Example #1
0
  def recover_session(self, master, saver=None, checkpoint_dir=None,
                      wait_for_checkpoint=False, max_wait_secs=7200,
                      config=None):
    """Creates a `Session`, recovering if possible.

    Creates a new session on 'master'.  If the session is not initialized
    and can be recovered from a checkpoint, recover it.

    Args:
      master: `String` representation of the TensorFlow master to use.
      saver: A `Saver` object used to restore a model.
      checkpoint_dir: Path to the checkpoint files.
      wait_for_checkpoint: Whether to wait for checkpoint to become available.
      max_wait_secs: Maximum time to wait for checkpoints to become available.
      config: Optional `ConfigProto` proto used to configure the session.

    Returns:
      A pair (sess, initialized) where 'initialized' is `True` if
      the session could be recovered, `False` otherwise.
    """
    self._target = master
    sess = session.Session(self._target, graph=self._graph, config=config)
    if self._local_init_op:
      sess.run([self._local_init_op])

    # If either saver or checkpoint_dir is not specified, cannot restore. Just
    # return.
    if not saver or not checkpoint_dir:
      not_ready = self._model_not_ready(sess)
      return sess, not_ready is None

    # Waits up until max_wait_secs for checkpoint to become available.
    wait_time = 0
    ckpt = saver_mod.get_checkpoint_state(checkpoint_dir)
    while not ckpt or not ckpt.model_checkpoint_path:
      if wait_for_checkpoint and wait_time < max_wait_secs:
        logging.info("Waiting for checkpoint to be available.")
        time.sleep(self._recovery_wait_secs)
        wait_time += self._recovery_wait_secs
        ckpt = saver_mod.get_checkpoint_state(checkpoint_dir)
      else:
        return sess, False

    # Loads the checkpoint and verifies that it makes the model ready.
    saver.restore(sess, ckpt.model_checkpoint_path)
    last_checkpoints = []
    for fname in ckpt.all_model_checkpoint_paths:
      fnames = gfile.Glob(fname)
      if fnames:
        mtime = gfile.Stat(fnames[0]).mtime
        last_checkpoints.append((fname, mtime))
    saver.set_last_checkpoints_with_time(last_checkpoints)
    not_ready = self._model_not_ready(sess)
    if not_ready:
      logging.info("Restoring model from %s did not make model ready: %s",
                   ckpt.model_checkpoint_path, not_ready)
      return sess, False
    else:
      logging.info("Restored model from %s", ckpt.model_checkpoint_path)
      return sess, True
Example #2
0
    def test_file_operations(self):
        """Test file operations"""

        f = get_oss_path("test_file_operations")
        self.assertFalse(gfile.Exists(f))

        fh = gfile.Open(f, mode="w")
        content = "file content"
        fh.write(content)
        fh.close()
        self.assertTrue(gfile.Exists(f))

        fh = gfile.Open(f)
        self.assertEqual(fh.read(), content)

        self.assertEqual(gfile.Stat(f).length, len(content))

        f2 = get_oss_path("test_file_2")
        gfile.Rename(f, f2)
        self.assertFalse(gfile.Exists(f))
        self.assertTrue(gfile.Exists(f2))

        f3 = get_oss_path("test_file_3")
        gfile.Copy(f2, f3, overwrite=True)
        self.assertTrue(gfile.Exists(f3))
Example #3
0
    def test_dir_operations(self):
        """ Test directory operations"""

        d = get_oss_path("d1/d2")
        gfile.MakeDirs(d)
        self.assertTrue(gfile.Stat(d).is_directory)

        # Test listing bucket directory with and without trailing '/'
        content = gfile.ListDirectory("oss://" + bucket)
        content_s = gfile.ListDirectory("oss://" + bucket + "/")
        self.assertEqual(content, content_s)
        self.assertIn("oss_fs_test", content)
        self.assertIn("oss_fs_test/d1", content)
        self.assertIn("oss_fs_test/d1/d2", content)

        # Test listing test directory with and without trailing '/'
        content = gfile.ListDirectory("oss://" + bucket + "/oss_fs_test")
        content_s = gfile.ListDirectory("oss://" + bucket + "/oss_fs_test/")
        self.assertEqual(content, content_s)
        self.assertIn("d1", content)
        self.assertIn("d1/d2", content)

        # Test listing sub directories.
        content = gfile.ListDirectory(get_oss_path("d1"))
        content_s = gfile.ListDirectory(get_oss_path("d1/"))
        self.assertEqual(content, content_s)
        self.assertIn("d2", content)

        content = gfile.ListDirectory(get_oss_path("d1/d2"))
        content_s = gfile.ListDirectory(get_oss_path("d1/d2/"))
        self.assertEqual(content, content_s)
        self.assertEqual([], content)
Example #4
0
 def testStat(self):
   with gfile.GFile(self.tmp + "test_stat", "w"):
     pass
   creation_time = time.time()
   statinfo = gfile.Stat(self.tmp + "test_stat")
   # Test the modification timestamp is within 20 seconds of closing the file.
   self.assertLessEqual(statinfo.mtime, creation_time + 10)
   self.assertGreaterEqual(statinfo.mtime, creation_time - 10)
Example #5
0
def _load_debugged_source_file(file_path, source_file_proto):
    file_stat = gfile.Stat(file_path)
    source_file_proto.host = socket.gethostname()
    source_file_proto.file_path = file_path
    source_file_proto.last_modified = file_stat.mtime_nsec
    source_file_proto.bytes = file_stat.length
    try:
        with gfile.Open(file_path, "r") as f:
            source_file_proto.lines.extend(f.read().splitlines())
    except IOError:
        pass
Example #6
0
    def __init__(self, dump_root, debug_dump_rel_path):
        """`DebugTensorDatum` constructor.

    Args:
      dump_root: (`str`) Debug dump root directory.
      debug_dump_rel_path: (`str`) Path to a debug dump file, relative to the
          `dump_root`. For example, suppose the debug dump root
          directory is `/tmp/tfdbg_1` and the dump file is at
          `/tmp/tfdbg_1/ns_1/node_a_0_DebugIdentity_123456789`, then
          the value of the debug_dump_rel_path should be
          `ns_1/node_a_0_DebugIdenity_1234456789`.

    Raises:
      ValueError: If the base file name of the dump file does not conform to
        the dump file naming pattern:
        `node_name`_`output_slot`_`debug_op`_`timestamp`
    """

        base = os.path.basename(debug_dump_rel_path)

        if base.count("_") < 3:
            raise ValueError(
                "Dump file path does not conform to the naming pattern: %s" %
                base)

        # TODO(cais): Add hostname and pid to support dumps from distributed
        #             sessions.

        self._extended_timestamp = base.split("_")[-1]
        # It may include an index suffix at the end if file path collision happened
        # due to identical timestamps.
        if "-" in self._extended_timestamp:
            self._timestamp = int(
                self._extended_timestamp[:self._extended_timestamp.find("-")])
        else:
            self._timestamp = int(self._extended_timestamp)

        self._debug_op = base.split("_")[-2]
        self._output_slot = int(base.split("_")[-3])

        namespace = os.path.dirname(debug_dump_rel_path).replace("\\", "/")
        node_base_name = "_".join(base.split("_")[:-3])
        if not namespace or namespace == ".":
            self._node_name = node_base_name
        else:
            self._node_name = namespace + "/" + node_base_name

        self._file_path = os.path.join(dump_root, debug_dump_rel_path)
        self._dump_size_bytes = (gfile.Stat(self._file_path).length
                                 if gfile.Exists(self._file_path) else None)

        self._run_fetches_info = None
        self._run_feed_keys_info = None
Example #7
0
    def test_dir_operations(self):
        """ Test directory operations"""

        d = get_oss_path("d1/d2/d3/d4")
        gfile.MakeDirs(d)
        self.assertTrue(gfile.Stat(d).is_directory)

        # Test listing bucket directory with and without trailing '/'
        content = gfile.ListDirectory(
            "oss://%s\x01id=%s\x02key=%s\x02host=%s" %
            (bucket, access_id, access_key, host))
        content_s = gfile.ListDirectory(
            "oss://%s\x01id=%s\x02key=%s\x02host=%s/" %
            (bucket, access_id, access_key, host))
        self.assertEqual(content, content_s)
        self.assertIn("oss_fs_test", content)
        self.assertIn("oss_fs_test/d1", content)
        self.assertIn("oss_fs_test/d1/d2", content)

        # Test listing test directory with and without trailing '/'
        content = gfile.ListDirectory(
            "oss://%s\x01id=%s\x02key=%s\x02host=%s" %
            (bucket, access_id, access_key, host) + "/oss_fs_test")
        content_s = gfile.ListDirectory(
            "oss://%s\x01id=%s\x02key=%s\x02host=%s" %
            (bucket, access_id, access_key, host) + "/oss_fs_test/")
        self.assertEqual(content, content_s)
        self.assertIn("d1", content)
        self.assertIn("d1/d2", content)

        # Test listing sub directories.
        content = gfile.ListDirectory(get_oss_path("d1"))
        content_s = gfile.ListDirectory(get_oss_path("d1/"))
        self.assertEqual(content, content_s)
        self.assertIn("d2", content)

        content = gfile.ListDirectory(get_oss_path("d1/d2/d3/d4"))
        content_s = gfile.ListDirectory(get_oss_path("d1/d2/d3/d4"))
        self.assertEqual(content, content_s)
        self.assertEqual([], content)

        # Test Rename directories
        self.assertTrue(gfile.Exists(get_oss_path("d1")))
        gfile.Rename(get_oss_path("d1"),
                     get_oss_path("rename_d1"),
                     overwrite=True)
        self.assertTrue(gfile.Exists(get_oss_path("rename_d1")))
        self.assertFalse(gfile.Exists(get_oss_path("d1")))

        content = gfile.ListDirectory(get_oss_path("rename_d1"))
        content_s = gfile.ListDirectory(get_oss_path("rename_d1/"))
        self.assertEqual(content, content_s)
        self.assertIn("d2", content)
Example #8
0
    def _restore_checkpoint(self,
                            master,
                            saver=None,
                            checkpoint_dir=None,
                            wait_for_checkpoint=False,
                            max_wait_secs=7200,
                            config=None):
        """Creates a `Session`, and tries to restore a checkpoint.


    Args:
      master: `String` representation of the TensorFlow master to use.
      saver: A `Saver` object used to restore a model.
      checkpoint_dir: Path to the checkpoint files.
      wait_for_checkpoint: Whether to wait for checkpoint to become available.
      max_wait_secs: Maximum time to wait for checkpoints to become available.
      config: Optional `ConfigProto` proto used to configure the session.

    Returns:
      A pair (sess, is_restored) where 'is_restored' is `True` if
      the session could be restored, `False` otherwise.
    """
        self._target = master
        sess = session.Session(self._target, graph=self._graph, config=config)

        # If either saver or checkpoint_dir is not specified, cannot restore. Just
        # return.
        if not saver or not checkpoint_dir:
            return sess, False

        # Waits up until max_wait_secs for checkpoint to become available.
        wait_time = 0
        ckpt = saver_mod.get_checkpoint_state(checkpoint_dir)
        while not ckpt or not ckpt.model_checkpoint_path:
            if wait_for_checkpoint and wait_time < max_wait_secs:
                logging.info("Waiting for checkpoint to be available.")
                time.sleep(self._recovery_wait_secs)
                wait_time += self._recovery_wait_secs
                ckpt = saver_mod.get_checkpoint_state(checkpoint_dir)
            else:
                return sess, False

        # Loads the checkpoint.
        saver.restore(sess, ckpt.model_checkpoint_path)
        last_checkpoints = []
        for fname in ckpt.all_model_checkpoint_paths:
            fnames = gfile.Glob(fname)
            if fnames:
                mtime = gfile.Stat(fnames[0]).mtime
                last_checkpoints.append((fname, mtime))
        saver.set_last_checkpoints_with_time(last_checkpoints)
        return sess, True
Example #9
0
 def _HasOOOWrite(self, path):
   """Returns whether the path has had an out-of-order write."""
   # Check the sizes of each path before the current one.
   size = gfile.Stat(path).length
   old_size = self._finalized_sizes.get(path, None)
   if size != old_size:
     if old_size is None:
       logging.error('File %s created after file %s even though it\'s '
                        'lexicographically earlier', path, self._path)
     else:
       logging.error('File %s updated even though the current file is %s',
                        path, self._path)
     return True
   else:
     return False
Example #10
0
def maybe_download(directory, filename, url):
    if not gfile.Exists(directory):
        print("Creating directory %s" % directory)
        gfile.MkDir(directory)
    file_path = os.path.join(directory, filename)
    if not gfile.Exists(file_path):
        print("Downloading %s to %s..." % (url, file_path))
        print("This may take very, very long...")
        file_path, _ = urllib.urlretrieve(
            url,
            file_path)  # TODO This probably doesn't work with GCS buckets.
        # It should be avoided to use the above function, because I think it won't work with Google Cloud
        # Storage buckets, just as the normal Python open(file, mode) function doesn't work with the buckets.
        # But I don't know how to make any download function work with GCS, so until now I have uploaded
        # every necessary file to the bucket manually so nothing has to be downloaded.
        file_info = gfile.Stat(file_path)
        print("Successfully downloaded", filename, file_info.st_size, "bytes")
    else:
        print("File was already downloaded")
    return file_path
Example #11
0
  def _SetPath(self, path):
    """Sets the current path to watch for new events.

    This also records the size of the old path, if any. If the size can't be
    found, an error is logged.

    Args:
      path: The full path of the file to watch.
    """
    old_path = self._path
    if old_path and not io_wrapper.IsGCSPath(old_path):
      try:
        # We're done with the path, so store its size.
        size = gfile.Stat(old_path).length
        logging.debug('Setting latest size of %s to %d', old_path, size)
        self._finalized_sizes[old_path] = size
      except errors.OpError as e:
        logging.error('Unable to get size of %s: %s', old_path, e)

    self._path = path
    self._loader = self._loader_factory(path)
Example #12
0
def format_tensor(tensor,
                  tensor_name,
                  np_printoptions,
                  print_all=False,
                  tensor_slicing=None,
                  highlight_options=None,
                  include_numeric_summary=False,
                  write_path=None):
  """Generate formatted str to represent a tensor or its slices.

  Args:
    tensor: (numpy ndarray) The tensor value.
    tensor_name: (str) Name of the tensor, e.g., the tensor's debug watch key.
    np_printoptions: (dict) Numpy tensor formatting options.
    print_all: (bool) Whether the tensor is to be displayed in its entirety,
      instead of printing ellipses, even if its number of elements exceeds
      the default numpy display threshold.
      (Note: Even if this is set to true, the screen output can still be cut
       off by the UI frontend if it consist of more lines than the frontend
       can handle.)
    tensor_slicing: (str or None) Slicing of the tensor, e.g., "[:, 1]". If
      None, no slicing will be performed on the tensor.
    highlight_options: (tensor_format.HighlightOptions) options to highlight
      elements of the tensor. See the doc of tensor_format.format_tensor()
      for more details.
    include_numeric_summary: Whether a text summary of the numeric values (if
      applicable) will be included.
    write_path: A path to save the tensor value (after any slicing) to
      (optional). `numpy.save()` is used to save the value.

  Returns:
    An instance of `debugger_cli_common.RichTextLines` representing the
    (potentially sliced) tensor.
  """

  if tensor_slicing:
    # Validate the indexing.
    value = command_parser.evaluate_tensor_slice(tensor, tensor_slicing)
    sliced_name = tensor_name + tensor_slicing
  else:
    value = tensor
    sliced_name = tensor_name

  auxiliary_message = None
  if write_path:
    with gfile.Open(write_path, "wb") as output_file:
      np.save(output_file, value)
    line = debugger_cli_common.RichLine("Saved value to: ")
    line += debugger_cli_common.RichLine(write_path, font_attr="bold")
    line += " (%sB)" % bytes_to_readable_str(gfile.Stat(write_path).length)
    auxiliary_message = debugger_cli_common.rich_text_lines_from_rich_line_list(
        [line, debugger_cli_common.RichLine("")])

  if print_all:
    np_printoptions["threshold"] = value.size
  else:
    np_printoptions["threshold"] = DEFAULT_NDARRAY_DISPLAY_THRESHOLD

  return tensor_format.format_tensor(
      value,
      sliced_name,
      include_metadata=True,
      include_numeric_summary=include_numeric_summary,
      auxiliary_message=auxiliary_message,
      np_printoptions=np_printoptions,
      highlight_options=highlight_options)
Example #13
0
    def test_dir_operations(self):
        """Test directory operations"""

        d = get_oss_path("d1/d2/d3/d4")
        gfile.MakeDirs(d)
        self.assertTrue(gfile.Stat(d).is_directory)

        # Test listing bucket directory with and without trailing '/'
        content = gfile.ListDirectory(
            "oss://%s\x01id=%s\x02key=%s\x02host=%s"
            % (bucket, access_id, access_key, host)
        )
        content_s = gfile.ListDirectory(
            "oss://%s\x01id=%s\x02key=%s\x02host=%s/"
            % (bucket, access_id, access_key, host)
        )
        self.assertEqual(content, content_s)
        self.assertIn("oss_fs_test", content)
        self.assertIn("oss_fs_test/d1", content)
        self.assertIn("oss_fs_test/d1/d2", content)

        # Test listing test directory with and without trailing '/'
        content = gfile.ListDirectory(
            "oss://%s\x01id=%s\x02key=%s\x02host=%s"
            % (bucket, access_id, access_key, host)
            + "/oss_fs_test"
        )
        content_s = gfile.ListDirectory(
            "oss://%s\x01id=%s\x02key=%s\x02host=%s"
            % (bucket, access_id, access_key, host)
            + "/oss_fs_test/"
        )
        self.assertEqual(content, content_s)
        self.assertIn("d1", content)
        self.assertIn("d1/d2", content)

        # Test listing sub directories.
        content = gfile.ListDirectory(get_oss_path("d1"))
        content_s = gfile.ListDirectory(get_oss_path("d1/"))
        self.assertEqual(content, content_s)
        self.assertIn("d2", content)

        content = gfile.ListDirectory(get_oss_path("d1/d2/d3/d4"))
        content_s = gfile.ListDirectory(get_oss_path("d1/d2/d3/d4"))
        self.assertEqual(content, content_s)
        self.assertEqual([], content)

        # Test Rename directories
        self.assertTrue(gfile.Exists(get_oss_path("d1")))
        gfile.Rename(get_oss_path("d1"), get_oss_path("rename_d1"), overwrite=True)
        self.assertTrue(gfile.Exists(get_oss_path("rename_d1")))
        self.assertFalse(gfile.Exists(get_oss_path("d1")))

        content = gfile.ListDirectory(get_oss_path("rename_d1"))
        content_s = gfile.ListDirectory(get_oss_path("rename_d1/"))
        self.assertEqual(content, content_s)
        self.assertIn("d2", content)

        # Test Rename non-empty directories
        not_empty_dir = get_oss_path("not_empty_dir/")
        rename_not_empty_dir = get_oss_path("rename_not_empty_dir/")
        gfile.MakeDirs(not_empty_dir)
        not_empty_file = get_oss_path("not_empty_dir/not_empty_file")
        rename_not_empty_file = get_oss_path("rename_not_empty_dir/not_empty_file")
        with gfile.Open(not_empty_file, mode="w") as fh:
            content = "file content"
            fh.write(content)
        self.assertTrue(gfile.Exists(not_empty_dir))
        self.assertTrue(gfile.Exists(not_empty_file))
        gfile.Rename(not_empty_dir, rename_not_empty_dir, overwrite=True)
        self.assertFalse(gfile.Exists(not_empty_dir))
        self.assertFalse(gfile.Exists(not_empty_file))
        self.assertTrue(gfile.Exists(rename_not_empty_dir))
        self.assertTrue(gfile.Exists(rename_not_empty_file))
Example #14
0
    def run(self):
        """
        Process the Directory path provided scanning for the TB events.
        Continues to run until terminated, scan and reports only new
        events logged to the respective TB-files

        1st scan of directory: read and log all TB events to the DB
        using the MLOps-stats API

        Subsequent scan of directory: check for only the newly appended
        TB files and update DB using the MLOps-stats API.
        """

        if not self._log_dir:
            raise Exception("log_dir was not given to TBParser: {}".format(
                self._log_dir))

        self._print("calling mlops_init()")
        mlops.init()

        flist = []
        files_found = []

        while True:
            try:
                if gfile.IsDirectory(self._log_dir):
                    files_found = gfile.ListDirectory(self._log_dir)
                    self._print_verbose("found log dir [{}]".format(
                        self._log_dir))
                    self._print_verbose("Found files: {}".format(files_found))
                else:
                    self._print_verbose(
                        "could not find log dir [{}] will sleep".format(
                            self._log_dir))
                    time.sleep(self._sleep_time)
                    continue
            except Exception as e:
                self._print(
                    "Error: READ Directory attempt failed: {}".format(e))
                break

            # Get the files in directory and respective filesize
            # And continue rescan of the directory for changes
            for file in files_found:
                file_path = os.path.join(self._log_dir, file)

                # TODO: move this to a separate routine - adding a new file
                # Confirm file has been seen before, if not add to list
                if file not in flist:
                    # add file to the known file list
                    flist.append(file)
                    time_stamp_start = 0

                    try:
                        self._file_size[file] = gfile.Stat(file_path).length
                        self._file_time[file] = time.time()

                        is_tf_events_file = event_accumulator.IsTensorFlowEventsFile(
                            file_path)
                        if self._file_size[file] > 0 and is_tf_events_file:
                            event_list = itertools.chain(
                                *[generate_event_from_file(file_path)])
                            self._report_events(event_list, time_stamp_start)

                    except Exception as e:
                        self._print("exception : {0}".format(e))
                        time.sleep(self._sleep_time)

                # stat files to compare length
                if self._file_size[file] < gfile.Stat(file_path).length and \
                        event_accumulator.IsTensorFlowEventsFile(file_path):

                    self._file_size[file] = gfile.Stat(file_path).length

                    try:
                        time_stamp_start = self._file_time[file]
                        self._file_time[file] = time.time()
                        event_list = itertools.chain(
                            *[generate_event_from_file(file_path)])
                        self._report_events(event_list, time_stamp_start)

                    except Exception as e:
                        self._print("exception: {0}".format(e))

            time.sleep(self._sleep_time)
            continue

        mlops.done()