def recover_session(self, master, saver=None, checkpoint_dir=None, wait_for_checkpoint=False, max_wait_secs=7200, config=None): """Creates a `Session`, recovering if possible. Creates a new session on 'master'. If the session is not initialized and can be recovered from a checkpoint, recover it. Args: master: `String` representation of the TensorFlow master to use. saver: A `Saver` object used to restore a model. checkpoint_dir: Path to the checkpoint files. wait_for_checkpoint: Whether to wait for checkpoint to become available. max_wait_secs: Maximum time to wait for checkpoints to become available. config: Optional `ConfigProto` proto used to configure the session. Returns: A pair (sess, initialized) where 'initialized' is `True` if the session could be recovered, `False` otherwise. """ self._target = master sess = session.Session(self._target, graph=self._graph, config=config) if self._local_init_op: sess.run([self._local_init_op]) # If either saver or checkpoint_dir is not specified, cannot restore. Just # return. if not saver or not checkpoint_dir: not_ready = self._model_not_ready(sess) return sess, not_ready is None # Waits up until max_wait_secs for checkpoint to become available. wait_time = 0 ckpt = saver_mod.get_checkpoint_state(checkpoint_dir) while not ckpt or not ckpt.model_checkpoint_path: if wait_for_checkpoint and wait_time < max_wait_secs: logging.info("Waiting for checkpoint to be available.") time.sleep(self._recovery_wait_secs) wait_time += self._recovery_wait_secs ckpt = saver_mod.get_checkpoint_state(checkpoint_dir) else: return sess, False # Loads the checkpoint and verifies that it makes the model ready. saver.restore(sess, ckpt.model_checkpoint_path) last_checkpoints = [] for fname in ckpt.all_model_checkpoint_paths: fnames = gfile.Glob(fname) if fnames: mtime = gfile.Stat(fnames[0]).mtime last_checkpoints.append((fname, mtime)) saver.set_last_checkpoints_with_time(last_checkpoints) not_ready = self._model_not_ready(sess) if not_ready: logging.info("Restoring model from %s did not make model ready: %s", ckpt.model_checkpoint_path, not_ready) return sess, False else: logging.info("Restored model from %s", ckpt.model_checkpoint_path) return sess, True
def test_file_operations(self): """Test file operations""" f = get_oss_path("test_file_operations") self.assertFalse(gfile.Exists(f)) fh = gfile.Open(f, mode="w") content = "file content" fh.write(content) fh.close() self.assertTrue(gfile.Exists(f)) fh = gfile.Open(f) self.assertEqual(fh.read(), content) self.assertEqual(gfile.Stat(f).length, len(content)) f2 = get_oss_path("test_file_2") gfile.Rename(f, f2) self.assertFalse(gfile.Exists(f)) self.assertTrue(gfile.Exists(f2)) f3 = get_oss_path("test_file_3") gfile.Copy(f2, f3, overwrite=True) self.assertTrue(gfile.Exists(f3))
def test_dir_operations(self): """ Test directory operations""" d = get_oss_path("d1/d2") gfile.MakeDirs(d) self.assertTrue(gfile.Stat(d).is_directory) # Test listing bucket directory with and without trailing '/' content = gfile.ListDirectory("oss://" + bucket) content_s = gfile.ListDirectory("oss://" + bucket + "/") self.assertEqual(content, content_s) self.assertIn("oss_fs_test", content) self.assertIn("oss_fs_test/d1", content) self.assertIn("oss_fs_test/d1/d2", content) # Test listing test directory with and without trailing '/' content = gfile.ListDirectory("oss://" + bucket + "/oss_fs_test") content_s = gfile.ListDirectory("oss://" + bucket + "/oss_fs_test/") self.assertEqual(content, content_s) self.assertIn("d1", content) self.assertIn("d1/d2", content) # Test listing sub directories. content = gfile.ListDirectory(get_oss_path("d1")) content_s = gfile.ListDirectory(get_oss_path("d1/")) self.assertEqual(content, content_s) self.assertIn("d2", content) content = gfile.ListDirectory(get_oss_path("d1/d2")) content_s = gfile.ListDirectory(get_oss_path("d1/d2/")) self.assertEqual(content, content_s) self.assertEqual([], content)
def testStat(self): with gfile.GFile(self.tmp + "test_stat", "w"): pass creation_time = time.time() statinfo = gfile.Stat(self.tmp + "test_stat") # Test the modification timestamp is within 20 seconds of closing the file. self.assertLessEqual(statinfo.mtime, creation_time + 10) self.assertGreaterEqual(statinfo.mtime, creation_time - 10)
def _load_debugged_source_file(file_path, source_file_proto): file_stat = gfile.Stat(file_path) source_file_proto.host = socket.gethostname() source_file_proto.file_path = file_path source_file_proto.last_modified = file_stat.mtime_nsec source_file_proto.bytes = file_stat.length try: with gfile.Open(file_path, "r") as f: source_file_proto.lines.extend(f.read().splitlines()) except IOError: pass
def __init__(self, dump_root, debug_dump_rel_path): """`DebugTensorDatum` constructor. Args: dump_root: (`str`) Debug dump root directory. debug_dump_rel_path: (`str`) Path to a debug dump file, relative to the `dump_root`. For example, suppose the debug dump root directory is `/tmp/tfdbg_1` and the dump file is at `/tmp/tfdbg_1/ns_1/node_a_0_DebugIdentity_123456789`, then the value of the debug_dump_rel_path should be `ns_1/node_a_0_DebugIdenity_1234456789`. Raises: ValueError: If the base file name of the dump file does not conform to the dump file naming pattern: `node_name`_`output_slot`_`debug_op`_`timestamp` """ base = os.path.basename(debug_dump_rel_path) if base.count("_") < 3: raise ValueError( "Dump file path does not conform to the naming pattern: %s" % base) # TODO(cais): Add hostname and pid to support dumps from distributed # sessions. self._extended_timestamp = base.split("_")[-1] # It may include an index suffix at the end if file path collision happened # due to identical timestamps. if "-" in self._extended_timestamp: self._timestamp = int( self._extended_timestamp[:self._extended_timestamp.find("-")]) else: self._timestamp = int(self._extended_timestamp) self._debug_op = base.split("_")[-2] self._output_slot = int(base.split("_")[-3]) namespace = os.path.dirname(debug_dump_rel_path).replace("\\", "/") node_base_name = "_".join(base.split("_")[:-3]) if not namespace or namespace == ".": self._node_name = node_base_name else: self._node_name = namespace + "/" + node_base_name self._file_path = os.path.join(dump_root, debug_dump_rel_path) self._dump_size_bytes = (gfile.Stat(self._file_path).length if gfile.Exists(self._file_path) else None) self._run_fetches_info = None self._run_feed_keys_info = None
def test_dir_operations(self): """ Test directory operations""" d = get_oss_path("d1/d2/d3/d4") gfile.MakeDirs(d) self.assertTrue(gfile.Stat(d).is_directory) # Test listing bucket directory with and without trailing '/' content = gfile.ListDirectory( "oss://%s\x01id=%s\x02key=%s\x02host=%s" % (bucket, access_id, access_key, host)) content_s = gfile.ListDirectory( "oss://%s\x01id=%s\x02key=%s\x02host=%s/" % (bucket, access_id, access_key, host)) self.assertEqual(content, content_s) self.assertIn("oss_fs_test", content) self.assertIn("oss_fs_test/d1", content) self.assertIn("oss_fs_test/d1/d2", content) # Test listing test directory with and without trailing '/' content = gfile.ListDirectory( "oss://%s\x01id=%s\x02key=%s\x02host=%s" % (bucket, access_id, access_key, host) + "/oss_fs_test") content_s = gfile.ListDirectory( "oss://%s\x01id=%s\x02key=%s\x02host=%s" % (bucket, access_id, access_key, host) + "/oss_fs_test/") self.assertEqual(content, content_s) self.assertIn("d1", content) self.assertIn("d1/d2", content) # Test listing sub directories. content = gfile.ListDirectory(get_oss_path("d1")) content_s = gfile.ListDirectory(get_oss_path("d1/")) self.assertEqual(content, content_s) self.assertIn("d2", content) content = gfile.ListDirectory(get_oss_path("d1/d2/d3/d4")) content_s = gfile.ListDirectory(get_oss_path("d1/d2/d3/d4")) self.assertEqual(content, content_s) self.assertEqual([], content) # Test Rename directories self.assertTrue(gfile.Exists(get_oss_path("d1"))) gfile.Rename(get_oss_path("d1"), get_oss_path("rename_d1"), overwrite=True) self.assertTrue(gfile.Exists(get_oss_path("rename_d1"))) self.assertFalse(gfile.Exists(get_oss_path("d1"))) content = gfile.ListDirectory(get_oss_path("rename_d1")) content_s = gfile.ListDirectory(get_oss_path("rename_d1/")) self.assertEqual(content, content_s) self.assertIn("d2", content)
def _restore_checkpoint(self, master, saver=None, checkpoint_dir=None, wait_for_checkpoint=False, max_wait_secs=7200, config=None): """Creates a `Session`, and tries to restore a checkpoint. Args: master: `String` representation of the TensorFlow master to use. saver: A `Saver` object used to restore a model. checkpoint_dir: Path to the checkpoint files. wait_for_checkpoint: Whether to wait for checkpoint to become available. max_wait_secs: Maximum time to wait for checkpoints to become available. config: Optional `ConfigProto` proto used to configure the session. Returns: A pair (sess, is_restored) where 'is_restored' is `True` if the session could be restored, `False` otherwise. """ self._target = master sess = session.Session(self._target, graph=self._graph, config=config) # If either saver or checkpoint_dir is not specified, cannot restore. Just # return. if not saver or not checkpoint_dir: return sess, False # Waits up until max_wait_secs for checkpoint to become available. wait_time = 0 ckpt = saver_mod.get_checkpoint_state(checkpoint_dir) while not ckpt or not ckpt.model_checkpoint_path: if wait_for_checkpoint and wait_time < max_wait_secs: logging.info("Waiting for checkpoint to be available.") time.sleep(self._recovery_wait_secs) wait_time += self._recovery_wait_secs ckpt = saver_mod.get_checkpoint_state(checkpoint_dir) else: return sess, False # Loads the checkpoint. saver.restore(sess, ckpt.model_checkpoint_path) last_checkpoints = [] for fname in ckpt.all_model_checkpoint_paths: fnames = gfile.Glob(fname) if fnames: mtime = gfile.Stat(fnames[0]).mtime last_checkpoints.append((fname, mtime)) saver.set_last_checkpoints_with_time(last_checkpoints) return sess, True
def _HasOOOWrite(self, path): """Returns whether the path has had an out-of-order write.""" # Check the sizes of each path before the current one. size = gfile.Stat(path).length old_size = self._finalized_sizes.get(path, None) if size != old_size: if old_size is None: logging.error('File %s created after file %s even though it\'s ' 'lexicographically earlier', path, self._path) else: logging.error('File %s updated even though the current file is %s', path, self._path) return True else: return False
def maybe_download(directory, filename, url): if not gfile.Exists(directory): print("Creating directory %s" % directory) gfile.MkDir(directory) file_path = os.path.join(directory, filename) if not gfile.Exists(file_path): print("Downloading %s to %s..." % (url, file_path)) print("This may take very, very long...") file_path, _ = urllib.urlretrieve( url, file_path) # TODO This probably doesn't work with GCS buckets. # It should be avoided to use the above function, because I think it won't work with Google Cloud # Storage buckets, just as the normal Python open(file, mode) function doesn't work with the buckets. # But I don't know how to make any download function work with GCS, so until now I have uploaded # every necessary file to the bucket manually so nothing has to be downloaded. file_info = gfile.Stat(file_path) print("Successfully downloaded", filename, file_info.st_size, "bytes") else: print("File was already downloaded") return file_path
def _SetPath(self, path): """Sets the current path to watch for new events. This also records the size of the old path, if any. If the size can't be found, an error is logged. Args: path: The full path of the file to watch. """ old_path = self._path if old_path and not io_wrapper.IsGCSPath(old_path): try: # We're done with the path, so store its size. size = gfile.Stat(old_path).length logging.debug('Setting latest size of %s to %d', old_path, size) self._finalized_sizes[old_path] = size except errors.OpError as e: logging.error('Unable to get size of %s: %s', old_path, e) self._path = path self._loader = self._loader_factory(path)
def format_tensor(tensor, tensor_name, np_printoptions, print_all=False, tensor_slicing=None, highlight_options=None, include_numeric_summary=False, write_path=None): """Generate formatted str to represent a tensor or its slices. Args: tensor: (numpy ndarray) The tensor value. tensor_name: (str) Name of the tensor, e.g., the tensor's debug watch key. np_printoptions: (dict) Numpy tensor formatting options. print_all: (bool) Whether the tensor is to be displayed in its entirety, instead of printing ellipses, even if its number of elements exceeds the default numpy display threshold. (Note: Even if this is set to true, the screen output can still be cut off by the UI frontend if it consist of more lines than the frontend can handle.) tensor_slicing: (str or None) Slicing of the tensor, e.g., "[:, 1]". If None, no slicing will be performed on the tensor. highlight_options: (tensor_format.HighlightOptions) options to highlight elements of the tensor. See the doc of tensor_format.format_tensor() for more details. include_numeric_summary: Whether a text summary of the numeric values (if applicable) will be included. write_path: A path to save the tensor value (after any slicing) to (optional). `numpy.save()` is used to save the value. Returns: An instance of `debugger_cli_common.RichTextLines` representing the (potentially sliced) tensor. """ if tensor_slicing: # Validate the indexing. value = command_parser.evaluate_tensor_slice(tensor, tensor_slicing) sliced_name = tensor_name + tensor_slicing else: value = tensor sliced_name = tensor_name auxiliary_message = None if write_path: with gfile.Open(write_path, "wb") as output_file: np.save(output_file, value) line = debugger_cli_common.RichLine("Saved value to: ") line += debugger_cli_common.RichLine(write_path, font_attr="bold") line += " (%sB)" % bytes_to_readable_str(gfile.Stat(write_path).length) auxiliary_message = debugger_cli_common.rich_text_lines_from_rich_line_list( [line, debugger_cli_common.RichLine("")]) if print_all: np_printoptions["threshold"] = value.size else: np_printoptions["threshold"] = DEFAULT_NDARRAY_DISPLAY_THRESHOLD return tensor_format.format_tensor( value, sliced_name, include_metadata=True, include_numeric_summary=include_numeric_summary, auxiliary_message=auxiliary_message, np_printoptions=np_printoptions, highlight_options=highlight_options)
def test_dir_operations(self): """Test directory operations""" d = get_oss_path("d1/d2/d3/d4") gfile.MakeDirs(d) self.assertTrue(gfile.Stat(d).is_directory) # Test listing bucket directory with and without trailing '/' content = gfile.ListDirectory( "oss://%s\x01id=%s\x02key=%s\x02host=%s" % (bucket, access_id, access_key, host) ) content_s = gfile.ListDirectory( "oss://%s\x01id=%s\x02key=%s\x02host=%s/" % (bucket, access_id, access_key, host) ) self.assertEqual(content, content_s) self.assertIn("oss_fs_test", content) self.assertIn("oss_fs_test/d1", content) self.assertIn("oss_fs_test/d1/d2", content) # Test listing test directory with and without trailing '/' content = gfile.ListDirectory( "oss://%s\x01id=%s\x02key=%s\x02host=%s" % (bucket, access_id, access_key, host) + "/oss_fs_test" ) content_s = gfile.ListDirectory( "oss://%s\x01id=%s\x02key=%s\x02host=%s" % (bucket, access_id, access_key, host) + "/oss_fs_test/" ) self.assertEqual(content, content_s) self.assertIn("d1", content) self.assertIn("d1/d2", content) # Test listing sub directories. content = gfile.ListDirectory(get_oss_path("d1")) content_s = gfile.ListDirectory(get_oss_path("d1/")) self.assertEqual(content, content_s) self.assertIn("d2", content) content = gfile.ListDirectory(get_oss_path("d1/d2/d3/d4")) content_s = gfile.ListDirectory(get_oss_path("d1/d2/d3/d4")) self.assertEqual(content, content_s) self.assertEqual([], content) # Test Rename directories self.assertTrue(gfile.Exists(get_oss_path("d1"))) gfile.Rename(get_oss_path("d1"), get_oss_path("rename_d1"), overwrite=True) self.assertTrue(gfile.Exists(get_oss_path("rename_d1"))) self.assertFalse(gfile.Exists(get_oss_path("d1"))) content = gfile.ListDirectory(get_oss_path("rename_d1")) content_s = gfile.ListDirectory(get_oss_path("rename_d1/")) self.assertEqual(content, content_s) self.assertIn("d2", content) # Test Rename non-empty directories not_empty_dir = get_oss_path("not_empty_dir/") rename_not_empty_dir = get_oss_path("rename_not_empty_dir/") gfile.MakeDirs(not_empty_dir) not_empty_file = get_oss_path("not_empty_dir/not_empty_file") rename_not_empty_file = get_oss_path("rename_not_empty_dir/not_empty_file") with gfile.Open(not_empty_file, mode="w") as fh: content = "file content" fh.write(content) self.assertTrue(gfile.Exists(not_empty_dir)) self.assertTrue(gfile.Exists(not_empty_file)) gfile.Rename(not_empty_dir, rename_not_empty_dir, overwrite=True) self.assertFalse(gfile.Exists(not_empty_dir)) self.assertFalse(gfile.Exists(not_empty_file)) self.assertTrue(gfile.Exists(rename_not_empty_dir)) self.assertTrue(gfile.Exists(rename_not_empty_file))
def run(self): """ Process the Directory path provided scanning for the TB events. Continues to run until terminated, scan and reports only new events logged to the respective TB-files 1st scan of directory: read and log all TB events to the DB using the MLOps-stats API Subsequent scan of directory: check for only the newly appended TB files and update DB using the MLOps-stats API. """ if not self._log_dir: raise Exception("log_dir was not given to TBParser: {}".format( self._log_dir)) self._print("calling mlops_init()") mlops.init() flist = [] files_found = [] while True: try: if gfile.IsDirectory(self._log_dir): files_found = gfile.ListDirectory(self._log_dir) self._print_verbose("found log dir [{}]".format( self._log_dir)) self._print_verbose("Found files: {}".format(files_found)) else: self._print_verbose( "could not find log dir [{}] will sleep".format( self._log_dir)) time.sleep(self._sleep_time) continue except Exception as e: self._print( "Error: READ Directory attempt failed: {}".format(e)) break # Get the files in directory and respective filesize # And continue rescan of the directory for changes for file in files_found: file_path = os.path.join(self._log_dir, file) # TODO: move this to a separate routine - adding a new file # Confirm file has been seen before, if not add to list if file not in flist: # add file to the known file list flist.append(file) time_stamp_start = 0 try: self._file_size[file] = gfile.Stat(file_path).length self._file_time[file] = time.time() is_tf_events_file = event_accumulator.IsTensorFlowEventsFile( file_path) if self._file_size[file] > 0 and is_tf_events_file: event_list = itertools.chain( *[generate_event_from_file(file_path)]) self._report_events(event_list, time_stamp_start) except Exception as e: self._print("exception : {0}".format(e)) time.sleep(self._sleep_time) # stat files to compare length if self._file_size[file] < gfile.Stat(file_path).length and \ event_accumulator.IsTensorFlowEventsFile(file_path): self._file_size[file] = gfile.Stat(file_path).length try: time_stamp_start = self._file_time[file] self._file_time[file] = time.time() event_list = itertools.chain( *[generate_event_from_file(file_path)]) self._report_events(event_list, time_stamp_start) except Exception as e: self._print("exception: {0}".format(e)) time.sleep(self._sleep_time) continue mlops.done()