def after_run(self, run_context, run_values): # Collect any legacy v1 summaries to emit. summaries_to_emit = [] if self._summary_writer and self._request_summary: for summary in run_values.results.get("summary", []): # Skip None results corresponding to V2 summary operations. if summary is not None: summaries_to_emit.append(summary) # Heuristically estimate current step as possibly-stale value plus one. stale_global_step = run_values.results["global_step"] self._current_step = stale_global_step + 1 # Read the actual post-run global step if we need better accuracy because # 1) we will request summaries on the next run (based on estimate now) and # must ensure we record an accurate "last triggered step" value, or # 2) we have legacy v1 summaries to emit using the post-run step value. # Note: we could have dealt with (1) separately in before_run() but by doing # it here we can consolidate the reads in case both (1) and (2) apply. near_next_trigger = self._timer.should_trigger_for_step( self._current_step) if near_next_trigger or summaries_to_emit: self._current_step = run_context.session.run( self._global_step_tensor) # Emit any legacy v1 summaries. if summaries_to_emit: with ops.default_session(run_context.session): for summary in summaries_to_emit: self._summary_writer.add_summary(summary, self._current_step)
def end(self, session): last_step = session.run(self._global_step_tensor) if last_step != self._timer.last_triggered_step(): self._save(session, last_step) for l in self._listeners: l.end(session, last_step) with ops.default_session(session): self._summary_writer.flush()
def main_loop(self): # some final operations that might modify the graph logger.info("[{}] Initializing graph variables ...".format(os.environ['PBS_ARRAY_INDEX'])) #self.sess.run(tf.initialize_all_variables()) self.config.session_init.init(self.sess) # tf.get_default_graph().finalize() callbacks = self.config.callbacks logger.info("[{}] Starting concurrency...".format(os.environ['PBS_ARRAY_INDEX'])) self._start_concurrency() #with self.sess.as_default(): logger.info("[{}] Setting default session".format(os.environ['PBS_ARRAY_INDEX'])) with ops.default_session(self.sess): try: logger.info("[{}] Getting global step".format(os.environ['PBS_ARRAY_INDEX'])) self.global_step = get_global_step() logger.info("[{}] Start training with global_step={}".format(os.environ['PBS_ARRAY_INDEX'], self.global_step)) if self.config.extra_arg['is_chief']: server = neptune_mp_server.Server( self.config.extra_arg['n_workers'], port=self.config.extra_arg['port'], debug_charts=self.config.extra_arg['debug_charts'], adam_debug=self.config.extra_arg['adam_debug'], schedule_hyper=self.config.extra_arg['schedule_hyper'], experiment_dir=self.config.extra_arg['experiment_dir']) server.main_loop() callbacks.before_train() for epoch in range(self.config.starting_epoch, self.config.max_epoch+1): with timed_operation( 'Epoch {}, global_step={}'.format( epoch, self.global_step + self.config.step_per_epoch)): for step in tqdm.trange( self.config.step_per_epoch, **get_tqdm_kwargs(leave=True)): if self.coord.should_stop(): return self.run_step() callbacks.trigger_step() try: self.global_step += 1 except: self.global_step = -1 self.trigger_epoch() print 'EPOCH ENDS HERE' except (KeyboardInterrupt, Exception): raise finally: # Do I need to run queue.close? print('Handling finally block') callbacks.after_train() self.coord.request_stop() self.summary_writer.close() self.sess.close()
def _start_concurrency(self): """ Run all threads before starting training """ logger.info("Starting all threads & procs ...") tf.train.start_queue_runners(sess=self.sess.get(), coord=self.coord, daemon=True, start=True) #with self.sess.as_default(): with ops.default_session(self.sess): # avoid sigint get handled by other processes start_proc_mask_signal(self._extra_threads_procs)
def as_default(self): """Returns a context manager that makes this object the default session. Use with the `with` keyword to specify that calls to @{tf.Operation.run} or @{tf.Tensor.eval} should be executed in this session. ```python c = tf.constant(..) sess = tf.Session() with sess.as_default(): assert tf.get_default_session() is sess print(c.eval()) ``` To get the current default session, use @{tf.get_default_session}. *N.B.* The `as_default` context manager *does not* close the session when you exit the context, and you must close the session explicitly. ```python c = tf.constant(...) sess = tf.Session() with sess.as_default(): print(c.eval()) # ... with sess.as_default(): print(c.eval()) sess.close() ``` Alternatively, you can use `with tf.Session():` to create a session that is automatically closed on exiting the context, including when an uncaught exception is raised. *N.B.* The default session is a property of the current thread. If you create a new thread, and wish to use the default session in that thread, you must explicitly add a `with sess.as_default():` in that thread's function. *N.B.* Entering a `with sess.as_default():` block does not affect the current default graph. If you are using multiple graphs, and `sess.graph` is different from the value of @{tf.get_default_graph}, you must explicitly enter a `with sess.graph.as_default():` block to make `sess.graph` the default graph. Returns: A context manager using this session as the default session. """ return ops.default_session(self)
def as_default(self): """Returns a context manager that makes this object the default session. Use with the `with` keyword to specify that calls to @{tf.Operation.run} or @{tf.Tensor.eval} should be executed in this session. ```python c = tf.constant(..) sess = tf.Session() with sess.as_default(): assert tf.get_default_session() is sess print(c.eval()) ``` To get the current default session, use @{tf.get_default_session}. *N.B.* The `as_default` context manager *does not* close the session when you exit the context, and you must close the session explicitly. ```python c = tf.constant(...) sess = tf.Session() with sess.as_default(): print(c.eval()) # ... with sess.as_default(): print(c.eval()) sess.close() ``` Alternatively, you can use `with tf.Session():` to create a session that is automatically closed on exiting the context, including when an uncaught exception is raised. *N.B.* The default session is a property of the current thread. If you create a new thread, and wish to use the default session in that thread, you must explicitly add a `with sess.as_default():` in that thread's function. *N.B.* Entering a `with sess.as_default():` block does not affect the current default graph. If you are using multiple graphs, and `sess.graph` is different from the value of @{tf.get_default_graph}, you must explicitly enter a `with sess.graph.as_default():` block to make `sess.graph` the default graph. Returns: A context manager using this session as the default session. """ return ops.default_session(self)
def as_default(self): """Returns a context manager that makes this object the default session. Use with the `with` keyword to specify that calls to [`Operation.run()`](../../api_docs/python/framework.md#Operation.run) or [`Tensor.run()`](../../api_docs/python/framework.md#Tensor.run) should be executed in this session. ```python c = tf.constant(..) sess = tf.Session() with sess.as_default(): assert tf.get_default_session() is sess print(c.eval()) ``` To get the current default session, use [`tf.get_default_session()`](#get_default_session). *N.B.* The `as_default` context manager *does not* close the session when you exit the context, and you must close the session explicitly. ```python c = tf.constant(...) sess = tf.Session() with sess.as_default(): print(c.eval()) # ... with sess.as_default(): print(c.eval()) sess.close() ``` Alternatively, you can use `with tf.Session():` to create a session that is automatically closed on exiting the context, including when an uncaught exception is raised. *N.B.* The default graph is a property of the current thread. If you create a new thread, and wish to use the default session in that thread, you must explicitly add a `with sess.as_default():` in that thread's function. Returns: A context manager using this session as the default session. """ return ops.default_session(self)
def as_default(self): """Returns a context manager that makes this object the default session. Use with the `with` keyword to specify that calls to [`Operation.run()`](../../api_docs/python/framework.md#Operation.run) or [`Tensor.eval()`](../../api_docs/python/framework.md#Tensor.eval) should be executed in this session. ```python c = tf.constant(..) sess = tf.Session() with sess.as_default(): assert tf.get_default_session() is sess print(c.eval()) ``` To get the current default session, use [`tf.get_default_session()`](#get_default_session). *N.B.* The `as_default` context manager *does not* close the session when you exit the context, and you must close the session explicitly. ```python c = tf.constant(...) sess = tf.Session() with sess.as_default(): print(c.eval()) # ... with sess.as_default(): print(c.eval()) sess.close() ``` Alternatively, you can use `with tf.Session():` to create a session that is automatically closed on exiting the context, including when an uncaught exception is raised. *N.B.* The default graph is a property of the current thread. If you create a new thread, and wish to use the default session in that thread, you must explicitly add a `with sess.as_default():` in that thread's function. Returns: A context manager using this session as the default session. """ return ops.default_session(self)
def after_run(self, run_context, run_values): stale_global_step = run_values.results["global_step"] global_step = stale_global_step + 1 if self._request_summary: global_step = run_context.session.run(self._global_step_tensor) self._timer.update_last_triggered_step(global_step) self._save(global_step, self._output_file.format(global_step), run_values.run_metadata.step_stats) with ops.default_session(run_context.session): self._file_writer.add_run_metadata(run_values.run_metadata, "step_%d" % global_step, global_step=global_step) self._next_step = global_step + 1
def after_run(self, run_context, run_values): stale_global_step = run_values.results if self._timer.should_trigger_for_step(stale_global_step + self._steps_per_run): # Get the real value after train op. global_step = run_context.session.run(self._global_step_tensor) if self._timer.should_trigger_for_step(global_step): elapsed_time, elapsed_steps = self._timer.update_last_triggered_step( global_step) if elapsed_time is not None: with ops.default_session(run_context.session): self._log_and_record(elapsed_steps, elapsed_time, global_step) self._last_global_step = stale_global_step
def after_create_session(self, session, coord): del coord # Ensure summary writer resource has been initialized. session.run(summary_ops_v2.summary_writer_initializer_op()) global_step = session.run(self._global_step_tensor) # Write graph and saver_def once graph is finalized, which isn't true yet # in begin() since later hooks can still change the graph. training_util.write_graph( ops.get_default_graph().as_graph_def(add_shapes=True), self._checkpoint_dir, "graph.pbtxt") saver_def = self._get_saver().saver_def if self._get_saver() else None graph = ops.get_default_graph() meta_graph_def = meta_graph.create_meta_graph_def( graph_def=graph.as_graph_def(add_shapes=True), saver_def=saver_def) with ops.default_session(session): self._summary_writer.add_graph(graph) self._summary_writer.add_meta_graph(meta_graph_def) # The checkpoint saved here is the state at step "global_step". self._save(session, global_step) self._timer.update_last_triggered_step(global_step)
def before_run(self, run_context): # For the first run, record a SessionLog.START at the pre-run global step. if self._current_step is None: self._current_step = run_context.session.run( self._global_step_tensor) with ops.default_session(run_context.session): self._summary_writer.add_session_log( SessionLog(status=SessionLog.START), self._current_step) requests = {"global_step": self._global_step_tensor} self._request_summary = self._timer.should_trigger_for_step( self._current_step) if self._request_summary: self._timer.update_last_triggered_step(self._current_step) if self._get_summary_op() is not None: requests["summary"] = self._get_summary_op() feeds = {} if self._placeholder is not None and self._request_summary: feeds[self._placeholder] = self._request_summary args = SessionRunArgs(fetches=requests, feed_dict=feeds) return args
def run(self): self.dataflow.reset_state() with ops.default_session(self.sess): try: while True: for dp in self.dataflow.get_data(): if self.coord.should_stop(): return feed = dict(zip(self.input_vars, dp)) self.op.run(feed_dict=feed) except tf.errors.CancelledError as e: pass except Exception: logger.exception("Exception in EnqueueThread:") finally: try: self.sess.run(self.close_op) except RuntimeError: # session already closed pass self.coord.request_stop() logger.info("Enqueue Thread Exited.")
def after_run(self, run_context, run_values): stale_global_step = run_values.results if self._timer.should_trigger_for_step(stale_global_step + self._steps_per_run): # get the real value after train op. global_step = run_context.session.run(self._global_step_tensor) if self._timer.should_trigger_for_step(global_step): elapsed_time, elapsed_steps = self._timer.update_last_triggered_step( global_step) if elapsed_time is not None: with ops.default_session(run_context.session): self._log_and_record(elapsed_steps, elapsed_time, global_step) # Check whether the global step has been increased. Here, we do not use the # timer.last_triggered_step as the timer might record a different global # step value such that the comparison could be unreliable. For simplicity, # we just compare the stale_global_step with previously recorded version. if stale_global_step == self._last_global_step: # Here, we use a counter to count how many times we have observed that the # global step has not been increased. For some Optimizers, the global step # is not increased each time by design. For example, SyncReplicaOptimizer # doesn't increase the global step in worker's main train step. self._global_step_check_count += 1 if self._global_step_check_count % 20 == 0: self._global_step_check_count = 0 logging.warning( "It seems that global step (tf.train.get_global_step) has not " "been increased. Current value (could be stable): %s vs previous " "value: %s. You could increase the global step by passing " "tf.train.get_global_step() to Optimizer.apply_gradients or " "Optimizer.minimize.", stale_global_step, self._last_global_step) else: # Whenever we observe the increment, reset the counter. self._global_step_check_count = 0 self._last_global_step = stale_global_step
def _save(self, session, step): """Saves the latest checkpoint, returns should_stop.""" logging.info("Saving checkpoints for %d into %s.", step, self._save_path) for l in self._listeners: l.before_save(session, step) self._get_saver().save(session, self._save_path, global_step=step) with ops.default_session(session): self._summary_writer.add_session_log( SessionLog(status=SessionLog.CHECKPOINT, checkpoint_path=self._save_path), step) self._summary_writer.flush() should_stop = False for l in self._listeners: if l.after_save(session, step): logging.info( "A CheckpointSaverListener requested that training be stopped. " "listener: {}".format(l)) should_stop = True return should_stop
def as_default(self): return ops.default_session(self)
def as_default(self): return ops.default_session(self)
def end(self, session): if self._summary_writer is not None: with ops.default_session(session): self._summary_writer.flush()
def as_default(self): """Returns a context manager that makes this object the default session. Use with the `with` keyword to specify that calls to [`Operation.run()`](../../api_docs/python/framework.md#Operation.run) or [`Tensor.run()`](../../api_docs/python/framework.md#Tensor.run) should be executed in this session. ```python c = tf.constant(..) sess = tf.Session() with sess.as_default(): assert tf.get_default_session() is sess print c.eval() ``` To get the current default session, use [`tf.get_default_session()`](#get_default_session). *N.B.* The `as_default` context manager *does not* close the session when you exit the context, and you must close the session explicitly. ```python c = tf.constant(...) sess = tf.Session() with sess.as_default(): print c.eval() # ... with sess.as_default(): print c.eval() sess.close() ``` Alternatively, you can use `with tf.Session():` to create a session that is automatically closed on exiting the context, including when an uncaught exception is raised. *N.B.* The default graph is a property of the current thread. If you create a new thread, and wish to use the default session in that thread, you must explicitly add a `with sess.as_default():` in that thread's function. Returns: A context manager using this session as the default session. """ return ops.default_session(self) # Eventually, this registration could be opened up to support custom # Tensor expansions. Expects tuples of (Type, fetch_fn, feed_fn), # where the signatures are: # fetch_fn : Type -> (list of Tensors, # lambda: list of fetched np.ndarray -> TypeVal) # feed_fn : Type, TypeVal -> list of (Tensor, value) # Conceptually, fetch_fn describes how to expand fetch into its # component Tensors and how to contracting the fetched results back into # a single return value. feed_fn describes how to unpack a single fed #
def end(self, session): with ops.default_session(session): self._file_writer.flush()
def end(self, session=None): if self._summary_writer and session: with ops.default_session(session): self._summary_writer.flush()