def after_run(self, run_context, run_values):
     # Collect any legacy v1 summaries to emit.
     summaries_to_emit = []
     if self._summary_writer and self._request_summary:
         for summary in run_values.results.get("summary", []):
             # Skip None results corresponding to V2 summary operations.
             if summary is not None:
                 summaries_to_emit.append(summary)
     # Heuristically estimate current step as possibly-stale value plus one.
     stale_global_step = run_values.results["global_step"]
     self._current_step = stale_global_step + 1
     # Read the actual post-run global step if we need better accuracy because
     # 1) we will request summaries on the next run (based on estimate now) and
     #    must ensure we record an accurate "last triggered step" value, or
     # 2) we have legacy v1 summaries to emit using the post-run step value.
     # Note: we could have dealt with (1) separately in before_run() but by doing
     # it here we can consolidate the reads in case both (1) and (2) apply.
     near_next_trigger = self._timer.should_trigger_for_step(
         self._current_step)
     if near_next_trigger or summaries_to_emit:
         self._current_step = run_context.session.run(
             self._global_step_tensor)
     # Emit any legacy v1 summaries.
     if summaries_to_emit:
         with ops.default_session(run_context.session):
             for summary in summaries_to_emit:
                 self._summary_writer.add_summary(summary,
                                                  self._current_step)
 def end(self, session):
     last_step = session.run(self._global_step_tensor)
     if last_step != self._timer.last_triggered_step():
         self._save(session, last_step)
     for l in self._listeners:
         l.end(session, last_step)
     with ops.default_session(session):
         self._summary_writer.flush()
Ejemplo n.º 3
0
    def main_loop(self):
        # some final operations that might modify the graph
        logger.info("[{}] Initializing graph variables ...".format(os.environ['PBS_ARRAY_INDEX']))

        #self.sess.run(tf.initialize_all_variables())

        self.config.session_init.init(self.sess)
#        tf.get_default_graph().finalize()
        callbacks = self.config.callbacks
        logger.info("[{}] Starting concurrency...".format(os.environ['PBS_ARRAY_INDEX']))
        self._start_concurrency()
        #with self.sess.as_default():
        logger.info("[{}] Setting default session".format(os.environ['PBS_ARRAY_INDEX']))
        with ops.default_session(self.sess):
            try:
                logger.info("[{}] Getting global step".format(os.environ['PBS_ARRAY_INDEX']))
                self.global_step = get_global_step()
                logger.info("[{}] Start training with global_step={}".format(os.environ['PBS_ARRAY_INDEX'], self.global_step))

                if self.config.extra_arg['is_chief']:
                    server = neptune_mp_server.Server(
                            self.config.extra_arg['n_workers'],
                            port=self.config.extra_arg['port'],
                            debug_charts=self.config.extra_arg['debug_charts'],
                            adam_debug=self.config.extra_arg['adam_debug'],
                            schedule_hyper=self.config.extra_arg['schedule_hyper'],
                            experiment_dir=self.config.extra_arg['experiment_dir'])
                    server.main_loop()

                callbacks.before_train()
                for epoch in range(self.config.starting_epoch, self.config.max_epoch+1):
                    with timed_operation(
                        'Epoch {}, global_step={}'.format(
                            epoch, self.global_step + self.config.step_per_epoch)):
                        for step in tqdm.trange(
                                self.config.step_per_epoch,
                                **get_tqdm_kwargs(leave=True)):
                            if self.coord.should_stop():
                                return
                            self.run_step()
                            callbacks.trigger_step()
                            try:
                                self.global_step += 1
                            except:
                                self.global_step = -1
                        self.trigger_epoch()
                        print 'EPOCH ENDS HERE'
            except (KeyboardInterrupt, Exception):
                raise
            finally:
                # Do I need to run queue.close?
                print('Handling finally block')
                callbacks.after_train()
                self.coord.request_stop()
                self.summary_writer.close()
                self.sess.close()
Ejemplo n.º 4
0
    def _start_concurrency(self):
        """
        Run all threads before starting training
        """
        logger.info("Starting all threads & procs ...")
        tf.train.start_queue_runners(sess=self.sess.get(), coord=self.coord, daemon=True, start=True)

        #with self.sess.as_default():
        with ops.default_session(self.sess):
            # avoid sigint get handled by other processes
            start_proc_mask_signal(self._extra_threads_procs)
Ejemplo n.º 5
0
  def as_default(self):
    """Returns a context manager that makes this object the default session.

    Use with the `with` keyword to specify that calls to
    @{tf.Operation.run} or @{tf.Tensor.eval} should be executed in
    this session.

    ```python
    c = tf.constant(..)
    sess = tf.Session()

    with sess.as_default():
      assert tf.get_default_session() is sess
      print(c.eval())
    ```

    To get the current default session, use @{tf.get_default_session}.

    *N.B.* The `as_default` context manager *does not* close the
    session when you exit the context, and you must close the session
    explicitly.

    ```python
    c = tf.constant(...)
    sess = tf.Session()
    with sess.as_default():
      print(c.eval())
    # ...
    with sess.as_default():
      print(c.eval())

    sess.close()
    ```

    Alternatively, you can use `with tf.Session():` to create a
    session that is automatically closed on exiting the context,
    including when an uncaught exception is raised.

    *N.B.* The default session is a property of the current thread. If you
    create a new thread, and wish to use the default session in that
    thread, you must explicitly add a `with sess.as_default():` in that
    thread's function.

    *N.B.* Entering a `with sess.as_default():` block does not affect
    the current default graph. If you are using multiple graphs, and
    `sess.graph` is different from the value of @{tf.get_default_graph},
    you must explicitly enter a `with sess.graph.as_default():` block
    to make `sess.graph` the default graph.

    Returns:
      A context manager using this session as the default session.
    """
    return ops.default_session(self)
Ejemplo n.º 6
0
  def as_default(self):
    """Returns a context manager that makes this object the default session.

    Use with the `with` keyword to specify that calls to
    @{tf.Operation.run} or @{tf.Tensor.eval} should be executed in
    this session.

    ```python
    c = tf.constant(..)
    sess = tf.Session()

    with sess.as_default():
      assert tf.get_default_session() is sess
      print(c.eval())
    ```

    To get the current default session, use @{tf.get_default_session}.

    *N.B.* The `as_default` context manager *does not* close the
    session when you exit the context, and you must close the session
    explicitly.

    ```python
    c = tf.constant(...)
    sess = tf.Session()
    with sess.as_default():
      print(c.eval())
    # ...
    with sess.as_default():
      print(c.eval())

    sess.close()
    ```

    Alternatively, you can use `with tf.Session():` to create a
    session that is automatically closed on exiting the context,
    including when an uncaught exception is raised.

    *N.B.* The default session is a property of the current thread. If you
    create a new thread, and wish to use the default session in that
    thread, you must explicitly add a `with sess.as_default():` in that
    thread's function.

    *N.B.* Entering a `with sess.as_default():` block does not affect
    the current default graph. If you are using multiple graphs, and
    `sess.graph` is different from the value of @{tf.get_default_graph},
    you must explicitly enter a `with sess.graph.as_default():` block
    to make `sess.graph` the default graph.

    Returns:
      A context manager using this session as the default session.
    """
    return ops.default_session(self)
Ejemplo n.º 7
0
    def as_default(self):
        """Returns a context manager that makes this object the default session.

    Use with the `with` keyword to specify that calls to
    [`Operation.run()`](../../api_docs/python/framework.md#Operation.run) or
    [`Tensor.run()`](../../api_docs/python/framework.md#Tensor.run) should be
    executed in this session.

    ```python
    c = tf.constant(..)
    sess = tf.Session()

    with sess.as_default():
      assert tf.get_default_session() is sess
      print(c.eval())
    ```

    To get the current default session, use
    [`tf.get_default_session()`](#get_default_session).


    *N.B.* The `as_default` context manager *does not* close the
    session when you exit the context, and you must close the session
    explicitly.

    ```python
    c = tf.constant(...)
    sess = tf.Session()
    with sess.as_default():
      print(c.eval())
    # ...
    with sess.as_default():
      print(c.eval())

    sess.close()
    ```

    Alternatively, you can use `with tf.Session():` to create a
    session that is automatically closed on exiting the context,
    including when an uncaught exception is raised.

    *N.B.* The default graph is a property of the current thread. If you
    create a new thread, and wish to use the default session in that
    thread, you must explicitly add a `with sess.as_default():` in that
    thread's function.

    Returns:
      A context manager using this session as the default session.

    """
        return ops.default_session(self)
Ejemplo n.º 8
0
  def as_default(self):
    """Returns a context manager that makes this object the default session.

    Use with the `with` keyword to specify that calls to
    [`Operation.run()`](../../api_docs/python/framework.md#Operation.run) or
    [`Tensor.eval()`](../../api_docs/python/framework.md#Tensor.eval) should be
    executed in this session.

    ```python
    c = tf.constant(..)
    sess = tf.Session()

    with sess.as_default():
      assert tf.get_default_session() is sess
      print(c.eval())
    ```

    To get the current default session, use
    [`tf.get_default_session()`](#get_default_session).


    *N.B.* The `as_default` context manager *does not* close the
    session when you exit the context, and you must close the session
    explicitly.

    ```python
    c = tf.constant(...)
    sess = tf.Session()
    with sess.as_default():
      print(c.eval())
    # ...
    with sess.as_default():
      print(c.eval())

    sess.close()
    ```

    Alternatively, you can use `with tf.Session():` to create a
    session that is automatically closed on exiting the context,
    including when an uncaught exception is raised.

    *N.B.* The default graph is a property of the current thread. If you
    create a new thread, and wish to use the default session in that
    thread, you must explicitly add a `with sess.as_default():` in that
    thread's function.

    Returns:
      A context manager using this session as the default session.

    """
    return ops.default_session(self)
    def after_run(self, run_context, run_values):
        stale_global_step = run_values.results["global_step"]
        global_step = stale_global_step + 1
        if self._request_summary:
            global_step = run_context.session.run(self._global_step_tensor)
            self._timer.update_last_triggered_step(global_step)
            self._save(global_step, self._output_file.format(global_step),
                       run_values.run_metadata.step_stats)
            with ops.default_session(run_context.session):
                self._file_writer.add_run_metadata(run_values.run_metadata,
                                                   "step_%d" % global_step,
                                                   global_step=global_step)

        self._next_step = global_step + 1
Ejemplo n.º 10
0
  def after_run(self, run_context, run_values):
    stale_global_step = run_values.results
    if self._timer.should_trigger_for_step(stale_global_step +
                                           self._steps_per_run):
      # Get the real value after train op.
      global_step = run_context.session.run(self._global_step_tensor)
      if self._timer.should_trigger_for_step(global_step):
        elapsed_time, elapsed_steps = self._timer.update_last_triggered_step(
            global_step)
        if elapsed_time is not None:
          with ops.default_session(run_context.session):
            self._log_and_record(elapsed_steps, elapsed_time, global_step)

    self._last_global_step = stale_global_step
Ejemplo n.º 11
0
 def after_create_session(self, session, coord):
     del coord
     # Ensure summary writer resource has been initialized.
     session.run(summary_ops_v2.summary_writer_initializer_op())
     global_step = session.run(self._global_step_tensor)
     # Write graph and saver_def once graph is finalized, which isn't true yet
     # in begin() since later hooks can still change the graph.
     training_util.write_graph(
         ops.get_default_graph().as_graph_def(add_shapes=True),
         self._checkpoint_dir, "graph.pbtxt")
     saver_def = self._get_saver().saver_def if self._get_saver() else None
     graph = ops.get_default_graph()
     meta_graph_def = meta_graph.create_meta_graph_def(
         graph_def=graph.as_graph_def(add_shapes=True), saver_def=saver_def)
     with ops.default_session(session):
         self._summary_writer.add_graph(graph)
         self._summary_writer.add_meta_graph(meta_graph_def)
     # The checkpoint saved here is the state at step "global_step".
     self._save(session, global_step)
     self._timer.update_last_triggered_step(global_step)
Ejemplo n.º 12
0
 def before_run(self, run_context):
     # For the first run, record a SessionLog.START at the pre-run global step.
     if self._current_step is None:
         self._current_step = run_context.session.run(
             self._global_step_tensor)
         with ops.default_session(run_context.session):
             self._summary_writer.add_session_log(
                 SessionLog(status=SessionLog.START), self._current_step)
     requests = {"global_step": self._global_step_tensor}
     self._request_summary = self._timer.should_trigger_for_step(
         self._current_step)
     if self._request_summary:
         self._timer.update_last_triggered_step(self._current_step)
         if self._get_summary_op() is not None:
             requests["summary"] = self._get_summary_op()
     feeds = {}
     if self._placeholder is not None and self._request_summary:
         feeds[self._placeholder] = self._request_summary
     args = SessionRunArgs(fetches=requests, feed_dict=feeds)
     return args
Ejemplo n.º 13
0
    def run(self):
        self.dataflow.reset_state()

        with ops.default_session(self.sess):
            try:
                while True:
                    for dp in self.dataflow.get_data():
                        if self.coord.should_stop():
                            return
                        feed = dict(zip(self.input_vars, dp))
                        self.op.run(feed_dict=feed)
            except tf.errors.CancelledError as e:
                pass
            except Exception:
                logger.exception("Exception in EnqueueThread:")
            finally:
                try:
                    self.sess.run(self.close_op)
                except RuntimeError:    # session already closed
                    pass
                self.coord.request_stop()
                logger.info("Enqueue Thread Exited.")
Ejemplo n.º 14
0
    def after_run(self, run_context, run_values):
        stale_global_step = run_values.results
        if self._timer.should_trigger_for_step(stale_global_step +
                                               self._steps_per_run):
            # get the real value after train op.
            global_step = run_context.session.run(self._global_step_tensor)
            if self._timer.should_trigger_for_step(global_step):
                elapsed_time, elapsed_steps = self._timer.update_last_triggered_step(
                    global_step)
                if elapsed_time is not None:
                    with ops.default_session(run_context.session):
                        self._log_and_record(elapsed_steps, elapsed_time,
                                             global_step)

        # Check whether the global step has been increased. Here, we do not use the
        # timer.last_triggered_step as the timer might record a different global
        # step value such that the comparison could be unreliable. For simplicity,
        # we just compare the stale_global_step with previously recorded version.
        if stale_global_step == self._last_global_step:
            # Here, we use a counter to count how many times we have observed that the
            # global step has not been increased. For some Optimizers, the global step
            # is not increased each time by design. For example, SyncReplicaOptimizer
            # doesn't increase the global step in worker's main train step.
            self._global_step_check_count += 1
            if self._global_step_check_count % 20 == 0:
                self._global_step_check_count = 0
                logging.warning(
                    "It seems that global step (tf.train.get_global_step) has not "
                    "been increased. Current value (could be stable): %s vs previous "
                    "value: %s. You could increase the global step by passing "
                    "tf.train.get_global_step() to Optimizer.apply_gradients or "
                    "Optimizer.minimize.", stale_global_step,
                    self._last_global_step)
        else:
            # Whenever we observe the increment, reset the counter.
            self._global_step_check_count = 0

        self._last_global_step = stale_global_step
Ejemplo n.º 15
0
    def _save(self, session, step):
        """Saves the latest checkpoint, returns should_stop."""
        logging.info("Saving checkpoints for %d into %s.", step,
                     self._save_path)

        for l in self._listeners:
            l.before_save(session, step)

        self._get_saver().save(session, self._save_path, global_step=step)
        with ops.default_session(session):
            self._summary_writer.add_session_log(
                SessionLog(status=SessionLog.CHECKPOINT,
                           checkpoint_path=self._save_path), step)
            self._summary_writer.flush()

        should_stop = False
        for l in self._listeners:
            if l.after_save(session, step):
                logging.info(
                    "A CheckpointSaverListener requested that training be stopped. "
                    "listener: {}".format(l))
                should_stop = True
        return should_stop
Ejemplo n.º 16
0
 def as_default(self):
   return ops.default_session(self)
Ejemplo n.º 17
0
 def as_default(self):
   return ops.default_session(self)
Ejemplo n.º 18
0
 def end(self, session):
     if self._summary_writer is not None:
         with ops.default_session(session):
             self._summary_writer.flush()
Ejemplo n.º 19
0
  def as_default(self):
    """Returns a context manager that makes this object the default session.

    Use with the `with` keyword to specify that calls to
    [`Operation.run()`](../../api_docs/python/framework.md#Operation.run) or
    [`Tensor.run()`](../../api_docs/python/framework.md#Tensor.run) should be
    executed in this session.

    ```python
    c = tf.constant(..)
    sess = tf.Session()

    with sess.as_default():
      assert tf.get_default_session() is sess
      print c.eval()
    ```

    To get the current default session, use
    [`tf.get_default_session()`](#get_default_session).


    *N.B.* The `as_default` context manager *does not* close the
    session when you exit the context, and you must close the session
    explicitly.

    ```python
    c = tf.constant(...)
    sess = tf.Session()
    with sess.as_default():
      print c.eval()
    # ...
    with sess.as_default():
      print c.eval()

    sess.close()
    ```

    Alternatively, you can use `with tf.Session():` to create a
    session that is automatically closed on exiting the context,
    including when an uncaught exception is raised.

    *N.B.* The default graph is a property of the current thread. If you
    create a new thread, and wish to use the default session in that
    thread, you must explicitly add a `with sess.as_default():` in that
    thread's function.

    Returns:
      A context manager using this session as the default session.

    """
    return ops.default_session(self)

  # Eventually, this registration could be opened up to support custom
  # Tensor expansions. Expects tuples of (Type, fetch_fn, feed_fn),
  # where the signatures are:
  #   fetch_fn : Type -> (list of Tensors,
  #                       lambda: list of fetched np.ndarray -> TypeVal)
  #   feed_fn  : Type, TypeVal -> list of (Tensor, value)
  # Conceptually, fetch_fn describes how to expand fetch into its
  # component Tensors and how to contracting the fetched results back into
  # a single return value. feed_fn describes how to unpack a single fed
  #
Ejemplo n.º 20
0
 def end(self, session):
     with ops.default_session(session):
         self._file_writer.flush()
Ejemplo n.º 21
0
 def end(self, session=None):
     if self._summary_writer and session:
         with ops.default_session(session):
             self._summary_writer.flush()