Ejemplo n.º 1
0
  def testEvaluationLoopTimeout(self):
    checkpoint_dir = os.path.join(self.get_temp_dir(),
                                  'evaluation_loop_timeout')
    if not gfile.Exists(checkpoint_dir):
      gfile.MakeDirs(checkpoint_dir)

    # We need a variable that that the saver will try to restore.
    variables.get_or_create_global_step()

    # Run with placeholders. If we actually try to evaluate this, we'd fail
    # since we're not using a feed_dict.
    cant_run_op = array_ops.placeholder(dtype=dtypes.float32)

    start = time.time()
    final_values = evaluation.evaluate_repeatedly(
        checkpoint_dir=checkpoint_dir,
        eval_ops=cant_run_op,
        hooks=[evaluation.StopAfterNEvalsHook(10)],
        timeout=6)
    end = time.time()
    self.assertFalse(final_values)

    # Assert that we've waited for the duration of the timeout (minus the sleep
    # time).
    self.assertGreater(end - start, 5.0)

    # Then the timeout kicked in and stops the loop.
    self.assertLess(end - start, 7)
 def test_step_counter_every_n_steps(self):
   with ops.Graph().as_default() as g, session_lib.Session() as sess:
     variables.get_or_create_global_step()
     train_op = training_util._increment_global_step(1)
     summary_writer = fake_summary_writer.FakeSummaryWriter(self.log_dir, g)
     hook = basic_session_run_hooks.StepCounterHook(
         summary_writer=summary_writer, every_n_steps=10)
     hook.begin()
     sess.run(variables_lib.global_variables_initializer())
     mon_sess = monitored_session._HookedSession(sess, [hook])
     with test.mock.patch.object(tf_logging, 'warning') as mock_log:
       for _ in range(30):
         time.sleep(0.01)
         mon_sess.run(train_op)
       # logging.warning should not be called.
       self.assertIsNone(mock_log.call_args)
     hook.end(sess)
     summary_writer.assert_summaries(
         test_case=self,
         expected_logdir=self.log_dir,
         expected_graph=g,
         expected_summaries={})
     self.assertItemsEqual([11, 21], summary_writer.summaries.keys())
     for step in [11, 21]:
       summary_value = summary_writer.summaries[step][0].value[0]
       self.assertEqual('global_step/sec', summary_value.tag)
       self.assertGreater(summary_value.simple_value, 0)
  def test_two_listeners_with_default_saver(self):
    with ops.Graph().as_default():
      global_step = variables.get_or_create_global_step()
      train_op = state_ops.assign_add(global_step, 1)
      listener1 = MockCheckpointSaverListener()
      listener2 = MockCheckpointSaverListener()
      hook = basic_session_run_hooks.CheckpointSaverHook(
          self.model_dir,
          save_steps=1,
          listeners=[listener1, listener2])
      with monitored_session.SingularMonitoredSession(
          hooks=[hook],
          checkpoint_dir=self.model_dir) as sess:
        sess.run(train_op)
        sess.run(train_op)
        global_step_val = sess.run(global_step)
      listener1_counts = listener1.get_counts()
      listener2_counts = listener2.get_counts()
    self.assertEqual(2, global_step_val)
    self.assertEqual({
        'begin': 1,
        'before_save': 2,
        'after_save': 2,
        'end': 1
    }, listener1_counts)
    self.assertEqual(listener1_counts, listener2_counts)

    with ops.Graph().as_default():
      global_step = variables.get_or_create_global_step()
      with monitored_session.SingularMonitoredSession(
          checkpoint_dir=self.model_dir) as sess2:
        global_step_saved_val = sess2.run(global_step)
    self.assertEqual(2, global_step_saved_val)
  def test_step_counter_every_n_secs(self):
    with ops.Graph().as_default() as g, session_lib.Session() as sess:
      variables.get_or_create_global_step()
      train_op = training_util._increment_global_step(1)
      summary_writer = fake_summary_writer.FakeSummaryWriter(self.log_dir, g)
      hook = basic_session_run_hooks.StepCounterHook(
          summary_writer=summary_writer, every_n_steps=None, every_n_secs=0.1)

      hook.begin()
      sess.run(variables_lib.global_variables_initializer())
      mon_sess = monitored_session._HookedSession(sess, [hook])
      mon_sess.run(train_op)
      time.sleep(0.2)
      mon_sess.run(train_op)
      time.sleep(0.2)
      mon_sess.run(train_op)
      hook.end(sess)

      summary_writer.assert_summaries(
          test_case=self,
          expected_logdir=self.log_dir,
          expected_graph=g,
          expected_summaries={})
      self.assertTrue(summary_writer.summaries, 'No summaries were created.')
      self.assertItemsEqual([2, 3], summary_writer.summaries.keys())
      for summary in summary_writer.summaries.values():
        summary_value = summary[0].value[0]
        self.assertEqual('global_step/sec', summary_value.tag)
        self.assertGreater(summary_value.simple_value, 0)
Ejemplo n.º 5
0
  def testEvaluateWithEvalFeedDict(self):
    # Create a checkpoint.
    checkpoint_dir = os.path.join(self.get_temp_dir(),
                                  'evaluate_with_eval_feed_dict')
    self._train_model(checkpoint_dir, num_steps=1)

    # We need a variable that that the saver will try to restore.
    variables.get_or_create_global_step()

    # Create a variable and an eval op that increments it with a placeholder.
    my_var = variables.local_variable(0.0, name='my_var')
    increment = array_ops.placeholder(dtype=dtypes.float32)
    eval_ops = state_ops.assign_add(my_var, increment)

    increment_value = 3
    num_evals = 5
    expected_value = increment_value * num_evals
    final_values = evaluation.evaluate_repeatedly(
        checkpoint_dir=checkpoint_dir,
        eval_ops=eval_ops,
        feed_dict={increment: 3},
        final_ops={'my_var': array_ops.identity(my_var)},
        hooks=[evaluation.StopAfterNEvalsHook(num_evals),],
        max_number_of_evaluations=1)
    self.assertEqual(final_values['my_var'], expected_value)
 def test_not_wait_for_step_zero(self):
   with ops.Graph().as_default():
     variables.get_or_create_global_step()
     hook = basic_session_run_hooks.GlobalStepWaiterHook(wait_until_step=0)
     hook.begin()
     with session_lib.Session() as sess:
       # Before run should return without waiting gstep increment.
       hook.before_run(
           session_run_hook.SessionRunContext(
               original_args=None, session=sess))
  def setUp(self):
    test.TestCase.setUp(self)

    self.log_dir = 'log/dir'
    self.summary_writer = fake_summary_writer.FakeSummaryWriter(self.log_dir)

    var = variables_lib.Variable(0.0)
    tensor = state_ops.assign_add(var, 1.0)
    tensor2 = tensor * 2
    self.summary_op = summary_lib.scalar('my_summary', tensor)
    self.summary_op2 = summary_lib.scalar('my_summary2', tensor2)

    variables.get_or_create_global_step()
    self.train_op = training_util._increment_global_step(1)
Ejemplo n.º 8
0
 def test_recover_and_retry_on_aborted_error(self):
   # Tests that we silently retry and recover on abort.  This test uses
   # a CheckpointSaver to have something to recover from.
   logdir = _test_dir(self.get_temp_dir(),
                      'test_recover_and_retry_on_aborted_error')
   with ops.Graph().as_default():
     gstep = variables_lib.get_or_create_global_step()
     do_step = state_ops.assign_add(gstep, 1)
     scaffold = monitored_session.Scaffold()
     abort_hook = RaiseOnceAtCountN(
         4, errors_impl.AbortedError(None, None, 'Abort'))
     # Save after each step.
     ckpt_hook = basic_session_run_hooks.CheckpointSaverHook(
         logdir, save_steps=1, scaffold=scaffold)
     hooks = [abort_hook, ckpt_hook]
     with monitored_session.MonitoredSession(
         session_creator=monitored_session.ChiefSessionCreator(
             scaffold, checkpoint_dir=logdir),
         hooks=hooks) as session:
       self.assertEqual(0, session.run(gstep))
       self.assertEqual(1, session.run(do_step))
       self.assertEqual(2, session.run(do_step))
       self.assertFalse(session.should_stop())
       # Here at step 3, the hook triggers and raises AbortedError.  The
       # MonitoredSession automatically restores and retries.
       self.assertEqual(3, session.run(do_step))
       self.assertTrue(abort_hook.raised)
       self.assertFalse(session.should_stop())
       self.assertEqual(4, session.run(do_step))
       self.assertFalse(session.should_stop())
Ejemplo n.º 9
0
 def test_recovery(self):
   logdir = _test_dir(self.get_temp_dir(), 'test_recovery')
   with ops.Graph().as_default():
     gstep = variables_lib.get_or_create_global_step()
     do_step = state_ops.assign_add(gstep, 1)
     scaffold = monitored_session.Scaffold()
     # Use a hook to save the model every 100 steps.  It also saves it at
     # the end.
     hooks = [
         basic_session_run_hooks.CheckpointSaverHook(
             logdir, save_steps=1, scaffold=scaffold)
     ]
     with monitored_session.MonitoredSession(
         session_creator=monitored_session.ChiefSessionCreator(
             scaffold, checkpoint_dir=logdir),
         hooks=hooks) as session:
       self.assertEqual(0, session.run(gstep))
       self.assertEqual(1, session.run(do_step))
       self.assertEqual(2, session.run(do_step))
     # A restart will find the checkpoint and recover automatically.
     with monitored_session.MonitoredSession(
         session_creator=monitored_session.ChiefSessionCreator(
             scaffold, checkpoint_dir=logdir)) as session:
       self.assertEqual(2, session.run(gstep))
     # A restart will find the checkpoint and recover automatically.
     with monitored_session.MonitoredSession(
         session_creator=monitored_session.ChiefSessionCreator(
             scaffold,
             checkpoint_filename_with_path=saver_lib.latest_checkpoint(
                 logdir))) as session:
       self.assertEqual(2, session.run(gstep))
Ejemplo n.º 10
0
  def test_num_steps(self):
    logdir = _test_dir(self.get_temp_dir(), 'test_num_steps')
    with ops.Graph().as_default():
      gstep = variables_lib.get_or_create_global_step()
      do_step = state_ops.assign_add(gstep, 1)
      # Do 3 steps and save.
      hooks = [basic_session_run_hooks.StopAtStepHook(num_steps=3)]
      scaffold = monitored_session.Scaffold().finalize()
      with monitored_session.MonitoredSession(hooks=hooks) as session:
        session.run(do_step)
        self.assertFalse(session.should_stop())
        session.run(do_step)
        self.assertFalse(session.should_stop())
        session.run(do_step)
        self.assertTrue(session.should_stop())
        save_path = scaffold.saver.save(session._coordinated_creator.tf_sess,
                                        os.path.join(logdir, 'step-3'))
      # Restore and do 4 steps.
      def load_ckpt(scaffold, sess):
        scaffold.saver.restore(sess, save_path)

      session_creator = monitored_session.ChiefSessionCreator(
          scaffold=monitored_session.Scaffold(init_fn=load_ckpt))
      hooks = [basic_session_run_hooks.StopAtStepHook(num_steps=4)]
      with monitored_session.MonitoredSession(
          hooks=hooks, session_creator=session_creator) as session:
        self.assertEqual(4, session.run(do_step))
        self.assertFalse(session.should_stop())
        session.run(do_step)
        self.assertFalse(session.should_stop())
        session.run(do_step)
        self.assertFalse(session.should_stop())
        session.run(do_step)
        self.assertTrue(session.should_stop())
Ejemplo n.º 11
0
    def __init__(
        self,
        global_step_tensor=None,
        init_op=None,
        init_feed_dict=None,
        init_fn=None,
        ready_op=None,
        local_init_op=None,
        summary_op=None,
        saver=None,
        keep_checkpoint_max=5,
    ):
        """Create a scaffold.

    Args:
      global_step_tensor: Optional tensor to use as the global step counter.
      init_op: Optional op for initializing variables.
      init_feed_dict: Optional session feed dictionary to use when running the
        init_op.
      init_fn: Optional function to use to initialize the model after running
        the init_op.  Will be called as `init_fn(scaffold, session)`.
      ready_op: Optional op to verify that the variables are initialized.  Must
        return an empty scalar string tensor when the variables are
        initialized, or a non-empty one listing the names of the
        non-initialized variables.
      local_init_op: Optional op to initialize local variables.
      summary_op: Optional op to gather all summaries.  Must return a scalar
        string tensor containing a serialized `Summary` proto.
      saver: Optional `tf.Saver` object to use to save and restore variables.
      keep_checkpoint_max: Optional parameter to use to construct a saver if
        none is already there in the graph.
    """
        if global_step_tensor is None:
            global_step_tensor = contrib_variables.get_or_create_global_step()
        self.global_step_tensor = global_step_tensor
        if init_op is None:
            init_op = Scaffold._get_or_default(ops.GraphKeys.INIT_OP, variables.initialize_all_variables)
        self.init_op = init_op
        self.init_feed_dict = init_feed_dict
        # NOTE(touts): modifying the init function to be passed the scaffold is a
        # hack to make it easy to find the saver.  Is there a better way?
        if init_fn:
            self.init_fn = lambda sess: init_fn(self, sess)
        else:
            self.init_fn = None
        if ready_op is None:
            ready_op = Scaffold._get_or_default(ops.GraphKeys.READY_OP, variables.report_uninitialized_variables)
        self.ready_op = ready_op
        if local_init_op is None:
            local_init_op = Scaffold._get_or_default(ops.GraphKeys.LOCAL_INIT_OP, Scaffold._default_local_init_op)
        self.local_init_op = local_init_op
        if summary_op is None:
            summary_op = Scaffold._get_or_default(ops.GraphKeys.SUMMARY_OP, logging_ops.merge_all_summaries)
        # pylint: disable=g-long-lambda
        if saver is None:
            saver = Scaffold._get_or_default(
                ops.GraphKeys.SAVERS, lambda: training_saver.Saver(sharded=True, max_to_keep=keep_checkpoint_max)
            )
        # pylint: enable=g-long-lambda
        self.saver = saver
Ejemplo n.º 12
0
  def _get_train_ops(self, features, targets):
    """See base class."""
    if not isinstance(self._linear_optimizer, sdca_optimizer.SDCAOptimizer):
      return super(LinearRegressor, self)._get_train_ops(features, targets)
    assert not self._joint_weights, ("_joint_weights is incompatible with"
                                     " SDCAOptimizer.")
    global_step = contrib_variables.get_or_create_global_step()

    logits, columns_to_variables, bias = (
        layers.weighted_sum_from_feature_columns(
            columns_to_tensors=features,
            feature_columns=self._linear_feature_columns,
            num_outputs=self._target_column.num_label_columns,
            weight_collections=[self._linear_model.get_scope_name()],
            scope=self._linear_model.get_scope_name()))
    with ops.control_dependencies([self._centered_bias()]):
      loss = self._target_column.loss(logits, targets, features)
      logging_ops.scalar_summary("loss", loss)

      _add_bias_column(self._linear_feature_columns, features, bias, targets,
                       columns_to_variables)

    train_op = self._linear_optimizer.get_train_step(
        columns_to_variables, self._target_column.weight_column_name,
        self._loss_type(), features, targets, global_step)
    return train_op, loss
Ejemplo n.º 13
0
  def _get_train_ops(self, features, targets):
    """See base class."""
    if not isinstance(self._linear_optimizer, sdca_optimizer.SDCAOptimizer):
      return super(LinearRegressor, self)._get_train_ops(features, targets)
    assert not self._joint_weights, ("_joint_weights is incompatible with"
                                     " SDCAOptimizer.")
    global_step = contrib_variables.get_or_create_global_step()

    logits, columns_to_variables, bias = (
        layers.weighted_sum_from_feature_columns(
            columns_to_tensors=features,
            feature_columns=self._linear_feature_columns,
            num_outputs=self._head.logits_dimension,
            weight_collections=[self._linear_model.get_scope_name()],
            scope=self._linear_model.get_scope_name()))
    _add_bias_column(self._linear_feature_columns, features, bias, targets,
                     columns_to_variables)

    def _train_op_fn(unused_loss):
      sdca_model, train_op = self._linear_optimizer.get_train_step(
          columns_to_variables, self._weight_column_name,
          self._loss_type(), features, targets, global_step)
      return sdca_model.update_weights(train_op)

    model_fn_ops = self._head.head_ops(features, targets,
                                       estimator.ModeKeys.TRAIN, _train_op_fn,
                                       logits=logits)
    return model_fn_ops.training_op, model_fn_ops.loss
Ejemplo n.º 14
0
  def testReturnsSingleCheckpointIfOneShardedCheckpoint(self):
    checkpoint_dir = os.path.join(self.get_temp_dir(),
                                  'one_checkpoint_found_sharded')
    if not gfile.Exists(checkpoint_dir):
      gfile.MakeDirs(checkpoint_dir)

    global_step = variables.get_or_create_global_step()

    # This will result in 3 different checkpoint shard files.
    with ops.device('/cpu:0'):
      variables_lib.Variable(10, name='v0')
    with ops.device('/cpu:1'):
      variables_lib.Variable(20, name='v1')

    saver = saver_lib.Saver(sharded=True)

    with session_lib.Session(
        target='',
        config=config_pb2.ConfigProto(device_count={'CPU': 2})) as session:

      session.run(variables_lib.global_variables_initializer())
      save_path = os.path.join(checkpoint_dir, 'model.ckpt')
      saver.save(session, save_path, global_step=global_step)

    num_found = 0
    for _ in evaluation.checkpoints_iterator(checkpoint_dir, timeout=0):
      num_found += 1
    self.assertEqual(num_found, 1)
 def setUp(self):
   self.model_dir = tempfile.mkdtemp()
   self.graph = ops.Graph()
   with self.graph.as_default():
     self.scaffold = monitored_session.Scaffold()
     self.global_step = variables.get_or_create_global_step()
     self.train_op = state_ops.assign_add(self.global_step, 1)
  def test_stop_based_on_num_step(self):
    h = basic_session_run_hooks.StopAtStepHook(num_steps=10)

    with ops.Graph().as_default():
      global_step = variables.get_or_create_global_step()
      no_op = control_flow_ops.no_op()
      h.begin()
      with session_lib.Session() as sess:
        mon_sess = monitored_session._HookedSession(sess, [h])
        sess.run(state_ops.assign(global_step, 5))
        h.after_create_session(sess, None)
        mon_sess.run(no_op)
        self.assertFalse(mon_sess.should_stop())
        sess.run(state_ops.assign(global_step, 13))
        mon_sess.run(no_op)
        self.assertFalse(mon_sess.should_stop())
        sess.run(state_ops.assign(global_step, 14))
        mon_sess.run(no_op)
        self.assertFalse(mon_sess.should_stop())
        sess.run(state_ops.assign(global_step, 15))
        mon_sess.run(no_op)
        self.assertTrue(mon_sess.should_stop())
        sess.run(state_ops.assign(global_step, 16))
        mon_sess._should_stop = False
        mon_sess.run(no_op)
        self.assertTrue(mon_sess.should_stop())
 def setUp(self):
   self.model_dir = tempfile.mkdtemp()
   self.graph = ops.Graph()
   with self.graph.as_default():
     self.scaffold = monitored_session.Scaffold()
     self.global_step = variables.get_or_create_global_step()
     self.train_op = training_util._increment_global_step(1)
 def test_listener_with_monitored_session(self):
   with ops.Graph().as_default():
     scaffold = monitored_session.Scaffold()
     global_step = variables.get_or_create_global_step()
     train_op = state_ops.assign_add(global_step, 1)
     listener = MockCheckpointSaverListener()
     hook = basic_session_run_hooks.CheckpointSaverHook(
         self.model_dir,
         save_steps=1,
         scaffold=scaffold,
         listeners=[listener])
     with monitored_session.SingularMonitoredSession(
         hooks=[hook],
         scaffold=scaffold,
         checkpoint_dir=self.model_dir) as sess:
       sess.run(train_op)
       sess.run(train_op)
       global_step_val = sess.run(global_step)
     listener_counts = listener.get_counts()
   self.assertEqual(2, global_step_val)
   self.assertEqual({
       'begin': 1,
       'before_save': 2,
       'after_save': 2,
       'end': 1
   }, listener_counts)
Ejemplo n.º 19
0
  def testNoneGlobalStep(self):
    with ops.Graph().as_default():
      random_seed.set_random_seed(0)
      tf_inputs = constant_op.constant(self._inputs, dtype=dtypes.float32)
      tf_labels = constant_op.constant(self._labels, dtype=dtypes.float32)

      tf_predictions = batchnorm_classifier(tf_inputs)
      loss_ops.log_loss(tf_predictions, tf_labels)
      total_loss = loss_ops.get_total_loss()
      optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=1.0)

      train_op = training.create_train_op(
          total_loss, optimizer, global_step=None)

      global_step = variables_lib.get_or_create_global_step()

      with session_lib.Session() as sess:
        # Initialize all variables
        sess.run(variables_lib2.global_variables_initializer())

        for _ in range(10):
          sess.run([train_op])
        global_step = global_step.eval()
        # Since train_op don't use global_step it shouldn't change.
        self.assertAllClose(global_step, 0)
Ejemplo n.º 20
0
  def finalize(self):
    """Creates operations if needed and finalizes the graph."""
    if self._global_step_tensor is None:
      self._global_step_tensor = contrib_variables.get_or_create_global_step()
    if self._init_op is None:
      self._init_op = Scaffold._get_or_default(
          'init_op', ops.GraphKeys.INIT_OP, variables.initialize_all_variables)
    if self._ready_op is None:
      self._ready_op = Scaffold._get_or_default(
          'ready_op', ops.GraphKeys.READY_OP,
          variables.report_uninitialized_variables)
    if self._local_init_op is None:
      self._local_init_op = Scaffold._get_or_default(
          'local_init_op', ops.GraphKeys.LOCAL_INIT_OP,
          Scaffold._default_local_init_op)
    if self._summary_op is None:
      self._summary_op = Scaffold._get_or_default(
          'summary_op', ops.GraphKeys.SUMMARY_OP,
          logging_ops.merge_all_summaries)
    # pylint: disable=g-long-lambda
    if self._saver is None:
      self._saver = Scaffold._get_or_default(
          'saver',
          ops.GraphKeys.SAVERS,
          lambda: training_saver.Saver(sharded=True,
                                       max_to_keep=self._keep_checkpoint_max))
    # pylint: enable=g-long-lambda

    ops.get_default_graph().finalize()
Ejemplo n.º 21
0
  def __init__(self,
               log_dir=None,
               summary_writer=None,
               summary_op=None,
               feed_dict=None):
    """Constructs the Summary Hook.

    Args:
      log_dir: The directory where the summary events are saved to.  Used only
        when `summary_writer` is not specified.
      summary_writer: A `tf.summary.FileWriter` to write summary events with.
      summary_op: The summary op to run. If left as `None`, then all summaries
        in the tf.GraphKeys.SUMMARIES collection are used.
      feed_dict: An optional feed dictionary to use when evaluating the
        summaries.

    Raises:
      ValueError: If both `log_dir` and `summary_writer` are `None`.
    """
    self._summary_op = summary_op
    self._feed_dict = feed_dict
    self._summary_writer = summary_writer
    self._log_dir = log_dir
    self._summary_writer = summary_writer
    if self._log_dir is None and self._summary_writer is None:
      raise ValueError('One of log_dir or summary_writer should be used.')
    self._global_step = variables.get_or_create_global_step()
 def setUp(self):
   super(ProfilerHookTest, self).setUp()
   self.output_dir = tempfile.mkdtemp()
   self.graph = ops.Graph()
   self.filepattern = os.path.join(self.output_dir, 'timeline-*.json')
   with self.graph.as_default():
     self.global_step = variables.get_or_create_global_step()
     self.train_op = state_ops.assign_add(self.global_step, 1)
 def test_log_warning_if_global_step_not_increased(self):
   with ops.Graph().as_default(), session_lib.Session() as sess:
     variables.get_or_create_global_step()
     train_op = training_util._increment_global_step(0)  # keep same.
     sess.run(variables_lib.global_variables_initializer())
     hook = basic_session_run_hooks.StepCounterHook(
         every_n_steps=1, every_n_secs=None)
     hook.begin()
     mon_sess = monitored_session._HookedSession(sess, [hook])
     mon_sess.run(train_op)  # Run one step to record global step.
     with test.mock.patch.object(tf_logging, 'warning') as mock_log:
       for _ in range(30):
         mon_sess.run(train_op)
       self.assertRegexpMatches(
           str(mock_log.call_args),
           'global step.*has not been increased')
     hook.end(sess)
Ejemplo n.º 24
0
 def test_invalid_graph(self):
   # Create inputs.
   model_dir = tempfile.mkdtemp()
   hook = trainer_hooks.FeatureImportanceSummarySaver(model_dir)
   with ops.Graph().as_default():
     # Begin won't be able to find the required tensors in the graph.
     _ = variables.get_or_create_global_step()
     with self.assertRaises(KeyError):
       hook.begin()
Ejemplo n.º 25
0
  def setUp(self):
    super(EvaluationTest, self).setUp()

    num_classes = 8
    batch_size = 16
    inputs, labels = GenerateTestData(num_classes, batch_size)
    self._expected_accuracy = GroundTruthAccuracy(inputs, labels, batch_size)

    self._global_step = variables_lib.get_or_create_global_step()
    self._inputs = constant_op.constant(inputs, dtype=dtypes.float32)
    self._labels = constant_op.constant(labels, dtype=dtypes.int64)
    self._predictions, self._scale = TestModel(self._inputs)
Ejemplo n.º 26
0
 def test_stop_cleanly_when_no_exception_in_with_body(self):
   # Tests that regular exceptions pass through
   with ops.Graph().as_default():
     gstep = variables_lib.get_or_create_global_step()
     do_step = state_ops.assign_add(gstep, 1)
     session = monitored_session.MonitoredSession()
     with session:
       self.assertEqual(1, session.run(do_step))
       self.assertEqual(2, session.run(do_step))
       self.assertFalse(session.should_stop())
     # Should have closed.
     self.assertTrue(session.should_stop())
     self.assertTrue(session._is_closed())
Ejemplo n.º 27
0
    def setUp(self):
        super(EvaluationTest, self).setUp()

        num_classes = 8
        batch_size = 16
        inputs, labels = GenerateTestData(num_classes, batch_size)
        self._expected_accuracy = GroundTruthAccuracy(inputs, labels,
                                                      batch_size)

        self._global_step = variables_lib.get_or_create_global_step()
        self._inputs = constant_op.constant(inputs, dtype=dtypes.float32)
        self._labels = constant_op.constant(labels, dtype=dtypes.int64)
        self._predictions, self._scale = TestModel(self._inputs)
Ejemplo n.º 28
0
  def setUp(self):
    test.TestCase.setUp(self)

    self.log_dir = 'log/dir'
    self.summary_writer = fake_summary_writer.FakeSummaryWriter(self.log_dir)

    var = variable_scope.get_variable('var', initializer=0.0, use_resource=True)
    tensor = state_ops.assign_add(var, 1.0)
    self.summary_op = summary_lib.scalar('my_summary', tensor)

    with variable_scope.variable_scope('foo', use_resource=True):
      global_step = variables.get_or_create_global_step()
    self.train_op = state_ops.assign_add(global_step, 1)
  def setUp(self):
    test.TestCase.setUp(self)

    self.log_dir = 'log/dir'
    self.summary_writer = fake_summary_writer.FakeSummaryWriter(self.log_dir)

    var = variable_scope.get_variable('var', initializer=0.0, use_resource=True)
    tensor = state_ops.assign_add(var, 1.0)
    self.summary_op = summary_lib.scalar('my_summary', tensor)

    with variable_scope.variable_scope('foo', use_resource=True):
      global_step = variables.get_or_create_global_step()
    self.train_op = state_ops.assign_add(global_step, 1)
Ejemplo n.º 30
0
    def test(self):
        """ Test the loaded model """
        tf.logging.set_verbosity(tf.logging.INFO)
        logging.info("Checking Source-Target Network")

        # Create the global step for monitoring the learning_rate and training.
        global_step = get_or_create_global_step()

        # variable collection
        source_vars = collect_vars('source')
        target_vars = collect_vars('target')

        self.source_saver = tf.train.Saver(max_to_keep=None,
                                           var_list=source_vars.values())
        self.target_saver = tf.train.Saver(max_to_keep=None,
                                           var_list=target_vars.values())

        def restore_fn(sess):
            self.source_saver.restore(
                sess, F.source_checkpoint_dir + F.source_checkpoint_file)
            self.target_saver.restore(
                sess, F.target_checkpoint_dir + F.target_checkpoint_file)
            return

        self.test_handle_op = self.test_iter.string_handle()

        # Define your supervisor for running a managed session.
        if F.load_chkpt:
            sv = tf.train.Supervisor(logdir=F.log_eval_dir,
                                     summary_op=None,
                                     init_fn=restore_fn,
                                     saver=None)
        else:
            sv = tf.train.Supervisor(logdir=F.log_eval_dir,
                                     summary_op=None,
                                     init_fn=None,
                                     saver=None)

        current_best_loss = 1000.  #TODO: Read it from a file for multiple restarts
        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=F.gpu_frac)
        with sv.managed_session(config=tf.ConfigProto(
                gpu_options=gpu_options)) as sess:
            self.test_handle = sess.run(self.test_handle_op)

            eval_loss = []
            while True:
                loss_source, loss_target = sess.run(
                    [self.source_loss, self.target_loss],
                    feed_dict={self.dataloader.split_handle: self.test_handle})
                logging.info("Batch-Loss Source: {}, Target: {}".format(
                    loss_source, loss_target))
Ejemplo n.º 31
0
  def setUp(self):
    test.TestCase.setUp(self)

    self.log_dir = 'log/dir'
    self.summary_writer = fake_summary_writer.FakeSummaryWriter(self.log_dir)

    var = variables_lib.Variable(0.0)
    tensor = state_ops.assign_add(var, 1.0)
    tensor2 = tensor * 2
    self.summary_op = summary_lib.scalar('my_summary', tensor)
    self.summary_op2 = summary_lib.scalar('my_summary2', tensor2)

    global_step = variables.get_or_create_global_step()
    self.train_op = state_ops.assign_add(global_step, 1)
Ejemplo n.º 32
0
  def __init__(self, log_dir, summary_op=None, feed_dict=None):
    """Constructs the Summary Hook.

    Args:
      log_dir: The directory where the logs are saved to.
      summary_op: The summary op to run. If left as `None`, then all summaries
        in the tf.GraphKeys.SUMMARIES collection are used.
      feed_dict: An optional feed dictionary to use when evaluating the
        summaries.
    """
    self._summary_op = summary_op
    self._feed_dict = feed_dict
    self._summary_writer = summary_io.SummaryWriter(log_dir)
    self._global_step = variables.get_or_create_global_step()
Ejemplo n.º 33
0
 def test_summaries(self):
   logdir = _test_dir(self.get_temp_dir(), 'test_summaries')
   with ops.Graph().as_default():
     gstep = variables_lib.get_or_create_global_step()
     new_gstep = state_ops.assign_add(gstep, 1)
     summary.scalar('my_summary_tag', new_gstep * 2)
     with monitored_session.MonitoredTrainingSession(
         is_chief=True, checkpoint_dir=logdir) as session:
       for _ in range(101):  # 100 is default summary writing steps
         session.run(new_gstep)
   summaries = util_test.latest_summaries(logdir)
   tags = [s.summary.value[0].tag for s in summaries]
   self.assertIn('my_summary_tag', tags)
   self.assertIn('global_step/sec', tags)
Ejemplo n.º 34
0
 def test_saving_restoring_checkpoint(self):
   logdir = _test_dir(self.get_temp_dir(), 'test_saving_restoring_checkpoint')
   with ops.Graph().as_default():
     gstep = variables_lib.get_or_create_global_step()
     do_step = state_ops.assign_add(gstep, 1)
     with monitored_session.MonitoredTrainingSession(
         is_chief=True, checkpoint_dir=logdir) as session:
       self.assertEqual(0, session.run(gstep))
       self.assertEqual(1, session.run(do_step))
       self.assertEqual(2, session.run(do_step))
     # A restart will find the checkpoint and recover automatically.
     with monitored_session.MonitoredTrainingSession(
         is_chief=True, checkpoint_dir=logdir) as session:
       self.assertEqual(2, session.run(gstep))
Ejemplo n.º 35
0
def train():
    img = tf.placeholder(shape=[config.batch_size, config.Config['min_dim'], config.Config['min_dim'], 3], dtype=tf.float32)
    anchors_num = sum(
        [config.Config['feature_maps'][s] ** 2 * config.Config['aspect_num'][s] for s in range(6)])

    loc = tf.placeholder(shape=[config.batch_size, anchors_num, 4], dtype=tf.float32)
    conf = tf.placeholder(shape=[config.batch_size, anchors_num], dtype=tf.float32)

    pred_loc, pred_confs, vbs = inceptionv3_500_ince.inception_v2_ssd(img,config)


    train_tensors = get_loss(conf, loc, pred_loc, pred_confs,config)
    global_step = get_or_create_global_step()

    # Define your exponentially decaying learning rate
    lr = tf.train.exponential_decay(
        learning_rate=0.001,
        global_step=global_step,
        decay_steps=20000,
        decay_rate=0.7,
        staircase=True)
    tf.summary.scalar('lr',lr)
    sum_op = tf.summary.merge_all()

    gen = data_gen.get_batch_inception(batch_size=config.batch_size,image_size=config.Config['min_dim'],max_detect=50)
    optimizer = tf.train.MomentumOptimizer(learning_rate=lr,momentum=0.9)
    train_op = slim.learning.create_train_op(train_tensors, optimizer)

    saver = tf.train.Saver(vbs)

    def restore(sess):
        saver.restore(sess, '/home/dsl/all_check/inception_v3.ckpt')

    sv = tf.train.Supervisor(logdir='/home/dsl/all_check/face_detect/voc-v32', summary_op=None, init_fn=restore)

    with sv.managed_session() as sess:
        for step in range(1000000000):

            images, true_box, true_label = q.get()

            loct, conft = np_utils.get_loc_conf(true_box, true_label, batch_size=config.batch_size,cfg=config.Config)
            feed_dict = {img: images, loc: loct,
                         conf: conft}
            t = time.time()
            ls,step = sess.run([train_op,global_step], feed_dict=feed_dict)
            if step % 10 == 0:
                print(time.time()-t)
                summaries = sess.run(sum_op, feed_dict=feed_dict)
                sv.summary_computed(sess, summaries)
                print(ls)
Ejemplo n.º 36
0
  def test_stop_based_with_multiple_steps(self):
    h = basic_session_run_hooks.StopAtStepHook(num_steps=10)

    with ops.Graph().as_default():
      global_step = variables.get_or_create_global_step()
      no_op = control_flow_ops.no_op()
      h.begin()
      with session_lib.Session() as sess:
        mon_sess = monitored_session._HookedSession(sess, [h])
        sess.run(state_ops.assign(global_step, 5))
        h.after_create_session(sess, None)
        mon_sess.run(no_op)
        self.assertFalse(mon_sess.should_stop())
        sess.run(state_ops.assign(global_step, 15))
        mon_sess.run(no_op)
        self.assertTrue(mon_sess.should_stop())
Ejemplo n.º 37
0
  def testReturnsSingleCheckpointIfOneCheckpointFound(self):
    checkpoint_dir = os.path.join(self.get_temp_dir(), 'one_checkpoint_found')
    if not gfile.Exists(checkpoint_dir):
      gfile.MakeDirs(checkpoint_dir)

    global_step = variables.get_or_create_global_step()
    saver = saver_lib.Saver()  # Saves the global step.

    with self.test_session() as session:
      session.run(variables_lib.global_variables_initializer())
      save_path = os.path.join(checkpoint_dir, 'model.ckpt')
      saver.save(session, save_path, global_step=global_step)

    num_found = 0
    for _ in evaluation.checkpoints_iterator(checkpoint_dir, timeout=0):
      num_found += 1
    self.assertEqual(num_found, 1)
Ejemplo n.º 38
0
    def __init__(self, corpus, **opts):
        self.corpus = corpus

        self.opts = opts

        self.global_step = get_or_create_global_step()
        self.increment_global_step_op = tf.assign(self.global_step, self.global_step + 1, name="increment_global_step")

        self.corpus_size = get_corpus_size(self.corpus["train"])
        self.corpus_size_valid = get_corpus_size(self.corpus["valid"])

        self.word2idx, self.idx2word = build_vocab(self.corpus["train"])
        self.vocab_size = len(self.word2idx)

        self.generator_template = tf.make_template(GENERATOR_PREFIX, generator)
        self.discriminator_template = tf.make_template(DISCRIMINATOR_PREFIX, discriminator)

        self.enqueue_data, _, source, target, sequence_length = \
            prepare_data(self.corpus["train"], self.word2idx, num_threads=7, **self.opts)

        # TODO: option to either do pretrain or just generate?
        self.g_tensors_pretrain = self.generator_template(
            source, target, sequence_length, self.vocab_size, **self.opts)

        self.enqueue_data_valid, self.input_ph, source_valid, target_valid, sequence_length_valid = \
            prepare_data(self.corpus["valid"], self.word2idx, num_threads=1, **self.opts)

        self.g_tensors_pretrain_valid = self.generator_template(
            source_valid, target_valid, sequence_length_valid, self.vocab_size, **self.opts)

        self.decoder_fn = prepare_custom_decoder(sequence_length)

        self.g_tensors_fake = self.generator_template(
            source, target, sequence_length, self.vocab_size, decoder_fn=self.decoder_fn, **self.opts)

        # TODO: using the rnn outputs from pretraining as "real" instead of target embeddings (aka professor forcing)
        self.d_tensors_real = self.discriminator_template(
            self.g_tensors_pretrain.rnn_outputs, sequence_length, is_real=True, **self.opts)

        # TODO: check to see if sequence_length is correct
        self.d_tensors_fake = self.discriminator_template(
            self.g_tensors_fake.rnn_outputs, None, is_real=False, **self.opts)

        self.g_tvars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=GENERATOR_PREFIX)
        self.d_tvars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=DISCRIMINATOR_PREFIX)
Ejemplo n.º 39
0
 def test_run(self):
   # Create inputs.
   model_dir = tempfile.mkdtemp()
   hook = trainer_hooks.FeatureImportanceSummarySaver(model_dir)
   with ops.Graph().as_default(), tf_session.Session() as sess:
     global_step = variables.get_or_create_global_step()
     with ops.name_scope("gbdt"):
       constant_op.constant(["featA", "featB"], name="feature_names")
       constant_op.constant([0, 2], name="feature_usage_counts")
       constant_op.constant([0, 0.8], name="feature_gains")
     # Begin finds tensors in the graph.
     hook.begin()
     sess.run(tf_variables.global_variables_initializer())
     # Run hook in a monitored session.
     train_op = state_ops.assign_add(global_step, 1)
     mon_sess = monitored_session._HookedSession(sess, [hook])
     mon_sess.run(train_op)
     hook.end(sess)
     # Ensure output summary dirs are created.
     self.assertTrue(os.path.exists(os.path.join(model_dir, "featA")))
     self.assertTrue(os.path.exists(os.path.join(model_dir, "featB")))
Ejemplo n.º 40
0
 def test_wait_for_step(self):
   with ops.Graph().as_default():
     gstep = variables.get_or_create_global_step()
     hook = basic_session_run_hooks.GlobalStepWaiterHook(wait_until_step=1000)
     hook.begin()
     with session_lib.Session() as sess:
       sess.run(variables_lib.global_variables_initializer())
       waiter = threading.Thread(
           target=hook.before_run,
           args=(session_run_hook.SessionRunContext(
               original_args=None, session=sess),))
       waiter.daemon = True
       waiter.start()
       time.sleep(1.0)
       self.assertTrue(waiter.is_alive())
       sess.run(state_ops.assign(gstep, 500))
       time.sleep(1.0)
       self.assertTrue(waiter.is_alive())
       sess.run(state_ops.assign(gstep, 1100))
       time.sleep(1.2)
       self.assertFalse(waiter.is_alive())
Ejemplo n.º 41
0
  def testGlobalStepNotIncrementedWhenSetToNone(self):
    with ops.Graph().as_default():
      random_seed.set_random_seed(0)
      tf_inputs = constant_op.constant(self._inputs, dtype=dtypes.float32)
      tf_labels = constant_op.constant(self._labels, dtype=dtypes.float32)

      tf_predictions = batchnorm_classifier(tf_inputs)
      loss = losses.log_loss(tf_labels, tf_predictions)
      optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=1.0)
      train_op = training.create_train_op(loss, optimizer, global_step=None)

      global_step = variables_lib.get_or_create_global_step()

      with self.test_session() as session:
        # Initialize all variables
        session.run(variables_lib2.global_variables_initializer())

        for _ in range(10):
          session.run(train_op)

        # Since train_op don't use global_step it shouldn't change.
        self.assertAllClose(global_step.eval(), 0)
Ejemplo n.º 42
0
def get_train_op(args, total_loss, learning_rate):
    # Generate moving averages of all losses and associated summaries.
    global_step = variables.get_or_create_global_step()
    loss_averages_op = _add_loss_summaries(total_loss)

    # Compute gradients.
    with tf.control_dependencies([loss_averages_op]):

        opt = get_optimizer(args, learning_rate)
        grads = opt.compute_gradients(total_loss, tf.global_variables())

    # Apply gradients.
    apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)

    # Track the moving averages of all trainable variables.
    variable_averages = tf.train.ExponentialMovingAverage(
        args.moving_average_decay, global_step)
    variables_averages_op = variable_averages.apply(tf.trainable_variables())

    with tf.control_dependencies([apply_gradient_op, variables_averages_op]):
        train_op = tf.no_op(name='train')

    return train_op
 def test_step_counter_every_n_steps(self):
     with ops.Graph().as_default() as g, session_lib.Session() as sess:
         global_step = variables.get_or_create_global_step()
         train_op = state_ops.assign_add(global_step, 1)
         summary_writer = fake_summary_writer.FakeSummaryWriter(
             self.log_dir, g)
         hook = basic_session_run_hooks.StepCounterHook(
             summary_writer=summary_writer, every_n_steps=10)
         hook.begin()
         sess.run(variables_lib.global_variables_initializer())
         mon_sess = monitored_session._HookedSession(sess, [hook])
         for _ in range(30):
             time.sleep(0.01)
             mon_sess.run(train_op)
         hook.end(sess)
         summary_writer.assert_summaries(test_case=self,
                                         expected_logdir=self.log_dir,
                                         expected_graph=g,
                                         expected_summaries={})
         self.assertItemsEqual([11, 21], summary_writer.summaries.keys())
         for step in [11, 21]:
             summary_value = summary_writer.summaries[step][0].value[0]
             self.assertEqual('global_step/sec', summary_value.tag)
             self.assertGreater(summary_value.simple_value, 0)
Ejemplo n.º 44
0
  def testUseGlobalStep(self):
    with ops.Graph().as_default():
      random_seed.set_random_seed(0)
      tf_inputs = constant_op.constant(self._inputs, dtype=dtypes.float32)
      tf_labels = constant_op.constant(self._labels, dtype=dtypes.float32)

      tf_predictions = batchnorm_classifier(tf_inputs)
      loss_ops.log_loss(tf_predictions, tf_labels)
      total_loss = loss_ops.get_total_loss()
      optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=1.0)

      train_op = training.create_train_op(total_loss, optimizer)

      global_step = variables_lib.get_or_create_global_step()

      with session_lib.Session() as sess:
        # Initialize all variables
        sess.run(variables_lib2.global_variables_initializer())

        for _ in range(10):
          sess.run([train_op])
        global_step = global_step.eval()
        # After 10 updates global_step should be 10.
        self.assertAllClose(global_step, 10)
Ejemplo n.º 45
0
def run():
    with tf.Graph().as_default() as graph:
        tf.logging.set_verbosity(tf.logging.INFO)

        #===================TEST BRANCH=======================
        #Load the files into one input queue
        images = tf.convert_to_tensor(image_files)
        input_queue = tf.train.slice_input_producer([images], shuffle=False)

        #Decode the image and annotation raw content
        image = tf.read_file(input_queue[0])
        image = tf.image.decode_image(image, channels=3)
        preprocessed_image = preprocess(image, None, image_height, image_width)

        images = tf.train.batch([preprocessed_image],
                                batch_size=batch_size,
                                allow_smaller_final_batch=True)

        #Create the model inference
        with slim.arg_scope(ENet_arg_scope()):
            logits, probabilities = ENet(images,
                                         num_classes,
                                         batch_size=batch_size,
                                         is_training=True,
                                         reuse=None,
                                         num_initial_blocks=num_initial_blocks,
                                         stage_two_repeat=stage_two_repeat,
                                         skip_connections=skip_connections)

        # Set up the variables to restore and restoring function from a saver.
        exclude = []
        variables_to_restore = slim.get_variables_to_restore(exclude=exclude)

        saver = tf.train.Saver(variables_to_restore)

        def restore_fn(sess):
            return saver.restore(sess, checkpoint_file)

        #State the metrics that you want to predict. We get a predictions that is not one_hot_encoded.
        predictions = tf.argmax(probabilities, -1)

        #Create the global step and an increment op for monitoring
        global_step = get_or_create_global_step()
        global_step_op = tf.assign(
            global_step, global_step + 1
        )  #no apply_gradient method so manually increasing the global_step

        #Define your supervisor for running a managed session. Do not run the summary_op automatically or else it will consume too much memory
        sv = tf.train.Supervisor(logdir=logdir,
                                 summary_op=None,
                                 init_fn=restore_fn)

        #Run the managed session
        with sv.managed_session() as sess:

            #Save the images
            if save_images:
                if not os.path.exists(photo_dir):
                    os.mkdir(photo_dir)

                for step in range(int(num_steps_per_epoch)):
                    # Compute summaries every 10 steps and continue evaluating
                    time_run = time.time()
                    predictions_val = sess.run([predictions])
                    time_run_end = time.time()
                    predictions_val_tuple = predictions_val[0]

                    print('totally cost (second)', time_run_end - time_run)

                    for i in range(predictions_val_tuple.shape[0]):
                        predicted_annotation = predictions_val_tuple[i]

                        # plt.subplot(1, 2, 1)
                        plt.imshow(predicted_annotation)
                        # plt.subplot(1, 2, 2)
                        # plt.imshow(img)
                        plt.savefig(photo_dir + "/image_" +
                                    str(image_files[step * num_epochs +
                                                    i])[15:])
Ejemplo n.º 46
0
    def __init__(self,
                 global_step_tensor=None,
                 init_op=None,
                 init_feed_dict=None,
                 init_fn=None,
                 ready_op=None,
                 local_init_op=None,
                 summary_op=None,
                 saver=None,
                 keep_checkpoint_max=5):
        """Create a scaffold.

    Args:
      global_step_tensor: Optional tensor to use as the global step counter.
      init_op: Optional op for initializing variables.
      init_feed_dict: Optional session feed dictionary to use when running the
        init_op.
      init_fn: Optional function to use to initialize the model after running
        the init_op.  Will be called as `init_fn(scaffold, session)`.
      ready_op: Optional op to verify that the variables are initialized.  Must
        return an empty scalar string tensor when the variables are
        initialized, or a non-empty one listing the names of the
        non-initialized variables.
      local_init_op: Optional op to initialize local variables.
      summary_op: Optional op to gather all summaries.  Must return a scalar
        string tensor containing a serialized `Summary` proto.
      saver: Optional `tf.Saver` object to use to save and restore variables.
      keep_checkpoint_max: Optional parameter to use to construct a saver if
        none is already there in the graph.
    """
        if global_step_tensor is None:
            global_step_tensor = contrib_variables.get_or_create_global_step()
        self.global_step_tensor = global_step_tensor
        if init_op is None:
            init_op = Scaffold._get_or_default(
                ops.GraphKeys.INIT_OP, variables.initialize_all_variables)
        self.init_op = init_op
        self.init_feed_dict = init_feed_dict
        # NOTE(touts): modifying the init function to be passed the scaffold is a
        # hack to make it easy to find the saver.  Is there a better way?
        if init_fn:
            self.init_fn = lambda sess: init_fn(self, sess)
        else:
            self.init_fn = None
        if ready_op is None:
            ready_op = Scaffold._get_or_default(
                ops.GraphKeys.READY_OP,
                variables.report_uninitialized_variables)
        self.ready_op = ready_op
        if local_init_op is None:
            local_init_op = Scaffold._get_or_default(
                ops.GraphKeys.LOCAL_INIT_OP, Scaffold._default_local_init_op)
        self.local_init_op = local_init_op
        if summary_op is None:
            summary_op = Scaffold._get_or_default(
                ops.GraphKeys.SUMMARY_OP, logging_ops.merge_all_summaries)
        # pylint: disable=g-long-lambda
        if saver is None:
            saver = Scaffold._get_or_default(
                ops.GraphKeys.SAVERS,
                lambda: training_saver.Saver(sharded=True,
                                             max_to_keep=keep_checkpoint_max))
        # pylint: enable=g-long-lambda
        self.saver = saver
def run():
    #Create the log directory here. Must be done here otherwise import will activate this unneededly.
    if not os.path.exists(log_dir):
        os.mkdir(log_dir)




    check = 0 # parsing 생성되어 있는지 확인

    #======================= TRAINING PROCESS =========================
    #Now we start to construct the graph and build our model
    with tf.Graph().as_default() as graph:
        tf.logging.set_verbosity(tf.logging.INFO) #Set the verbosity to INFO level

        #First create the dataset and load one batch
        #dataset = get_split('train', dataset_dir, file_pattern=file_pattern)
        #images, _, labels = load_batch(dataset, batch_size=batch_size)

        ##here##########################################################################################
        parser = argparse.ArgumentParser()
        parser.add_argument("--data_path", default='../sample', help="data_path")
        parser.add_argument("--save_path", default='../preprocessed', help="data_path")
        args = parser.parse_args()

        if not os.path.exists(args.save_path):
            os.makedirs(args.save_path)

        if check == 0 :
            preprocesser = json_to_mel()

            abs_data_path = os.path.abspath(args.data_path)
            abs_save_path = os.path.abspath(args.save_path)

            json_path = os.path.join(abs_data_path, '*.json')
            save_clean_path = os.path.join(abs_save_path, 'clean.pkl')  ## 바꾸기 2개로
            #save_mel_path = os.path.join(abs_save_path, 'mel.pkl')

            clean_data_frame, clean_data_time = preprocesser.parser(preprocesser.read(json_path), save_clean_path)
            clean_data_mel, clean_data_label = preprocesser.split_by_emo(abs_data_path, abs_save_path, clean_data_frame,
                                                                     clean_data_time)


        with open('abc2.bin', 'rb') as file:
            data = pickle.load(file)

        #####################################################################################################

        #Know the number steps to take before decaying the learning rate and batches per epoch
        num_batches_per_epoch = int(dataset.num_samples / batch_size)
        num_steps_per_epoch = num_batches_per_epoch #Because one step is one batch processed
        decay_steps = int(num_epochs_before_decay * num_steps_per_epoch)

        #Create the model inference
        with slim.arg_scope(inception_resnet_v2_arg_scope()):
            logits, end_points = inception_resnet_v2(images, num_classes = dataset.num_classes, is_training = True)

        #Define the scopes that you want to exclude for restoration
        exclude = ['InceptionResnetV2/Logits', 'InceptionResnetV2/AuxLogits']
        variables_to_restore = slim.get_variables_to_restore(exclude = exclude)

        #Perform one-hot-encoding of the labels (Try one-hot-encoding within the load_batch function!)
        one_hot_labels = slim.one_hot_encoding(labels, dataset.num_classes)

        #Performs the equivalent to tf.nn.sparse_softmax_cross_entropy_with_logits but enhanced with checks
        loss = tf.losses.softmax_cross_entropy(onehot_labels = one_hot_labels, logits = logits)
        total_loss = tf.losses.get_total_loss()    #obtain the regularization losses as well

        #Create the global step for monitoring the learning_rate and training.
        global_step = get_or_create_global_step()

        #Define your exponentially decaying learning rate
        lr = tf.train.exponential_decay(
            learning_rate = initial_learning_rate,
            global_step = global_step,
            decay_steps = decay_steps,
            decay_rate = learning_rate_decay_factor,
            staircase = True)

        #Now we can define the optimizer that takes on the learning rate
        optimizer = tf.train.AdamOptimizer(learning_rate = lr)

        #Create the train_op.
        train_op = slim.learning.create_train_op(total_loss, optimizer)

        # Accuracy
        predictions = tf.argmax(end_points['Predictions'], 1)
        probabilities = end_points['Predictions']
        accuracy, accuracy_update = tf.contrib.metrics.streaming_accuracy(predictions, labels)
        metrics_op = tf.group(accuracy_update, probabilities)

        #Summary's
        tf.summary.scalar('losses/Total_Loss', total_loss)
        tf.summary.scalar('accuracy', accuracy)
        tf.summary.scalar('learning_rate', lr)
        my_summary_op = tf.summary.merge_all()

        # training step function that runs both the train_op, metrics_op and updates the global_step concurrently.
        def train_step(sess, train_op, global_step):
            '''
            Simply runs a session for the three arguments provided and gives a logging on the time elapsed for each global step
            '''
            #Check the time for each sess run
            start_time = time.time()
            total_loss, global_step_count, _ = sess.run([train_op, global_step, metrics_op])
            time_elapsed = time.time() - start_time

            #Run the logging to print some results
            logging.info('global step %s: loss: %.4f (%.2f sec/step)', global_step_count, total_loss, time_elapsed)

            return total_loss, global_step_count

        #Saver function that restores the variables from a checkpoint file in a sess
        saver = tf.train.Saver(variables_to_restore)
        def restore_fn(sess):
            return saver.restore(sess, checkpoint_file)

        #Define supervisor for running a managed session. Do not run the summary_op automatically or else it will consume too much memory
        sv = tf.train.Supervisor(logdir = log_dir, summary_op = None, init_fn = restore_fn)


        #Run the managed session
        with sv.managed_session() as sess:
            for step in range(num_steps_per_epoch * num_epochs):
                #At the start of every epoch, show the vital information:
                if step % num_batches_per_epoch == 0:
                    logging.info('Epoch %s/%s', step/num_batches_per_epoch + 1, num_epochs)
                    learning_rate_value, accuracy_value = sess.run([lr, accuracy])
                    logging.info('Current Learning Rate: %s', learning_rate_value)
                    logging.info('Current Streaming Accuracy: %s', accuracy_value)

                    # optionally, print your logits and predictions for a sanity check that things are going fine.
                    logits_value, probabilities_value, predictions_value, labels_value = sess.run([logits, probabilities, predictions, labels])
                    print('logits: \n', logits_value)
                    print('Probabilities: \n', probabilities_value)
                    print('predictions: \n', predictions_value)
                    print('Labels:\n:', labels_value)

                #Log the summaries every 10 step.
                if step % 10 == 0:
                    loss, _ = train_step(sess, train_op, sv.global_step)
                    summaries = sess.run(my_summary_op)
                    sv.summary_computed(sess, summaries)

                #If not, simply run the training step
                else:
                    loss, _ = train_step(sess, train_op, sv.global_step)

            #We log the final training loss and accuracy
            logging.info('Final Loss: %s', loss)
            logging.info('Final Accuracy: %s', sess.run(accuracy))

            #Once all the training has been done, save the log files and checkpoint model
            logging.info('Finished training! Saving model to disk now.')
            # saver.save(sess, "./flowers_model.ckpt")
            sv.saver.save(sess, sv.save_path, global_step = sv.global_step)
def train(split, train_steps, train_dir, fc_after, level, checkpoints_dir = checkpoints_dir,checkpoint = 'model.ckpt-150000'):
    """ Trains the given neural network and saves the weights and summary information into a new checkpoint file in the train_dir
    
    Args: 
      split: Chooses split of flower dataset to train the network
      train_steps: Number of steps to train network
      train_dir: Directory in which checkpoints should be stored, and old checkpoints get loaded
    Returns:
      -
    """

    
    with tf.Graph().as_default():
        tf.logging.set_verbosity(tf.logging.INFO) # showing INFO logs
        
        dataset = dataVisualisation.get_split(split_name = split, dataset_dir = flowers_data_dir, label_type="multiple")
        images, _, label_species, labels_genus, labels_family, labels_organ= load_batch_intermediate(dataset,height=224, width=224, is_training=True, batch_size=100)      
        

                    #print(family, genus, species)
                
        abstraction_levels = {"family":labels_family, "genus":labels_genus, "species":label_species, "organs":labels_organ}  
        levels_length = {"family":124, "genus":516, "species":1000, "organs":7}  
                    
        labels = tf.stack(abstraction_levels.get(level, label_species))
        
        one_hot_labels = slim.one_hot_encoding(labels, levels_length.get(level, 1000))
        

        
        
        

        # Forward pass with non-flipped images
        logits,_ = my_intermediate_cnn(images, is_training=True, fc_after=fc_after, num_classes = levels_length.get(level, 1000))
        #print(logits, one_hot_labels)
        #with tf.Session() as sess:
        #    print(sess.run(tf.shape(logits)))

        tf.losses.softmax_cross_entropy(one_hot_labels, logits)      
        total_loss = tf.losses.get_total_loss()
        tf.summary.scalar('losses/Total_Loss', total_loss)
        
        
        # Learning rate decay
        global_step = variables.get_or_create_global_step()
        boundaries = [tf.constant(100000, dtype= "int64"), tf.constant(200000, dtype= "int64"), tf.constant(300000, dtype= "int64")]
        values = [0.001, 0.0001, 0.00001, 0.000001]
        my_learning_rate = tf.train.piecewise_constant(global_step, boundaries, values)
        
        #for v in tf.trainable_variables():
        #    print(v)
            
        #for v in slim.get_variables(scope="resnet_v2_50/fc_intermediate/"):
        #    print(v)

        # Specify the optimizer and create the train op:
        optimizer = tf.train.MomentumOptimizer(learning_rate=my_learning_rate, momentum = 0.9) 
        train_op = slim.learning.create_train_op(total_loss=total_loss, optimizer=optimizer, variables_to_train=slim.get_variables(scope="fc_intermediate"))
        
        saver = tf.train.Saver(max_to_keep=1)
 
        
        # Run the training:
        final_loss = slim.learning.train(
            train_op,
            logdir=train_dir,
            log_every_n_steps=50,
            init_fn= get_init_fn(fc_after),
            number_of_steps=train_steps,
            global_step = global_step, 
            saver = saver)    
        
       


    
    print('Finished training. Last batch loss %f' % final_loss)
Ejemplo n.º 49
0
def train(train_op,
          logdir,
          train_step_fn=train_step,
          train_step_kwargs=_USE_DEFAULT,
          log_every_n_steps=1,
          graph=None,
          master='',
          is_chief=True,
          global_step=None,
          number_of_steps=None,
          init_op=_USE_DEFAULT,
          init_feed_dict=None,
          local_init_op=_USE_DEFAULT,
          init_fn=None,
          ready_op=_USE_DEFAULT,
          summary_op=_USE_DEFAULT,
          save_summaries_secs=600,
          summary_writer=_USE_DEFAULT,
          startup_delay_steps=0,
          saver=None,
          save_interval_secs=600,
          sync_optimizer=None,
          session_config=None,
          trace_every_n_steps=None):
    """Runs a training loop using a TensorFlow supervisor.

  When the sync_optimizer is supplied, gradient updates are applied
  synchronously. Otherwise, gradient updates are applied asynchronous.

  Args:
    train_op: A `Tensor` that, when executed, will apply the gradients and
      return the loss value.
    logdir: The directory where training logs are written to. If None, model
      checkpoints and summaries will not be written.
    train_step_fn: The function to call in order to execute a single gradient
      step. The function must have take exactly four arguments: the current
      session, the `train_op` `Tensor`, a global step `Tensor` and a dictionary.
    train_step_kwargs: A dictionary which is passed to the `train_step_fn`. By
      default, two `Boolean`, scalar ops called "should_stop" and "should_log"
      are provided.
    log_every_n_steps: The frequency, in terms of global steps, that the loss
      and global step and logged.
    graph: The graph to pass to the supervisor. If no graph is supplied the
      default graph is used.
    master: The address of the tensorflow master.
    is_chief: Specifies whether or not the training is being run by the primary
      replica during replica training.
    global_step: The `Tensor` representing the global step. If left as `None`,
      then slim.variables.get_or_create_global_step() is used.
    number_of_steps: The max number of gradient steps to take during training.
      If the value is left as None, training proceeds indefinitely.
    init_op: The initialization operation. If left to its default value, then
      the session is initialized by calling `tf.global_variables_initializer()`.
    init_feed_dict: A feed dictionary to use when executing the `init_op`.
    local_init_op: The local initialization operation. If left to its default
      value, then the session is initialized by calling
      `tf.local_variables_initializer()` and `tf.tables_initializer()`.
    init_fn: An optional callable to be executed after `init_op` is called. The
      callable must accept one argument, the session being initialized.
    ready_op: Operation to check if the model is ready to use. If left to its
      default value, then the session checks for readiness by calling
      `tf.report_uninitialized_variables()`.
    summary_op: The summary operation.
    save_summaries_secs: How often, in seconds, to save summaries.
    summary_writer: `SummaryWriter` to use.  Can be `None`
      to indicate that no summaries should be written. If unset, we
      create a SummaryWriter.
    startup_delay_steps: The number of steps to wait for before beginning. Note
      that this must be 0 if a sync_optimizer is supplied.
    saver: Saver to save checkpoints. If None, a default one will be created
      and used.
    save_interval_secs: How often, in seconds, to save the model to `logdir`.
    sync_optimizer: an instance of tf.train.SyncReplicasOptimizer. If the
      argument is supplied, gradient updates will be synchronous. If left as
      `None`, gradient updates will be asynchronous.
    session_config: An instance of `tf.ConfigProto` that will be used to
      configure the `Session`. If left as `None`, the default will be used.
    trace_every_n_steps: produce and save a `Timeline` in Chrome trace format
      and add it to the summaries every `trace_every_n_steps`. If None, no trace
      information will be produced or saved.

  Returns:
    the value of the loss function after training.

  Raises:
    ValueError: if `train_op` is empty or if `startup_delay_steps` is
      non-zero when `sync_optimizer` is supplied, if `number_of_steps` is
      negative, or if `trace_every_n_steps` is not `None` and no `logdir` is
      provided.
  """
    if train_op is None:
        raise ValueError('train_op cannot be None.')

    if logdir is None:
        if summary_op != _USE_DEFAULT:
            raise ValueError('Cannot provide summary_op because logdir=None')
        if saver is not None:
            raise ValueError('Cannot provide saver because logdir=None')
        if trace_every_n_steps is not None:
            raise ValueError('Cannot provide trace_every_n_steps because '
                             'logdir=None')

    if sync_optimizer is not None and startup_delay_steps > 0:
        raise ValueError(
            'startup_delay_steps must be zero when sync_optimizer is supplied.'
        )

    if number_of_steps is not None and number_of_steps <= 0:
        raise ValueError(
            '`number_of_steps` must be either None or a positive number.')

    graph = graph or ops.get_default_graph()
    with graph.as_default():
        if global_step is None:
            global_step = variables.get_or_create_global_step()
        saver = saver or tf_saver.Saver()

        with ops.name_scope('init_ops'):
            if init_op == _USE_DEFAULT:
                init_op = tf_variables.global_variables_initializer()

            if ready_op == _USE_DEFAULT:
                ready_op = tf_variables.report_uninitialized_variables()

            if local_init_op == _USE_DEFAULT:
                local_init_op = control_flow_ops.group(
                    tf_variables.local_variables_initializer(),
                    data_flow_ops.tables_initializer())

            if sync_optimizer is not None and isinstance(
                    sync_optimizer,
                    sync_replicas_optimizer.SyncReplicasOptimizer):
                with ops.control_dependencies(
                    [local_init_op] if local_init_op is not None else []):
                    if is_chief:
                        local_init_op = sync_optimizer.chief_init_op
                    else:
                        local_init_op = sync_optimizer.local_step_init_op
                ready_for_local_init_op = sync_optimizer.ready_for_local_init_op
            else:
                ready_for_local_init_op = None

        if summary_op == _USE_DEFAULT:
            summary_op = summary.merge_all()

        if summary_writer == _USE_DEFAULT:
            summary_writer = supervisor.Supervisor.USE_DEFAULT

        if is_chief and sync_optimizer is not None:
            if not isinstance(sync_optimizer,
                              (sync_replicas_optimizer.SyncReplicasOptimizer)):
                raise ValueError(
                    '`sync_optimizer` must be a tf.train.SyncReplicasOptimizer.'
                )

            # Need to create these BEFORE the supervisor finalizes the graph:
            init_tokens_op = sync_optimizer.get_init_tokens_op()
            chief_queue_runner = sync_optimizer.get_chief_queue_runner()

        if train_step_kwargs == _USE_DEFAULT:
            with ops.name_scope('train_step'):
                train_step_kwargs = {}

                if number_of_steps:
                    should_stop_op = math_ops.greater_equal(
                        global_step, number_of_steps)
                else:
                    should_stop_op = constant_op.constant(False)
                train_step_kwargs['should_stop'] = should_stop_op
                train_step_kwargs['should_log'] = math_ops.equal(
                    math_ops.mod(global_step, log_every_n_steps), 0)
                if is_chief and trace_every_n_steps is not None:
                    train_step_kwargs['should_trace'] = math_ops.equal(
                        math_ops.mod(global_step, trace_every_n_steps), 0)
                    train_step_kwargs['logdir'] = logdir

    sv = supervisor.Supervisor(graph=graph,
                               is_chief=is_chief,
                               logdir=logdir,
                               init_op=init_op,
                               init_feed_dict=init_feed_dict,
                               local_init_op=local_init_op,
                               ready_for_local_init_op=ready_for_local_init_op,
                               ready_op=ready_op,
                               summary_op=summary_op,
                               summary_writer=summary_writer,
                               global_step=global_step,
                               saver=saver,
                               save_summaries_secs=save_summaries_secs,
                               save_model_secs=save_interval_secs,
                               init_fn=init_fn)

    if summary_writer is not None:
        train_step_kwargs['summary_writer'] = sv.summary_writer

    should_retry = True
    while should_retry:
        try:
            should_retry = False
            with sv.managed_session(master,
                                    start_standard_services=False,
                                    config=session_config) as sess:
                logging.info('Starting Session.')
                if is_chief:
                    if logdir:
                        sv.start_standard_services(sess)
                elif startup_delay_steps > 0:
                    _wait_for_step(
                        sess, global_step,
                        min(startup_delay_steps, number_of_steps
                            or sys.maxint))
                sv.start_queue_runners(sess)
                logging.info('Starting Queues.')
                if is_chief and sync_optimizer is not None:
                    sv.start_queue_runners(sess, [chief_queue_runner])
                    sess.run(init_tokens_op)
                try:
                    while not sv.should_stop():
                        total_loss, should_stop = train_step_fn(
                            sess, train_op, global_step, train_step_kwargs)
                        if should_stop:
                            logging.info('Stopping Training.')
                            break
                except errors.OutOfRangeError:
                    # OutOfRangeError is thrown when epoch limit per
                    # tf.train.limit_epochs is reached.
                    logging.info('Caught OutOfRangeError. Stopping Training.')
                if logdir and sv.is_chief:
                    logging.info('Finished training! Saving model to disk.')
                    sv.saver.save(sess,
                                  sv.save_path,
                                  global_step=sv.global_step)

        except errors.AbortedError:
            # Always re-run on AbortedError as it indicates a restart of one of the
            # distributed tensorflow servers.
            logging.info('Retrying training!')
            should_retry = True

    return total_loss
Ejemplo n.º 50
0
def run():
    end_points = {}
    if not os.path.exists(log_eval):
        os.mkdir(log_eval)
    with tf.Graph().as_default() as graph:
        tf.logging.set_verbosity(
            tf.logging.INFO)  # Set the verbosity to INFO level

        ########################################################
        # Get RGB dataset and the Imagenet trained on RGB images
        ########################################################

        # First create the dataset and load one batch
        dataset_rgb = get_split('validation',
                                dataset_dir_rgb,
                                file_pattern=file_pattern)
        images_rgb, labels_rgb = load_batch(dataset_rgb, batch_size=batch_size)

        # Know the number steps to take before decaying the learning rate and batches per epoch
        num_batches_per_epoch = int(dataset_rgb.num_samples / batch_size)
        num_steps_per_epoch = num_batches_per_epoch  # Because one step is one batch processed

        with tf.variable_scope("net_rgb"):
            # Create the model inference
            with slim.arg_scope(inception_v3_arg_scope()):
                logits_rgb, end_points_rgb = inception_v3(
                    images_rgb,
                    num_classes=dataset_rgb.num_classes,
                    is_training=True)

        ########################################################
        # Get depth dataset and the Imagenet trained on depth images
        ########################################################

        # First create the dataset and load one batch
        dataset_depth = get_split('validation',
                                  dataset_dir_depth,
                                  file_pattern=file_pattern)
        images_depth, labels_depth = load_batch(dataset_depth,
                                                batch_size=batch_size)

        # Create the model inference
        with tf.variable_scope("net_depth"):
            with slim.arg_scope(inception_v3_arg_scope()):
                logits_depth, end_points_depth = inception_v3(
                    images_depth,
                    num_classes=dataset_rgb.num_classes,
                    is_training=True)

        ########################################################
        # Combine the models with the concatenation operation
        # and add an FC layer on top
        ########################################################

        #
        with tf.variable_scope("concat_dense"):
            W_master = tf.Variable(tf.random_uniform([10, 5], -0.01, 0.01),
                                   name="weights_concat")
            b_master = tf.Variable(tf.zeros([5]), name="bias_concat")

            h_master = tf.matmul(tf.concat(
                (logits_rgb, logits_depth), axis=1), W_master) + b_master

            logits2 = tf.layers.dense(inputs=h_master,
                                      units=(num_classes * 2),
                                      name="dense_concat1")

            logits = tf.layers.dense(inputs=logits2,
                                     units=num_classes,
                                     name="dense_concat0")

        end_points['Logits'] = logits
        end_points['Predictions'] = slim.softmax(logits, scope='Predictions')

        variables_to_restore = slim.get_variables_to_restore()
        saver = tf.train.Saver(variables_to_restore)

        def restore_fn(sess):
            return saver.restore(sess, checkpoint_file)

        ####################################################
        # EVALUATION
        ####################################################

        predictions = tf.argmax(end_points['Predictions'], 1)
        accuracy, accuracy_update = tf.contrib.metrics.streaming_accuracy(
            predictions, labels_rgb)
        metrics_op = tf.group(accuracy_update)

        global_step = get_or_create_global_step()
        global_step_op = tf.assign(global_step, global_step + 1)

        conf_m = np.zeros((5, 5))

        def eval_step(sess, metrics_op, global_step, confusion_m):
            '''
            Simply takes in a session, runs the metrics op and some logging information.
            '''

            start_time = time.time()
            _, global_step_count, accuracy_value = sess.run(
                [metrics_op, global_step_op, accuracy])
            time_elapsed = time.time() - start_time

            images_rgb_im, images_depth_im, labels, prediction = sess.run(
                [images_rgb, images_depth, labels_rgb, predictions])

            confusion_m += confusion_matrix(labels,
                                            prediction,
                                            labels=[0, 1, 2, 3, 4])

            logging.info(
                'Global Step %s: Streaming Accuracy: %.4f (%.2f sec/step)',
                global_step_count, accuracy_value, time_elapsed)

            return accuracy_value

        tf.summary.scalar('Validation_Accuracy', accuracy)
        my_summary_op = tf.summary.merge_all()

        sv = tf.train.Supervisor(logdir=log_eval,
                                 summary_op=None,
                                 saver=None,
                                 init_fn=restore_fn)

        with sv.managed_session() as sess:
            num_steps_per_epoch = int(num_steps_per_epoch)
            for step in range(num_steps_per_epoch * num_epochs):
                sess.run(sv.global_step)
                if step % num_batches_per_epoch == 0:
                    logging.info('Epoch: %s/%s',
                                 step / num_batches_per_epoch + 1, num_epochs)
                    logging.info('Current Streaming Accuracy: %.4f',
                                 sess.run(accuracy))

                if step % 10 == 0:
                    eval_step(sess,
                              metrics_op=metrics_op,
                              global_step=sv.global_step,
                              confusion_m=conf_m)
                    summaries = sess.run(my_summary_op)
                    sv.summary_computed(sess, summaries)

                else:
                    eval_step(sess,
                              metrics_op=metrics_op,
                              global_step=sv.global_step,
                              confusion_m=conf_m)

            logging.info('Final Streaming Accuracy: %.4f', sess.run(accuracy))

            images_rgb, images_depth, labels, predictions = sess.run(
                [images_rgb, images_depth, labels_rgb, predictions])

            print(sess.run(end_points['Predictions']))

            print(conf_m)

            for i in range(10):
                label, prediction = labels[i], predictions[i]
                prediction_name, label_name = dataset_rgb.labels_to_name[
                    prediction], dataset_rgb.labels_to_name[label]
                text = 'Prediction: %s \n Ground Truth: %s' % (prediction_name,
                                                               label_name)
                print(text)
            logging.info(
                'Model evaluation has completed! Visit TensorBoard for more information regarding your evaluation.'
            )
Ejemplo n.º 51
0
def run():
    with tf.Graph().as_default() as graph:
        tf.logging.set_verbosity(tf.logging.INFO)

        #===================TEST BRANCH=======================
        #Load the files into one input queue
        images = tf.convert_to_tensor(image_files)
        annotations = tf.convert_to_tensor(annotation_files)
        input_queue = tf.train.slice_input_producer([images, annotations])

        #Decode the image and annotation raw content
        image = tf.read_file(input_queue[0])
        image = tf.image.decode_image(image, channels=3)
        annotation = tf.read_file(input_queue[1])
        annotation = tf.image.decode_image(annotation)

        #preprocess and batch up the image and annotation
        preprocessed_image, preprocessed_annotation = preprocess(
            image, annotation, image_height, image_width)
        images, annotations = tf.train.batch(
            [preprocessed_image, preprocessed_annotation],
            batch_size=batch_size,
            allow_smaller_final_batch=True)

        #Create the model inference
        with slim.arg_scope(ENet_arg_scope()):
            logits, probabilities = ENet(images,
                                         num_classes,
                                         batch_size=batch_size,
                                         is_training=True,
                                         reuse=None,
                                         num_initial_blocks=num_initial_blocks,
                                         stage_two_repeat=stage_two_repeat,
                                         skip_connections=skip_connections)

        # Set up the variables to restore and restoring function from a saver.
        exclude = []
        variables_to_restore = slim.get_variables_to_restore(exclude=exclude)

        saver = tf.train.Saver(variables_to_restore)

        def restore_fn(sess):
            return saver.restore(sess, checkpoint_file)

        #perform one-hot-encoding on the ground truth annotation to get same shape as the logits
        annotations = tf.reshape(annotations,
                                 shape=[batch_size, image_height, image_width])
        annotations_ohe = tf.one_hot(annotations, num_classes, axis=-1)
        annotations = tf.cast(annotations, tf.int64)

        #State the metrics that you want to predict. We get a predictions that is not one_hot_encoded.
        predictions = tf.argmax(probabilities, -1)
        accuracy, accuracy_update = tf.contrib.metrics.streaming_accuracy(
            predictions, annotations)
        mean_IOU, mean_IOU_update = tf.contrib.metrics.streaming_mean_iou(
            predictions=predictions,
            labels=annotations,
            num_classes=num_classes)
        per_class_accuracy, per_class_accuracy_update = tf.metrics.mean_per_class_accuracy(
            labels=annotations,
            predictions=predictions,
            num_classes=num_classes)
        metrics_op = tf.group(accuracy_update, mean_IOU_update,
                              per_class_accuracy_update)

        #Create the global step and an increment op for monitoring
        global_step = get_or_create_global_step()
        global_step_op = tf.assign(
            global_step, global_step + 1
        )  #no apply_gradient method so manually increasing the global_step

        #Create a evaluation step function
        def eval_step(sess, metrics_op, global_step):
            '''
            Simply takes in a session, runs the metrics op and some logging information.
            '''
            start_time = time.time()
            _, global_step_count, accuracy_value, mean_IOU_value, per_class_accuracy_value = sess.run(
                [
                    metrics_op, global_step_op, accuracy, mean_IOU,
                    per_class_accuracy
                ])
            time_elapsed = time.time() - start_time

            #Log some information
            logging.info(
                'Global Step %s: Streaming Accuracy: %.4f     Streaming Mean IOU: %.4f     Per-class Accuracy: %.4f (%.2f sec/step)',
                global_step_count, accuracy_value, mean_IOU_value,
                per_class_accuracy_value, time_elapsed)

            return accuracy_value, mean_IOU_value, per_class_accuracy_value

        #Create your summaries
        tf.summary.scalar('Monitor/test_accuracy', accuracy)
        tf.summary.scalar('Monitor/test_mean_per_class_accuracy',
                          per_class_accuracy)
        tf.summary.scalar('Monitor/test_mean_IOU', mean_IOU)
        my_summary_op = tf.summary.merge_all()

        #Define your supervisor for running a managed session. Do not run the summary_op automatically or else it will consume too much memory
        sv = tf.train.Supervisor(logdir=logdir,
                                 summary_op=None,
                                 init_fn=restore_fn)

        #Run the managed session
        with sv.managed_session() as sess:
            for step in range(int(num_steps_per_epoch * num_epochs)):
                #print vital information every start of the epoch as always
                if step % num_batches_per_epoch == 0:
                    accuracy_value, mean_IOU_value = sess.run(
                        [accuracy, mean_IOU])
                    logging.info('Epoch: %s/%s',
                                 step / num_batches_per_epoch + 1, num_epochs)
                    logging.info('Current Streaming Accuracy: %.4f',
                                 accuracy_value)
                    logging.info('Current Streaming Mean IOU: %.4f',
                                 mean_IOU_value)

                #Compute summaries every 10 steps and continue evaluating
                if step % 10 == 0:
                    test_accuracy, test_mean_IOU, test_per_class_accuracy = eval_step(
                        sess,
                        metrics_op=metrics_op,
                        global_step=sv.global_step)
                    summaries = sess.run(my_summary_op)
                    sv.summary_computed(sess, summaries)

                #Otherwise just run as per normal
                else:
                    test_accuracy, test_mean_IOU, test_per_class_accuracy = eval_step(
                        sess,
                        metrics_op=metrics_op,
                        global_step=sv.global_step)

            #At the end of all the evaluation, show the final accuracy
            logging.info('Final Streaming Accuracy: %.4f', test_accuracy)
            logging.info('Final Mean IOU: %.4f', test_mean_IOU)
            logging.info('Final Per Class Accuracy %.4f',
                         test_per_class_accuracy)

            #Show end of evaluation
            logging.info('Finished evaluating!')

            #Save the images
            if save_images:
                if not os.path.exists(photo_dir):
                    os.mkdir(photo_dir)

                #Save the image visualizations for the first 10 images.
                logging.info('Saving the images now...')
                predictions_val, annotations_val = sess.run(
                    [predictions, annotations])

                for i in range(10):
                    predicted_annotation = predictions_val[i]
                    annotation = annotations_val[i]

                    plt.subplot(1, 2, 1)
                    plt.imshow(predicted_annotation)
                    plt.subplot(1, 2, 2)
                    plt.imshow(annotation)
                    plt.savefig(photo_dir + "/image_" + str(i))
Ejemplo n.º 52
0
def run():
    # Create the log directory here. Must be done here otherwise import will activate this unneededly.
    # 创建log目录
    if not os.path.exists(log_dir):
        os.mkdir(log_dir)

    # ======================= TRAINING PROCESS(训练过程) =========================
    # Now we start to construct the graph and build our model
    # 现在我们开始构造图并建立我们的模型
    with tf.Graph().as_default() as graph:
        # Set the verbosity to INFO level
        # 设置日志的级别,会将日志级别为INFO的打印出
        tf.logging.set_verbosity(tf.logging.INFO)

        # First create the dataset and load one batch
        # 首先,创建数据集并加载一个批次
        dataset = get_split('train', dataset_dir, file_pattern=file_pattern)
        images, _, labels = load_batch(dataset, batch_size=batch_size)

        # Know the number steps to take before decaying the learning rate and batches per epoch
        num_batches_per_epoch = dataset.num_samples // batch_size
        num_steps_per_epoch = num_batches_per_epoch  # Because one step is one batch processed
        decay_steps = int(num_epochs_before_decay * num_steps_per_epoch)

        # Create the model inference
        # 创建模型推理
        with slim.arg_scope(xception_arg_scope()):
            logits, end_points = xception(images, num_classes=dataset.num_classes, is_training=True)

        # Perform one-hot-encoding of the labels (Try one-hot-encoding within the load_batch function!)
        # 将标签编程one-hot形式
        one_hot_labels = slim.one_hot_encoding(labels, dataset.num_classes)

        # Performs the equivalent to tf.nn.sparse_softmax_cross_entropy_with_logits but enhanced with checks
        # 计算损失
        loss = tf.losses.softmax_cross_entropy(onehot_labels=one_hot_labels, logits=logits)
        total_loss = tf.losses.get_total_loss()  # obtain the regularization losses as well

        # Create the global step for monitoring the learning_rate and training.
        # 创建global_step
        global_step = get_or_create_global_step()

        # Define your exponentially decaying learning rate
        # 定义指数衰减的学习率
        lr = tf.train.exponential_decay(
            learning_rate=initial_learning_rate,
            global_step=global_step,
            decay_steps=decay_steps,
            decay_rate=learning_rate_decay_factor,
            staircase=True)

        # Now we can define the optimizer that takes on the learning rate
        # 定义优化器
        optimizer = tf.train.AdamOptimizer(learning_rate=lr)
        # optimizer = tf.train.RMSPropOptimizer(learning_rate = lr, momentum=0.9)

        # Create the train_op.
        # 创建训练操作
        train_op = slim.learning.create_train_op(total_loss, optimizer)

        # State the metrics that you want to predict. We get a predictions that is not one_hot_encoded.
        # 定义度量标准
        predictions = tf.argmax(end_points['Predictions'], 1)
        probabilities = end_points['Predictions']
        accuracy, accuracy_update = tf.contrib.metrics.streaming_accuracy(predictions, labels)
        metrics_op = tf.group(accuracy_update, probabilities)

        # Now finally create all the summaries you need to monitor and group them into one summary op.
        # 创建summary
        tf.summary.scalar('losses/Total_Loss', total_loss)
        tf.summary.scalar('accuracy', accuracy)
        tf.summary.scalar('learning_rate', lr)
        my_summary_op = tf.summary.merge_all()

        # Now we need to create a training step function that runs both the train_op, metrics_op and updates the global_step concurrently.
        def train_step(sess, train_op, global_step):
            '''
            Simply runs a session for the three arguments provided and gives a logging on the time elapsed for each global step
            '''
            # Check the time for each sess run
            start_time = time.time()
            total_loss, global_step_count, _ = sess.run([train_op, global_step, metrics_op])
            time_elapsed = time.time() - start_time

            # Run the logging to print some results
            logging.info('global step %s: loss: %.4f (%.2f sec/step)', global_step_count, total_loss, time_elapsed)

            return total_loss, global_step_count

        # Define your supervisor for running a managed session.
        # Do not run the summary_op automatically or else it will consume too much memory
        sv = tf.train.Supervisor(logdir=log_dir, summary_op=None)

        # Run the managed session
        with sv.managed_session() as sess:
            for step in range(num_steps_per_epoch * num_epochs):
                # At the start of every epoch, show the vital information:
                if step % num_batches_per_epoch == 0:
                    logging.info('Epoch %s/%s', step / num_batches_per_epoch + 1, num_epochs)
                    learning_rate_value, accuracy_value = sess.run([lr, accuracy])
                    logging.info('Current Learning Rate: %s', learning_rate_value)
                    logging.info('Current Streaming Accuracy: %s', accuracy_value)

                    # optionally, print your logits and predictions for a sanity check that things are going fine.
                    logits_value, probabilities_value, predictions_value, labels_value = sess.run(
                        [logits, probabilities, predictions, labels])
                    print('logits: \n', logits_value[:5])
                    print('Probabilities: \n', probabilities_value[:5])
                    print('predictions: \n', predictions_value[:5])
                    print('Labels:\n:', labels_value[:5])

                # Log the summaries every 10 step.
                if step % 10 == 0:
                    loss, _ = train_step(sess, train_op, sv.global_step)
                    summaries = sess.run(my_summary_op)
                    sv.summary_computed(sess, summaries)

                # If not, simply run the training step
                else:
                    loss, _ = train_step(sess, train_op, sv.global_step)

            # We log the final training loss and accuracy
            logging.info('Final Loss: %s', loss)
            logging.info('Final Accuracy: %s', sess.run(accuracy))

            # Once all the training has been done, save the log files and checkpoint model
            logging.info('Finished training! Saving model to disk now.')
Ejemplo n.º 53
0
    def _build_graph(self):
        with tf.Graph().as_default() as graph:
            tf.logging.set_verbosity(
                tf.logging.INFO)  # Set the verbosity to INFO level

            # #First create the dataset and load one batch
            def load_batch_from_tfrecord(
                    self,
                    split_name,
                    dataset_dir=self.tfrecord_dir,
                    num_classes=self.num_classes,
                    file_pattern_for_counting=self.tfrecord_prefix,
                    batch_size=self.batch_size):
                is_training = True if split_name == 'train' else False
                file_pattern = self.tfrecord_prefix + '_%s_*.tfrecord'
                dataset = get_split(split_name, dataset_dir, num_classes,
                                    file_pattern, file_pattern_for_counting)
                images, _, labels = load_batch(dataset,
                                               batch_size,
                                               num_classes,
                                               height=self.image_size,
                                               width=self.image_size,
                                               is_training=is_training)
                return images, labels, dataset.num_samples

            ## get train data
            train_images, self.train_labels, self.num_samples = load_batch_from_tfrecord(
                self, 'train')
            ## get validation data
            val_images, self.val_labels, self.val_num_samples = load_batch_from_tfrecord(
                self, 'validation')
            # #Know the number steps to take before decaying the learning rate and batches per epoch
            self.num_batches_per_epoch = (self.num_samples -
                                          1) / self.batch_size + 1
            self.val_num_batches_per_epoch = (self.val_num_samples -
                                              1) / self.batch_size + 1

            with slim.arg_scope(inception_resnet_v2_arg_scope()):
                logits, end_points = inception_resnet_v2(
                    train_images,
                    num_classes=self.num_classes,
                    is_training=True)
            ## convert into probabilities
            self.probabilities = tf.sigmoid(logits)

            ## new loss, just equal to the sum of 14 log loss
            loss = tf.losses.log_loss(labels=self.train_labels,
                                      predictions=self.probabilities)
            # total_loss = tf.losses.get_total_loss()  # obtain the regularization losses as well
            l2_loss = tf.add_n(
                [tf.nn.l2_loss(var) for var in tf.trainable_variables()])
            total_loss = loss + l2_loss * self.weight_decay

            ## convert into actual predicte
            lesion_pred = tf.cast(tf.greater_equal(self.probabilities, 0.5),
                                  tf.float32)

            # Create the global step for monitoring the learning_rate and training.
            self.global_step = get_or_create_global_step()
            decay_steps = int(self.step_size * self.num_batches_per_epoch)
            # Define your exponentially decaying learning rate
            self.lr = tf.train.exponential_decay(
                learning_rate=self.learning_rate,
                global_step=self.global_step,
                decay_steps=decay_steps,
                decay_rate=self.lr_decay_factor,
                staircase=True)
            # Now we can define the optimizer that takes on the learning rate
            optimizer = tf.train.AdamOptimizer(learning_rate=self.lr)
            # Create the train_op.
            self.train_op = slim.learning.create_train_op(
                total_loss, optimizer)
            # State the metrics that you want to predict. We get a predictions that is not one_hot_encoded.
            self.accuracy = tf.reduce_mean(
                tf.cast(tf.equal(lesion_pred, self.train_labels), tf.float32))

            # def val_graph(images, labels):
            with slim.arg_scope(inception_resnet_v2_arg_scope()):
                val_logits, val_end_points = inception_resnet_v2(
                    val_images,
                    num_classes=self.num_classes,
                    is_training=False,
                    reuse=True)
                self.val_probabilities = tf.sigmoid(val_logits)

            ## new loss, just equal to the sum of 14 log loss
            self.val_loss = tf.losses.log_loss(
                labels=self.val_labels, predictions=self.val_probabilities)
            val_lesion_pred = tf.cast(
                tf.greater_equal(self.val_probabilities, 0.5), tf.float32)
            self.val_accuracy = tf.reduce_mean(
                tf.cast(tf.equal(val_lesion_pred, self.val_labels),
                        tf.float32))

            # Now finally create all the summaries you need to monitor and group them into one summary op.
            tf.summary.scalar('losses/Total_Loss', total_loss)
            tf.summary.scalar('accuracy', self.accuracy)
            # tf.summary.scalar('auc', auc)
            tf.summary.scalar('learning_rate', self.lr)
            tf.summary.scalar('val_losses', self.val_loss)
            tf.summary.scalar('val_accuracy', self.val_accuracy)
            self.my_summary_op = tf.summary.merge_all()
Ejemplo n.º 54
0
def run():
    with tf.Graph().as_default() as graph:
        tf.logging.set_verbosity(tf.logging.INFO)

        #===================TRAINING BRANCH=======================
        #Load the files into one input queue
        images = tf.convert_to_tensor(image_files)
        annotations = tf.convert_to_tensor(annotation_files)
        input_queue = tf.train.slice_input_producer(
            [images,
             annotations])  #Slice_input producer shuffles the data by default.

        #Decode the image and annotation raw content
        image = tf.read_file(input_queue[0])
        image = tf.image.decode_image(image, channels=3)
        annotation = tf.read_file(input_queue[1])
        annotation = tf.image.decode_image(annotation)

        #preprocess and batch up the image and annotation
        preprocessed_image, preprocessed_annotation = preprocess(
            image, annotation, image_height, image_width)
        images, annotations = tf.train.batch(
            [preprocessed_image, preprocessed_annotation],
            batch_size=batch_size,
            allow_smaller_final_batch=True)

        #Create the model inference
        with slim.arg_scope(ENet_arg_scope(weight_decay=weight_decay)):
            logits, probabilities = ENet(images,
                                         num_classes,
                                         batch_size=batch_size,
                                         is_training=True,
                                         reuse=None,
                                         num_initial_blocks=num_initial_blocks,
                                         stage_two_repeat=stage_two_repeat,
                                         skip_connections=skip_connections)

        #perform one-hot-encoding on the ground truth annotation to get same shape as the logits
        annotations = tf.reshape(annotations,
                                 shape=[batch_size, image_height, image_width])
        annotations_ohe = tf.one_hot(annotations, num_classes, axis=-1)

        #Actually compute the loss
        loss = weighted_cross_entropy(logits=logits,
                                      onehot_labels=annotations_ohe,
                                      class_weights=class_weights)
        total_loss = tf.losses.get_total_loss()

        #Create the global step for monitoring the learning_rate and training.
        global_step = get_or_create_global_step()

        #Define your exponentially decaying learning rate
        lr = tf.train.exponential_decay(learning_rate=initial_learning_rate,
                                        global_step=global_step,
                                        decay_steps=decay_steps,
                                        decay_rate=learning_rate_decay_factor,
                                        staircase=True)

        #Now we can define the optimizer that takes on the learning rate
        optimizer = tf.train.AdamOptimizer(learning_rate=lr, epsilon=epsilon)

        #Create the train_op.
        train_op = slim.learning.create_train_op(total_loss, optimizer)

        #State the metrics that you want to predict. We get a predictions that is not one_hot_encoded.
        predictions = tf.argmax(probabilities, -1)
        accuracy, accuracy_update = tf.contrib.metrics.streaming_accuracy(
            predictions, annotations)
        mean_IOU, mean_IOU_update = tf.contrib.metrics.streaming_mean_iou(
            predictions=predictions,
            labels=annotations,
            num_classes=num_classes)
        metrics_op = tf.group(accuracy_update, mean_IOU_update)

        #Now we need to create a training step function that runs both the train_op, metrics_op and updates the global_step concurrently.
        def train_step(sess, train_op, global_step, metrics_op):
            '''
            Simply runs a session for the three arguments provided and gives a logging on the time elapsed for each global step
            '''
            #Check the time for each sess run
            start_time = time.time()
            total_loss, global_step_count, accuracy_val, mean_IOU_val, _ = sess.run(
                [train_op, global_step, accuracy, mean_IOU, metrics_op])
            time_elapsed = time.time() - start_time

            #Run the logging to show some results
            logging.info(
                'global step %s: loss: %.4f (%.2f sec/step)    Current Streaming Accuracy: %.4f    Current Mean IOU: %.4f',
                global_step_count, total_loss, time_elapsed, accuracy_val,
                mean_IOU_val)

            return total_loss, accuracy_val, mean_IOU_val

        #================VALIDATION BRANCH========================
        #Load the files into one input queue
        images_val = tf.convert_to_tensor(image_val_files)
        annotations_val = tf.convert_to_tensor(annotation_val_files)
        input_queue_val = tf.train.slice_input_producer(
            [images_val, annotations_val])

        #Decode the image and annotation raw content
        image_val = tf.read_file(input_queue_val[0])
        image_val = tf.image.decode_jpeg(image_val, channels=3)
        annotation_val = tf.read_file(input_queue_val[1])
        annotation_val = tf.image.decode_png(annotation_val)

        #preprocess and batch up the image and annotation
        preprocessed_image_val, preprocessed_annotation_val = preprocess(
            image_val, annotation_val, image_height, image_width)
        images_val, annotations_val = tf.train.batch(
            [preprocessed_image_val, preprocessed_annotation_val],
            batch_size=eval_batch_size,
            allow_smaller_final_batch=True)

        with slim.arg_scope(ENet_arg_scope(weight_decay=weight_decay)):
            logits_val, probabilities_val = ENet(
                images_val,
                num_classes,
                batch_size=eval_batch_size,
                is_training=True,
                reuse=True,
                num_initial_blocks=num_initial_blocks,
                stage_two_repeat=stage_two_repeat,
                skip_connections=skip_connections)

        #perform one-hot-encoding on the ground truth annotation to get same shape as the logits
        annotations_val = tf.reshape(
            annotations_val,
            shape=[eval_batch_size, image_height, image_width])
        annotations_ohe_val = tf.one_hot(annotations_val, num_classes, axis=-1)

        #State the metrics that you want to predict. We get a predictions that is not one_hot_encoded. ----> Should we use OHE instead?
        predictions_val = tf.argmax(probabilities_val, -1)
        accuracy_val, accuracy_val_update = tf.contrib.metrics.streaming_accuracy(
            predictions_val, annotations_val)
        mean_IOU_val, mean_IOU_val_update = tf.contrib.metrics.streaming_mean_iou(
            predictions=predictions_val,
            labels=annotations_val,
            num_classes=num_classes)
        metrics_op_val = tf.group(accuracy_val_update, mean_IOU_val_update)

        #Create an output for showing the segmentation output of validation images
        segmentation_output_val = tf.cast(predictions_val, dtype=tf.float32)
        segmentation_output_val = tf.reshape(
            segmentation_output_val, shape=[-1, image_height, image_width, 1])
        segmentation_ground_truth_val = tf.cast(annotations_val,
                                                dtype=tf.float32)
        segmentation_ground_truth_val = tf.reshape(
            segmentation_ground_truth_val,
            shape=[-1, image_height, image_width, 1])

        def eval_step(sess, metrics_op):
            '''
            Simply takes in a session, runs the metrics op and some logging information.
            '''
            start_time = time.time()
            _, accuracy_value, mean_IOU_value = sess.run(
                [metrics_op, accuracy_val, mean_IOU_val])
            time_elapsed = time.time() - start_time

            #Log some information
            logging.info(
                '---VALIDATION--- Validation Accuracy: %.4f    Validation Mean IOU: %.4f    (%.2f sec/step)',
                accuracy_value, mean_IOU_value, time_elapsed)

            return accuracy_value, mean_IOU_value

        #=====================================================

        #Now finally create all the summaries you need to monitor and group them into one summary op.
        tf.summary.scalar('Monitor/Total_Loss', total_loss)
        tf.summary.scalar('Monitor/validation_accuracy', accuracy_val)
        tf.summary.scalar('Monitor/training_accuracy', accuracy)
        tf.summary.scalar('Monitor/validation_mean_IOU', mean_IOU_val)
        tf.summary.scalar('Monitor/training_mean_IOU', mean_IOU)
        tf.summary.scalar('Monitor/learning_rate', lr)
        tf.summary.image('Images/Validation_original_image',
                         images_val,
                         max_outputs=1)
        tf.summary.image('Images/Validation_segmentation_output',
                         segmentation_output_val,
                         max_outputs=1)
        tf.summary.image('Images/Validation_segmentation_ground_truth',
                         segmentation_ground_truth_val,
                         max_outputs=1)
        my_summary_op = tf.summary.merge_all()

        #Define your supervisor for running a managed session. Do not run the summary_op automatically or else it will consume too much memory
        sv = tf.train.Supervisor(logdir=logdir, summary_op=None, init_fn=None)

        # Run the managed session
        with sv.managed_session() as sess:
            for step in xrange(int(num_steps_per_epoch * num_epochs)):
                #At the start of every epoch, show the vital information:
                if step % num_batches_per_epoch == 0:
                    logging.info('Epoch %s/%s',
                                 step / num_batches_per_epoch + 1, num_epochs)
                    learning_rate_value = sess.run([lr])
                    logging.info('Current Learning Rate: %s',
                                 learning_rate_value)

                #Log the summaries every 10 steps or every end of epoch, which ever lower.
                if step % min(num_steps_per_epoch, 10) == 0:
                    loss, training_accuracy, training_mean_IOU = train_step(
                        sess, train_op, sv.global_step, metrics_op=metrics_op)

                    #Check the validation data only at every third of an epoch
                    if step % (num_steps_per_epoch / 3) == 0:
                        for i in xrange(
                                len(image_val_files) / eval_batch_size):
                            validation_accuracy, validation_mean_IOU = eval_step(
                                sess, metrics_op_val)

                    summaries = sess.run(my_summary_op)
                    sv.summary_computed(sess, summaries)

                #If not, simply run the training step
                else:
                    loss, training_accuracy, training_mean_IOU = train_step(
                        sess, train_op, sv.global_step, metrics_op=metrics_op)

            #We log the final training loss
            logging.info('Final Loss: %s', loss)
            logging.info('Final Training Accuracy: %s', training_accuracy)
            logging.info('Final Training Mean IOU: %s', training_mean_IOU)
            logging.info('Final Validation Accuracy: %s', validation_accuracy)
            logging.info('Final Validation Mean IOU: %s', validation_mean_IOU)

            #Once all the training has been done, save the log files and checkpoint model
            logging.info('Finished training! Saving model to disk now.')
            sv.saver.save(sess, sv.save_path, global_step=sv.global_step)

            if save_images:
                if not os.path.exists(photo_dir):
                    os.mkdir(photo_dir)

                #Plot the predictions - check validation images only
                logging.info('Saving the images now...')
                predictions_value, annotations_value = sess.run(
                    [predictions_val, annotations_val])

                for i in xrange(eval_batch_size):
                    predicted_annotation = predictions_value[i]
                    annotation = annotations_value[i]

                    plt.subplot(1, 2, 1)
                    plt.imshow(predicted_annotation)
                    plt.subplot(1, 2, 2)
                    plt.imshow(annotation)
                    plt.savefig(photo_dir + "/image_" + str(i))
Ejemplo n.º 55
0
def run():
    """ Trainer Runner

    Runs the ALL Detection System 2019 NCS1 Classifier Trainer.
    """

    humanStart, clockStart = Trainer.Helpers.timerStart()

    Trainer.Helpers.logger.info(
        "ALL Detection System 2019 NCS1 Trainer started.")

    # Open the labels file
    Trainer.labels = open(
        Trainer.confs["Classifier"]["DatasetDir"] + "/" +
        Trainer.confs["Classifier"]["Labels"], 'r')

    # Create a dictionary to refer each label to their string name
    for line in Trainer.labels:
        label, string_name = line.split(':')
        string_name = string_name[:-1]  # Remove newline
        Trainer.labelsToName[int(label)] = string_name

    # Create a dictionary that will help people understand your dataset better. This is required by the Dataset class later.
    Trainer.items_to_descriptions = {
        'image': 'A 3-channel RGB coloured  image that is ex: office, people',
        'label': 'A label that ,start from zero'
    }

    # Create the log directory here. Must be done here otherwise import will activate this unneededly.
    if not os.path.exists(Trainer.confs["Classifier"]["LogDir"]):
        os.mkdir(Trainer.confs["Classifier"]["LogDir"])

    # Now we start to construct the graph and build our model
    with tf.Graph().as_default() as graph:
        # Set the verbosity to INFO level
        tf.logging.set_verbosity(tf.logging.INFO)

        # First create the dataset and load one batch
        dataset = Trainer.getSplit('train')
        images, _, labels = Trainer.loadBatch(dataset)

        # Know the number steps to take before decaying the learning rate and batches per epoch
        num_batches_per_epoch = dataset.num_samples // Trainer.confs[
            "Classifier"]["BatchSize"]
        # Because one step is one batch processed
        num_steps_per_epoch = num_batches_per_epoch
        decay_steps = int(Trainer.confs["Classifier"]["EpochsBeforeDecay"] *
                          num_steps_per_epoch)

        # Create the model inference
        with slim.arg_scope(inception_v3_arg_scope()):
            logits, end_points = inception_v3(images,
                                              num_classes=dataset.num_classes,
                                              is_training=True)

        # Perform one-hot-encoding of the labels (Try one-hot-encoding within the load_batch function!)
        one_hot_labels = slim.one_hot_encoding(labels, dataset.num_classes)

        # Performs the equivalent to tf.nn.sparse_softmax_cross_entropy_with_logits but enhanced with checks
        loss = tf.losses.softmax_cross_entropy(onehot_labels=one_hot_labels,
                                               logits=logits)
        # obtain the regularization losses as well
        total_loss = tf.losses.get_total_loss()

        # Create the global step for monitoring the learning_rate and training.
        global_step = get_or_create_global_step()

        # Define your exponentially decaying learning rate
        lr = tf.train.exponential_decay(
            learning_rate=Trainer.confs["Classifier"]["LearningRate"],
            global_step=global_step,
            decay_steps=decay_steps,
            decay_rate=Trainer.confs["Classifier"]["LearningRateDecay"],
            staircase=True)

        # Now we can define the optimizer that takes on the learning rate
        optimizer = tf.train.AdamOptimizer(learning_rate=lr)
        # optimizer = tf.train.RMSPropOptimizer(learning_rate = lr, momentum=0.9)

        # Create the train_op.
        train_op = slim.learning.create_train_op(total_loss, optimizer)

        # State the metrics that you want to predict. We get a predictions that is not one_hot_encoded.
        predictions = tf.argmax(end_points['Predictions'], 1)
        probabilities = end_points['Predictions']
        accuracy, accuracy_update = tf.contrib.metrics.streaming_accuracy(
            predictions, labels)
        metrics_op = tf.group(accuracy_update, probabilities)

        # Now finally create all the summaries you need to monitor and group them into one summary op.
        tf.summary.scalar('losses/Total_Loss', total_loss)
        tf.summary.scalar('accuracy', accuracy)
        tf.summary.scalar('learning_rate', lr)
        my_summary_op = tf.summary.merge_all()

        # Now we need to create a training step function that runs both the train_op, metrics_op and updates the global_step concurrently.
        def train_step(sess, train_op, global_step, epochCount):
            '''
            Simply runs a session for the three arguments provided and gives a logging on the time elapsed for each global step
            '''
            # Check the time for each sess run
            start_time = time.time()
            total_loss, global_step_count, _ = sess.run(
                [train_op, global_step, metrics_op])
            time_elapsed = time.time() - start_time

            # Run the logging to print some results
            logging.info(' Epch %.2f Glb Stp %s: Loss: %.4f (%.2f sec/step)',
                         epochCount, global_step_count, total_loss,
                         time_elapsed)

            return total_loss, global_step_count

        # Define your supervisor for running a managed session. Do not run the summary_op automatically or else it will consume too much memory
        sv = tf.train.Supervisor(logdir=Trainer.confs["Classifier"]["LogDir"],
                                 summary_op=None)

        # Run the managed session
        with sv.managed_session() as sess:
            for step in range(num_steps_per_epoch *
                              Trainer.confs["Classifier"]["Epochs"]):
                # At the start of every epoch, show the vital information:
                if step % num_batches_per_epoch == 0:
                    logging.info('Epoch %s/%s',
                                 step / num_batches_per_epoch + 1,
                                 Trainer.confs["Classifier"]["Epochs"])
                    learning_rate_value, accuracy_value = sess.run(
                        [lr, accuracy])
                    logging.info('Current Learning Rate: %s',
                                 learning_rate_value)
                    logging.info('Current Streaming Accuracy: %s',
                                 accuracy_value)

                    # optionally, print your logits and predictions for a sanity check that things are going fine.
                    logits_value, probabilities_value, predictions_value, labels_value = sess.run(
                        [logits, probabilities, predictions, labels])
                    print('logits: \n', logits_value[:5])
                    print('Probabilities: \n', probabilities_value[:5])
                    print('predictions: \n', predictions_value[:100])
                    print('Labels:\n:', labels_value[:100])

                # Log the summaries every 10 step.
                if step % 10 == 0:
                    loss, _ = train_step(sess, train_op, sv.global_step,
                                         step / num_batches_per_epoch + 1)
                    summaries = sess.run(my_summary_op)
                    sv.summary_computed(sess, summaries)
                # If not, simply run the training step
                else:
                    loss, _ = train_step(sess, train_op, sv.global_step,
                                         step / num_batches_per_epoch + 1)

            # We log the final training loss and accuracy
            logging.info('Final Loss: %s', loss)
            logging.info('Final Accuracy: %s', sess.run(accuracy))

            # Once all the training has been done, save the log files and checkpoint model
            logging.info('Finished training! Saving model to disk now.')

    checkpoint_file = tf.train.latest_checkpoint(
        Trainer.confs["Classifier"]["LogDir"])

    with tf.Graph().as_default() as graph:

        # images = tf.placeholder(shape=[None, ImageSize, ImageSize, 3], dtype=tf.float32, name = 'Placeholder_only')
        images = tf.placeholder("float", [
            1, Trainer.confs["Classifier"]["ImageSize"],
            Trainer.confs["Classifier"]["ImageSize"], 3
        ],
                                name="input")

        with slim.arg_scope(inception_v3_arg_scope()):
            logits, end_points = inception_v3(
                images,
                num_classes=Trainer.confs["Classifier"]["NumClasses"],
                is_training=False)
        probabilities = tf.nn.softmax(logits)
        saver = tf.train.Saver(slim.get_variables_to_restore())

        # Setup graph def
        input_graph_def = graph.as_graph_def()
        output_node_names = Trainer.confs["Classifier"]["OutputNode"]
        output_graph_name = Trainer.confs["Classifier"]["ALLGraph"]

        with tf.Session() as sess:
            saver.restore(sess, checkpoint_file)

            # Exporting the graph
            print("Exporting graph...")
            output_graph_def = graph_util.convert_variables_to_constants(
                sess, input_graph_def, output_node_names.split(","))

            with tf.gfile.GFile(output_graph_name, "wb") as f:
                f.write(output_graph_def.SerializeToString())

        clockEnd, difference, humanEnd = Trainer.Helpers.timerEnd(clockStart)

    Trainer.Helpers.logger.info(
        "ALL Detection System 2019 NCS1 Trainer ended in " + str(difference))
Ejemplo n.º 56
0
def train():
    img = tf.placeholder(
        shape=[config.batch_size, config.image_size, config.image_size, 3],
        dtype=tf.float32)
    anchors_num = sum([
        config.Config['feature_maps'][s]**2 * config.Config['aspect_num'][s]
        for s in range(3)
    ])

    input_loc_t = tf.placeholder(shape=[config.batch_size, anchors_num, 4],
                                 dtype=tf.float32)
    input_conf_t = tf.placeholder(shape=[config.batch_size, anchors_num],
                                  dtype=tf.float32)
    input_gt_mask = tf.placeholder(shape=[
        config.batch_size, config.mask_pool_shape * 2,
        config.mask_pool_shape * 2, 100
    ],
                                   dtype=tf.int32)
    input_gt_box = tf.placeholder(shape=[config.batch_size, 100, 4],
                                  dtype=tf.float32)
    input_mask_index = tf.placeholder(shape=[config.batch_size, anchors_num],
                                      dtype=tf.int32)

    #gen = data_gen.get_batch_shapes(batch_size=config.batch_size, image_size=config.image_size,mask_pool_size=config.mask_pool_shape*2)

    input_gt_mask_trans = tf.transpose(input_gt_mask, [0, 3, 1, 2])
    pred_loc, pred_confs, mask_fp, vbs = iv2_mask_add.gen_box(img, config)

    target_mask = mask_model.get_target_mask(input_gt_box, input_gt_mask_trans,
                                             input_mask_index, config)

    train_tensors = mask_model.get_loss(input_conf_t, input_loc_t, pred_loc,
                                        pred_confs, target_mask, mask_fp,
                                        config)
    global_step = get_or_create_global_step()
    lr = tf.train.exponential_decay(learning_rate=0.001,
                                    global_step=global_step,
                                    decay_steps=100000,
                                    decay_rate=0.7,
                                    staircase=True)
    tf.summary.scalar('lr', lr)
    sum_op = tf.summary.merge_all()

    optimizer = tf.train.MomentumOptimizer(learning_rate=0.001, momentum=0.9)
    train_op = slim.learning.create_train_op(train_tensors, optimizer)

    saver = tf.train.Saver(vbs)

    def restore(sess):
        saver.restore(sess, '/home/dsl/all_check/inception_v2.ckpt')

    sv = tf.train.Supervisor(
        logdir='/home/dsl/all_check/face_detect/coco_mask',
        summary_op=None,
        init_fn=restore)

    with sv.managed_session() as sess:
        for step in range(1000000000):

            data_images, data_true_box, data_true_label, data_true_mask = data_loader.q.get(
            )

            data_loct, data_conft, data_mask_index = np_utils.get_loc_conf_mask(
                data_true_box,
                data_true_label,
                batch_size=config.batch_size,
                cfg=config.Config)

            feed_dict = {
                img: data_images,
                input_loc_t: data_loct,
                input_conf_t: data_conft,
                input_gt_mask: data_true_mask,
                input_gt_box: data_true_box,
                input_mask_index: data_mask_index
            }

            t = time.time()

            ls, step = sess.run([train_op, global_step], feed_dict=feed_dict)
            print(ls)

            if step % 10 == 0:
                tt = time.time() - t
                print(data_true_label)

                print('step:' + str(step) + ' ' + 'class_loss:' + str(ls[0]) +
                      ' ' + 'loc_loss:' + str(ls[1]) + ' ' + 'mask_loss:' +
                      str(ls[2]) + ' ' + 'timestp:' + str(tt))
                summaries = sess.run(sum_op, feed_dict=feed_dict)
                sv.summary_computed(sess, summaries)
Ejemplo n.º 57
0
 def begin(self):
     if self._replace_summary_op:
         self._summary_op = summary.merge_all()
     self._global_step = variables.get_or_create_global_step()
Ejemplo n.º 58
0
def run():
    #Create log_dir for evaluation information
    if not os.path.exists(log_eval):
        os.mkdir(log_eval)

    #Just construct the graph from scratch again
    with tf.Graph().as_default() as graph:
        tf.logging.set_verbosity(tf.logging.INFO)
        #Get the dataset first and load one batch of validation images and labels tensors. Set is_training as False so as to use the evaluation preprocessing
        dataset = get_split('validation', dataset_dir)
        images, raw_images, labels = load_batch(dataset,
                                                batch_size=batch_size,
                                                is_training=False)

        #Create some information about the training steps
        num_batches_per_epoch = dataset.num_samples / batch_size
        num_steps_per_epoch = num_batches_per_epoch

        #Now create the inference model but set is_training=False
        with slim.arg_scope(inception_resnet_v2_arg_scope()):
            logits, end_points = inception_resnet_v2(
                images, num_classes=dataset.num_classes, is_training=False)

        # #get all the variables to restore from the checkpoint file and create the saver function to restore
        variables_to_restore = slim.get_variables_to_restore()
        saver = tf.train.Saver(variables_to_restore)

        def restore_fn(sess):
            return saver.restore(sess, checkpoint_file)

        #Just define the metrics to track without the loss or whatsoever
        predictions = tf.argmax(end_points['Predictions'], 1)
        accuracy, accuracy_update = tf.contrib.metrics.streaming_accuracy(
            predictions, labels)
        metrics_op = tf.group(accuracy_update)

        #Create the global step and an increment op for monitoring
        global_step = get_or_create_global_step()
        global_step_op = tf.assign(
            global_step, global_step + 1
        )  #no apply_gradient method so manually increasing the global_step

        #Create a evaluation step function
        def eval_step(sess, metrics_op, global_step):
            '''
            Simply takes in a session, runs the metrics op and some logging information.
            '''
            start_time = time.time()
            _, global_step_count, accuracy_value = sess.run(
                [metrics_op, global_step_op, accuracy])
            time_elapsed = time.time() - start_time

            #Log some information
            logging.info(
                'Global Step %s: Streaming Accuracy: %.4f (%.2f sec/step)',
                global_step_count, accuracy_value, time_elapsed)

            return accuracy_value

        #Define some scalar quantities to monitor
        tf.summary.scalar('Validation_Accuracy', accuracy)
        my_summary_op = tf.summary.merge_all()

        #Get your supervisor
        sv = tf.train.Supervisor(logdir=log_eval,
                                 summary_op=None,
                                 saver=None,
                                 init_fn=restore_fn)

        #Now we are ready to run in one session
        with sv.managed_session() as sess:
            for step in xrange(num_steps_per_epoch * num_epochs):
                sess.run(sv.global_step)
                #print vital information every start of the epoch as always
                if step % num_batches_per_epoch == 0:
                    logging.info('Epoch: %s/%s',
                                 step / num_batches_per_epoch + 1, num_epochs)
                    logging.info('Current Streaming Accuracy: %.4f',
                                 sess.run(accuracy))

                #Compute summaries every 10 steps and continue evaluating
                if step % 10 == 0:
                    eval_step(sess,
                              metrics_op=metrics_op,
                              global_step=sv.global_step)
                    summaries = sess.run(my_summary_op)
                    sv.summary_computed(sess, summaries)

                #Otherwise just run as per normal
                else:
                    eval_step(sess,
                              metrics_op=metrics_op,
                              global_step=sv.global_step)

            #At the end of all the evaluation, show the final accuracy
            logging.info('Final Streaming Accuracy: %.4f', sess.run(accuracy))

            #Now we want to visualize the last batch's images just to see what our model has predicted
            raw_images, labels, predictions = sess.run(
                [raw_images, labels, predictions])
            for i in range(10):
                image, label, prediction = raw_images[i], labels[
                    i], predictions[i]
                prediction_name, label_name = dataset.labels_to_name[
                    prediction], dataset.labels_to_name[label]
                text = 'Prediction: %s \n Ground Truth: %s' % (prediction_name,
                                                               label_name)
                img_plot = plt.imshow(image)

                #Set up the plot and hide axes
                plt.title(text)
                img_plot.axes.get_yaxis().set_ticks([])
                img_plot.axes.get_xaxis().set_ticks([])
                plt.show()

            logging.info(
                'Model evaluation has completed! Visit TensorBoard for more information regarding your evaluation.'
            )
Ejemplo n.º 59
0
def evaluation_loop(master,
                    checkpoint_dir,
                    logdir,
                    num_evals=1,
                    eval_op=None,
                    eval_op_feed_dict=None,
                    final_op=None,
                    final_op_feed_dict=None,
                    summary_op=_USE_DEFAULT,
                    summary_op_feed_dict=None,
                    variables_to_restore=None,
                    eval_interval_secs=60,
                    max_number_of_evaluations=None):
  """Runs TF-Slim's Evaluation Loop.

  Args:
    master: The BNS address of the TensorFlow master.
    checkpoint_dir: The directory where checkpoints are stored.
    logdir: The directory where the TensorFlow summaries are written to.
    num_evals: The number of times to run `eval_op`.
    eval_op: A operation run `num_evals` times.
    eval_op_feed_dict: The feed dictionary to use when executing the `eval_op`.
    final_op: An operation to execute after all of the `eval_op` executions. The
      value of `final_op` is returned.
    final_op_feed_dict: A feed dictionary to use when executing `final_op`.
    summary_op: The summary_op to evaluate after running TF-Slims metric ops. By
      default the summary_op is set to tf.merge_all_summaries().
    summary_op_feed_dict: An optional feed dictionary to use when running the
      `summary_op`.
    variables_to_restore: A list of TensorFlow variables to restore during
      evaluation. If the argument is left as `None` then
      slim.variables.GetVariablesToRestore() is used.
    eval_interval_secs: The minimum number of seconds between evaluations.
    max_number_of_evaluations: the max number of iterations of the evaluation.
      If the value is left as 'None', the evaluation continues indefinitely.
  """
  if summary_op == _USE_DEFAULT:
    summary_op = logging_ops.merge_all_summaries()

  global_step = variables.get_or_create_global_step()

  init_op = control_flow_ops.group(tf_variables.initialize_all_variables(),
                                   tf_variables.initialize_local_variables(),
                                   data_flow_ops.initialize_all_tables())

  saver = tf_saver.Saver(variables_to_restore or
                         variables.get_variables_to_restore())

  summary_writer = summary_io.SummaryWriter(logdir)

  sv = supervisor.Supervisor(graph=ops.get_default_graph(),
                             logdir=logdir,
                             init_op=init_op,
                             summary_op=None,
                             summary_writer=None,
                             global_step=None,
                             saver=saver)

  last_checkpoint = None
  number_of_evaluations = 0
  while True:
    last_checkpoint = wait_for_new_checkpoint(checkpoint_dir, last_checkpoint)
    start = time.time()
    logging.info('Starting evaluation at ' + time.strftime('%Y-%m-%d-%H:%M:%S',
                                                           time.gmtime()))

    with sv.managed_session(master, start_standard_services=False) as sess:
      sv.saver.restore(sess, last_checkpoint)
      sv.start_queue_runners(sess)
      evaluation(sess,
                 num_evals=num_evals,
                 eval_op=eval_op,
                 eval_op_feed_dict=eval_op_feed_dict,
                 final_op=final_op,
                 final_op_feed_dict=final_op_feed_dict,
                 summary_op=summary_op,
                 summary_op_feed_dict=summary_op_feed_dict,
                 summary_writer=summary_writer,
                 global_step=global_step)

    logging.info('Finished evaluation at ' + time.strftime('%Y-%m-%d-%H:%M:%S',
                                                           time.gmtime()))
    number_of_evaluations += 1
    if (max_number_of_evaluations and
        number_of_evaluations >= max_number_of_evaluations):
      logging.info('Reached max_number_of_evaluations=%s. Exit',
                   max_number_of_evaluations)
      break

    time_to_next_eval = start + eval_interval_secs - time.time()
    if time_to_next_eval > 0:
      time.sleep(time_to_next_eval)