def testManagedEndOfInputOneQueue(self):
     # Tests that the supervisor finishes without an error when using
     # a fixed number of epochs, reading from a single queue.
     logdir = _test_dir("managed_end_of_input_one_queue")
     os.makedirs(logdir)
     data_path = self._csv_data(logdir)
     with ops.Graph().as_default():
         # Create an input pipeline that reads the file 3 times.
         filename_queue = input_lib.string_input_producer([data_path],
                                                          num_epochs=3)
         reader = io_ops.TextLineReader()
         _, csv = reader.read(filename_queue)
         rec = parsing_ops.decode_csv(csv, record_defaults=[[1], [1], [1]])
         sv = supervisor.Supervisor(logdir=logdir)
         with sv.managed_session("") as sess:
             while not sv.should_stop():
                 sess.run(rec)
Ejemplo n.º 2
0
 def testManagedSessionUserError(self):
   logdir = self._test_dir("managed_user_error")
   with ops.Graph().as_default():
     my_op = constant_op.constant(1.0)
     sv = supervisor.Supervisor(logdir=logdir)
     last_step = None
     with self.assertRaisesRegex(RuntimeError, "failing here"):
       with sv.managed_session("") as sess:
         for step in range(10):
           last_step = step
           if step == 1:
             raise RuntimeError("failing here")
           else:
             self.evaluate(my_op)
     # Supervisor has been stopped.
     self.assertTrue(sv.should_stop())
     self.assertEqual(1, last_step)
 def testManagedSessionIgnoreOutOfRangeError(self):
     logdir = _test_dir("managed_out_of_range")
     with ops.Graph().as_default():
         my_op = constant_op.constant(1.0)
         sv = supervisor.Supervisor(logdir=logdir)
         last_step = None
         with sv.managed_session("") as sess:
             for step in xrange(10):
                 last_step = step
                 if step == 3:
                     raise errors_impl.OutOfRangeError(
                         my_op.op.node_def, my_op.op, "all done")
                 else:
                     sess.run(my_op)
         # Supervisor has been stopped.  OutOfRangeError was not thrown.
         self.assertTrue(sv.should_stop())
         self.assertEqual(3, last_step)
Ejemplo n.º 4
0
    def testPrepareSessionAfterStopForNonChief(self):
        logdir = self._test_dir("prepare_after_stop_nonchief")
        with ops.Graph().as_default():
            sv = supervisor.Supervisor(logdir=logdir, is_chief=False)

            # Create a first session and then stop.
            sess = sv.prepare_or_wait_for_session("")
            sv.stop()
            sess.close()
            self.assertTrue(sv.should_stop())

            # Now create a second session and test that we don't stay stopped, until
            # we ask to stop again.
            sess2 = sv.prepare_or_wait_for_session("")
            self.assertFalse(sv.should_stop())
            sv.stop()
            sess2.close()
            self.assertTrue(sv.should_stop())
Ejemplo n.º 5
0
    def testBasicTrainLoop(self):
        logdir = _test_dir("basic_train_loop")
        # Counts the number of calls.
        num_calls = [0]

        def train_fn(unused_sess, sv, y, a):
            num_calls[0] += 1
            self.assertEqual("y", y)
            self.assertEqual("A", a)
            if num_calls[0] == 3:
                sv.request_stop()

        with ops.Graph().as_default():
            sv = supervisor.Supervisor(logdir=logdir)
            basic_loops.basic_train_loop(sv,
                                         train_fn,
                                         args=(sv, "y"),
                                         kwargs={"a": "A"})
            self.assertEqual(3, num_calls[0])
Ejemplo n.º 6
0
  def testLocalInitOpForNonChief(self):
    logdir = self._test_dir("default_local_init_op_non_chief")
    with ops.Graph().as_default():
      with ops.device("/job:localhost"):
        # A local variable.
        v = variables.Variable(
            [1.0, 2.0, 3.0],
            trainable=False,
            collections=[ops.GraphKeys.LOCAL_VARIABLES])
        # This shouldn't add a variable to the VARIABLES collection responsible
        # for variables that are saved/restored from checkpoints.
        self.assertEquals(len(variables.global_variables()), 0)

      # Suppress normal variable inits to make sure the local one is
      # initialized via local_init_op.
      sv = supervisor.Supervisor(logdir=logdir, init_op=None, is_chief=False)
      sess = sv.prepare_or_wait_for_session("")
      self.assertAllClose([1.0, 2.0, 3.0], sess.run(v))
      sv.stop()
Ejemplo n.º 7
0
    def testTFRecordReader(self):
        with self.cached_session():
            self._tfrecord_paths = test_utils.create_tfrecord_files(
                tempfile.mkdtemp(), num_files=3)

        key, value = parallel_reader.parallel_read(
            self._tfrecord_paths, reader_class=io_ops.TFRecordReader, num_readers=3)

        sv = supervisor.Supervisor(logdir=tempfile.mkdtemp())
        with sv.prepare_or_wait_for_session() as sess:
            sv.start_queue_runners(sess)

            flowers = 0
            num_reads = 100
            for _ in range(num_reads):
                current_key, _ = sess.run([key, value])
                if 'flowers' in str(current_key):
                    flowers += 1
            self.assertGreater(flowers, 0)
            self.assertEqual(flowers, num_reads)
Ejemplo n.º 8
0
    def testTFRecordReader(self):
        with self.test_session():
            self._tfrecord_paths = test_utils.create_tfrecord_files(
                self.get_temp_dir(), num_files=3)

        key, value = parallel_reader.parallel_read(
            self._tfrecord_paths,
            reader_class=io_ops.TFRecordReader,
            num_readers=3)

        sv = supervisor.Supervisor(logdir=self.get_temp_dir())
        with sv.managed_session() as sess:

            flowers = 0
            num_reads = 100
            for _ in range(num_reads):
                current_key, _ = sess.run([key, value])
                if 'flowers' in str(current_key):
                    flowers += 1
            self.assertGreater(flowers, 0)
            self.assertEquals(flowers, num_reads)
Ejemplo n.º 9
0
    def get_session(is_chief):
      g = ops.Graph()
      with g.as_default():
        with ops.device("/job:local"):
          v = variables.Variable(
              1.0, name="ready_for_local_init_op_restore_v_" + str(uid))
          vadd = v.assign_add(1)
          w = variables.Variable(
              v,
              trainable=False,
              collections=[ops.GraphKeys.LOCAL_VARIABLES],
              name="ready_for_local_init_op_restore_w_" + str(uid))
          ready_for_local_init_op = variables.report_uninitialized_variables(
              variables.global_variables())
      sv = supervisor.Supervisor(
          logdir=logdir,
          is_chief=is_chief,
          graph=g,
          recovery_wait_secs=1,
          ready_for_local_init_op=ready_for_local_init_op)
      sess = sv.prepare_or_wait_for_session(server.target)

      return sv, sess, v, vadd, w
Ejemplo n.º 10
0
    def testBasicTrainLoopRetryOnAborted(self):
        logdir = _test_dir("basic_train_loop_exception_aborts")

        class AbortAndRetry:
            def __init__(self):
                self.num_calls = 0
                self.retries_left = 2

            def train_fn(self, unused_sess):
                self.num_calls += 1
                if self.num_calls % 3 == 2:
                    self.retries_left -= 1
                if self.retries_left > 0:
                    raise errors_impl.AbortedError(None, None, "Aborted here")
                else:
                    raise RuntimeError("Failed Again")

        with ops.Graph().as_default():
            sv = supervisor.Supervisor(logdir=logdir)
            aar = AbortAndRetry()
            with self.assertRaisesRegex(RuntimeError, "Failed Again"):
                basic_loops.basic_train_loop(sv, aar.train_fn)
            self.assertEqual(0, aar.retries_left)
Ejemplo n.º 11
0
    def _verify_all_data_sources_read(self, shared_queue):
        with self.cached_session():
            tfrecord_paths = test_utils.create_tfrecord_files(
                self.get_temp_dir(), num_files=3)

        num_readers = len(tfrecord_paths)
        p_reader = parallel_reader.ParallelReader(io_ops.TFRecordReader,
                                                  shared_queue,
                                                  num_readers=num_readers)

        data_files = parallel_reader.get_data_files(tfrecord_paths)
        filename_queue = input_lib.string_input_producer(data_files)
        key, value = p_reader.read(filename_queue)

        count0 = 0
        count1 = 0
        count2 = 0

        num_reads = 50

        sv = supervisor.Supervisor(logdir=self.get_temp_dir())
        with sv.prepare_or_wait_for_session() as sess:
            sv.start_queue_runners(sess)

            for _ in range(num_reads):
                current_key, _ = sess.run([key, value])
                if '0-of-3' in str(current_key):
                    count0 += 1
                if '1-of-3' in str(current_key):
                    count1 += 1
                if '2-of-3' in str(current_key):
                    count2 += 1

        self.assertGreater(count0, 0)
        self.assertGreater(count1, 0)
        self.assertGreater(count2, 0)
        self.assertEqual(count0 + count1 + count2, num_reads)
Ejemplo n.º 12
0
 def testManagedMainErrorTwoQueues(self):
   # Tests that the supervisor correctly raises a main loop
   # error even when using multiple queues for input.
   logdir = self._test_dir("managed_main_error_two_queues")
   os.makedirs(logdir)
   data_path = self._csv_data(logdir)
   with self.assertRaisesRegexp(RuntimeError, "fail at step 3"):
     with ops.Graph().as_default():
       # Create an input pipeline that reads the file 3 times.
       filename_queue = input_lib.string_input_producer(
           [data_path], num_epochs=3)
       reader = io_ops.TextLineReader()
       _, csv = reader.read(filename_queue)
       rec = parsing_ops.decode_csv(csv, record_defaults=[[1], [1], [1]])
       shuff_rec = input_lib.shuffle_batch(rec, 1, 6, 4)
       sv = supervisor.Supervisor(logdir=logdir)
       with sv.managed_session("") as sess:
         for step in range(9):
           if sv.should_stop():
             break
           elif step == 3:
             raise RuntimeError("fail at step 3")
           else:
             sess.run(shuff_rec)
Ejemplo n.º 13
0
    def testLocalInitOp(self):
        logdir = self._test_dir("default_local_init_op")
        with ops.Graph().as_default():
            # A local variable.
            v = variables.Variable([1.0, 2.0, 3.0],
                                   trainable=False,
                                   collections=[ops.GraphKeys.LOCAL_VARIABLES])

            # An entity which is initialized through a TABLE_INITIALIZER.
            w = variables.Variable([4, 5, 6], trainable=False, collections=[])
            ops.add_to_collection(ops.GraphKeys.TABLE_INITIALIZERS,
                                  w.initializer)

            # This shouldn't add a variable to the VARIABLES collection responsible
            # for variables that are saved/restored from checkpoints.
            self.assertEquals(len(variables.global_variables()), 0)

            # Suppress normal variable inits to make sure the local one is
            # initialized via local_init_op.
            sv = supervisor.Supervisor(logdir=logdir, init_op=None)
            sess = sv.prepare_or_wait_for_session("")
            self.assertAllClose([1.0, 2.0, 3.0], sess.run(v))
            self.assertAllClose([4, 5, 6], sess.run(w))
            sv.stop()
Ejemplo n.º 14
0
def do_training(train_op, init_fn=None, summary_op=None, lr=None):
    global savers
    graph = ops.get_default_graph()
    with graph.as_default():
        global_step = variables.get_or_create_global_step()
        saver = tf_saver.Saver(max_to_keep=0)

        with ops.name_scope('init_ops'):
            init_op = tf_variables.global_variables_initializer()

            ready_op = tf_variables.report_uninitialized_variables()

            local_init_op = control_flow_ops.group(
                tf_variables.local_variables_initializer(),
                data_flow_ops.tables_initializer())

        summary_writer = supervisor.Supervisor.USE_DEFAULT
        with ops.name_scope('train_step'):
            train_step_kwargs = {}

            if not FLAGS.max_number_of_steps is None:
                should_stop_op = math_ops.greater_equal(
                    global_step, FLAGS.max_number_of_steps)
            else:
                should_stop_op = constant_op.constant(False)
            train_step_kwargs['should_stop'] = should_stop_op
            if FLAGS.log_every_n_steps > 0:
                train_step_kwargs['should_log'] = math_ops.equal(
                    math_ops.mod(global_step, FLAGS.log_every_n_steps), 0)
        prefix = "loc/net"
        lp = len(prefix)
        vdic = {
            "InceptionV2" + v.op.name[lp:]: v
            for v in tf.trainable_variables()
            if v.name.startswith(prefix) and v.name.find("Logits/") < 0
        }
        _saver = tf_saver.Saver(vdic)
        savers.append(_saver)
        for i in xrange(NUM_STN):
            prefix = "stn%d/net" % i
            lp = len(prefix)
            vdic = {
                "InceptionV2" + v.op.name[lp:]: v
                for v in tf.trainable_variables()
                if v.name.startswith(prefix) and v.name.find("Logits/") < 0
            }
            # saver = tf.train.Saver(vdic)
            _saver = tf_saver.Saver(vdic)
            savers.append(_saver)
    prt("savers %d" % len(savers))

    is_chief = True
    logdir = FLAGS.train_dir

    sv = supervisor.Supervisor(graph=graph,
                               is_chief=is_chief,
                               logdir=logdir,
                               init_op=init_op,
                               init_feed_dict=None,
                               local_init_op=local_init_op,
                               ready_for_local_init_op=None,
                               ready_op=ready_op,
                               summary_op=summary_op,
                               summary_writer=summary_writer,
                               global_step=global_step,
                               saver=saver,
                               save_summaries_secs=FLAGS.save_summaries_secs,
                               save_model_secs=FLAGS.save_interval_secs,
                               init_fn=init_fn)

    if summary_writer is not None:
        train_step_kwargs['summary_writer'] = sv.summary_writer

    with sv.managed_session('', start_standard_services=False,
                            config=None) as sess:
        logging.info('Starting Session.')
        if is_chief:
            if logdir:
                sv.start_standard_services(sess)
        elif startup_delay_steps > 0:
            _wait_for_step(
                sess, global_step,
                min(startup_delay_steps, number_of_steps or sys.maxint))
        sv.start_queue_runners(sess)
        logging.info('Starting Queues.')
        try:
            while not sv.should_stop():
                total_loss, global_step_value, should_stop = train_step(
                    sess, train_op, global_step, lr, train_step_kwargs)
                current_epoch = int(
                    math.ceil(float(global_step_value) / FLAGS.steps_in_epoch))
                if global_step_value > 0 and global_step_value % FLAGS.save_every_n_steps == 0:
                    sv.saver.save(sess,
                                  sv.save_path,
                                  global_step=sv.global_step)

                if should_stop:
                    logging.info('Stopping Training.')
                    break
        except errors.OutOfRangeError:
            # OutOfRangeError is thrown when epoch limit per
            # tf.train.limit_epochs is reached.
            logging.info('Caught OutOfRangeError. Stopping Training.')
        if logdir and sv.is_chief:
            logging.info('Finished training! Saving model to disk.')
            sv.saver.save(sess, sv.save_path, global_step=sv.global_step)
Ejemplo n.º 15
0
def _train_internal(graph,
                    output_dir,
                    train_op,
                    loss_op,
                    global_step_tensor,
                    init_op,
                    init_feed_dict,
                    init_fn,
                    log_every_steps,
                    supervisor_is_chief,
                    supervisor_master,
                    supervisor_save_model_secs,
                    keep_checkpoint_max,
                    supervisor_save_summaries_steps,
                    feed_fn,
                    steps,
                    fail_on_nan_loss,
                    monitors,
                    max_steps):
  """See train."""
  if (steps is not None) and (max_steps is not None):
    raise ValueError('Can not provide both steps and max_steps.')
  if not output_dir:
    raise ValueError('Output directory should be non-empty %s.' % output_dir)
  if train_op is None:
    raise ValueError('Missing train_op.')
  if loss_op is None:
    raise ValueError('Missing loss_op.')

  with graph.as_default():
    global_step_tensor = contrib_variables.assert_or_get_global_step(
        graph, global_step_tensor)
    if global_step_tensor is None:
      raise ValueError('No "global_step" was provided or found in the graph.')

    # Get current step.
    try:
      start_step = load_variable(output_dir, global_step_tensor.name)
    except (errors.NotFoundError, ValueError):
      start_step = 0

    summary_writer = (get_summary_writer(output_dir)
                      if supervisor_is_chief else None)

    # Add default chief monitors if none were provided.
    if not monitors:
      monitors = monitors_lib.get_default_monitors(
          loss_op=loss_op,
          summary_op=logging_ops.get_summary_op(),
          save_summary_steps=supervisor_save_summaries_steps,
          summary_writer=summary_writer) if supervisor_is_chief else []

    # TODO(ipolosukhin): Replace all functionality of Supervisor
    # with Chief-Exclusive Monitors.
    if not supervisor_is_chief:
      # Prune list of monitor to the ones runnable on all workers.
      monitors = [monitor for monitor in monitors if monitor.run_on_all_workers]

    if max_steps is None:
      max_steps = (start_step + steps) if steps else None
    # Start monitors, can create graph parts.
    for monitor in monitors:
      monitor.begin(max_steps=max_steps)

  supervisor = tf_supervisor.Supervisor(
      graph,
      init_op=init_op or tf_supervisor.Supervisor.USE_DEFAULT,
      init_feed_dict=init_feed_dict,
      is_chief=supervisor_is_chief,
      logdir=output_dir,
      saver=_make_saver(graph, keep_checkpoint_max),
      global_step=global_step_tensor,
      summary_op=None,
      summary_writer=summary_writer,
      save_model_secs=supervisor_save_model_secs,
      init_fn=init_fn)
  session = supervisor.PrepareSession(master=supervisor_master,
                                      start_standard_services=True)
  supervisor.StartQueueRunners(session)

  with session:
    get_current_step = lambda: session.run(global_step_tensor)

    start_step = get_current_step()
    last_step = start_step
    last_log_step = start_step
    loss_value = None
    logging.info('Training steps [%d,%s)', last_step, 'inf'
                 if max_steps is None else str(max_steps))

    excinfo = None
    try:
      while not supervisor.ShouldStop() and (
          (max_steps is None) or (last_step < max_steps)):
        start_time = time.time()
        feed_dict = feed_fn() if feed_fn is not None else None

        outputs, should_stop = _run_with_monitors(
            session, last_step + 1, [train_op, loss_op], feed_dict, monitors)

        loss_value = outputs[loss_op.name]
        if np.isnan(loss_value):
          failure_message = 'Model diverged with loss = NaN.'
          if fail_on_nan_loss:
            logging.error(failure_message)
            raise monitors_lib.NanLossDuringTrainingError()
          else:
            logging.warning(failure_message)

        if should_stop:
          break

        this_step = get_current_step()

        if this_step <= last_step:
          logging.error(
              'Global step was not incremented by train op at step %s'
              ': new step %d', last_step, this_step)

        last_step = this_step
        is_last_step = (max_steps is not None) and (last_step >= max_steps)
        if is_last_step or (last_step - last_log_step >= log_every_steps):
          logging.info(
              'training step %d, loss = %.5f (%.3f sec/batch).',
              last_step, loss_value, float(time.time() - start_time))
          last_log_step = last_step
    except errors.OutOfRangeError as e:
      logging.warn('Got exception during tf.learn training loop possibly '
                   'due to exhausted input queue %s.', e)
    except StopIteration:
      logging.info('Exhausted input iterarator.')
    except BaseException as e:  # pylint: disable=broad-except
      # Hold on to any other exceptions while we try recording a final
      # checkpoint and summary.
      excinfo = sys.exc_info()
    finally:
      try:
        # Call supervisor.Stop() from within a try block because it re-raises
        # exceptions thrown by the supervised threads.
        supervisor.Stop(close_summary_writer=False)

        # Save one last checkpoint and summaries
        # TODO(wicke): This should be handled by Supervisor

        # In case we encountered an exception in the try block before we updated
        # last_step, update it here (again).
        last_step = get_current_step()
        if supervisor_is_chief:
          ckpt_path = supervisor.save_path
          logging.info('Saving checkpoint for step %d to checkpoint: %s.',
                       last_step, ckpt_path)
          supervisor.saver.save(session, ckpt_path, global_step=last_step)

          # Finish monitors.
          for monitor in monitors:
            monitor.end()

      # catch OutOfRangeError which is thrown when queue is out of data (and for
      # other reasons as well).
      except errors.OutOfRangeError as e:
        logging.warn('OutOfRangeError in tf.learn final checkpoint possibly '
                     'due to exhausted input queue. Note: summary_op is not '
                     'expected to trigger dequeues. %s.', e)
      except BaseException as e:  # pylint: disable=broad-except
        # If we don't already have an exception to re-raise, raise this one.
        if not excinfo:
          raise
        # Otherwise, log this one and raise the other in the finally block.
        logging.error('Got exception during tf.learn final checkpoint %s.', e)
      finally:
        if excinfo:
          reraise(*excinfo)
    return loss_value
Ejemplo n.º 16
0
def main(_):
    if not FLAGS.dataset_dir:
        raise ValueError(
            'You must supply the dataset directory with --dataset_dir')

    if not os.path.isfile(FLAGS.checkpoint_path):
        FLAGS.eval_dir = os.path.join(FLAGS.checkpoint_path, 'eval')
    else:
        FLAGS.eval_dir = os.path.join(os.path.dirname(FLAGS.checkpoint_path),
                                      'eval')

    try:
        os.makedirs(FLAGS.eval_dir)
    except OSError:
        pass

    tf.logging.set_verbosity(tf.logging.INFO)
    with tf.Graph().as_default():
        tf_global_step = slim.get_or_create_global_step()

        ######################
        # Select the dataset #
        ######################
        dataset = dataset_factory.get_dataset(
            FLAGS.dataset_name,
            FLAGS.dataset_split_name,
            FLAGS.dataset_dir.split(','),
            FLAGS.dataset_list_dir,
            num_samples=FLAGS.frames_per_video,
            modality=FLAGS.modality,
            split_id=FLAGS.split_id)

        ####################
        # Select the model #
        ####################
        network_fn = nets_factory.get_network_fn(
            FLAGS.model_name,
            num_classes=(dataset.num_classes - FLAGS.labels_offset),
            batch_size=FLAGS.batch_size,
            is_training=False)

        ##############################################################
        # Create a dataset provider that loads data from the dataset #
        ##############################################################
        provider = dataset_data_provider.DatasetDataProvider(
            dataset,
            shuffle=FLAGS.force_random_shuffle,
            common_queue_capacity=2 * FLAGS.batch_size,
            common_queue_min=FLAGS.batch_size,
            bgr_flips=FLAGS.bgr_flip)
        [image, label] = provider.get(['image', 'label'])
        label = tf.cast(tf.string_to_number(label, tf.int32), tf.int64)
        label.set_shape(())
        label -= FLAGS.labels_offset

        #####################################
        # Select the preprocessing function #
        #####################################
        preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name
        image_preprocessing_fn = preprocessing_factory.get_preprocessing(
            preprocessing_name, is_training=False)

        eval_image_size = FLAGS.eval_image_size or network_fn.default_image_size

        image = image_preprocessing_fn(image,
                                       eval_image_size,
                                       eval_image_size,
                                       model_name=FLAGS.model_name,
                                       ncrops=FLAGS.ncrops,
                                       out_dim_scale=FLAGS.out_dim_scale)

        images, labels = tf.train.batch(
            [image, label],
            batch_size=FLAGS.batch_size,
            num_threads=1 if FLAGS.store_feat is not None else
            FLAGS.num_preprocessing_threads,
            capacity=5 * FLAGS.batch_size)

        ####################
        # Define the model #
        ####################
        kwargs = {}
        if FLAGS.conv_endpoint is not None:
            kwargs['conv_endpoint'] = FLAGS.conv_endpoint
        logits, end_points = network_fn(
            images,
            pool_type=FLAGS.pooling,
            classifier_type=FLAGS.classifier_type,
            num_channels_stream=provider.num_channels_stream,
            netvlad_centers=FLAGS.netvlad_initCenters.split(','),
            stream_pool_type=FLAGS.stream_pool_type,
            **kwargs)
        end_points['images'] = images
        end_points['labels'] = labels

        if FLAGS.moving_average_decay:
            variable_averages = tf.train.ExponentialMovingAverage(
                FLAGS.moving_average_decay, tf_global_step)
            variables_to_restore = variable_averages.variables_to_restore(
                slim.get_model_variables())
            variables_to_restore[tf_global_step.op.name] = tf_global_step
        else:
            variables_to_restore = slim.get_variables_to_restore()

        # print(dir(variables_to_restore))
        print(type(variables_to_restore))
        ignore_variables = [
            'stream0/vgg_16/fc8/weights:0',
            'stream0/vgg_16/fc8/biases:0',
        ]
        new_variables_to_restore = []
        for var in variables_to_restore:
            if (var.name not in ignore_variables):
                new_variables_to_restore.append(var)

        variables_to_restore = new_variables_to_restore
        for var in variables_to_restore:
            print(var.name)

        predictions = tf.argmax(logits, 1)
        # rgirdhar: Because of the following, can't use with batch_size=1
        if FLAGS.batch_size > 1:
            labels = tf.squeeze(labels)

        # Define the metrics:
        names_to_values, names_to_updates = slim.metrics.aggregate_metric_map({
            'Accuracy':
            slim.metrics.streaming_accuracy(predictions, labels),
            'Recall@5':
            slim.metrics.streaming_recall_at_k(logits, labels, 5),
        })

        # Print the summaries to screen.
        for name, value in names_to_values.iteritems():
            summary_name = 'eval/%s' % name
            op = tf.scalar_summary(summary_name, value, collections=[])
            op = tf.Print(op, [value], summary_name)
            tf.add_to_collection(tf.GraphKeys.SUMMARIES, op)

        # TODO(sguada) use num_epochs=1
        if FLAGS.max_num_batches:
            num_batches = FLAGS.max_num_batches
        else:
            # This ensures that we make a single pass over all of the data.
            num_batches = int(
                math.ceil(dataset.num_samples / float(FLAGS.batch_size)))

        if tf.gfile.IsDirectory(FLAGS.checkpoint_path):
            checkpoint_path = tf.train.latest_checkpoint(FLAGS.checkpoint_path)
        else:
            checkpoint_path = FLAGS.checkpoint_path

        tf.logging.info('Evaluating %s' % checkpoint_path)

        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        config.allow_soft_placement = True

        if FLAGS.store_feat is not None:
            assert (FLAGS.store_feat_path is not None)
            from tensorflow.python.training import supervisor
            from tensorflow.python.framework import ops
            import h5py
            saver = tf.train.Saver(variables_to_restore)
            sv = supervisor.Supervisor(graph=ops.get_default_graph(),
                                       logdir=None,
                                       summary_op=None,
                                       summary_writer=None,
                                       global_step=None,
                                       saver=None)
            ept_names_to_store = FLAGS.store_feat.split(',')
            try:
                ept_to_store = [end_points[el] for el in ept_names_to_store]
            except:
                logging.error('Endpoint not found')
                logging.error('Choose from %s' % ','.join(end_points.keys()))
                raise KeyError()
            res = dict([(epname, []) for epname in ept_names_to_store])
            with sv.managed_session(FLAGS.master,
                                    start_standard_services=False,
                                    config=config) as sess:
                saver.restore(sess, checkpoint_path)
                sv.start_queue_runners(sess)
                for j in range(num_batches):
                    if j % 10 == 0:
                        logging.info('Doing batch %d/%d' % (j, num_batches))
                    feats = sess.run(ept_to_store)
                    for eid, epname in enumerate(ept_names_to_store):
                        res[epname].append(feats[eid])
            logging.info('Writing out features to %s' % FLAGS.store_feat_path)
            with h5py.File(FLAGS.store_feat_path, 'w') as fout:
                for epname in res.keys():
                    fout.create_dataset(
                        epname,
                        data=np.concatenate(res[epname], axis=0),
                        compression='gzip',
                        compression_opts=FLAGS.feat_store_compression_opt)
        else:
            slim.evaluation.evaluate_once(
                master=FLAGS.master,
                checkpoint_path=checkpoint_path,
                logdir=FLAGS.eval_dir,
                num_evals=num_batches,
                eval_op=names_to_updates.values(),
                variables_to_restore=variables_to_restore,
                session_config=config)
def main(_):
  if not FLAGS.dataset_dir:
    raise ValueError('You must supply the dataset directory with --dataset_dir')
  times = {}
  tf.logging.set_verbosity(tf.logging.INFO)
  with tf.Graph().as_default():
    start = time.time()
    tf_global_step = slim.get_or_create_global_step()
    times['global_step'] = time.time() - start

    ######################
    # Select the dataset #

    start = time.time()
    dataset = dataset_factory.get_dataset(
      FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir, suffix=FLAGS.dataset_name_suffix)
    times['get_dataset'] = time.time() - start

    ####################
    # Select the model #
    ####################
    start = time.time()
    network_fn = nets_factory.get_network_fn(
      FLAGS.model_name,
      num_classes=(dataset.num_classes - FLAGS.labels_offset),
      is_training=False)
    times['select_model'] = time.time() - start

    ##############################################################
    # Create a dataset provider that loads data from the dataset #
    ##############################################################
    start = time.time()
    provider = slim.dataset_data_provider.DatasetDataProvider(
      dataset,
      shuffle=False,
      common_queue_capacity=2 * FLAGS.batch_size,
      common_queue_min=FLAGS.batch_size)
    times['get_provider'] = time.time() - start
    start = time.time()
    [image] = provider.get(['image'])
    times['get_image'] = time.time() - start

    #####################################
    # Select the preprocessing function #
    #####################################
    start = time.time()
    preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name
    image_preprocessing_fn = preprocessing_factory.get_preprocessing(
      preprocessing_name,
      is_training=False)
    times['get_preprocessing'] = time.time() - start

    eval_image_size = FLAGS.eval_image_size or network_fn.default_image_size

    start = time.time()
    image = image_preprocessing_fn(image, eval_image_size, eval_image_size)
    times['preprocessing'] = time.time() - start

    start = time.time()
    images = tf.train.batch(
      [image],
      batch_size=FLAGS.batch_size,
      num_threads=FLAGS.num_preprocessing_threads,
      capacity=5 * FLAGS.batch_size)
    times['get_batch'] = time.time() - start

    start = time.time()
    tf.image_summary('test_images', images, FLAGS.batch_size)
    times['image_summary'] = time.time() - start

    ####################
    # Define the model #
    ####################
    start = time.time()
    logits, _ = network_fn(images)
    times['do_network'] = time.time() - start

    # with tf.variable_scope('resnet_v2_152/block1/unit_1/bottleneck_v2/conv1', reuse=True):
    #   weights = tf.get_variable('weights')
    #   kernel_transposed = put_kernels_on_grid(weights)
    # scale weights to [0 1], type is still float
    # x_min = tf.reduce_min(weights)
    # x_max = tf.reduce_max(weights)
    # kernel_0_to_1 = (weights - x_min) / (x_max - x_min)
    #
    # # to tf.image_summary format [batch_size, height, width, channels]
    # kernel_transposed = tf.transpose(kernel_0_to_1, [3, 0, 1, 2])

    # this will display random 3 filters from the 64 in conv1
    # tf.image_summary('conv1/filters', kernel_transposed, max_images=50)

    if FLAGS.moving_average_decay:
      variable_averages = tf.train.ExponentialMovingAverage(
        FLAGS.moving_average_decay, tf_global_step)
      variables_to_restore = variable_averages.variables_to_restore(
        slim.get_model_variables())
      variables_to_restore[tf_global_step.op.name] = tf_global_step
    else:
      variables_to_restore = slim.get_variables_to_restore()

    if len(logits.get_shape()) == 4:
      logits = tf.reshape(logits, [int(logits.get_shape()[0]), -1])

    softmax = tf.nn.softmax(logits)
    # predictions = tf.argmax(logits, 1)

    # Define the metrics:
    # names_to_values, names_to_updates = slim.metrics.aggregate_metric_map({
    # 'Predictions': predictions,
    # 'Predictions': slim.metrics.streaming_accuracy(predictions, labels),
    # 'Predictions@5': slim.metrics.streaming_recall_at_k(
    #   logits, labels, 5),
    # })

    # Print the summaries to screen.
    # for name, value in names_to_values.iteritems():
    #   summary_name = 'eval/%s' % name
    #   op = tf.scalar_summary(summary_name, value, collections=[])
    #   op = tf.Print(op, [value], summary_name)
    #   tf.add_to_collection(tf.GraphKeys.SUMMARIES, op)

    # TODO(sguada) use num_epochs=1
    if FLAGS.max_num_batches:
      num_batches = FLAGS.max_num_batches
    else:
      # This ensures that we make a single pass over all of the data.
      num_batches = math.ceil(dataset.num_samples / float(FLAGS.batch_size))

    start = time.time()
    if tf.gfile.IsDirectory(FLAGS.checkpoint_path):
      checkpoint_path = tf.train.latest_checkpoint(FLAGS.checkpoint_path)
    else:
      checkpoint_path = FLAGS.checkpoint_path
    times['load_checkpoint'] = time.time() - start

    tf.logging.info('Evaluating %s' % checkpoint_path)
    # evaluate_loop

    from tensorflow.contrib.framework.python.ops import variables
    from tensorflow.core.protobuf import saver_pb2
    from tensorflow.python.training import saver as tf_saver
    from tensorflow.python.framework import ops
    from tensorflow.python.training import supervisor
    saver = tf_saver.Saver(
      variables_to_restore or variables.get_variables_to_restore(),
      write_version=saver_pb2.SaverDef.V1)
    sv = supervisor.Supervisor(graph=ops.get_default_graph(),
                               logdir=FLAGS.eval_dir,
                               summary_op=None,
                               summary_writer=None,
                               global_step=None,
                               saver=None)
    # init = tf.initialize_all_variables()
    # sess = tf.Session()
    with sv.managed_session(FLAGS.master, start_standard_services=False) as sess:
      # sess.run(init)
      saver.restore(sess, checkpoint_path)
      sv.start_queue_runners(sess)
      start = time.time()
      final_op_value = sess.run(logits)
      # final_op_value = slim.evaluation.evaluate_once(
      #   master=FLAGS.master,
      #   checkpoint_path=checkpoint_path,
      #   logdir=FLAGS.eval_dir,
      #   num_evals=num_batches,
      #   final_op=[softmax, logits],
      #   # eval_op=names_to_updates.values(),
      #   variables_to_restore=variables_to_restore)
      times['exec'] = time.time() - start

    print(final_op_value[1].shape)
    result_predict = np.reshape(final_op_value[1], (FLAGS.batch_size, final_op_value[1].shape[-1]))
    # print(final_op_value)
    print(result_predict)
    print(np.argsort(result_predict[:, 1])[-5:])
  print(times)
Ejemplo n.º 18
0
 def testStandardServicesWithGlobalStep(self):
     logdir = self._test_dir("standard_services_with_global_step")
     # Create a checkpoint.
     with ops.Graph().as_default():
         v = variables.VariableV1([123], name="global_step")
         sv = supervisor.Supervisor(logdir=logdir)
         meta_graph_def = meta_graph.create_meta_graph_def(
             saver_def=sv.saver.saver_def)
         sess = sv.prepare_or_wait_for_session("")
         # This is where the checkpoint will appear, with step number 123.
         save_path = "%s-123" % sv.save_path
         self._wait_for_glob(save_path, 3.0)
         self._wait_for_glob(os.path.join(logdir, "*events*"),
                             3.0,
                             for_checkpoint=False)
         # Wait to make sure everything is written to file before stopping.
         time.sleep(1)
         sv.stop()
     # There should be an event file with a version number.
     rr = _summary_iterator(logdir)
     ev = next(rr)
     self.assertEquals("brain.Event:2", ev.file_version)
     ev = next(rr)
     ev_graph = graph_pb2.GraphDef()
     ev_graph.ParseFromString(ev.graph_def)
     self.assertProtoEquals(sess.graph.as_graph_def(add_shapes=True),
                            ev_graph)
     ev = next(rr)
     ev_meta_graph = meta_graph_pb2.MetaGraphDef()
     ev_meta_graph.ParseFromString(ev.meta_graph_def)
     self.assertProtoEquals(meta_graph_def, ev_meta_graph)
     self.assertProtoEquals(sess.graph.as_graph_def(add_shapes=True),
                            ev_meta_graph.graph_def)
     ev = next(rr)
     # It is actually undeterministic whether SessionLog.START gets written
     # before the summary or the checkpoint, but this works when run 10000 times.
     self.assertEquals(123, ev.step)
     self.assertEquals(event_pb2.SessionLog.START, ev.session_log.status)
     first = next(rr)
     second = next(rr)
     # It is undeterministic whether the value gets written before the checkpoint
     # since they are on separate threads, so we check for both conditions.
     if first.HasField("summary"):
         self.assertProtoEquals(
             """value { tag: 'global_step/sec'
                                     simple_value: 0.0 }""", first.summary)
         self.assertEquals(123, second.step)
         self.assertEquals(event_pb2.SessionLog.CHECKPOINT,
                           second.session_log.status)
     else:
         self.assertEquals(123, first.step)
         self.assertEquals(event_pb2.SessionLog.CHECKPOINT,
                           first.session_log.status)
         self.assertProtoEquals(
             """value { tag: 'global_step/sec'
                                     simple_value: 0.0 }""", second.summary)
     ev = next(rr)
     self.assertEquals(event_pb2.SessionLog.STOP, ev.session_log.status)
     self.assertRaises(StopIteration, lambda: next(rr))
     # There should be a checkpoint file with the variable "foo"
     with ops.Graph().as_default(), self.cached_session() as sess:
         v = variables.VariableV1([-12], name="global_step")
         sav = saver_lib.Saver([v])
         sav.restore(sess, save_path)
         self.assertEqual(123, self.evaluate(v)[0])
Ejemplo n.º 19
0
 def testNoQueueRunners(self):
     with ops.Graph().as_default(), self.cached_session() as sess:
         sv = supervisor.Supervisor(
             logdir=self._test_dir("no_queue_runners"))
         self.assertEqual(0, len(sv.start_queue_runners(sess)))
         sv.stop()
Ejemplo n.º 20
0
def evaluation_loop(master,
                    checkpoint_dir,
                    logdir,
                    num_evals=1,
                    eval_op=None,
                    eval_op_feed_dict=None,
                    final_op=None,
                    final_op_feed_dict=None,
                    summary_op=_USE_DEFAULT,
                    summary_op_feed_dict=None,
                    variables_to_restore=None,
                    eval_interval_secs=60,
                    max_number_of_evaluations=None):
  """Runs TF-Slim's Evaluation Loop.

  Args:
    master: The BNS address of the TensorFlow master.
    checkpoint_dir: The directory where checkpoints are stored.
    logdir: The directory where the TensorFlow summaries are written to.
    num_evals: The number of times to run `eval_op`.
    eval_op: A operation run `num_evals` times.
    eval_op_feed_dict: The feed dictionary to use when executing the `eval_op`.
    final_op: An operation to execute after all of the `eval_op` executions. The
      value of `final_op` is returned.
    final_op_feed_dict: A feed dictionary to use when executing `final_op`.
    summary_op: The summary_op to evaluate after running TF-Slims metric ops. By
      default the summary_op is set to tf.merge_all_summaries().
    summary_op_feed_dict: An optional feed dictionary to use when running the
      `summary_op`.
    variables_to_restore: A list of TensorFlow variables to restore during
      evaluation. If the argument is left as `None` then
      slim.variables.GetVariablesToRestore() is used.
    eval_interval_secs: The minimum number of seconds between evaluations.
    max_number_of_evaluations: the max number of iterations of the evaluation.
      If the value is left as 'None', the evaluation continues indefinitely.
  """
  if summary_op == _USE_DEFAULT:
    summary_op = logging_ops.merge_all_summaries()

  global_step = variables.get_or_create_global_step()

  init_op = control_flow_ops.group(tf_variables.initialize_all_variables(),
                                   tf_variables.initialize_local_variables(),
                                   data_flow_ops.initialize_all_tables())

  saver = tf_saver.Saver(variables_to_restore or
                         variables.get_variables_to_restore())

  summary_writer = summary_io.SummaryWriter(logdir)

  sv = supervisor.Supervisor(graph=ops.get_default_graph(),
                             logdir=logdir,
                             init_op=init_op,
                             summary_op=None,
                             summary_writer=None,
                             global_step=None,
                             saver=saver)

  last_checkpoint = None
  number_of_evaluations = 0
  while True:
    last_checkpoint = wait_for_new_checkpoint(checkpoint_dir, last_checkpoint)
    start = time.time()
    logging.info('Starting evaluation at ' + time.strftime('%Y-%m-%d-%H:%M:%S',
                                                           time.gmtime()))

    with sv.managed_session(master, start_standard_services=False) as sess:
      sv.saver.restore(sess, last_checkpoint)
      sv.start_queue_runners(sess)
      evaluation(sess,
                 num_evals=num_evals,
                 eval_op=eval_op,
                 eval_op_feed_dict=eval_op_feed_dict,
                 final_op=final_op,
                 final_op_feed_dict=final_op_feed_dict,
                 summary_op=summary_op,
                 summary_op_feed_dict=summary_op_feed_dict,
                 summary_writer=summary_writer,
                 global_step=global_step)

    logging.info('Finished evaluation at ' + time.strftime('%Y-%m-%d-%H:%M:%S',
                                                           time.gmtime()))
    number_of_evaluations += 1
    if (max_number_of_evaluations and
        number_of_evaluations >= max_number_of_evaluations):
      logging.info('Reached max_number_of_evaluations=%s. Exit',
                   max_number_of_evaluations)
      break

    time_to_next_eval = start + eval_interval_secs - time.time()
    if time_to_next_eval > 0:
      time.sleep(time_to_next_eval)
Ejemplo n.º 21
0
 def _start_standard_services():
     with ops.Graph().as_default():
         sv = supervisor.Supervisor(is_chief=False)
         sess = sv.prepare_or_wait_for_session("")
         sv.start_standard_services(sess)
Ejemplo n.º 22
0
def train(train_op,
          logdir,
          train_step_fn=train_step,
          train_step_kwargs=_USE_DEFAULT,
          log_every_n_steps=1,
          graph=None,
          master='',
          is_chief=True,
          global_step=None,
          number_of_steps=None,
          init_op=_USE_DEFAULT,
          init_feed_dict=None,
          local_init_op=None,
          init_fn=None,
          summary_op=_USE_DEFAULT,
          save_summaries_secs=600,
          startup_delay_steps=0,
          saver=None,
          save_interval_secs=600,
          sync_optimizer=None):
    """Runs a training loop using a TensorFlow supervisor.

  When the sync_optimizer is supplied, gradient updates are applied
  synchronously. Otherwise, gradient updates are applied asynchronous.

  Args:
    train_op: A `Tensor` that, when executed, will apply the gradients and
      return the loss value.
    logdir: The directory where training logs are written to.
    train_step_fn: The function to call in order to execute a single gradient
      step. The function must have take exactly four arguments: the current
      session, the `train_op` `Tensor`, a global step `Tensor` and a dictionary.
    train_step_kwargs: A dictionary which is passed to the `train_step_fn`. By
      default, two `Boolean`, scalar ops called "should_stop" and "should_log"
      are provided.
    log_every_n_steps: The frequency, in terms of global steps, that the loss
      and global step and logged.
    graph: The graph to pass to the supervisor. If no graph is supplied the
      default graph is used.
    master: The BNS name of the tensorflow master.
    is_chief: Specifies whether or not the training is being run by the primary
      replica during replica training.
    global_step: The `Tensor` representing the global step. If left as `None`,
      then slim.variables.get_or_create_global_step() is used.
    number_of_steps: The max number of gradient steps to take during training.
      If the value is left as None, training proceeds indefinitely.
    init_op: The initialization operation. If left to its default value, then
      the session is initialized by calling `tf.initialize_all_variables()`.
    init_feed_dict: A feed dictionary to use when executing the `init_op`.
    local_init_op: The local initialization operation. If None,
      then the session is initialized by calling
      `tf.initialize_local_variables()` and `tf.initialize_all_tables()`.
    init_fn: An optional callable to be executed after `init_op` is called. The
      callable must accept one argument, the session being initialized.
    summary_op: The summary operation.
    save_summaries_secs: How often, in seconds, to save summaries.
    startup_delay_steps: The number of steps to wait for before beginning. Note
      that this must be 0 if a sync_optimizer is supplied.
    saver: Saver to save checkpoints. If none, a default one will be created
      and used.
    save_interval_secs: How often, in seconds, to save the model to `logdir`.
    sync_optimizer: an instance of tf.train.SyncReplicasOptimizer. If the
      argument is supplied, gradient updates will be synchronous. If left as
      `None`, gradient updates will be asynchronous.

  Returns:
    the value of the loss function after training.

  Raises:
    ValueError: if `train_op` is empty or if `startup_delay_steps` is
      non-zero when `sync_optimizer` is supplied, or if `number_of_steps` is
      negative.
  """
    if train_op is None:
        raise ValueError('train_op cannot be None.')

    if sync_optimizer and startup_delay_steps > 0:
        raise ValueError(
            'startup_delay_steps must be zero when sync_optimizer is supplied.'
        )

    if number_of_steps is not None and number_of_steps <= 0:
        raise ValueError(
            '`number_of_steps` must be either None or a positive number.')

    graph = graph or ops.get_default_graph()
    with graph.as_default():
        if global_step is None:
            global_step = variables.get_or_create_global_step()
        saver = saver or tf_saver.Saver()

    if init_op == _USE_DEFAULT:
        init_op = tf_variables.initialize_all_variables()

    if summary_op == _USE_DEFAULT:
        summary_op = logging_ops.merge_all_summaries()

    cleanup_op = None

    if is_chief and sync_optimizer:
        if not isinstance(sync_optimizer,
                          sync_replicas_optimizer.SyncReplicasOptimizer):
            raise ValueError(
                '`sync_optimizer` must be a tf.train.SyncReplicasOptimizer')

        # Need to create these BEFORE the supervisor finalizes the graph:
        with ops.control_dependencies([init_op]):
            init_tokens_op = sync_optimizer.get_init_tokens_op()
        init_op = init_tokens_op
        chief_queue_runner = sync_optimizer.get_chief_queue_runner()
        cleanup_op = sync_optimizer.get_clean_up_op()

    if train_step_kwargs == _USE_DEFAULT:
        train_step_kwargs = {}

        if number_of_steps:
            should_stop_op = math_ops.greater_equal(global_step,
                                                    number_of_steps)
        else:
            should_stop_op = constant_op.constant(False)
        train_step_kwargs['should_stop'] = should_stop_op
        train_step_kwargs['should_log'] = math_ops.equal(
            math_ops.mod(global_step, log_every_n_steps), 0)

    sv = supervisor.Supervisor(graph=graph,
                               is_chief=is_chief,
                               logdir=logdir,
                               init_op=init_op,
                               init_feed_dict=init_feed_dict,
                               local_init_op=local_init_op,
                               summary_op=summary_op,
                               global_step=global_step,
                               saver=saver,
                               save_summaries_secs=save_summaries_secs,
                               save_model_secs=save_interval_secs,
                               init_fn=init_fn)

    with sv.managed_session(master, start_standard_services=False) as sess:
        if is_chief:
            sv.start_standard_services(sess)
        elif not is_chief and startup_delay_steps > 0:
            _wait_for_step(
                sess, global_step,
                min(startup_delay_steps, number_of_steps or sys.maxint))
        sv.start_queue_runners(sess)
        if is_chief and sync_optimizer:
            sv.start_queue_runners(sess, [chief_queue_runner])

        try:
            while not sv.should_stop():
                total_loss, should_stop = train_step_fn(
                    sess, train_op, global_step, train_step_kwargs)
                if should_stop:
                    break
        finally:
            if sv.is_chief and cleanup_op is not None:
                sess.run(cleanup_op)

        # This waits for service threads to finish.
        sv.Stop()

        if sv.is_chief:
            logging.info('Finished training! Saving model to disk.')
            sv.saver.save(sess, sv.save_path, global_step=sv.global_step)

        return total_loss
Ejemplo n.º 23
0
def evaluation_loop(master,
                    checkpoint_dir,
                    logdir,
                    num_evals=1,
                    initial_op=None,
                    initial_op_feed_dict=None,
                    eval_op=None,
                    eval_op_feed_dict=None,
                    final_op=None,
                    final_op_feed_dict=None,
                    summary_op=_USE_DEFAULT,
                    summary_op_feed_dict=None,
                    variables_to_restore=None,
                    eval_interval_secs=60,
                    max_number_of_evaluations=None,
                    session_config=None,
                    timeout=None):
    """Runs TF-Slim's Evaluation Loop.

  Args:
    master: The BNS address of the TensorFlow master.
    checkpoint_dir: The directory where checkpoints are stored.
    logdir: The directory where the TensorFlow summaries are written to.
    num_evals: The number of times to run `eval_op`.
    initial_op: An operation run at the beginning of evaluation.
    initial_op_feed_dict: A feed dictionary to use when executing `initial_op`.
    eval_op: A operation run `num_evals` times.
    eval_op_feed_dict: The feed dictionary to use when executing the `eval_op`.
    final_op: An operation to execute after all of the `eval_op` executions. The
      value of `final_op` is returned.
    final_op_feed_dict: A feed dictionary to use when executing `final_op`.
    summary_op: The summary_op to evaluate after running TF-Slims metric ops. By
      default the summary_op is set to tf.summary.merge_all().
    summary_op_feed_dict: An optional feed dictionary to use when running the
      `summary_op`.
    variables_to_restore: A list of TensorFlow variables to restore during
      evaluation. If the argument is left as `None` then
      slim.variables.GetVariablesToRestore() is used.
    eval_interval_secs: The minimum number of seconds between evaluations.
    max_number_of_evaluations: the max number of iterations of the evaluation.
      If the value is left as 'None', the evaluation continues indefinitely.
    session_config: An instance of `tf.ConfigProto` that will be used to
      configure the `Session`. If left as `None`, the default will be used.
    timeout: The maximum amount of time to wait between checkpoints. If left as
      `None`, then the process will wait indefinitely.

  Returns:
    The value of `final_op` or `None` if `final_op` is `None`.
  """
    if summary_op == _USE_DEFAULT:
        summary_op = summary.merge_all()

    global_step = variables.get_or_create_global_step()

    saver = tf_saver.Saver(variables_to_restore
                           or variables.get_variables_to_restore())

    summary_writer = summary_io.SummaryWriter(logdir)

    sv = supervisor.Supervisor(graph=ops.get_default_graph(),
                               logdir=logdir,
                               summary_op=None,
                               summary_writer=None,
                               global_step=None,
                               saver=saver)

    number_of_evaluations = 0
    for checkpoint_path in checkpoints_iterator(checkpoint_dir,
                                                eval_interval_secs, timeout):
        logging.info('Starting evaluation at ' +
                     time.strftime('%Y-%m-%d-%H:%M:%S', time.gmtime()))

        with sv.managed_session(master,
                                start_standard_services=False,
                                config=session_config) as sess:
            sv.saver.restore(sess, checkpoint_path)
            sv.start_queue_runners(sess)
            final_op_value = evaluation(
                sess,
                num_evals=num_evals,
                initial_op=initial_op,
                initial_op_feed_dict=initial_op_feed_dict,
                eval_op=eval_op,
                eval_op_feed_dict=eval_op_feed_dict,
                final_op=final_op,
                final_op_feed_dict=final_op_feed_dict,
                summary_op=summary_op,
                summary_op_feed_dict=summary_op_feed_dict,
                summary_writer=summary_writer,
                global_step=global_step)

        logging.info('Finished evaluation at ' +
                     time.strftime('%Y-%m-%d-%H:%M:%S', time.gmtime()))
        number_of_evaluations += 1
        if (max_number_of_evaluations
                and number_of_evaluations >= max_number_of_evaluations):
            logging.info('Reached max_number_of_evaluations=%s. Exit',
                         max_number_of_evaluations)
            return final_op_value

    logging.info(
        'Timed-out waiting for new checkpoint file. Exiting evaluation loop.')
    return final_op_value
Ejemplo n.º 24
0
def train(train_op,
          logdir,
          train_step_fn=train_step,
          train_step_kwargs=_USE_DEFAULT,
          log_every_n_steps=1,
          graph=None,
          master='',
          is_chief=True,
          global_step=None,
          number_of_steps=None,
          init_op=_USE_DEFAULT,
          init_feed_dict=None,
          local_init_op=_USE_DEFAULT,
          init_fn=None,
          ready_op=_USE_DEFAULT,
          summary_op=_USE_DEFAULT,
          save_summaries_secs=600,
          summary_writer=_USE_DEFAULT,
          startup_delay_steps=0,
          saver=None,
          save_interval_secs=600,
          sync_optimizer=None,
          session_config=None,
          trace_every_n_steps=None):
    """Runs a training loop using a TensorFlow supervisor.

  When the sync_optimizer is supplied, gradient updates are applied
  synchronously. Otherwise, gradient updates are applied asynchronous.

  Args:
    train_op: A `Tensor` that, when executed, will apply the gradients and
      return the loss value.
    logdir: The directory where training logs are written to. If None, model
      checkpoints and summaries will not be written.
    train_step_fn: The function to call in order to execute a single gradient
      step. The function must have take exactly four arguments: the current
      session, the `train_op` `Tensor`, a global step `Tensor` and a dictionary.
    train_step_kwargs: A dictionary which is passed to the `train_step_fn`. By
      default, two `Boolean`, scalar ops called "should_stop" and "should_log"
      are provided.
    log_every_n_steps: The frequency, in terms of global steps, that the loss
      and global step and logged.
    graph: The graph to pass to the supervisor. If no graph is supplied the
      default graph is used.
    master: The address of the tensorflow master.
    is_chief: Specifies whether or not the training is being run by the primary
      replica during replica training.
    global_step: The `Tensor` representing the global step. If left as `None`,
      then slim.variables.get_or_create_global_step() is used.
    number_of_steps: The max number of gradient steps to take during training.
      If the value is left as None, training proceeds indefinitely.
    init_op: The initialization operation. If left to its default value, then
      the session is initialized by calling `tf.global_variables_initializer()`.
    init_feed_dict: A feed dictionary to use when executing the `init_op`.
    local_init_op: The local initialization operation. If left to its default
      value, then the session is initialized by calling
      `tf.local_variables_initializer()` and `tf.tables_initializer()`.
    init_fn: An optional callable to be executed after `init_op` is called. The
      callable must accept one argument, the session being initialized.
    ready_op: Operation to check if the model is ready to use. If left to its
      default value, then the session checks for readiness by calling
      `tf.report_uninitialized_variables()`.
    summary_op: The summary operation.
    save_summaries_secs: How often, in seconds, to save summaries.
    summary_writer: `SummaryWriter` to use.  Can be `None`
      to indicate that no summaries should be written. If unset, we
      create a SummaryWriter.
    startup_delay_steps: The number of steps to wait for before beginning. Note
      that this must be 0 if a sync_optimizer is supplied.
    saver: Saver to save checkpoints. If None, a default one will be created
      and used.
    save_interval_secs: How often, in seconds, to save the model to `logdir`.
    sync_optimizer: an instance of tf.train.SyncReplicasOptimizer. If the
      argument is supplied, gradient updates will be synchronous. If left as
      `None`, gradient updates will be asynchronous.
    session_config: An instance of `tf.ConfigProto` that will be used to
      configure the `Session`. If left as `None`, the default will be used.
    trace_every_n_steps: produce and save a `Timeline` in Chrome trace format
      and add it to the summaries every `trace_every_n_steps`. If None, no trace
      information will be produced or saved.

  Returns:
    the value of the loss function after training.

  Raises:
    ValueError: if `train_op` is empty or if `startup_delay_steps` is
      non-zero when `sync_optimizer` is supplied, if `number_of_steps` is
      negative, or if `trace_every_n_steps` is not `None` and no `logdir` is
      provided.
  """
    if train_op is None:
        raise ValueError('train_op cannot be None.')

    if logdir is None:
        if summary_op != _USE_DEFAULT:
            raise ValueError('Cannot provide summary_op because logdir=None')
        if saver is not None:
            raise ValueError('Cannot provide saver because logdir=None')
        if trace_every_n_steps is not None:
            raise ValueError('Cannot provide trace_every_n_steps because '
                             'logdir=None')

    if sync_optimizer is not None and startup_delay_steps > 0:
        raise ValueError(
            'startup_delay_steps must be zero when sync_optimizer is supplied.'
        )

    if number_of_steps is not None and number_of_steps <= 0:
        raise ValueError(
            '`number_of_steps` must be either None or a positive number.')

    graph = graph or ops.get_default_graph()
    with graph.as_default():
        if global_step is None:
            global_step = variables.get_or_create_global_step()
        saver = saver or tf_saver.Saver()

        with ops.name_scope('init_ops'):
            if init_op == _USE_DEFAULT:
                init_op = tf_variables.global_variables_initializer()

            if ready_op == _USE_DEFAULT:
                ready_op = tf_variables.report_uninitialized_variables()

            if local_init_op == _USE_DEFAULT:
                local_init_op = control_flow_ops.group(
                    tf_variables.local_variables_initializer(),
                    data_flow_ops.tables_initializer())

            if sync_optimizer is not None and isinstance(
                    sync_optimizer,
                    sync_replicas_optimizer.SyncReplicasOptimizer):
                with ops.control_dependencies(
                    [local_init_op] if local_init_op is not None else []):
                    if is_chief:
                        local_init_op = sync_optimizer.chief_init_op
                    else:
                        local_init_op = sync_optimizer.local_step_init_op
                ready_for_local_init_op = sync_optimizer.ready_for_local_init_op
            else:
                ready_for_local_init_op = None

        if summary_op == _USE_DEFAULT:
            summary_op = summary.merge_all()

        if summary_writer == _USE_DEFAULT:
            summary_writer = supervisor.Supervisor.USE_DEFAULT

        if is_chief and sync_optimizer is not None:
            if not isinstance(sync_optimizer,
                              (sync_replicas_optimizer.SyncReplicasOptimizer)):
                raise ValueError(
                    '`sync_optimizer` must be a tf.train.SyncReplicasOptimizer.'
                )

            # Need to create these BEFORE the supervisor finalizes the graph:
            init_tokens_op = sync_optimizer.get_init_tokens_op()
            chief_queue_runner = sync_optimizer.get_chief_queue_runner()

        if train_step_kwargs == _USE_DEFAULT:
            with ops.name_scope('train_step'):
                train_step_kwargs = {}

                if number_of_steps:
                    should_stop_op = math_ops.greater_equal(
                        global_step, number_of_steps)
                else:
                    should_stop_op = constant_op.constant(False)
                train_step_kwargs['should_stop'] = should_stop_op
                train_step_kwargs['should_log'] = math_ops.equal(
                    math_ops.mod(global_step, log_every_n_steps), 0)
                if is_chief and trace_every_n_steps is not None:
                    train_step_kwargs['should_trace'] = math_ops.equal(
                        math_ops.mod(global_step, trace_every_n_steps), 0)
                    train_step_kwargs['logdir'] = logdir

    sv = supervisor.Supervisor(graph=graph,
                               is_chief=is_chief,
                               logdir=logdir,
                               init_op=init_op,
                               init_feed_dict=init_feed_dict,
                               local_init_op=local_init_op,
                               ready_for_local_init_op=ready_for_local_init_op,
                               ready_op=ready_op,
                               summary_op=summary_op,
                               summary_writer=summary_writer,
                               global_step=global_step,
                               saver=saver,
                               save_summaries_secs=save_summaries_secs,
                               save_model_secs=save_interval_secs,
                               init_fn=init_fn)

    if summary_writer is not None:
        train_step_kwargs['summary_writer'] = sv.summary_writer

    should_retry = True
    while should_retry:
        try:
            should_retry = False
            with sv.managed_session(master,
                                    start_standard_services=False,
                                    config=session_config) as sess:
                logging.info('Starting Session.')
                if is_chief:
                    if logdir:
                        sv.start_standard_services(sess)
                elif startup_delay_steps > 0:
                    _wait_for_step(
                        sess, global_step,
                        min(startup_delay_steps, number_of_steps
                            or sys.maxint))
                sv.start_queue_runners(sess)
                logging.info('Starting Queues.')
                if is_chief and sync_optimizer is not None:
                    sv.start_queue_runners(sess, [chief_queue_runner])
                    sess.run(init_tokens_op)
                try:
                    while not sv.should_stop():
                        total_loss, should_stop = train_step_fn(
                            sess, train_op, global_step, train_step_kwargs)
                        if should_stop:
                            logging.info('Stopping Training.')
                            break
                except errors.OutOfRangeError:
                    # OutOfRangeError is thrown when epoch limit per
                    # tf.train.limit_epochs is reached.
                    logging.info('Caught OutOfRangeError. Stopping Training.')
                if logdir and sv.is_chief:
                    logging.info('Finished training! Saving model to disk.')
                    sv.saver.save(sess,
                                  sv.save_path,
                                  global_step=sv.global_step)

        except errors.AbortedError:
            # Always re-run on AbortedError as it indicates a restart of one of the
            # distributed tensorflow servers.
            logging.info('Retrying training!')
            should_retry = True

    return total_loss
Ejemplo n.º 25
0
def train(graph,
          output_dir,
          train_op,
          loss_op,
          global_step_tensor=None,
          init_op=None,
          init_feed_dict=None,
          init_fn=None,
          log_every_steps=10,
          supervisor_is_chief=True,
          supervisor_master='',
          supervisor_save_model_secs=600,
          supervisor_save_summaries_steps=100,
          feed_fn=None,
          steps=None,
          fail_on_nan_loss=True,
          monitors=None):
    """Train a model.

  Given `graph`, a directory to write outputs to (`output_dir`), and some ops,
  run a training loop. The given `train_op` performs one step of training on the
  model. The `loss_op` represents the objective function of the training. It is
  expected to increment the `global_step_tensor`, a scalar integer tensor
  counting training steps. This function uses `Supervisor` to initialize the
  graph (from a checkpoint if one is available in `output_dir`), write summaries
  defined in the graph, and write regular checkpoints as defined by
  `supervisor_save_model_secs`.

  Training continues until `global_step_tensor` evaluates to `max_steps`, or, if
  `fail_on_nan_loss`, until `loss_op` evaluates to `NaN`. In that case the
  program is terminated with exit code 1.

  Args:
    graph: A graph to train. It is expected that this graph is not in use
      elsewhere.
    output_dir: A directory to write outputs to.
    train_op: An op that performs one training step when run.
    loss_op: A scalar loss tensor.
    global_step_tensor: A tensor representing the global step. If none is given,
      one is extracted from the graph using the same logic as in `Supervisor`.
    init_op: An op that initializes the graph. If `None`, use `Supervisor`'s
      default.
    init_feed_dict: A dictionary that maps `Tensor` objects to feed values.
      This feed dictionary will be used when `init_op` is evaluated.
    init_fn: Optional callable passed to Supervisor to initialize the model.
    log_every_steps: Output logs regularly. The logs contain timing data and the
      current loss.
    supervisor_is_chief: Whether the current process is the chief supervisor in
      charge of restoring the model and running standard services.
    supervisor_master: The master string to use when preparing the session.
    supervisor_save_model_secs: Save a checkpoint every
      `supervisor_save_model_secs` seconds when training.
    supervisor_save_summaries_steps: Save summaries every
      `supervisor_save_summaries_steps` seconds when training.
    feed_fn: A function that is called every iteration to produce a `feed_dict`
      passed to `session.run` calls. Optional.
    steps: Trains for this many steps (e.g. current global step + `steps`).
    fail_on_nan_loss: If true, raise `NanLossDuringTrainingError` if `loss_op`
      evaluates to `NaN`. If false, continue training as if nothing happened.
    monitors: List of `BaseMonitor` subclass instances. Used for callbacks
      inside the training loop.

  Returns:
    The final loss value.

  Raises:
    ValueError: If `global_step_tensor` is not provided. See
        `tf.contrib.framework.get_global_step` for how we look it up if not
        provided explicitly.
    NanLossDuringTrainingError: If `fail_on_nan_loss` is `True`, and loss ever
        evaluates to `NaN`.
  """
    if not output_dir:
        raise ValueError('Output directory should be non-empty.')

    with graph.as_default():
        global_step_tensor = contrib_variables.assert_or_get_global_step(
            graph, global_step_tensor)
        if global_step_tensor is None:
            raise ValueError(
                'No "global_step" was provided or found in the graph.')

        # Get current step.
        try:
            start_step = checkpoints.load_variable(output_dir,
                                                   global_step_tensor.name)
        except (errors.NotFoundError, ValueError):
            start_step = 0

        summary_writer = (get_summary_writer(output_dir)
                          if supervisor_is_chief else None)

        # TODO(ipolosukhin): Replace all functionality of Supervisor with Monitors.
        if not supervisor_is_chief:
            # monitors should run only on the chief.
            monitors = []
        elif not monitors:
            monitors = monitors_lib.get_default_monitors(
                loss_op=loss_op,
                summary_op=logging_ops.get_summary_op(),
                save_summary_steps=supervisor_save_summaries_steps,
                summary_writer=summary_writer)

        # Start monitors, can create graph parts.
        for monitor in monitors:
            monitor.begin(max_steps=start_step + steps)

    supervisor = tf_supervisor.Supervisor(
        graph,
        init_op=init_op or tf_supervisor.Supervisor.USE_DEFAULT,
        init_feed_dict=init_feed_dict,
        is_chief=supervisor_is_chief,
        logdir=output_dir,
        saver=_make_saver(graph),
        global_step=global_step_tensor,
        summary_op=None,
        summary_writer=summary_writer,
        save_model_secs=supervisor_save_model_secs,
        init_fn=init_fn)
    session = supervisor.PrepareSession(master=supervisor_master,
                                        start_standard_services=True)
    supervisor.StartQueueRunners(session)

    with session:
        get_current_step = lambda: session.run(global_step_tensor)

        start_step = get_current_step()
        max_steps = start_step + steps
        last_step = start_step
        last_log_step = start_step
        loss_value = None
        logging.info('Training steps [%d,%s)', last_step,
                     'inf' if max_steps is None else str(max_steps))

        excinfo = None
        try:
            while not supervisor.ShouldStop() and ((max_steps is None) or
                                                   (last_step < max_steps)):
                start_time = time.time()
                feed_dict = feed_fn() if feed_fn is not None else None

                outputs, should_stop = _run_with_monitors(
                    session, last_step + 1, [train_op, loss_op], feed_dict,
                    monitors)

                loss_value = outputs[loss_op.name]
                if np.isnan(loss_value):
                    failure_message = 'Model diverged with loss = NaN.'
                    if fail_on_nan_loss:
                        logging.error(failure_message)
                        raise NanLossDuringTrainingError()
                    else:
                        logging.warning(failure_message)

                if should_stop:
                    break

                this_step = get_current_step()

                if this_step <= last_step:
                    logging.error(
                        'Global step was not incremented by train op at step %s'
                        ': new step %d', last_step, this_step)

                last_step = this_step
                is_last_step = (max_steps
                                is not None) and (last_step >= max_steps)
                if is_last_step or (last_step - last_log_step >=
                                    log_every_steps):
                    logging.info(
                        'training step %d, loss = %.5f (%.3f sec/batch).',
                        last_step, loss_value, float(time.time() - start_time))
                    last_log_step = last_step
        except errors.OutOfRangeError as e:
            logging.warn(
                'Got exception during tf.learn training loop possibly '
                'due to exhausted input queue %s.', e)
        except BaseException as e:  # pylint: disable=broad-except
            # Hold on to any other exceptions while we try recording a final
            # checkpoint and summary.
            excinfo = sys.exc_info()
        finally:
            try:
                # Call supervisor.Stop() from within a try block because it re-raises
                # exceptions thrown by the supervised threads.
                supervisor.Stop(close_summary_writer=False)

                # Save one last checkpoint and summaries
                # TODO(wicke): This should be handled by Supervisor

                # In case we encountered an exception in the try block before we updated
                # last_step, update it here (again).
                last_step = get_current_step()
                if supervisor_is_chief:
                    ckpt_path = supervisor.save_path
                    logging.info(
                        'Saving checkpoint for step %d to checkpoint: %s.',
                        last_step, ckpt_path)
                    supervisor.saver.save(session,
                                          ckpt_path,
                                          global_step=last_step)

                    # Finish monitors.
                    for monitor in monitors:
                        monitor.end()

            # catch OutOfRangeError which is thrown when queue is out of data (and for
            # other reasons as well).
            except errors.OutOfRangeError as e:
                logging.warn(
                    'OutOfRangeError in tf.learn final checkpoint possibly '
                    'due to exhausted input queue. Note: summary_op is not '
                    'expected to trigger dequeues. %s.', e)
            except BaseException as e:  # pylint: disable=broad-except
                # If we don't already have an exception to re-raise, raise this one.
                if not excinfo:
                    raise
                # Otherwise, log this one and raise the other in the finally block.
                logging.error(
                    'Got exception during tf.learn final checkpoint %s.', e)
            finally:
                if excinfo:
                    reraise(*excinfo)
        return loss_value
Ejemplo n.º 26
0
 network_fn = nets_factory.get_network_fn(model_name)
 end_points = network_fn(img, is_training=False)
 print (end_points)
 task1 = tf.to_int32(tf.argmax(end_points['Logits'], 1))
 
 training_accuracy1 = slim.metrics.accuracy(task1, tf.to_int32(lb))
 
 variables_to_restore = slim.get_variables_to_restore()
 checkpoint_path = latest_checkpoint(train_dir)
 saver = Saver(variables_to_restore)
 config = ConfigProto()
 config.gpu_options.allow_growth=True
 sess = Session(config=config)
 sv = supervisor.Supervisor(logdir=checkpoint_path,
                            summary_op=None,
                            summary_writer=None,
                            global_step=None,
                            saver=None)
 correct = 0
 predict = 0
 with sv.managed_session(master='', start_standard_services=False, config=config) as sess:
     saver.restore(sess, checkpoint_path)
     optim_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
     layer = {}
     name = ['conv1w','conv1b',
             'conv2w','conv2b',
             'conv3w','conv3b',
             'conv4w','conv4b',
             'conv5w','conv5b',
             'conv6w','conv6b',
             'conv7w','conv7b',
Ejemplo n.º 27
0
def train(train_op,
          logdir,
          train_step_fn=train_step,
          train_step_kwargs=_USE_DEFAULT,
          log_every_n_steps=1,
          graph=None,
          master='',
          is_chief=True,
          global_step=None,
          number_of_steps=None,
          init_op=_USE_DEFAULT,
          init_feed_dict=None,
          local_init_op=_USE_DEFAULT,
          init_fn=None,
          ready_op=_USE_DEFAULT,
          summary_op=_USE_DEFAULT,
          save_summaries_secs=600,
          summary_writer=_USE_DEFAULT,
          startup_delay_steps=0,
          saver=None,
          save_interval_secs=600,
          sync_optimizer=None,
          session_config=None,
          session_wrapper=None,
          trace_every_n_steps=None,
          batch_size=1,
          num_examples=None,
          config_summary_list=None):
    """Runs a training loop using a TensorFlow supervisor.

  When the sync_optimizer is supplied, gradient updates are applied
  synchronously. Otherwise, gradient updates are applied asynchronous.

  Args:
    train_op: A `Tensor` that, when executed, will apply the gradients and
      return the loss value.
    logdir: The directory where training logs are written to. If None, model
      checkpoints and summaries will not be written.
    train_step_fn: The function to call in order to execute a single gradient
      step. The function must have take exactly four arguments: the current
      session, the `train_op` `Tensor`, a global step `Tensor` and a dictionary.
    train_step_kwargs: A dictionary which is passed to the `train_step_fn`. By
      default, two `Boolean`, scalar ops called "should_stop" and "should_log"
      are provided.
    log_every_n_steps: The frequency, in terms of global steps, that the loss
      and global step and logged.
    graph: The graph to pass to the supervisor. If no graph is supplied the
      default graph is used.
    master: The address of the tensorflow master.
    is_chief: Specifies whether or not the training is being run by the primary
      replica during replica training.
    global_step: The `Tensor` representing the global step. If left as `None`,
      then slim.variables.get_or_create_global_step() is used.
    number_of_steps: The max number of gradient steps to take during training,
      as measured by 'global_step': training will stop if global_step is
      greater than 'number_of_steps'. If the value is left as None, training
      proceeds indefinitely.
    init_op: The initialization operation. If left to its default value, then
      the session is initialized by calling `tf.global_variables_initializer()`.
    init_feed_dict: A feed dictionary to use when executing the `init_op`.
    local_init_op: The local initialization operation. If left to its default
      value, then the session is initialized by calling
      `tf.local_variables_initializer()` and `tf.tables_initializer()`.
    init_fn: An optional callable to be executed after `init_op` is called. The
      callable must accept one argument, the session being initialized.
    ready_op: Operation to check if the model is ready to use. If left to its
      default value, then the session checks for readiness by calling
      `tf.report_uninitialized_variables()`.
    summary_op: The summary operation.
    save_summaries_secs: How often, in seconds, to save summaries.
    summary_writer: `SummaryWriter` to use.  Can be `None`
      to indicate that no summaries should be written. If unset, we
      create a SummaryWriter.
    startup_delay_steps: The number of steps to wait for before beginning. Note
      that this must be 0 if a sync_optimizer is supplied.
    saver: Saver to save checkpoints. If None, a default one will be created
      and used.
    save_interval_secs: How often, in seconds, to save the model to `logdir`.
    sync_optimizer: an instance of tf.train.SyncReplicasOptimizer, or a list of
      them. If the argument is supplied, gradient updates will be synchronous.
      If left as `None`, gradient updates will be asynchronous.
    session_config: An instance of `tf.ConfigProto` that will be used to
      configure the `Session`. If left as `None`, the default will be used.
    session_wrapper: A function that takes a `tf.Session` object as the only
      argument and returns a wrapped session object that has the same methods
      that the original object has, or `None`. Iff not `None`, the wrapped
      object will be used for training.
    trace_every_n_steps: produce and save a `Timeline` in Chrome trace format
      and add it to the summaries every `trace_every_n_steps`. If None, no trace
      information will be produced or saved.
    batch_size: batch size.
    num_examples: The number of examples in dataset for training.
    dubug_tensors: Additional tensors to run for debugging.

  Returns:
    the value of the loss function after training.

  Raises:
    ValueError: if `train_op` is empty or if `startup_delay_steps` is
      non-zero when `sync_optimizer` is supplied, if `number_of_steps` is
      negative, or if `trace_every_n_steps` is not `None` and no `logdir` is
      provided.
  """
    if train_op is None:
        raise ValueError('train_op cannot be None.')
    if not isinstance(train_op, list):
        train_op = [train_op]

    # Allocate log function to each step.
    log_fn_list = [log.info, log.infov]

    def _iter_log_fn():
        for log_fn in log_fn_list:
            yield log_fn

    it = itertools.cycle(_iter_log_fn())
    current_log_fn = it.next()

    if logdir is None:
        if summary_op != _USE_DEFAULT:
            raise ValueError('Cannot provide summary_op because logdir=None')
        if saver is not None:
            raise ValueError('Cannot provide saver because logdir=None')
        if trace_every_n_steps is not None:
            raise ValueError('Cannot provide trace_every_n_steps because '
                             'logdir=None')

    if isinstance(sync_optimizer,
                  sync_replicas_optimizer.SyncReplicasOptimizer):
        sync_optimizer = [sync_optimizer]
    if sync_optimizer is not None and startup_delay_steps > 0:
        raise ValueError(
            'startup_delay_steps must be zero when sync_optimizer is supplied.'
        )

    if number_of_steps is not None and number_of_steps <= 0:
        raise ValueError(
            '`number_of_steps` must be either None or a positive number.')

    graph = graph or ops.get_default_graph()
    with graph.as_default():
        if global_step is None:
            global_step = training_util.get_or_create_global_step()
        saver = saver or tf_saver.Saver()

        if sync_optimizer is not None:
            for opt in sync_optimizer:
                if not isinstance(
                        opt, sync_replicas_optimizer.SyncReplicasOptimizer):
                    raise ValueError(
                        '`sync_optimizer` must be a tf.train.SyncReplicasOptimizer.'
                    )

        with ops.name_scope('init_ops'):
            if init_op == _USE_DEFAULT:
                init_op = variables.global_variables_initializer()

            if ready_op == _USE_DEFAULT:
                ready_op = variables.report_uninitialized_variables()

            if local_init_op == _USE_DEFAULT:
                local_init_op = control_flow_ops.group(
                    variables.local_variables_initializer(),
                    lookup_ops.tables_initializer())

            if sync_optimizer is not None and isinstance(sync_optimizer, list):
                with ops.control_dependencies(
                    [local_init_op] if local_init_op is not None else []):
                    if is_chief:
                        local_init_op = control_flow_ops.group(
                            *[opt.chief_init_op for opt in sync_optimizer])
                    else:
                        local_init_op = control_flow_ops.group(
                            *
                            [opt.local_step_init_op for opt in sync_optimizer])
                ready_for_local_init_op = control_flow_ops.group(
                    *[opt.ready_for_local_init_op for opt in sync_optimizer])
            else:
                ready_for_local_init_op = None

        if summary_op == _USE_DEFAULT:
            summary_op = summary.merge_all()

        if summary_writer == _USE_DEFAULT:
            summary_writer = supervisor.Supervisor.USE_DEFAULT

        if is_chief and sync_optimizer is not None:
            # Need to create these BEFORE the supervisor finalizes the graph:
            init_tokens_op = [
                opt.get_init_tokens_op() for opt in sync_optimizer
            ]
            chief_queue_runner = [
                opt.get_chief_queue_runner() for opt in sync_optimizer
            ]

        if train_step_kwargs == _USE_DEFAULT:
            with ops.name_scope('train_step'):
                train_step_kwargs = {}

                if number_of_steps:
                    should_stop_op = math_ops.greater_equal(
                        global_step, number_of_steps)
                else:
                    should_stop_op = constant_op.constant(False)
                train_step_kwargs['should_stop'] = should_stop_op
                if log_every_n_steps > 0:
                    train_step_kwargs['should_log'] = math_ops.equal(
                        math_ops.mod(global_step, log_every_n_steps), 0)
                if is_chief and trace_every_n_steps is not None:
                    train_step_kwargs['should_trace'] = math_ops.equal(
                        math_ops.mod(global_step, trace_every_n_steps), 0)
                    train_step_kwargs['logdir'] = logdir

    sv = supervisor.Supervisor(graph=graph,
                               is_chief=is_chief,
                               logdir=logdir,
                               init_op=init_op,
                               init_feed_dict=init_feed_dict,
                               local_init_op=local_init_op,
                               ready_for_local_init_op=ready_for_local_init_op,
                               ready_op=ready_op,
                               summary_op=summary_op,
                               summary_writer=summary_writer,
                               global_step=global_step,
                               saver=saver,
                               save_summaries_secs=save_summaries_secs,
                               save_model_secs=save_interval_secs,
                               init_fn=init_fn)

    if summary_writer is not None:
        train_step_kwargs['summary_writer'] = sv.summary_writer

    steps_in_epoch = int(num_examples / batch_size)

    total_loss = 0.0
    should_retry = True
    while should_retry:
        try:
            should_retry = False
            with sv.managed_session(master,
                                    start_standard_services=False,
                                    config=session_config) as sess:
                log.infov('Starting Session.')
                if session_wrapper is not None:
                    log.info('Wrapping session with wrapper function: %s',
                             session_wrapper)
                    sess = session_wrapper(sess)
                if is_chief:
                    if logdir:
                        sv.start_standard_services(sess)
                elif startup_delay_steps > 0:
                    _wait_for_step(
                        sess, global_step,
                        min(startup_delay_steps, number_of_steps
                            or sys.maxint))
                threads = sv.start_queue_runners(sess)
                log.infov('Starting Queues.')
                if is_chief and sync_optimizer is not None:
                    sv.start_queue_runners(sess, chief_queue_runner)
                    sess.run(init_tokens_op)
                sess.graph.finalize()
                # try:
                if config_summary_list is not None:
                    for config_summary in config_summary_list:
                        sv.summary_writer.add_summary(
                            config_summary.eval(session=sess))

                while not sv.should_stop():
                    for _train_op in train_op:
                        total_loss, should_stop, np_global_step = train_step_fn(
                            sess, _train_op, global_step, train_step_kwargs,
                            batch_size, steps_in_epoch, current_log_fn)
                        if should_stop:
                            log.infov('Stopping Training.')
                            sv.request_stop()
                            break

                # except errors.OutOfRangeError:
                #   # OutOfRangeError is thrown when epoch limit per
                #   # tf.train.limit_epochs is reached.
                #   log.warn('Caught OutOfRangeError. Stopping Training.')
                if logdir and sv.is_chief:
                    log.warn('Finished training! Saving model to disk.')
                    sv.saver.save(sess,
                                  sv.save_path,
                                  global_step=sv.global_step)
                    sv.stop(threads, close_summary_writer=True)

                    def _last_checkpoint_path(sv_save_path,
                                              additional_dir_name='last'):
                        dir_list = sv_save_path.split('/')
                        dir_list.insert(-1, 'last')
                        last_checkpoint_dir_path = '/'.join(dir_list[:-1])
                        last_checkpoint_path = '/'.join(dir_list)
                        return last_checkpoint_dir_path, last_checkpoint_path

                    # Save the last checkpoint again to a 'last' directory for the next training with
                    # different configuration.
                    last_checkpoint_dir_path, last_checkpoint_path = _last_checkpoint_path(
                        sv.save_path, 'last')
                    if os.path.exists(last_checkpoint_dir_path):
                        shutil.rmtree(last_checkpoint_dir_path)
                    os.makedirs(last_checkpoint_dir_path)
                    sv.saver.save(sess,
                                  last_checkpoint_path,
                                  global_step=sv.global_step)

        except errors.AbortedError:
            # Always re-run on AbortedError as it indicates a restart of one of the
            # distributed tensorflow servers.
            log.warn('Retrying training!')
            should_retry = True

    return total_loss
Ejemplo n.º 28
0
def main():
    args, cfg = parse_args()
    train_dir = get_output_dir(
        'default' if args.cfg_file is None else args.cfg_file)
    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu

    print('Using Config:')
    pprint.pprint(cfg)

    tf.logging.set_verbosity(tf.logging.INFO)
    with tf.Graph().as_default():
        tf_global_step = tf.train.get_or_create_global_step()

        ######################
        # Select the dataset #
        ######################
        kwargs = {}
        if cfg.TEST.VIDEO_FRAMES_PER_VIDEO > 1:
            kwargs['num_samples'] = cfg.TEST.VIDEO_FRAMES_PER_VIDEO
            kwargs['modality'] = cfg.INPUT.VIDEO.MODALITY
            kwargs['split_id'] = cfg.INPUT.SPLIT_ID
        if args.dataset_list_dir is not None:
            kwargs['dataset_list_dir'] = args.dataset_list_dir
        elif cfg.DATASET_LIST_DIR != '':
            kwargs['dataset_list_dir'] = cfg.DATASET_LIST_DIR
        if cfg.INPUT_FILE_STYLE_LABEL != '':
            kwargs['input_file_style_label'] = cfg.INPUT_FILE_STYLE_LABEL
        dataset, num_pose_keypoints = dataset_factory.get_dataset(
            cfg.DATASET_NAME, cfg.TEST.DATASET_SPLIT_NAME, cfg.DATASET_DIR,
            **kwargs)

        ####################
        # Select the model #
        ####################
        network_fn = nets_factory.get_network_fn(
            cfg.MODEL_NAME,
            num_classes=dataset.num_classes,
            num_pose_keypoints=num_pose_keypoints,
            is_training=False,
            cfg=cfg)

        ##############################################################
        # Create a dataset provider that loads data from the dataset #
        ##############################################################
        provider = slim.dataset_data_provider.DatasetDataProvider(
            dataset,
            shuffle=False,
            num_epochs=1,
            common_queue_capacity=2 * cfg.TEST.BATCH_SIZE,
            common_queue_min=cfg.TEST.BATCH_SIZE)
        [image, action_label] = get_input(provider, cfg,
                                          ['image', 'action_label'])
        # label -= FLAGS.labels_offset

        #####################################
        # Select the preprocessing function #
        #####################################
        preprocessing_name = cfg.MODEL_NAME
        image_preprocessing_fn = preprocessing_factory.get_preprocessing(
            preprocessing_name, is_training=False)

        eval_image_size = cfg.TRAIN.IMAGE_SIZE or network_fn.default_image_size

        image = image_preprocessing_fn(image,
                                       eval_image_size,
                                       eval_image_size,
                                       resize_side_min=cfg.TRAIN.RESIZE_SIDE,
                                       resize_side_max=cfg.TRAIN.RESIZE_SIDE)

        # additional preprocessing as required
        if 'flips' in args.preprocs:
            tf.logging.info('Flipping all images while testing!')
            image = tf.stack(
                [tf.image.flip_left_right(el) for el in tf.unstack(image)])

        images, action_labels = tf.train.batch(
            [image, action_label],
            batch_size=cfg.TEST.BATCH_SIZE,
            # following is because if there are more, the order of batch can be
            # different due to different speed... so avoid that
            # http://stackoverflow.com/questions/35001027/does-batching-queue-tf-train-batch-not-preserve-order#comment57731040_35001027
            # num_threads=1 if args.save else cfg.NUM_PREPROCESSING_THREADS,
            num_threads=
            1,  # The above was too unsafe as sometimes I forgot --save
            # and it would just randomize the whole thing.
            # This is very important so
            # shifting to this by default. Better safe than sorry.
            allow_smaller_final_batch=True if cfg.TEST.VIDEO_FRAMES_PER_VIDEO
            == 1 else False,  # because otherwise we need to
            # average logits over the frames,
            # and that needs first dimensions
            # to be fully defined
            capacity=5 * cfg.TEST.BATCH_SIZE)

        ####################
        # Define the model #
        ####################
        logits, end_points = network_fn(images)
        end_points['images'] = images

        if cfg.TEST.MOVING_AVERAGE_DECAY:
            variable_averages = tf.train.ExponentialMovingAverage(
                cfg.TEST.MOVING_AVERAGE_DECAY, tf_global_step)
            variables_to_restore = variable_averages.variables_to_restore(
                slim.get_model_variables())
            variables_to_restore[tf_global_step.op.name] = tf_global_step
        else:
            variables_to_restore = slim.get_variables_to_restore()

        predictions = tf.argmax(logits, 1)
        if cfg.TRAIN.LOSS_FN_ACTION.startswith('multi-label'):
            logits = tf.sigmoid(logits)
        else:
            logits = tf.nn.softmax(logits, -1)
        labels = tf.squeeze(action_labels)
        end_points['labels'] = labels

        # Define the metrics:
        names_to_values, names_to_updates = slim.metrics.aggregate_metric_map({
            'Accuracy':
            slim.metrics.streaming_accuracy(predictions, labels),
            # 'Recall@5': slim.metrics.streaming_recall_at_k(
            #     logits, labels, 5),
        })

        # Print the summaries to screen.
        for name, value in names_to_values.iteritems():
            summary_name = 'eval/%s' % name
            op = tf.summary.scalar(summary_name, value, collections=[])
            op = tf.Print(op, [value], summary_name)
            tf.add_to_collection(tf.GraphKeys.SUMMARIES, op)

        # TODO(sguada) use num_epochs=1
        if cfg.TEST.MAX_NUM_BATCHES:
            num_batches = cfg.TEST.MAX_NUM_BATCHES
        else:
            # This ensures that we make a single pass over all of the data.
            num_batches = math.ceil(dataset.num_samples /
                                    float(cfg.TEST.BATCH_SIZE))

        # just test the latest trained model
        checkpoint_path = cfg.TEST.CHECKPOINT_PATH or train_dir
        if tf.gfile.IsDirectory(checkpoint_path):
            checkpoint_path = tf.train.latest_checkpoint(checkpoint_path)
        else:
            checkpoint_path = checkpoint_path
        checkpoint_step = int(checkpoint_path.split('-')[-1])

        tf.logging.info('Evaluating %s' % checkpoint_path)

        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        config.allow_soft_placement = True
        summary_writer = tf.summary.FileWriter(logdir=train_dir)

        if cfg.TEST.EVAL_METRIC == 'mAP' or args.save or args.ept:
            from tensorflow.python.training import supervisor
            from tensorflow.python.framework import ops
            import h5py
            saver = tf.train.Saver(variables_to_restore)
            sv = supervisor.Supervisor(graph=ops.get_default_graph(),
                                       logdir=None,
                                       summary_op=None,
                                       summary_writer=summary_writer,
                                       global_step=None,
                                       saver=None)
            all_labels = []
            end_points['logits'] = logits
            end_points_to_save = args.ept + ['logits']
            end_points_to_save = list(set(end_points_to_save))
            all_feats = dict([(ename, []) for ename in end_points_to_save])
            start_time = time.time()
            with sv.managed_session('',
                                    start_standard_services=False,
                                    config=config) as sess:
                saver.restore(sess, checkpoint_path)
                sv.start_queue_runners(sess)
                for j in tqdm(range(int(math.ceil(num_batches)))):
                    feats = sess.run([
                        action_labels,
                        [end_points[ename] for ename in end_points_to_save]
                    ])
                    all_labels.append(feats[0])
                    for ept_id, ename in enumerate(end_points_to_save):
                        all_feats[ename].append(feats[1][ept_id])
            print(time.time() - start_time)
            APs = []
            all_labels = np.concatenate(all_labels)
            if args.save or args.ept:
                res_outdir = os.path.join(train_dir, 'Features/')
                mkdir_p(res_outdir)
                outfpath = args.outfpath or os.path.join(
                    res_outdir, 'features_ckpt_{}_{}.h5'.format(
                        cfg.TEST.DATASET_SPLIT_NAME, checkpoint_step))
                print(
                    'Saving the features/logits/labels to {}'.format(outfpath))
                with h5py.File(outfpath, 'a') as fout:
                    for ename in end_points_to_save:
                        if ename in fout:
                            tf.logging.warning(
                                'Deleting {} from output HDF5 to write the '
                                'new features.'.format(ename))
                            del fout[ename]
                        if ename == 'labels':
                            feat_to_save = np.array(all_feats[ename])
                        else:
                            feat_to_save = np.concatenate(all_feats[ename])
                        try:
                            fout.create_dataset(ename,
                                                data=feat_to_save,
                                                compression='gzip',
                                                compression_opts=9)
                        except:
                            pdb.set_trace(
                            )  # manually deal with it and continue
                    if 'labels' in fout:
                        del fout['labels']
                    fout.create_dataset('labels',
                                        data=all_labels,
                                        compression='gzip',
                                        compression_opts=9)

            if args.ept:
                tf.logging.info(
                    'Evaluation had --ept passed in. '
                    'This indicates script was used for feature '
                    'extraction. Hence, not performing any evaluation.')
                return
            # Evaluation code
            all_logits = np.concatenate(all_feats['logits'])
            acc = np.mean(all_logits.argmax(axis=1) == all_labels)
            mAP = compute_map(all_logits, all_labels)[0]
            print('Mean AP: {}'.format(mAP))
            print('Accuracy: {}'.format(acc))
            summary_writer.add_summary(tf.Summary(value=[
                tf.Summary.Value(tag='mAP/{}'.format(
                    cfg.TEST.DATASET_SPLIT_NAME),
                                 simple_value=mAP)
            ]),
                                       global_step=checkpoint_step)
            summary_writer.add_summary(tf.Summary(value=[
                tf.Summary.Value(tag='Accuracy/{}'.format(
                    cfg.TEST.DATASET_SPLIT_NAME),
                                 simple_value=acc)
            ]),
                                       global_step=checkpoint_step)
        else:
            slim.evaluation.evaluate_once(
                master='',
                checkpoint_path=checkpoint_path,
                logdir=train_dir,
                num_evals=num_batches,
                eval_op=names_to_updates.values(),
                variables_to_restore=variables_to_restore,
                session_config=config)
Ejemplo n.º 29
0
def main(_):
    if not FLAGS.dataset_dir:
        raise ValueError(
            'You must supply the dataset directory with --dataset_dir')
    print("START!")
    tf.logging.set_verbosity(tf.logging.INFO)
    with tf.Graph().as_default():
        tf_global_step = slim.get_or_create_global_step()

        ######################
        # Select the dataset #
        ######################
        dataset = dataset_factory.get_dataset(FLAGS.dataset_name,
                                              FLAGS.dataset_split_name,
                                              FLAGS.dataset_dir)

        ####################
        # Select the model #
        ####################
        network_fn = nets_factory.get_network_fn(
            FLAGS.model_name,
            num_classes=(dataset.num_classes - FLAGS.labels_offset),
            is_training=False)
        #print(dataset.num_classes)
        #print(dir(dataset))
        #print(dataset.num_samples)
        #print(dataset.get_shape())
        ##############################################################
        # Create a dataset provider that loads data from the dataset #
        ##############################################################
        provider = slim.dataset_data_provider.DatasetDataProvider(
            dataset,
            shuffle=False,
            common_queue_capacity=2 * FLAGS.batch_size,
            common_queue_min=FLAGS.batch_size)
        files = True
        if files:
            [image, label,
             filename] = provider.get(['image', 'label', 'filename'])
        else:
            [image, label] = provider.get(['image', 'label'])
        label -= FLAGS.labels_offset

        #####################################
        # Select the preprocessing function #
        #####################################
        preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name
        image_preprocessing_fn = preprocessing_factory.get_preprocessing(
            preprocessing_name, is_training=False)

        eval_image_size = FLAGS.eval_image_size or network_fn.default_image_size

        image = image_preprocessing_fn(image, eval_image_size, eval_image_size)

        if files:
            images, labels, filenames = tf.train.batch(
                [image, label, filename],
                batch_size=FLAGS.batch_size,
                num_threads=FLAGS.num_preprocessing_threads,
                capacity=FLAGS.batch_size,
                allow_smaller_final_batch=True)
        else:
            images, labels = tf.train.batch(
                [image, label],
                batch_size=FLAGS.batch_size,
                num_threads=FLAGS.num_preprocessing_threads,
                capacity=FLAGS.batch_size,
                allow_smaller_final_batch=True)

        ####################
        # Define the model #
        ####################
        logits, endpoints = network_fn(images)

        if FLAGS.moving_average_decay:
            variable_averages = tf.train.ExponentialMovingAverage(
                FLAGS.moving_average_decay, tf_global_step)
            variables_to_restore = variable_averages.variables_to_restore(
                slim.get_model_variables())
            variables_to_restore[tf_global_step.op.name] = tf_global_step
        else:
            variables_to_restore = slim.get_variables_to_restore()

        probabilities = tf.nn.softmax(logits)

        # TODO(sguada) use num_epochs=1
        if FLAGS.max_num_batches:
            num_batches = FLAGS.max_num_batches
        else:
            # This ensures that we make a single pass over all of the data.
            num_batches = math.ceil(dataset.num_samples /
                                    float(FLAGS.batch_size))

        if tf.gfile.IsDirectory(FLAGS.checkpoint_path):
            checkpoint_path = tf.train.latest_checkpoint(FLAGS.checkpoint_path)
        else:
            checkpoint_path = FLAGS.checkpoint_path

        init_fn = slim.assign_from_checkpoint_fn(checkpoint_path,
                                                 variables_to_restore)

        tf.logging.info('Evaluating %s' % checkpoint_path)

        ###
        import time

        from tensorflow.contrib.framework.python.ops import variables
        from tensorflow.python.framework import ops
        from tensorflow.python.ops import logging_ops
        from tensorflow.python.platform import tf_logging as logging
        from tensorflow.python.training import saver as tf_saver
        from tensorflow.python.training import summary_io
        from tensorflow.python.training import supervisor
        from tensorflow.python.training import training_util

        saver = tf_saver.Saver(variables_to_restore
                               or variables.get_variables_to_restore())

        #summary_writer = summary_io.SummaryWriter(logdir)

        sv = supervisor.Supervisor(graph=ops.get_default_graph(),
                                   logdir=FLAGS.eval_dir,
                                   summary_op=None,
                                   summary_writer=None,
                                   global_step=None,
                                   saver=None)

        logging.info('Starting evaluation at ' +
                     time.strftime('%Y-%m-%d-%H:%M:%S', time.gmtime()))

        import collections

        with sv.managed_session(FLAGS.master,
                                start_standard_services=False,
                                config=None) as sess:
            saver.restore(sess, checkpoint_path)
            sv.start_queue_runners(sess)

            if FLAGS.result_type == "classify":
                ##export classification

                classifications = {"classifications": {}}
                filenamelist = []
                for i in xrange(int(num_batches) + 1):
                    np_probabilities, np_labels, np_filenames, np_endpoints = sess.run(
                        [probabilities, labels, filenames, endpoints])
                    #print({i:endpoints[i].get_shape() for i in endpoints.keys()})
                    #return -1
                    for j in xrange(FLAGS.batch_size):
                        if not np_filenames[j] in filenamelist:
                            filenamelist.append(np_filenames[j])
                            tmpprob = []

                            for l in np.argsort(
                                    np_probabilities[j, :]
                            )[::-1][:5]:  #iterate over best 5 probs
                                tmpprob.append([
                                    str(dataset.labels_to_names[l]).rstrip(
                                        "\r"), "{0:.2f}".format(
                                            np_probabilities[j, l] * 100)
                                ])

                            tmp = {np_filenames[j]: tmpprob}
                            classifications["classifications"].update(tmp)
                        else:
                            pass

                    print(i)
                print(len(classifications["classifications"]))
                #print(filenamelist)
                sortedclass = collections.OrderedDict()
                for k in sorted(classifications["classifications"]):
                    sortedclass.update(
                        {k: classifications["classifications"][k]})
                classifications["classifications"] = sortedclass

                jsonecoded = json.dumps(classifications)
                loadconf = open(
                    os.path.join(FLAGS.result_path,
                                 FLAGS.result_name + ".json"), 'wb')
                loadconf.write(jsonecoded)
                loadconf.close()

            if FLAGS.result_type == "stats":
                np_probabilities, np_labels, np_filenames, np_endpoints = sess.run(
                    [probabilities, labels, filenames, endpoints])
                print({i: endpoints[i].get_shape()
                       for i in endpoints.keys()})  #layer shapes
                allparams = 0
                for variable in tf.trainable_variables():  #iterate over vars
                    shape = variable.get_shape()
                    currpar = 1
                    for dim in shape:  #iterate over shape of var
                        currpar *= dim.value
                    allparams += currpar  #add
                print(allparams)

                return -1  #kill

            if FLAGS.result_type == "decaf":
                ##extract DeCAFs
                features = []
                filenamelist = []

                layerdefinition = {
                    "alexnet_v2": "alexnet_v2/fc7/Relu:0",
                    "inception_v1": "MaxPool_0a_7x7",
                    "inception_v3": "AvgPool_1a_{}x{}",
                    "inception_resnet_v2": "AvgPool_1a_8x8",
                    "vgg_16": "vgg_16/fc7/Relu:0",
                    "resnet_v1_152": "pool5"
                }

                for i in xrange(int(num_batches)):
                    np_probabilities, np_labels, np_filenames, np_endpoints = sess.run(
                        [probabilities, labels, filenames, endpoints])

                    for j in xrange(FLAGS.batch_size):
                        if not np_filenames[j] in filenamelist:
                            filenamelist.append(np_filenames[j])

                            tmp_descr = (np_endpoints[layerdefinition[
                                FLAGS.model_name]][j][0][0]).tolist()

                            tmp_descr.insert(0, (np_filenames[j]).replace(
                                ".jpg", ""))
                            features.append(tmp_descr)
                    print(i)
                toARFF(
                    features, FLAGS.result_name,
                    os.path.join(FLAGS.result_path,
                                 FLAGS.result_name + ".arff"))
Ejemplo n.º 30
0
def evaluate_once(master,
                  checkpoint_path,
                  logdir,
                  num_evals=1,
                  initial_op=None,
                  initial_op_feed_dict=None,
                  eval_op=None,
                  eval_op_feed_dict=None,
                  final_op=None,
                  final_op_feed_dict=None,
                  summary_op=_USE_DEFAULT,
                  summary_op_feed_dict=None,
                  variables_to_restore=None,
                  session_config=None):
    """Evaluates the model at the given checkpoint path.

  Args:
    master: The BNS address of the TensorFlow master.
    checkpoint_path: The path to a checkpoint to use for evaluation.
    logdir: The directory where the TensorFlow summaries are written to.
    num_evals: The number of times to run `eval_op`.
    initial_op: An operation run at the beginning of evaluation.
    initial_op_feed_dict: A feed dictionary to use when executing `initial_op`.
    eval_op: A operation run `num_evals` times.
    eval_op_feed_dict: The feed dictionary to use when executing the `eval_op`.
    final_op: An operation to execute after all of the `eval_op` executions. The
      value of `final_op` is returned.
    final_op_feed_dict: A feed dictionary to use when executing `final_op`.
    summary_op: The summary_op to evaluate after running TF-Slims metric ops. By
      default the summary_op is set to tf.summary.merge_all().
    summary_op_feed_dict: An optional feed dictionary to use when running the
      `summary_op`.
    variables_to_restore: A list of TensorFlow variables to restore during
      evaluation. If the argument is left as `None` then
      slim.variables.GetVariablesToRestore() is used.
    session_config: An instance of `tf.ConfigProto` that will be used to
      configure the `Session`. If left as `None`, the default will be used.

  Returns:
    The value of `final_op` or `None` if `final_op` is `None`.
  """
    if summary_op == _USE_DEFAULT:
        summary_op = summary.merge_all()

    global_step = variables.get_or_create_global_step()

    saver = tf_saver.Saver(variables_to_restore
                           or variables.get_variables_to_restore(),
                           write_version=saver_pb2.SaverDef.V1)

    summary_writer = summary_io.SummaryWriter(logdir)

    sv = supervisor.Supervisor(graph=ops.get_default_graph(),
                               logdir=logdir,
                               summary_op=None,
                               summary_writer=None,
                               global_step=None,
                               saver=None)

    logging.info('Starting evaluation at ' +
                 time.strftime('%Y-%m-%d-%H:%M:%S', time.gmtime()))
    with sv.managed_session(master,
                            start_standard_services=False,
                            config=session_config) as sess:
        saver.restore(sess, checkpoint_path)
        sv.start_queue_runners(sess)
        final_op_value = evaluation(sess,
                                    num_evals=num_evals,
                                    initial_op=initial_op,
                                    initial_op_feed_dict=initial_op_feed_dict,
                                    eval_op=eval_op,
                                    eval_op_feed_dict=eval_op_feed_dict,
                                    final_op=final_op,
                                    final_op_feed_dict=final_op_feed_dict,
                                    summary_op=summary_op,
                                    summary_op_feed_dict=summary_op_feed_dict,
                                    summary_writer=summary_writer,
                                    global_step=global_step)

    logging.info('Finished evaluation at ' +
                 time.strftime('%Y-%m-%d-%H:%M:%S', time.gmtime()))

    return final_op_value