Ejemplo n.º 1
0
def train():
    with tf.Graph().as_default():
        global_step = tf.train.get_or_create_global_step()

    with tf.device('/cpu:0'):
        images, labels = adience.distored_inputs()

        logits = adience.inference(images)

        loss = adience.loss(logits, labels)

        train_op = adience.train(loss, global_step)

        class _LoggerHook(tf.train.SessionRunHook):
            def begin(self):
                self._step = -1
                self._start_time = time.time()

            def before_run(self, run_context):
                self._step += 1
                return tf.train.SessionRunArgs(loss)

            def after_run(self, run_context, run_values):
                if self._step % FLAGS.log_frequency == 0:
                    current_time = time.time()
                    duration = current_time - self._start_time

                    loss_value = run_values.results
                    examples_per_sec = FLAGS.log_frequency * FLAGS.batch_size / duration
                    sec_per_batch = float(duration / FLAGS.log_frequency)

                    format_str = ('%s: step %d, loss = %.2f (%1.f examples/sec; %.3f ' 'sec/batch)')
                    print(format_str % (datetime.now(), self._step, loss_value, examples_per_sec, sec_per_batch))

        with tf.train.MonitoredTrainingSession(
                checkpoint_dir=FLAGS.train_dir,
                hooks=[tf.train.StopAtStepHook(last_step=FLAGS.max_steps),
                    tf.train.NanTensorHook(loss),
                    _LoggerHook()],
                config=tf.ConfigProto(log_device_placement=FLAGS.log_device_placement)) as mon_sess:
            while not mon_sess.should_stop():
                mon_sess.run(train_op)
Ejemplo n.º 2
0
def evaluate():
    """Eval CIFAR-10 for a number of steps."""
    with tf.Graph().as_default():
        # Get images and labels for CIFAR-10.
        eval_data = FLAGS.eval_data == 'test'
        images, labels = adience.inputs(eval_data=eval_data)

        FLAGS.num_examples = adience.NUM_EXAMPLES_PER_EPOCH_FOR_EVAL

        # Build a Graph that computes the logits predictions from the
        # inference model.
        logits = adience.inference(images)

        # Calculate predictions.
        top_k_op = tf.nn.in_top_k(logits, labels, 1)

        # Restore the moving average version of the learned variables for eval.
        variable_averages = tf.train.ExponentialMovingAverage(
                adience.MOVING_AVERAGE_DECAY)
        variables_to_restore = {}
        for v in tf.all_variables():
            if v in tf.trainable_variables():
                restore_name = variable_averages.average_name(v)
            else:
                restore_name = v.op.name
            variables_to_restore[restore_name] = v
        saver = tf.train.Saver(variables_to_restore)

        # Build the summary operation based on the TF collection of Summaries.
        summary_op = tf.merge_all_summaries()

        graph_def = tf.get_default_graph().as_graph_def()
        summary_writer = tf.train.SummaryWriter(FLAGS.eval_dir, graph_def=graph_def)

        while True:
            eval_once(saver, summary_writer, top_k_op, summary_op)
            if FLAGS.run_once:
                break
            time.sleep(FLAGS.eval_interval_secs)
Ejemplo n.º 3
0
def train(train_continue):
    """Train Adience for a number of steps."""
    with tf.Graph().as_default():
        global_step = tf.Variable(0, trainable=False)

        # Get images and labels for Adience.
        images, labels = adience.distorted_inputs()
        print("distorted images")
        #print(labels)

        # Build a Graph that computes the logits predictions from the
        # inference model.
        print('call inference')
        logits = adience.inference(images)

        # Calculate loss.
        print('call loss')
        loss = adience.loss(logits, labels)

        # Build a Grahalloph that trains the model with one batch of examples and
        # updates the model parameters.
        print('train_op')
        train_op = adience.train(loss, global_step)

        # Build the summary operation based on the TF collection of Summaries.
        summary_op = tf.merge_all_summaries()

        # Build an initialization operation to run below.
        init = tf.initialize_all_variables()

        # Start running operations on the Graph.
        sess = tf.Session(config=tf.ConfigProto(
                log_device_placement=FLAGS.log_device_placement))
        sess.run(init)

        # Create a saver.
        if not train_continue:
            saver = tf.train.Saver(tf.all_variables())
            load_step = 0

        else:
            # Restore the moving average version of the learned variables for eval.
            variable_averages = tf.train.ExponentialMovingAverage(
                    adience.MOVING_AVERAGE_DECAY)
            variables_to_restore = {}
            for v in tf.all_variables():
                if v in tf.trainable_variables():
                    restore_name = variable_averages.average_name(v)
                else:
                    restore_name = v.op.name
                variables_to_restore[restore_name] = v
            saver = tf.train.Saver(variables_to_restore)

            ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir)
            if ckpt and ckpt.model_checkpoint_path:
                print("Checkpoint found")
                # Restores from checkpoint
                saver.restore(sess, ckpt.model_checkpoint_path)
                # Assuming model_checkpoint_path looks something like:
                #     /my-favorite-path/cifar10_train/model.ckpt-0,
                # extract global_step from it.
                load_step = int(ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]) + 1
                print("Start from step: {}".format(load_step))

            else:
                print('No checkpoint file found')

        # Start the queue runners.
        tf.train.start_queue_runners(sess=sess)

        summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, graph_def=sess.graph_def)

        for step in xrange(FLAGS.max_steps - load_step):
            # continue
            step += load_step

            start_time = time.time()
            _, loss_value = sess.run([train_op, loss])
            duration = time.time() - start_time

            assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

            if step % 10 == 0:
                num_examples_per_step = FLAGS.batch_size
                examples_per_sec = num_examples_per_step / duration
                sec_per_batch = float(duration)

                format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                                            'sec/batch)')
                print (format_str % (datetime.now(), step, loss_value,
                                                         examples_per_sec, sec_per_batch))

            if step % 100 == 0:
                summary_str = sess.run(summary_op)
                summary_writer.add_summary(summary_str, step)

            # Save the model checkpoint periodically.
            if step % 1000 == 0 or (step + 1) == FLAGS.max_steps:
                checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)

        else:
            print("Step already over limit: {}".format(FLAGS.max_steps))