def evaluate():
  """Eval CIFAR-10 for a number of steps."""
  with tf.Graph().as_default() as g:
    # Get images and labels for CIFAR-10.
    eval_data = FLAGS.eval_data == 'test'
    images, labels = cifar10.inputs(eval_data=eval_data)

    # Build a Graph that computes the logits predictions from the
    # inference model.
    logits = cifar10.inference(images)

    # Calculate predictions.
    top_k_op = tf.nn.in_top_k(logits, labels, 1)

    # Restore the moving average version of the learned variables for eval.
    variable_averages = tf.train.ExponentialMovingAverage(
        cifar10.MOVING_AVERAGE_DECAY)
    variables_to_restore = variable_averages.variables_to_restore()
    saver = tf.train.Saver(variables_to_restore)

    # Build the summary operation based on the TF collection of Summaries.
    summary_op = tf.merge_all_summaries()

    summary_writer = tf.train.SummaryWriter(FLAGS.eval_dir, g)

    while True:
      for i in range(20):
        eval_once(saver, summary_writer, top_k_op, summary_op,i)
      if FLAGS.run_once:
        break
      time.sleep(FLAGS.eval_interval_secs)
def tower_loss(scope):
  """Calculate the total loss on a single tower running the CIFAR model.
  Args:
    scope: unique prefix string identifying the CIFAR tower, e.g. 'tower_0'
  Returns:
     Tensor of shape [] containing the total loss for a batch of data
  """
  # Get images and labels for CIFAR-10.
  images, labels = cifar10.distorted_inputs()
  # Build inference Graph.
  logits = cifar10.inference(images)
  # Build the portion of the Graph calculating the losses. Note that we will
  # assemble the total_loss using a custom function below.
  _ = cifar10.loss(logits, labels)
  # Assemble all of the losses for the current tower only.
  losses = tf.get_collection('losses', scope)
  # Calculate the total loss for the current tower.
  total_loss = tf.add_n(losses, name='total_loss')
  # Compute the moving average of all individual losses and the total loss.
  loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg')
  loss_averages_op = loss_averages.apply(losses + [total_loss])
  # Attach a scalar summary to all individual losses and the total loss; do the
  # same for the averaged version of the losses.
  for l in losses + [total_loss]:
    # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training
    # session. This helps the clarity of presentation on tensorboard.
    loss_name = re.sub('%s_[0-9]*/' % cifar10.TOWER_NAME, '', l.op.name)
    # Name each loss as '(raw)' and name the moving average version of the loss
    # as the original loss name.
    tf.scalar_summary(loss_name +' (raw)', l)
    tf.scalar_summary(loss_name, loss_averages.average(l))
  with tf.control_dependencies([loss_averages_op]):
    total_loss = tf.identity(total_loss)
  return total_loss
def tower_loss(scope):
  """Calculate the total loss on a single tower running the CIFAR model.

  Args:
    scope: unique prefix string identifying the CIFAR tower, e.g. 'tower_0'

  Returns:
     Tensor of shape [] containing the total loss for a batch of data
  """
  # Get images and labels for CIFAR-10.
  images, labels = cifar10.distorted_inputs()

  # Build inference Graph.
  logits = cifar10.inference(images)

  # Build the portion of the Graph calculating the losses. Note that we will
  # assemble the total_loss using a custom function below.
  _ = cifar10.loss(logits, labels)

  # Assemble all of the losses for the current tower only.
  losses = tf.get_collection('losses', scope)

  # Calculate the total loss for the current tower.
  total_loss = tf.add_n(losses, name='total_loss')

  # Attach a scalar summary to all individual losses and the total loss; do the
  # same for the averaged version of the losses.
  for l in losses + [total_loss]:
    # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training
    # session. This helps the clarity of presentation on tensorboard.
    loss_name = re.sub('%s_[0-9]*/' % cifar10.TOWER_NAME, '', l.op.name)
    tf.contrib.deprecated.scalar_summary(loss_name, l)

  return total_loss
def evaluate():
	"""Eval CIFAR-10 for a number of steps."""
	with tf.Graph().as_default():
		# Get images and labels for CIFAR-10.
		eval_data = eval_data == 'test'
		images, labels = cifar10.inputs(eval_data=eval_data)
		# Build a Graph that computes the logits predictions from the
		# inference model.
		logits = cifar10.inference(images)
		# Calculate predictions.
		top_k_op = tf.nn.in_top_k(logits, labels, 1)
		# Restore the moving average version of the learned variables for eval.
		variable_averages = tf.train.ExponentialMovingAverage(
				cifar10.MOVING_AVERAGE_DECAY)
		variables_to_restore = {}
		for v in tf.all_variables():
			if v in tf.trainable_variables():
				restore_name = variable_averages.average_name(v)
			else:
				restore_name = v.op.name
			variables_to_restore[restore_name] = v
		saver = tf.train.Saver(variables_to_restore)
		while True:
			eval_once(saver, top_k_op)
			if run_once:
				break
			time.sleep(eval_interval_secs)
Exemple #5
0
def train():
  """Train CIFAR-10 for a number of steps."""
  with tf.Graph().as_default():
    global_step = tf.contrib.framework.get_or_create_global_step()

    # Get images and labels for CIFAR-10.
    images, labels = cifar10.distorted_inputs()
    #images, labels = cifar10.inputs()

    # Build a Graph that computes the logits predictions from the
    # inference model.
    logits = cifar10.inference(images)

    # Calculate loss.
    loss = cifar10.loss(logits, labels)

    # Build a Graph that trains the model with one batch of examples and
    # updates the model parameters.
    train_op = cifar10.train(loss, global_step)

    class _LoggerHook(tf.train.SessionRunHook):
      """Logs loss and runtime."""

      def begin(self):
        self._step = -1

      def before_run(self, run_context):
        self._step += 1
        self._start_time = time.time()
        return tf.train.SessionRunArgs(loss)  # Asks for loss value.

      def after_run(self, run_context, run_values):
        duration = time.time() - self._start_time
        loss_value = run_values.results
        if self._step % 10 == 0:
          num_examples_per_step = FLAGS.batch_size
          examples_per_sec = num_examples_per_step / duration
          sec_per_batch = float(duration)

          format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                        'sec/batch)')
          print (format_str % (datetime.now(), self._step, loss_value,
                               examples_per_sec, sec_per_batch))

    with tf.train.MonitoredTrainingSession(
        checkpoint_dir=FLAGS.train_dir,
        hooks=[tf.train.StopAtStepHook(last_step=FLAGS.max_steps),
               tf.train.NanTensorHook(loss),
               _LoggerHook()],
        config=tf.ConfigProto(
            log_device_placement=FLAGS.log_device_placement)) as mon_sess:
      while not mon_sess.should_stop():
        mon_sess.run(train_op)
Exemple #6
0
def evaluate():
    images, labels = cifar10.inputs(eval_data=True)
    logits = cifar10.inference(images)
    top_k_op = tf.nn.in_top_k(logits, labels, 1)

    variable_averages = tf.train.ExponentialMovingAverage(cifar10.MOVING_AVERAGE_DECAY)
    variables_to_restore = {}
    for v in tf.all_variables():
        if v in tf.trainable_variables():
            restore_name = variable_averages.average_name(v)
        else:
            restore_name = v.op.name
        variables_to_restore[restore_name] = v
    saver = tf.train.Saver(variables_to_restore)
    eval_once(saver, top_k_op)
def train():
  """Train CIFAR-10 for a number of steps."""
  with tf.Graph().as_default():
    global_step = tf.Variable(0, trainable=False)
    # Get images and labels for CIFAR-10.
    images, labels = cifar10.distorted_inputs()
    # Build a Graph that computes the logits predictions from the
    # inference model.
    logits = cifar10.inference(images)
    # Calculate loss.
    loss = cifar10.loss(logits, labels)
    # Build a Graph that trains the model with one batch of examples and
    # updates the model parameters.
    train_op = cifar10.train(loss, global_step)
    # Create a saver.
    saver = tf.train.Saver(tf.all_variables())
    # Build the summary operation based on the TF collection of Summaries.
    summary_op = tf.merge_all_summaries()
    # Build an initialization operation to run below.
    init = tf.initialize_all_variables()
    # Start running operations on the Graph.
    sess = tf.Session(config=tf.ConfigProto(
        log_device_placement=FLAGS.log_device_placement))
    sess.run(init)
    # Start the queue runners.
    tf.train.start_queue_runners(sess=sess)
    summary_writer = tf.train.SummaryWriter(FLAGS.train_dir,
                                            graph_def=sess.graph_def)
    for step in xrange(FLAGS.max_steps):
      start_time = time.time()
      _, loss_value = sess.run([train_op, loss])
      duration = time.time() - start_time
      assert not np.isnan(loss_value), 'Model diverged with loss = NaN'
      if step % 10 == 0:
        num_examples_per_step = FLAGS.batch_size
        examples_per_sec = num_examples_per_step / duration
        sec_per_batch = float(duration)
        format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                      'sec/batch)')
        print (format_str % (datetime.now(), step, loss_value,
                             examples_per_sec, sec_per_batch))
      if step % 100 == 0:
        summary_str = sess.run(summary_op)
        summary_writer.add_summary(summary_str, step)
      # Save the model checkpoint periodically.
      if step % 1000 == 0 or (step + 1) == FLAGS.max_steps:
        checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
Exemple #8
0
def train():
    # ops
    global_step = tf.Variable(0, trainable=False)
    images, labels = cifar10.distorted_inputs()
    logits = cifar10.inference(tf.image.resize_images(images, cifar10.IMAGE_SIZE, cifar10.IMAGE_SIZE))
    loss = cifar10.loss(logits, labels)
    train_op = cifar10.train(loss, global_step)
    summary_op = tf.merge_all_summaries()

    with tf.Session() as sess:
        saver = tf.train.Saver(tf.all_variables(), max_to_keep=21)
        summary_writer = tf.train.SummaryWriter(FLAGS.train_dir)

        # restore or initialize variables
        ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir)
        if ckpt and ckpt.model_checkpoint_path:
            saver.restore(sess, ckpt.model_checkpoint_path)
        else:
            sess.run(tf.initialize_all_variables())

        # Start the queue runners.
        tf.train.start_queue_runners(sess=sess)

        start = sess.run(global_step)
        for step in xrange(start, FLAGS.max_steps):
            start_time = time.time()
            _, loss_value = sess.run([train_op, loss])
            duration = time.time() - start_time

            assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

            print '%d: %f (%.3f sec/batch)' % (step, loss_value, duration)

            if step % 100 == 0:
                summary_str = sess.run(summary_op)
                summary_writer.add_summary(summary_str, step)
            if step % 500 == 0 or (step + 1) == FLAGS.max_steps:
                checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
def train():
  """Train CIFAR-10 for a number of steps."""
  with tf.Graph().as_default():
    global_step = tf.Variable(0, trainable=False)

    # Get images and labels for CIFAR-10.
    images, labels = cifar10.distorted_inputs()

    eval_data = FLAGS.eval_data == 'test'
    #timages, tlabels = cifar10.inputs(eval_data=eval_data)
    # Build a Graph that computes the logits predictions from the
    # inference model.
    logits = cifar10.inference(images)

    #tlogits = cifar10.inference(timages)
    # Calculate loss.
    top_k_op = tf.nn.in_top_k(logits, labels, 1)
    loss = cifar10.loss(logits, labels)
    #precision = tf.Variable(0.8, name='precision')
    #tf.scalar_summary('accuracy', precision)

    # Build a Graph that trains the model with one batch of examples and
    # updates the model parameters.
    train_op = cifar10.train(loss, global_step)

    # Create a saver.
    saver = tf.train.Saver(tf.all_variables())

    # Build the summary operation based on the TF collection of Summaries.
    summary_op = tf.merge_all_summaries()

    # Build an initialization operation to run below.
    init = tf.initialize_all_variables()

    # Start running operations on the Graph.
    sess = tf.Session(config=tf.ConfigProto(
        log_device_placement=FLAGS.log_device_placement))
    sess.run(init)

    # Start the queue runners.
    tf.train.start_queue_runners(sess=sess)
    sess.graph.finalize()

    summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph)

    for step in xrange(FLAGS.max_steps):
      start_time = time.time()
      _, loss_value = sess.run([train_op, loss])
      duration = time.time() - start_time

      assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

      if step % 100 == 0:

    # Build a Graph that computes the logits predictions from the
    # inference model.

    # Calculate predictions.
        num_examples_per_step = FLAGS.batch_size
        examples_per_sec = num_examples_per_step / duration
        sec_per_batch = float(duration)

        format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                      'sec/batch)')
        print (format_str % (datetime.now(), step, loss_value,
                             examples_per_sec, sec_per_batch))
	num_iter = int(math.ceil(FLAGS.num_examples / FLAGS.batch_size))
        true_count = 0  # Counts the number of correct predictions.
        total_sample_count = num_iter * FLAGS.batch_size
        i_step = 0
        while i_step < num_iter:
          predictions = sess.run([top_k_op])
          true_count += np.sum(predictions)
          i_step += 1

      #Compute precision @ 1.
      	#sess.run(precision.assign(true_count / total_sample_count))
      	prec = true_count / total_sample_count
      	print(prec)
	summary = tf.Summary()
        summary.ParseFromString(sess.run(summary_op))
        summary.value.add(tag='accuracy', simple_value=prec)
        summary_writer.add_summary(summary, step)

	#summary_str = sess.run(summary_op)

        #summary_writer.add_summary(summary_str, step)
       	#summary_writer.flush()

      # Save the model checkpoint periodically.
      if step % 100 == 0 or (step + 1) == FLAGS.max_steps:
        checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
def train():
  """Train CIFAR-10 for a number of steps."""
  with tf.Graph().as_default():
    global_step = tf.Variable(0, trainable=False)


    # Get images and labels for CIFAR-10.
    images, labels = cifar10.distorted_inputs()

    # Build a Graph that computes the logits predictions from the
    # inference model.
    logits = cifar10.inference(images)

    # Calculate loss.
    loss = cifar10.loss(logits, labels)

    # Build a Graph that trains the model with one batch of examples and
    # updates the model parameters.
    train_op = cifar10.train(loss, global_step)

    # Create a saver.
    saver = tf.train.Saver(tf.all_variables())

    # Build the summary operation based on the TF collection of Summaries.
    summary_op = tf.merge_all_summaries()

    # Build an initialization operation to run below.
    init = tf.initialize_all_variables()

    # Start running operations on the Graph.
    sess = tf.Session(config=tf.ConfigProto(
        log_device_placement=FLAGS.log_device_placement))
    sess.run(init)

    # Start the queue runners.
    tf.train.start_queue_runners(sess=sess)

    summary_writer = tf.train.SummaryWriter(FLAGS.train_dir)

    summary_writer0 = tf.train.SummaryWriter(FLAGS.train_dir0)
    summary_writer1= tf.train.SummaryWriter(FLAGS.train_dir1)
    summary_writer2 = tf.train.SummaryWriter(FLAGS.train_dir2)
    summary_writer3 = tf.train.SummaryWriter(FLAGS.train_dir3)
    summary_writer4 = tf.train.SummaryWriter(FLAGS.train_dir4)
    summary_writer5 = tf.train.SummaryWriter(FLAGS.train_dir5)
    summary_writer6 = tf.train.SummaryWriter(FLAGS.train_dir6)
    summary_writer7 = tf.train.SummaryWriter(FLAGS.train_dir7)
    summary_writer8 = tf.train.SummaryWriter(FLAGS.train_dir8)
    summary_writer9 = tf.train.SummaryWriter(FLAGS.train_dir9)
    summary_writer10 = tf.train.SummaryWriter(FLAGS.train_dir10)
    summary_writer11 = tf.train.SummaryWriter(FLAGS.train_dir11)
    summary_writer12 = tf.train.SummaryWriter(FLAGS.train_dir12)
    summary_writer13 = tf.train.SummaryWriter(FLAGS.train_dir13)
    summary_writer14 = tf.train.SummaryWriter(FLAGS.train_dir14)
    summary_writer15 = tf.train.SummaryWriter(FLAGS.train_dir15)
    summary_writer16 = tf.train.SummaryWriter(FLAGS.train_dir16)
    summary_writer17 = tf.train.SummaryWriter(FLAGS.train_dir17)
    summary_writer18 = tf.train.SummaryWriter(FLAGS.train_dir18)
    summary_writer19 = tf.train.SummaryWriter(FLAGS.train_dir19)
   


    for step in xrange(FLAGS.max_steps):
      start_time = time.time()
      _, loss_value = sess.run([train_op, loss])
      duration = time.time() - start_time

      assert not np.isnan(loss_value), 'Model diverged with loss = NaN'


      if step % 10 == 0:
        num_examples_per_step = FLAGS.batch_size
        examples_per_sec = num_examples_per_step / duration
        sec_per_batch = float(duration)

        format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                      'sec/batch)')
        print (format_str % (datetime.now(), step, loss_value,
                             examples_per_sec, sec_per_batch))

      if step % 100 == 0:
        summary_str = sess.run(summary_op)
        summary_writer.add_summary(summary_str, step)
        summary_writer0.add_summary(summary_str, step)
        summary_writer1.add_summary(summary_str, step)
        summary_writer2.add_summary(summary_str, step)
        summary_writer3.add_summary(summary_str, step)
        summary_writer4.add_summary(summary_str, step)
        summary_writer5.add_summary(summary_str, step)
        summary_writer6.add_summary(summary_str, step)
        summary_writer7.add_summary(summary_str, step)
        summary_writer8.add_summary(summary_str, step)
        summary_writer9.add_summary(summary_str, step)
        summary_writer10.add_summary(summary_str, step)
        summary_writer11.add_summary(summary_str, step)
        summary_writer12.add_summary(summary_str, step)
        summary_writer13.add_summary(summary_str, step)
        summary_writer14.add_summary(summary_str, step)
        summary_writer15.add_summary(summary_str, step)
        summary_writer16.add_summary(summary_str, step)
        summary_writer17.add_summary(summary_str, step)
        summary_writer18.add_summary(summary_str, step)
        summary_writer19.add_summary(summary_str, step)

      # Save the model checkpoint periodically.
      # if step % 1000 == 0 or (step + 1) == FLAGS.max_steps:
      #   checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
      #   saver.save(sess, checkpoint_path, global_step=step/100)

        # hard cord here!!!
      if step==100:
        checkpoint_path = os.path.join(FLAGS.train_dir0, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)

      if step==200:
        checkpoint_path = os.path.join(FLAGS.train_dir1, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)

      if step==300:
        checkpoint_path = os.path.join(FLAGS.train_dir2, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
      if step==400:
        checkpoint_path = os.path.join(FLAGS.train_dir3, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
      if step==500:
        checkpoint_path = os.path.join(FLAGS.train_dir4, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
      if step==600:
        checkpoint_path = os.path.join(FLAGS.train_dir5, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
      if step==700:
        checkpoint_path = os.path.join(FLAGS.train_dir6, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
      if step==800:
        checkpoint_path = os.path.join(FLAGS.train_dir7, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
      if step==900:
        checkpoint_path = os.path.join(FLAGS.train_dir8, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
      if step==1000:
        checkpoint_path = os.path.join(FLAGS.train_dir9, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
      if step==1100:
        checkpoint_path = os.path.join(FLAGS.train_dir10, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
      if step==1200:
        checkpoint_path = os.path.join(FLAGS.train_dir11, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
      if step==1300:
        checkpoint_path = os.path.join(FLAGS.train_dir12, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
      if step==1400:
        checkpoint_path = os.path.join(FLAGS.train_dir13, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
      if step==1500:
        checkpoint_path = os.path.join(FLAGS.train_dir14, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
      if step==1600:
        checkpoint_path = os.path.join(FLAGS.train_dir15, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
      if step==1700:
        checkpoint_path = os.path.join(FLAGS.train_dir16, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
      if step==1800:
        checkpoint_path = os.path.join(FLAGS.train_dir17, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
      if step==1900:
        checkpoint_path = os.path.join(FLAGS.train_dir18, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
      if step==2000:
        checkpoint_path = os.path.join(FLAGS.train_dir19, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
def train():
  ps_hosts = FLAGS.ps_hosts.split(',')
  worker_hosts = FLAGS.worker_hosts.split(',')
  print ('PS hosts are: %s' % ps_hosts)
  print ('Worker hosts are: %s' % worker_hosts)

  server = tf.train.Server(
      {'ps': ps_hosts, 'worker': worker_hosts},
      job_name = FLAGS.job_name,
      task_index=FLAGS.task_id)

  if FLAGS.job_name == 'ps':
    # `ps` jobs wait for incoming connections from the workers.
    server.join()

  is_chief = (FLAGS.task_id == 0)
  if is_chief:
    if tf.gfile.Exists(FLAGS.train_dir):
      tf.gfile.DeleteRecursively(FLAGS.train_dir)
    tf.gfile.MakeDirs(FLAGS.train_dir)

  """Train CIFAR-10 for a number of steps."""
  cluster = tf.train.ClusterSpec({'ps': ps_hosts, 'worker': worker_hosts})
  device_setter = tf.train.replica_device_setter(cluster=cluster)
  with tf.device(device_setter):
    global_step = tf.Variable(0, trainable=False)

    # Get images and labels for CIFAR-10.
    images, labels = cifar10.distorted_inputs()

    # Build a Graph that computes the logits predictions from the
    # inference model.
    logits = cifar10.inference(images)

    # Calculate loss.
    loss = cifar10.loss(logits, labels)

    num_batches_per_epoch = NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / FLAGS.batch_size
    decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY)

    # Decay the learning rate exponentially based on the number of steps.
    lr = tf.train.exponential_decay(INITIAL_LEARNING_RATE,
                                    global_step,
                                    decay_steps,
                                    LEARNING_RATE_DECAY_FACTOR,
                                    staircase=True)
    tf.scalar_summary('learning_rate', lr)
    opt = tf.train.GradientDescentOptimizer(lr)


    # Track the moving averages of all trainable variables.
    exp_moving_averager = tf.train.ExponentialMovingAverage(
        MOVING_AVERAGE_DECAY, global_step)
    variables_to_average = (
        tf.trainable_variables() + tf.moving_average_variables())

    opt = tf.train.SyncReplicasOptimizer(
        opt,
        replicas_to_aggregate=len(worker_hosts),
        replica_id=FLAGS.task_id,
        total_num_replicas=len(worker_hosts),
        variable_averages=exp_moving_averager,
        variables_to_average=variables_to_average)


    # Compute gradients with respect to the loss.
    grads = opt.compute_gradients(loss)

    # Add histograms for gradients.
    for grad, var in grads:
      if grad is not None:
        tf.histogram_summary(var.op.name + '/gradients', grad)

    apply_gradients_op = opt.apply_gradients(grads, global_step=global_step)

    with tf.control_dependencies([apply_gradients_op]):
      train_op = tf.identity(loss, name='train_op')


    chief_queue_runners = [opt.get_chief_queue_runner()]
    init_tokens_op = opt.get_init_tokens_op()

    saver = tf.train.Saver()
    # We run the summaries in the same thread as the training operations by
    # passing in None for summary_op to avoid a summary_thread being started.
    # Running summaries and training operations in parallel could run out of
    # GPU memory.
    sv = tf.train.Supervisor(is_chief=is_chief,
                             logdir=FLAGS.train_dir,
                             init_op=tf.initialize_all_variables(),
                             summary_op=tf.merge_all_summaries(),
                             global_step=global_step,
                             saver=saver,
                             save_model_secs=60)

    tf.logging.info('%s Supervisor' % datetime.now())

    sess_config = tf.ConfigProto(
        allow_soft_placement=True,
        log_device_placement=FLAGS.log_device_placement)

    print ("Before session init")
    # Get a session.
    sess = sv.prepare_or_wait_for_session(server.target, config=sess_config)
    print ("Before session init done")

    # Start the queue runners.
    queue_runners = tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS)
    sv.start_queue_runners(sess, queue_runners)
    print ('Started %d queues for processing input data.' % len(queue_runners))

    sv.start_queue_runners(sess, chief_queue_runners)
    sess.run(init_tokens_op)

    print ('Start training')
    for step in xrange(FLAGS.max_steps):
      start_time = time.time()
      _, loss_value, gs = sess.run([train_op, loss, global_step])
      duration = time.time() - start_time

      assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

      if step % 10 == 0:
        num_examples_per_step = FLAGS.batch_size
        examples_per_sec = num_examples_per_step / duration
        sec_per_batch = float(duration)

        format_str = ('%s: step %d (global_step %d), loss = %.2f (%.1f examples/sec; %.3f '
                      'sec/batch)')
        print (format_str % (datetime.now(), step, gs, loss_value,
                             examples_per_sec, sec_per_batch))

    if is_chief:
        saver.save(sess,
                   os.path.join(FLAGS.train_dir, 'model.ckpt'),
                   global_step=global_step)
Exemple #12
0
    reader = tf.TextLineReader()
    key, value = reader.read(filename_queue)


    batch_size = 128
    min_fraction_of_examples_in_queue = 0.4
    num_examples_per_epoch = 50000
    min_queue_examples = int(num_examples_per_epoch *
                             min_fraction_of_examples_in_queue)
    images_batch, label_batch =\
      tf.train.shuffle_batch_join([read_example(value) for _ in range(9)],
                                   batch_size=batch_size,
                                   capacity=min_queue_examples + 3*batch_size,
                                   min_after_dequeue=min_queue_examples)

    logits = cifar10.inference(images_batch)
    loss = cifar10.loss(logits, label_batch)

    global_step = tf.Variable(0, trainable=False)
    train_op = cifar10.train(loss, global_step)

    saver = tf.train.Saver(tf.all_variables())

    summary_op = tf.merge_all_summaries() 

    init = tf.initialize_all_variables()

    sess = tf.Session()


    summary_writer = tf.train.SummaryWriter('./train',
Exemple #13
0
import os

cifar10.NUM_CLASSES = 6

FLAGS = tf.app.flags.FLAGS
tf.app.flags.DEFINE_string('checkpoint_path', '/tmp/model.ckpt',
                           """Directory where to read model checkpoints.""")
tf.app.flags.DEFINE_string('download_url', 'http://',
                           """Directory where to read model checkpoints.""")
tf.app.flags.DEFINE_integer('image_size', 32,
                           """Image size.""")
tf.app.flags.DEFINE_integer('port', 5000,
                           """Application port.""")

images = tf.placeholder(tf.float32, shape=(1, FLAGS.image_size, FLAGS.image_size, 3))
logits = tf.nn.softmax(cifar10.inference(images))

sess = tf.Session()
saver = tf.train.Saver(tf.all_variables())
if not os.path.isfile(FLAGS.checkpoint_path):
    print 'No checkpoint file found'
    print urllib.urlretrieve(FLAGS.download_url, FLAGS.checkpoint_path)
saver.restore(sess, FLAGS.checkpoint_path)


app = Flask(__name__)
app.debug = True

@app.route('/', methods=['POST'])
def api():
    results = []
def train():
    ps_hosts = FLAGS.ps_hosts.split(',')
    worker_hosts = FLAGS.worker_hosts.split(',')
    print ('PS hosts are: %s' % ps_hosts)
    print ('Worker hosts are: %s' % worker_hosts)

    server = tf.train.Server(
        {'ps': ps_hosts, 'worker': worker_hosts},
        job_name = FLAGS.job_name,
        task_index=FLAGS.task_id)

    if FLAGS.job_name == 'ps':
        server.join()

    is_chief = (FLAGS.task_id == 0)
    if is_chief:
        if tf.gfile.Exists(FLAGS.train_dir):
            tf.gfile.DeleteRecursively(FLAGS.train_dir)
        tf.gfile.MakeDirs(FLAGS.train_dir)
  
    device_setter = tf.train.replica_device_setter(ps_tasks=1)
    with tf.device('/job:worker/task:%d' % FLAGS.task_id):
        with tf.device(device_setter):
            global_step = tf.Variable(0, trainable=False)

            # Get images and labels for CIFAR-10.
            images, labels = cifar10.distorted_inputs()

            # Build a Graph that computes the logits predictions from the
            # inference model.
            logits = cifar10.inference(images)

            # Calculate loss.
            loss = cifar10.loss(logits, labels)
            train_op = cifar10.train(loss, global_step)

            saver = tf.train.Saver()
            # We run the summaries in the same thread as the training operations by
            # passing in None for summary_op to avoid a summary_thread being started.
            # Running summaries and training operations in parallel could run out of
            # GPU memory.
            sv = tf.train.Supervisor(is_chief=is_chief,
                                     logdir=FLAGS.train_dir,
                                     init_op=tf.initialize_all_variables(),
                                     summary_op=tf.merge_all_summaries(),
                                     global_step=global_step,
                                     saver=saver,
                                     save_model_secs=60)

            tf.logging.info('%s Supervisor' % datetime.now())

            sess_config = tf.ConfigProto(allow_soft_placement=True,
                                         log_device_placement=FLAGS.log_device_placement)

            print ("Before session init")
            # Get a session.
            sess = sv.prepare_or_wait_for_session(server.target, config=sess_config)
            print ("Session init done")

            # Start the queue runners.
            queue_runners = tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS)
            sv.start_queue_runners(sess, queue_runners)
            print ('Started %d queues for processing input data.' % len(queue_runners))
  
            """Train CIFAR-10 for a number of steps."""
            for step in xrange(FLAGS.max_steps):
                start_time = time.time()
                _, loss_value, gs = sess.run([train_op, loss, global_step])
                duration = time.time() - start_time

                assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

                if step % 10 == 0:
                    num_examples_per_step = FLAGS.batch_size
                    examples_per_sec = num_examples_per_step / duration
                    sec_per_batch = float(duration)

                    format_str = ('%s: step %d (global_step %d), loss = %.2f (%.1f examples/sec; %.3f sec/batch)')
                    print (format_str % (datetime.now(), step, gs, loss_value, examples_per_sec, sec_per_batch))

    if is_chief:
        saver.save(sess, os.path.join(FLAGS.train_dir, 'model.ckpt'), global_step=global_step)