def evaluate():
  """Eval CIFAR-10 for a number of steps."""
  with tf.Graph().as_default() as g:
    # Get images and labels for CIFAR-10.
    eval_data = FLAGS.eval_data == 'test'
    images, labels = cifar10.inputs(eval_data=eval_data)

    # Build a Graph that computes the logits predictions from the
    # inference model.
    logits = cifar10.inference(images)

    # Calculate predictions.
    top_k_op = tf.nn.in_top_k(logits, labels, 1)

    # Restore the moving average version of the learned variables for eval.
    variable_averages = tf.train.ExponentialMovingAverage(
        cifar10.MOVING_AVERAGE_DECAY)
    variables_to_restore = variable_averages.variables_to_restore()
    saver = tf.train.Saver(variables_to_restore)

    # Build the summary operation based on the TF collection of Summaries.
    summary_op = tf.merge_all_summaries()

    summary_writer = tf.train.SummaryWriter(FLAGS.eval_dir, g)

    while True:
      for i in range(20):
        eval_once(saver, summary_writer, top_k_op, summary_op,i)
      if FLAGS.run_once:
        break
      time.sleep(FLAGS.eval_interval_secs)
def tower_loss(scope):
  """Calculate the total loss on a single tower running the CIFAR model.
  Args:
    scope: unique prefix string identifying the CIFAR tower, e.g. 'tower_0'
  Returns:
     Tensor of shape [] containing the total loss for a batch of data
  """
  # Get images and labels for CIFAR-10.
  images, labels = cifar10.distorted_inputs()
  # Build inference Graph.
  logits = cifar10.inference(images)
  # Build the portion of the Graph calculating the losses. Note that we will
  # assemble the total_loss using a custom function below.
  _ = cifar10.loss(logits, labels)
  # Assemble all of the losses for the current tower only.
  losses = tf.get_collection('losses', scope)
  # Calculate the total loss for the current tower.
  total_loss = tf.add_n(losses, name='total_loss')
  # Compute the moving average of all individual losses and the total loss.
  loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg')
  loss_averages_op = loss_averages.apply(losses + [total_loss])
  # Attach a scalar summary to all individual losses and the total loss; do the
  # same for the averaged version of the losses.
  for l in losses + [total_loss]:
    # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training
    # session. This helps the clarity of presentation on tensorboard.
    loss_name = re.sub('%s_[0-9]*/' % cifar10.TOWER_NAME, '', l.op.name)
    # Name each loss as '(raw)' and name the moving average version of the loss
    # as the original loss name.
    tf.scalar_summary(loss_name +' (raw)', l)
    tf.scalar_summary(loss_name, loss_averages.average(l))
  with tf.control_dependencies([loss_averages_op]):
    total_loss = tf.identity(total_loss)
  return total_loss
Exemple #3
0
def evaluate():
    """Eval CIFAR-10 for a number of steps."""
    with tf.Graph().as_default():
        # Get images and labels for CIFAR-10.
        eval_data = FLAGS.eval_data == 'test'
        images, labels = cifar10.inputs(eval_data=eval_data)
        # Build a Graph that computes the logits predictions from the
        # inference model.
        logits = cifar10.inference(images)
        # Calculate predictions.
        top_k_op = tf.nn.in_top_k(logits, labels, 1)
        # Restore the moving average version of the learned variables for eval.
        variable_averages = tf.train.ExponentialMovingAverage(
            cifar10.MOVING_AVERAGE_DECAY)
        variables_to_restore = variable_averages.variables_to_restore()
        saver = tf.train.Saver(variables_to_restore)
        # Build the summary operation based on the TF collection of Summaries.
        summary_op = tf.merge_all_summaries()
        graph_def = tf.get_default_graph().as_graph_def()
        summary_writer = tf.train.SummaryWriter(FLAGS.eval_dir,
                                                graph_def=graph_def)
        while True:
            eval_once(saver, summary_writer, top_k_op, summary_op)
            if FLAGS.run_once:
                break
            time.sleep(FLAGS.eval_interval_secs)
Exemple #4
0
def evaluate():
    """Eval CIFAR-10 for a number of steps."""
    with tf.Graph().as_default():
        # Get images and labels for CIFAR-10.
        eval_data = eval_data == 'test'
        images, labels = cifar10.inputs(eval_data=eval_data)
        # Build a Graph that computes the logits predictions from the
        # inference model.
        logits = cifar10.inference(images)
        # Calculate predictions.
        top_k_op = tf.nn.in_top_k(logits, labels, 1)
        # Restore the moving average version of the learned variables for eval.
        variable_averages = tf.train.ExponentialMovingAverage(
            cifar10.MOVING_AVERAGE_DECAY)
        variables_to_restore = {}
        for v in tf.all_variables():
            if v in tf.trainable_variables():
                restore_name = variable_averages.average_name(v)
            else:
                restore_name = v.op.name
            variables_to_restore[restore_name] = v
        saver = tf.train.Saver(variables_to_restore)
        while True:
            eval_once(saver, top_k_op)
            if run_once:
                break
            time.sleep(eval_interval_secs)
def tower_loss(scope):
  """Calculate the total loss on a single tower running the CIFAR model.

  Args:
    scope: unique prefix string identifying the CIFAR tower, e.g. 'tower_0'

  Returns:
     Tensor of shape [] containing the total loss for a batch of data
  """
  # Get images and labels for CIFAR-10.
  images, labels = cifar10.distorted_inputs()

  # Build inference Graph.
  logits = cifar10.inference(images)

  # Build the portion of the Graph calculating the losses. Note that we will
  # assemble the total_loss using a custom function below.
  _ = cifar10.loss(logits, labels)

  # Assemble all of the losses for the current tower only.
  losses = tf.get_collection('losses', scope)

  # Calculate the total loss for the current tower.
  total_loss = tf.add_n(losses, name='total_loss')

  # Attach a scalar summary to all individual losses and the total loss; do the
  # same for the averaged version of the losses.
  for l in losses + [total_loss]:
    # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training
    # session. This helps the clarity of presentation on tensorboard.
    loss_name = re.sub('%s_[0-9]*/' % cifar10.TOWER_NAME, '', l.op.name)
    tf.contrib.deprecated.scalar_summary(loss_name, l)

  return total_loss
def evaluate():
	"""Eval CIFAR-10 for a number of steps."""
	with tf.Graph().as_default():
		# Get images and labels for CIFAR-10.
		eval_data = eval_data == 'test'
		images, labels = cifar10.inputs(eval_data=eval_data)
		# Build a Graph that computes the logits predictions from the
		# inference model.
		logits = cifar10.inference(images)
		# Calculate predictions.
		top_k_op = tf.nn.in_top_k(logits, labels, 1)
		# Restore the moving average version of the learned variables for eval.
		variable_averages = tf.train.ExponentialMovingAverage(
				cifar10.MOVING_AVERAGE_DECAY)
		variables_to_restore = {}
		for v in tf.all_variables():
			if v in tf.trainable_variables():
				restore_name = variable_averages.average_name(v)
			else:
				restore_name = v.op.name
			variables_to_restore[restore_name] = v
		saver = tf.train.Saver(variables_to_restore)
		while True:
			eval_once(saver, top_k_op)
			if run_once:
				break
			time.sleep(eval_interval_secs)
Exemple #7
0
def evaluate():
    """Eval CIFAR-10 for a number of steps."""
    with tf.Graph().as_default() as g:
        # Get images and labels for CIFAR-10.
        eval_data = FLAGS.eval_data == 'test'
        images, labels = cifar10.inputs(eval_data=eval_data)

        # Build a Graph that computes the logits predictions from the
        # inference model.
        logits = cifar10.inference(images)

        # Calculate predictions.
        top_k_op = tf.nn.in_top_k(logits, labels, 1)

        # define predict function
        predict_function = tf.argmax(logits, 1)

        variable_averages = tf.train.ExponentialMovingAverage(
            cifar10.MOVING_AVERAGE_DECAY)
        variables_to_restore = variable_averages.variables_to_restore()
        saver = tf.train.Saver(variables_to_restore)
        # Build the summary operation based on the TF collection of Summaries.
        summary_op = tf.merge_all_summaries()
        summary_writer = tf.train.SummaryWriter(FLAGS.eval_dir, g)

        predict()
def evaluate():
    """Eval CIFAR-10 for a number of steps."""
    with tf.Graph().as_default() as g:
        # Get images and labels for CIFAR-10.
        #eval_data = FLAGS.eval_data == 'test'
        #images, labels = cifar10.inputs(eval_data=eval_data)
        images = secondimage()
        # Build a Graph that computes the logits predictions from the
        # inference model.
        local4 = cifar10.inference(images)
        # with tf.Session() as sess:
        #     print (sess.run([images]))

        # Calculate predictions.
        #top_k_op = tf.nn.in_top_k(logits, labels, 1)

        # Restore the moving average version of the learned variables for eval.
        variable_averages = tf.train.ExponentialMovingAverage(
            cifar10.MOVING_AVERAGE_DECAY)
        variables_to_restore = variable_averages.variables_to_restore()
        saver = tf.train.Saver(variables_to_restore)

        # Build the summary operation based on the TF collection of Summaries.
        summary_op = tf.merge_all_summaries()

        summary_writer = tf.train.SummaryWriter(FLAGS.eval_dir, g)

        return eval_once(saver, summary_writer, local4, summary_op, images)
Exemple #9
0
def tower_loss(scope):
    """Calculate the total loss on a single tower running the CIFAR model.
  Args:
    scope: unique prefix string identifying the CIFAR tower, e.g. 'tower_0'
  Returns:
     Tensor of shape [] containing the total loss for a batch of data
  """
    # Get images and labels for CIFAR-10.
    images, labels = cifar10.distorted_inputs()
    # Build inference Graph.
    logits = cifar10.inference(images)
    # Build the portion of the Graph calculating the losses. Note that we will
    # assemble the total_loss using a custom function below.
    _ = cifar10.loss(logits, labels)
    # Assemble all of the losses for the current tower only.
    losses = tf.get_collection('losses', scope)
    # Calculate the total loss for the current tower.
    total_loss = tf.add_n(losses, name='total_loss')
    # Compute the moving average of all individual losses and the total loss.
    loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg')
    loss_averages_op = loss_averages.apply(losses + [total_loss])
    # Attach a scalar summary to all individual losses and the total loss; do the
    # same for the averaged version of the losses.
    for l in losses + [total_loss]:
        # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training
        # session. This helps the clarity of presentation on tensorboard.
        loss_name = re.sub('%s_[0-9]*/' % cifar10.TOWER_NAME, '', l.op.name)
        # Name each loss as '(raw)' and name the moving average version of the loss
        # as the original loss name.
        tf.scalar_summary(loss_name + ' (raw)', l)
        tf.scalar_summary(loss_name, loss_averages.average(l))
    with tf.control_dependencies([loss_averages_op]):
        total_loss = tf.identity(total_loss)
    return total_loss
Exemple #10
0
def main():
    # Get images and labels for CIFAR-10.
    eval_data = FLAGS.eval_data == 'test'
    images, labels = cifar10.inputs(eval_data=eval_data)
    with tf.Session() as sess:
        # Build a Graph that computes the logits predictions from the
        # inference model.
        probabilities = tf.nn.softmax(cifar10.inference(images))

        # Restore the moving average version of the learned variables for eval.
        variable_averages = tf.train.ExponentialMovingAverage(
            cifar10.MOVING_AVERAGE_DECAY)
        variables_to_restore = variable_averages.variables_to_restore()
        saver = tf.train.Saver(variables_to_restore)

        ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir)
        if ckpt and ckpt.model_checkpoint_path:
            # Restores from checkpoint
            saver.restore(sess, ckpt.model_checkpoint_path)
        else:
            print('No checkpoint file found')
            return

        # Start the queue runners.
        coord = tf.train.Coordinator()
        try:
            threads = []
            for qr in tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS):
                threads.extend(
                    qr.create_threads(sess,
                                      coord=coord,
                                      daemon=True,
                                      start=True))

            num_iter = int(math.ceil(FLAGS.num_examples / FLAGS.batch_size))

            submission = []
            true_labels = []

            step = 0
            while step < num_iter and not coord.should_stop():
                submission_batch, true_labels_batch = sess.run(
                    [probabilities, labels])
                submission.append(submission_batch)
                true_labels.append(true_labels_batch)
                step += 1

            submission = np.vstack(submission)
            true_labels = np.concatenate(true_labels)

        except Exception as e:  # pylint: disable=broad-except
            coord.request_stop(e)

        coord.request_stop()
        coord.join(threads, stop_grace_period_secs=10)

    return submission, true_labels
Exemple #11
0
def train():
    """Train CIFAR-10 for a number of steps."""
    with tf.Graph().as_default():
        global_step = tf.contrib.framework.get_or_create_global_step()

        # Get images and labels for CIFAR-10.
        images, labels = cifar10.distorted_inputs()

        # Build a Graph that computes the logits predictions from the
        # inference model.
        logits = cifar10.inference(images)

        # Calculate loss.
        loss = cifar10.loss(logits, labels)

        # Build a Graph that trains the model with one batch of examples and
        # updates the model parameters.
        train_op = cifar10.train(loss, global_step)

        class _LoggerHook(tf.train.SessionRunHook):
            """Logs loss and runtime."""
            def begin(self):
                self._step = -1

            def before_run(self, run_context):
                self._step += 1
                self._start_time = time.time()
                return tf.train.SessionRunArgs(loss)  # Asks for loss value.

            def after_run(self, run_context, run_values):
                duration = time.time() - self._start_time
                loss_value = run_values.results
                if self._step % 10 == 0:
                    num_examples_per_step = FLAGS.batch_size
                    examples_per_sec = num_examples_per_step / duration
                    sec_per_batch = float(duration)

                    format_str = (
                        '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                        'sec/batch)')
                    print(format_str % (datetime.now(), self._step, loss_value,
                                        examples_per_sec, sec_per_batch))

        with tf.train.MonitoredTrainingSession(
                checkpoint_dir=FLAGS.train_dir,
                hooks=[
                    tf.train.StopAtStepHook(last_step=FLAGS.max_steps),
                    tf.train.NanTensorHook(loss),
                    _LoggerHook()
                ],
                config=tf.ConfigProto(log_device_placement=FLAGS.
                                      log_device_placement)) as mon_sess:
            while not mon_sess.should_stop():
                mon_sess.run(train_op)
Exemple #12
0
def train():
  """Train CIFAR-10 for a number of steps."""
  with tf.Graph().as_default():
    global_step = tf.contrib.framework.get_or_create_global_step()

    # Get images and labels for CIFAR-10.
    images, labels = cifar10.distorted_inputs()
    #images, labels = cifar10.inputs()

    # Build a Graph that computes the logits predictions from the
    # inference model.
    logits = cifar10.inference(images)

    # Calculate loss.
    loss = cifar10.loss(logits, labels)

    # Build a Graph that trains the model with one batch of examples and
    # updates the model parameters.
    train_op = cifar10.train(loss, global_step)

    class _LoggerHook(tf.train.SessionRunHook):
      """Logs loss and runtime."""

      def begin(self):
        self._step = -1

      def before_run(self, run_context):
        self._step += 1
        self._start_time = time.time()
        return tf.train.SessionRunArgs(loss)  # Asks for loss value.

      def after_run(self, run_context, run_values):
        duration = time.time() - self._start_time
        loss_value = run_values.results
        if self._step % 10 == 0:
          num_examples_per_step = FLAGS.batch_size
          examples_per_sec = num_examples_per_step / duration
          sec_per_batch = float(duration)

          format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                        'sec/batch)')
          print (format_str % (datetime.now(), self._step, loss_value,
                               examples_per_sec, sec_per_batch))

    with tf.train.MonitoredTrainingSession(
        checkpoint_dir=FLAGS.train_dir,
        hooks=[tf.train.StopAtStepHook(last_step=FLAGS.max_steps),
               tf.train.NanTensorHook(loss),
               _LoggerHook()],
        config=tf.ConfigProto(
            log_device_placement=FLAGS.log_device_placement)) as mon_sess:
      while not mon_sess.should_stop():
        mon_sess.run(train_op)
def multilevel_train():
    """Train CIFAR-10 for a number of steps."""
    with tf.Graph().as_default():
        global_step = tf.Variable(0, trainable=False)

        # Get images and labels for CIFAR-10.
        images, labels = cifar10.distorted_inputs()

        # Build a Graph that computes the logits predictions from the
        # inference model.
        logits = cifar10.inference(images)

        # Calculate loss.
        loss = cifar10.loss(logits, labels)

        init = tf.initialize_all_variables()
        # Build a Graph that trains the model with one batch of examples and
        # updates the model parameters.
        # train_op = cifar10.train(loss, global_step)
        sess = tf.Session(config=tf.ConfigProto(
            log_device_placement=FLAGS.log_device_placement))
        sess.run(init)

        # Create a saver.
        # saver = tf.train.Saver(tf.all_variables())
        model_dir_exp = os.path.expanduser(
            "/home/chenz/Workspace/Analysis/Parameters")
        ckpt_file = "model.ckpt-1500"
        meta_file = "model.ckpt-1500.meta"
        saver = tf.train.import_meta_graph(
            os.path.join(model_dir_exp, meta_file))
        #saver.restore(tf.get_default_session(), os.path.join(model_dir_exp, ckpt_file))
        saver.restore(sess, os.path.join(model_dir_exp, ckpt_file))
        #saver = load_model("/home/chenz/Workspace/Analysis/Parameters", "model.ckpt-1500.meta", "model.ckpt-1500")

        # Build the summary operation based on the TF collection of Summaries.
        summary_op = tf.merge_all_summaries()

        # Build an initialization operation to run below.

        # Start running operations on the Graph.
        #sess = tf.Session(config=tf.ConfigProto(
        #    log_device_placement=FLAGS.log_device_placement))
        #sess.run(init)

        # Start the queue runners.
        tf.train.start_queue_runners(sess=sess)

        summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph)

        loss_value = sess.run(loss)
        print(loss_value)
def train():
  """Train CIFAR-10 for a number of steps."""
  with tf.Graph().as_default():
    global_step = tf.Variable(0, trainable=False)

    images, labels = cifar10.distorted_inputs()

    logits = cifar10.inference(images)
    
    ## EDIT: Softmax activation
    softmax = tf.nn.softmax(logits)

    loss = cifar10.loss(softmax, labels)
    train_op = cifar10.train(loss, global_step)
    saver = tf.train.Saver(tf.all_variables())
    summary_op = tf.merge_all_summaries()
    init = tf.initialize_all_variables()


    sess = tf.Session(config=tf.ConfigProto(
        log_device_placement=FLAGS.log_device_placement))
    sess.run(init)
    
    tf.train.start_queue_runners(sess=sess)

    summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph)

    for step in xrange(FLAGS.max_steps):
      start_time = time.time()
      _, loss_value = sess.run([train_op, loss])
      duration = time.time() - start_time

      assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

      if step % 10 == 0:
        num_examples_per_step = FLAGS.batch_size
        examples_per_sec = num_examples_per_step / duration
        sec_per_batch = float(duration)

        format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                      'sec/batch)')
        print (format_str % (datetime.now(), step, loss_value,
                             examples_per_sec, sec_per_batch))

      if step % 100 == 0:
        summary_str = sess.run(summary_op)
        summary_writer.add_summary(summary_str, step)

      # Save the model checkpoint periodically.
      if step % 1000 == 0 or (step + 1) == FLAGS.max_steps:
        checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
Exemple #15
0
def train():
  """Train CIFAR-10 for a number of steps."""
  with tf.Graph().as_default():
    global_step = tf.Variable(0, trainable=False)
    # Get images and labels for CIFAR-10.
    images, labels = cifar10.distorted_inputs()
    print(labels)
    # Build a Graph that computes the logits predictions from the
    # inference model.
    logits = cifar10.inference(images)
    # Calculate loss.
    loss = cifar10.loss(logits, labels)
    # Build a Graph that trains the model with one batch of examples and
    # updates the model parameters.
    train_op = cifar10.train(loss, global_step)
    # Create a saver.
    saver = tf.train.Saver(tf.all_variables())
    # Build the summary operation based on the TF collection of Summaries.
    summary_op = tf.merge_all_summaries()
    # Build an initialization operation to run below.
    init = tf.initialize_all_variables()
    # Start running operations on the Graph.
    sess = tf.Session(config=tf.ConfigProto(
        log_device_placement=FLAGS.log_device_placement))
    sess.run(init)
    # Start the queue runners.
    tf.train.start_queue_runners(sess=sess)
    summary_writer = tf.train.SummaryWriter(FLAGS.train_dir,
                                            graph_def=sess.graph_def)
    for step in xrange(FLAGS.max_steps):
      start_time = time.time()
      _, loss_value = sess.run([train_op, loss])
      duration = time.time() - start_time
      assert not np.isnan(loss_value), 'Model diverged with loss = NaN'
      if step % 10 == 0:
        num_examples_per_step = FLAGS.batch_size
        examples_per_sec = num_examples_per_step / duration
        sec_per_batch = float(duration)
        format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                      'sec/batch)')
        print (format_str % (datetime.now(), step, loss_value,
                             examples_per_sec, sec_per_batch))
      if step % 100 == 0:
        summary_str = sess.run(summary_op)
        summary_writer.add_summary(summary_str, step)
      # Save the model checkpoint periodically.
      if step % 10 == 0 or (step + 1) == FLAGS.max_steps:
        checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
        print("module save")
        saver.save(sess, checkpoint_path, global_step=step)
Exemple #16
0
def evaluate():
    images, labels = cifar10.inputs(eval_data=True)
    logits = cifar10.inference(images)
    top_k_op = tf.nn.in_top_k(logits, labels, 1)

    variable_averages = tf.train.ExponentialMovingAverage(cifar10.MOVING_AVERAGE_DECAY)
    variables_to_restore = {}
    for v in tf.all_variables():
        if v in tf.trainable_variables():
            restore_name = variable_averages.average_name(v)
        else:
            restore_name = v.op.name
        variables_to_restore[restore_name] = v
    saver = tf.train.Saver(variables_to_restore)
    eval_once(saver, top_k_op)
def train():
  """Train CIFAR-10 for a number of steps."""
  with tf.Graph().as_default():
    global_step = tf.Variable(0, trainable=False)
    # Get images and labels for CIFAR-10.
    images, labels = cifar10.distorted_inputs()
    # Build a Graph that computes the logits predictions from the
    # inference model.
    logits = cifar10.inference(images)
    # Calculate loss.
    loss = cifar10.loss(logits, labels)
    # Build a Graph that trains the model with one batch of examples and
    # updates the model parameters.
    train_op = cifar10.train(loss, global_step)
    # Create a saver.
    saver = tf.train.Saver(tf.all_variables())
    # Build the summary operation based on the TF collection of Summaries.
    summary_op = tf.merge_all_summaries()
    # Build an initialization operation to run below.
    init = tf.initialize_all_variables()
    # Start running operations on the Graph.
    sess = tf.Session(config=tf.ConfigProto(
        log_device_placement=FLAGS.log_device_placement))
    sess.run(init)
    # Start the queue runners.
    tf.train.start_queue_runners(sess=sess)
    summary_writer = tf.train.SummaryWriter(FLAGS.train_dir,
                                            graph_def=sess.graph_def)
    for step in xrange(FLAGS.max_steps):
      start_time = time.time()
      _, loss_value = sess.run([train_op, loss])
      duration = time.time() - start_time
      assert not np.isnan(loss_value), 'Model diverged with loss = NaN'
      if step % 10 == 0:
        num_examples_per_step = FLAGS.batch_size
        examples_per_sec = num_examples_per_step / duration
        sec_per_batch = float(duration)
        format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                      'sec/batch)')
        print (format_str % (datetime.now(), step, loss_value,
                             examples_per_sec, sec_per_batch))
      if step % 100 == 0:
        summary_str = sess.run(summary_op)
        summary_writer.add_summary(summary_str, step)
      # Save the model checkpoint periodically.
      if step % 1000 == 0 or (step + 1) == FLAGS.max_steps:
        checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
def train():
  """Train CIFAR-10 for a number of steps."""
  with tf.Graph().as_default():
    global_step = tf.Variable(0, trainable=False)

    #GETTING THE TRAINING IMAGES
    images, labels = cifar10.distorted_inputs()
    # DATA FOR GRAPH.
    logits = cifar10.inference(images)
    # LOSS FUNCTION
    loss = cifar10.loss(logits, labels)

    # CREATING AND RUNNING A TENSORBOARD GRAPH
    train_op = cifar10.train(loss, global_step)
    saver = tf.train.Saver(tf.all_variables())
    summary_op = tf.merge_all_summaries()
    init = tf.initialize_all_variables()
    sess = tf.Session(config=tf.ConfigProto(
        log_device_placement=FLAGS.log_device_placement))
    sess.run(init)

    # START THE QUEUE
    tf.train.start_queue_runners(sess=sess)

    summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph)

    for step in xrange(FLAGS.max_steps):
      start_time = time.time()
      _, loss_value = sess.run([train_op, loss])
      duration = time.time() - start_time
      assert not np.isnan(loss_value), 'Model diverged with loss = NaN'
      if step % 10 == 0:
        num_examples_per_step = FLAGS.batch_size
        examples_per_sec = num_examples_per_step / duration
        sec_per_batch = float(duration)
        format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                      'sec/batch)')
        print (format_str % (datetime.now(), step, loss_value,
                             examples_per_sec, sec_per_batch))
      if step % 100 == 0:
        summary_str = sess.run(summary_op)
        summary_writer.add_summary(summary_str, step)
      # SAVE CHECKPOINT TO EVALUATE PERIODICALLY
      if step % 1000 == 0 or (step + 1) == FLAGS.max_steps:
        checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
def evaluate():
    with tf.Graph().as_default() as g:
        # GET THE TEST IMAGES
        eval_data = FLAGS.eval_data == 'test'
        images, labels = cifar10.inputs(eval_data=eval_data)
        logits = cifar10.inference(images)
        top_k_op = tf.nn.in_top_k(logits, labels, 1)
        variable_averages = tf.train.ExponentialMovingAverage(
            cifar10.MOVING_AVERAGE_DECAY)
        variables_to_restore = variable_averages.variables_to_restore()
        saver = tf.train.Saver(variables_to_restore)
        # SUMMARY FOR GRAPH
        summary_op = tf.merge_all_summaries()
        summary_writer = tf.train.SummaryWriter(FLAGS.eval_dir, g)

        while True:
            eval_once(saver, summary_writer, top_k_op, summary_op)
            if FLAGS.run_once:
                break
            time.sleep(FLAGS.eval_interval_secs)
Exemple #20
0
def train():
    # ops
    global_step = tf.Variable(0, trainable=False)
    images, labels = cifar10.distorted_inputs()
    logits = cifar10.inference(tf.image.resize_images(images, cifar10.IMAGE_SIZE, cifar10.IMAGE_SIZE))
    loss = cifar10.loss(logits, labels)
    train_op = cifar10.train(loss, global_step)
    summary_op = tf.merge_all_summaries()

    with tf.Session() as sess:
        saver = tf.train.Saver(tf.all_variables(), max_to_keep=21)
        summary_writer = tf.train.SummaryWriter(FLAGS.train_dir)

        # restore or initialize variables
        ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir)
        if ckpt and ckpt.model_checkpoint_path:
            saver.restore(sess, ckpt.model_checkpoint_path)
        else:
            sess.run(tf.initialize_all_variables())

        # Start the queue runners.
        tf.train.start_queue_runners(sess=sess)

        start = sess.run(global_step)
        for step in xrange(start, FLAGS.max_steps):
            start_time = time.time()
            _, loss_value = sess.run([train_op, loss])
            duration = time.time() - start_time

            assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

            print '%d: %f (%.3f sec/batch)' % (step, loss_value, duration)

            if step % 100 == 0:
                summary_str = sess.run(summary_op)
                summary_writer.add_summary(summary_str, step)
            if step % 500 == 0 or (step + 1) == FLAGS.max_steps:
                checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
def evaluate():
    """Eval CIFAR-10 for a number of steps."""
    f = open('/mnt/eval_output.log', 'w')
    f.write("TrainingStep\tPrecision\n")
    f.close()
    with tf.Graph().as_default() as g:
        # Get images and labels for CIFAR-10.
        eval_data = FLAGS.eval_data == 'test'
        images, labels = cifar10.inputs(eval_data=eval_data)

        # Build a Graph that computes the logits predictions from the
        # inference model.
        logits = cifar10.inference(images)

        # Calculate predictions.
        top_k_op = tf.nn.in_top_k(logits, labels, 1)

        # Restore the moving average version of the learned variables for eval.
        #variable_averages = tf.train.ExponentialMovingAverage(
        #   cifar10.MOVING_AVERAGE_DECAY)
        #variables_to_restore = variable_averages.variables_to_restore()
        #saver = tf.train.Saver(variables_to_restore)
        saver = tf.train.Saver(tf.global_variables())

        # Build the summary operation based on the TF collection of Summaries.
        summary_op = tf.merge_all_summaries()

        summary_writer = tf.train.SummaryWriter(FLAGS.eval_dir, g)
        training_step = 0
        while True:
            eval_once(saver, summary_writer, top_k_op, summary_op,
                      training_step)
            training_step = training_step + FLAGS.checkpointing_step
            if (training_step > FLAGS.trained_steps):
                break
            if FLAGS.run_once:
                break
Exemple #22
0
import os

cifar10.NUM_CLASSES = 6

FLAGS = tf.app.flags.FLAGS
tf.app.flags.DEFINE_string('checkpoint_path', '/tmp/model.ckpt',
                           """Directory where to read model checkpoints.""")
tf.app.flags.DEFINE_string('download_url', 'http://',
                           """Directory where to read model checkpoints.""")
tf.app.flags.DEFINE_integer('image_size', 32,
                           """Image size.""")
tf.app.flags.DEFINE_integer('port', 5000,
                           """Application port.""")

images = tf.placeholder(tf.float32, shape=(1, FLAGS.image_size, FLAGS.image_size, 3))
logits = tf.nn.softmax(cifar10.inference(images))

sess = tf.Session()
saver = tf.train.Saver(tf.all_variables())
if not os.path.isfile(FLAGS.checkpoint_path):
    print 'No checkpoint file found'
    print urllib.urlretrieve(FLAGS.download_url, FLAGS.checkpoint_path)
saver.restore(sess, FLAGS.checkpoint_path)


app = Flask(__name__)
app.debug = True

@app.route('/', methods=['POST'])
def api():
    results = []
def train():
  """Train CIFAR-10 for a number of steps."""
  with tf.Graph().as_default():
    global_step = tf.Variable(0, trainable=False)

    # Get images and labels for CIFAR-10.
    images, labels = cifar10.distorted_inputs()

    eval_data = FLAGS.eval_data == 'test'
    #timages, tlabels = cifar10.inputs(eval_data=eval_data)
    # Build a Graph that computes the logits predictions from the
    # inference model.
    logits = cifar10.inference(images)

    #tlogits = cifar10.inference(timages)
    # Calculate loss.
    top_k_op = tf.nn.in_top_k(logits, labels, 1)
    loss = cifar10.loss(logits, labels)
    #precision = tf.Variable(0.8, name='precision')
    #tf.scalar_summary('accuracy', precision)

    # Build a Graph that trains the model with one batch of examples and
    # updates the model parameters.
    train_op = cifar10.train(loss, global_step)

    # Create a saver.
    saver = tf.train.Saver(tf.all_variables())

    # Build the summary operation based on the TF collection of Summaries.
    summary_op = tf.merge_all_summaries()

    # Build an initialization operation to run below.
    init = tf.initialize_all_variables()

    # Start running operations on the Graph.
    sess = tf.Session(config=tf.ConfigProto(
        log_device_placement=FLAGS.log_device_placement))
    sess.run(init)

    # Start the queue runners.
    tf.train.start_queue_runners(sess=sess)
    sess.graph.finalize()

    summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph)

    for step in xrange(FLAGS.max_steps):
      start_time = time.time()
      _, loss_value = sess.run([train_op, loss])
      duration = time.time() - start_time

      assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

      if step % 100 == 0:

    # Build a Graph that computes the logits predictions from the
    # inference model.

    # Calculate predictions.
        num_examples_per_step = FLAGS.batch_size
        examples_per_sec = num_examples_per_step / duration
        sec_per_batch = float(duration)

        format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                      'sec/batch)')
        print (format_str % (datetime.now(), step, loss_value,
                             examples_per_sec, sec_per_batch))
	num_iter = int(math.ceil(FLAGS.num_examples / FLAGS.batch_size))
        true_count = 0  # Counts the number of correct predictions.
        total_sample_count = num_iter * FLAGS.batch_size
        i_step = 0
        while i_step < num_iter:
          predictions = sess.run([top_k_op])
          true_count += np.sum(predictions)
          i_step += 1

      #Compute precision @ 1.
      	#sess.run(precision.assign(true_count / total_sample_count))
      	prec = true_count / total_sample_count
      	print(prec)
	summary = tf.Summary()
        summary.ParseFromString(sess.run(summary_op))
        summary.value.add(tag='accuracy', simple_value=prec)
        summary_writer.add_summary(summary, step)

	#summary_str = sess.run(summary_op)

        #summary_writer.add_summary(summary_str, step)
       	#summary_writer.flush()

      # Save the model checkpoint periodically.
      if step % 100 == 0 or (step + 1) == FLAGS.max_steps:
        checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
Exemple #24
0
def train():
  """Train CIFAR-10 for a number of steps."""
  with tf.Graph().as_default():
    
    global_step = tf.Variable(0, trainable=False)

    # Get images and labels for CIFAR-10.
    #with tf.device('/gpu:%d' % FLAGS.gpu_number):
    images, labels = cifar10.distorted_inputs()

    # Build a Graph that computes the logits predictions from the
    # inference model.
    logits = cifar10.inference(images)
    loss_per_batch = cifar10.loss_per_batch(logits, labels)
    # Calculate loss.
    loss = cifar10.loss(logits, labels)

    # Build a Graph that trains the model with one batch of examples and
    # updates the model parameters.
    train_op = cifar10.train(loss, global_step, FLAGS.gpu_number)

    # Create a saver.
    saver = tf.train.Saver(tf.all_variables(), max_to_keep=None)

    # Build the summary operation based on the TF collection of Summaries.
    summary_op = tf.merge_all_summaries()

    # Build an initialization operation to run below.
    init = tf.initialize_all_variables()

    # Start running operations on the Graph.
    config = tf.ConfigProto()
    config.gpu_options.allow_growth=True
    config.allow_soft_placement=True
    config.log_device_placement=FLAGS.log_device_placement
    sess = tf.Session(config=config)
    
    tf.train.write_graph(sess.graph_def, FLAGS.train_dir,
                         "cifar10_train.pb", False)
    sess.run(init)

    # Start the queue runners.
    tf.train.start_queue_runners(sess=sess)

    summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph)

    train_start_time = time.time()
    for step in xrange(FLAGS.max_steps):
      start_time = time.time()
      _, loss_value, logits_value, loss_per_batch_value, labels_value = sess.run([train_op, loss, logits, loss_per_batch, labels])
      duration = time.time() - start_time
      #logits_str = print_logits(logits_value, labels_value, loss_per_batch_value)
      
      #with open(os.path.join(FLAGS.train_dir, 'logits_%d.log' % step),'w') as f:
      #  f.write("%s" % logits_str)

      assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

      if step % 10 == 0:
        num_examples_per_step = FLAGS.batch_size
        examples_per_sec = num_examples_per_step / duration
        sec_per_batch = float(duration)

        format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                      'sec/batch)')
        log_str  = (format_str % (datetime.now(), step, loss_value,
                                  examples_per_sec, sec_per_batch))
        print(log_str)
        with open(os.path.join(FLAGS.train_dir, 'train.log'),'a+') as f:
          f.write("%s\n" % log_str)

      if step % 500 == 0:
        summary_str = sess.run(summary_op)
        summary_writer.add_summary(summary_str, step)
        checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
        save_path = saver.save(sess, checkpoint_path, global_step=step)
    train_duration = time.time() - train_start_time

    log_str = ("Finishing. Training %d batches of %d images took %fs\n" %
               (FLAGS.max_steps, FLAGS.batch_size, float(train_duration)))
    print(log_str)
    with open(os.path.join(FLAGS.train_dir, 'train.log'),'a+') as f:
      f.write("%s" % log_str)
def train():
    """Train CIFAR-10 for a number of steps."""
    with tf.Graph().as_default():
        global_step = tf.Variable(0, trainable=False)

        # Get images and labels for CIFAR-10.
        images, labels = cifar10.distorted_inputs()

        # Build a Graph that computes the logits predictions from the
        # inference model.
        logits = cifar10.inference(images)

        # Calculate loss.
        loss = cifar10.loss(logits, labels)

        # Build a Graph that trains the model with one batch of examples and
        # updates the model parameters.
        train_op = cifar10.train(loss, global_step)

        # Create a saver.
        saver = tf.train.Saver(tf.all_variables())

        # Build the summary operation based on the TF collection of Summaries.
        summary_op = tf.merge_all_summaries()

        # Build an initialization operation to run below.
        init = tf.initialize_all_variables()

        # Start running operations on the Graph.
        sess = tf.Session(config=tf.ConfigProto(
            log_device_placement=FLAGS.log_device_placement))
        sess.run(init)

        if FLAGS.checkpoint_dir is not None:
            ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir)
            print("checkpoint path is %s" % ckpt.model_checkpoint_path)
            tf.train.Saver().restore(sess, ckpt.model_checkpoint_path)

        # Start the queue runners.
        print("FLAGS.checkpoint_dir is %s" % FLAGS.checkpoint_dir)
        tf.train.start_queue_runners(sess=sess)
        summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph)

        cur_step = sess.run(global_step)
        print("current step is %s" % cur_step)
        interrupt_check_duration = 0.0
        elapsed_time = time.time()
        flag = 0
        for step in xrange(cur_step, FLAGS.max_steps):
            start_time = time.time()
            _, loss_value = sess.run([train_op, loss])
            duration = time.time() - start_time
            interrupt_check_duration += duration
            if float(interrupt_check_duration) > 5.0:
                print("checking for interruption: %s", interrupt_check_duration)
                if decision_for_migration():
                    print("have to migrate")
                    checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
                    print("checkpoint path is %s" % checkpoint_path)
                    saver.save(sess, checkpoint_path, global_step=step)
                    random_id = generate_random_prefix()
                    start_new_instance(checkpoint_path, step, random_id)
                    upload_checkpoint_to_s3(checkpoint_path, step, "mj-bucket-1", random_id)
                    break
                else:
                    print("not interrupted")
                interrupt_check_duration = 0.0
            assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

            if step % 10 == 0:
                num_examples_per_step = FLAGS.batch_size
                examples_per_sec = num_examples_per_step / duration
                sec_per_batch = float(duration)

                format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                              'sec/batch)')
                print(format_str % (datetime.now(), step, loss_value,
                                    examples_per_sec, sec_per_batch))

            if step % 100 == 0:
                summary_str = sess.run(summary_op)
                summary_writer.add_summary(summary_str, step)

            # Save the model checkpoint periodically.
            if step % 1000 == 0 or (step + 1) == FLAGS.max_steps:
                checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
            elapsed = (int(time.time() - elapsed_time))
            if elapsed % 300 == 0 and flag == 0:
                print("uploading current status")
                uploading_current_status_to_rds(step)
                flag = 1
            elif elapsed % 300 != 0 and flag == 1:
                flag = 0
Exemple #26
0
def train():
    """Train CIFAR-10 for a number of steps."""
    with tf.Graph().as_default():
        with tf.variable_scope("model") as scope:
            global_step = tf.Variable(0, trainable=False)

            # Get images and labels for CIFAR-10.
            images, labels = cifar10.distorted_inputs()
            images_eval, labels_eval = cifar10.inputs(eval_data=True)

            # Build a Graph that computes the logits predictions from the
            # inference model.
            logits = cifar10.inference(images)
            scope.reuse_variables()
            logits_eval = cifar10.inference(images_eval)

            # Calculate loss.
            loss = cifar10.loss(logits, labels)

            # For evaluation
            top_k = tf.nn.in_top_k(logits, labels, 1)
            top_k_eval = tf.nn.in_top_k(logits_eval, labels_eval, 1)

            # Add precision summary
            summary_train_prec = tf.placeholder(tf.float32)
            summary_eval_prec = tf.placeholder(tf.float32)
            tf.scalar_summary('precision/train', summary_train_prec)
            tf.scalar_summary('precision/eval', summary_eval_prec)

            # Build a Graph that trains the model with one batch of examples and
            # updates the model parameters.
            train_op = cifar10.train(loss, global_step)

            # Create a saver.
            saver = tf.train.Saver(tf.all_variables())

            # Build the summary operation based on the TF collection of Summaries.
            summary_op = tf.merge_all_summaries()

            # Build an initialization operation to run below.
            init = tf.initialize_all_variables()

            # Start running operations on the Graph.
            sess = tf.Session(config=tf.ConfigProto(
                log_device_placement=FLAGS.log_device_placement))
            sess.run(init)

            # Start the queue runners.
            tf.train.start_queue_runners(sess=sess)

            summary_writer = tf.train.SummaryWriter(FLAGS.train_dir,
                                                    graph_def=sess.graph_def)

            for step in xrange(FLAGS.max_steps):
                start_time = time.time()
                _, loss_value = sess.run([train_op, loss])
                duration = time.time() - start_time

                assert not np.isnan(
                    loss_value), 'Model diverged with loss = NaN'

                if step % 10 == 0:
                    num_examples_per_step = FLAGS.batch_size
                    examples_per_sec = num_examples_per_step / duration
                    sec_per_batch = float(duration)

                    format_str = (
                        '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                        'sec/batch)')
                    print(format_str % (datetime.now(), step, loss_value,
                                        examples_per_sec, sec_per_batch))

                EVAL_STEP = 10
                EVAL_NUM_EXAMPLES = 1024
                if step % EVAL_STEP == 0:
                    prec_train = evaluate_set(sess, top_k, EVAL_NUM_EXAMPLES)
                    prec_eval = evaluate_set(sess, top_k_eval,
                                             EVAL_NUM_EXAMPLES)
                    print('%s: precision train = %.3f' %
                          (datetime.now(), prec_train))
                    print('%s: precision eval  = %.3f' %
                          (datetime.now(), prec_eval))

                if step % 100 == 0:
                    summary_str = sess.run(summary_op,
                                           feed_dict={
                                               summary_train_prec: prec_train,
                                               summary_eval_prec: prec_eval
                                           })
                    summary_writer.add_summary(summary_str, step)

                # Save the model checkpoint periodically.
                if step % 1000 == 0 or (step + 1) == FLAGS.max_steps:
                    checkpoint_path = os.path.join(FLAGS.train_dir,
                                                   'model.ckpt')
                    saver.save(sess, checkpoint_path, global_step=step)
def train():
  ps_hosts = FLAGS.ps_hosts.split(',')
  worker_hosts = FLAGS.worker_hosts.split(',')
  print ('PS hosts are: %s' % ps_hosts)
  print ('Worker hosts are: %s' % worker_hosts)

  server = tf.train.Server(
      {'ps': ps_hosts, 'worker': worker_hosts},
      job_name = FLAGS.job_name,
      task_index=FLAGS.task_id)

  if FLAGS.job_name == 'ps':
    # `ps` jobs wait for incoming connections from the workers.
    server.join()

  is_chief = (FLAGS.task_id == 0)
  if is_chief:
    if tf.gfile.Exists(FLAGS.train_dir):
      tf.gfile.DeleteRecursively(FLAGS.train_dir)
    tf.gfile.MakeDirs(FLAGS.train_dir)

  """Train CIFAR-10 for a number of steps."""
  cluster = tf.train.ClusterSpec({'ps': ps_hosts, 'worker': worker_hosts})
  device_setter = tf.train.replica_device_setter(cluster=cluster)
  with tf.device(device_setter):
    global_step = tf.Variable(0, trainable=False)

    # Get images and labels for CIFAR-10.
    images, labels = cifar10.distorted_inputs()

    # Build a Graph that computes the logits predictions from the
    # inference model.
    logits = cifar10.inference(images)

    # Calculate loss.
    loss = cifar10.loss(logits, labels)

    num_batches_per_epoch = NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / FLAGS.batch_size
    decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY)

    # Decay the learning rate exponentially based on the number of steps.
    lr = tf.train.exponential_decay(INITIAL_LEARNING_RATE,
                                    global_step,
                                    decay_steps,
                                    LEARNING_RATE_DECAY_FACTOR,
                                    staircase=True)
    tf.scalar_summary('learning_rate', lr)
    opt = tf.train.GradientDescentOptimizer(lr)


    # Track the moving averages of all trainable variables.
    exp_moving_averager = tf.train.ExponentialMovingAverage(
        MOVING_AVERAGE_DECAY, global_step)
    variables_to_average = (
        tf.trainable_variables() + tf.moving_average_variables())

    opt = tf.train.SyncReplicasOptimizer(
        opt,
        replicas_to_aggregate=len(worker_hosts),
        replica_id=FLAGS.task_id,
        total_num_replicas=len(worker_hosts),
        variable_averages=exp_moving_averager,
        variables_to_average=variables_to_average)


    # Compute gradients with respect to the loss.
    grads = opt.compute_gradients(loss)

    # Add histograms for gradients.
    for grad, var in grads:
      if grad is not None:
        tf.histogram_summary(var.op.name + '/gradients', grad)

    apply_gradients_op = opt.apply_gradients(grads, global_step=global_step)

    with tf.control_dependencies([apply_gradients_op]):
      train_op = tf.identity(loss, name='train_op')


    chief_queue_runners = [opt.get_chief_queue_runner()]
    init_tokens_op = opt.get_init_tokens_op()

    saver = tf.train.Saver()
    # We run the summaries in the same thread as the training operations by
    # passing in None for summary_op to avoid a summary_thread being started.
    # Running summaries and training operations in parallel could run out of
    # GPU memory.
    sv = tf.train.Supervisor(is_chief=is_chief,
                             logdir=FLAGS.train_dir,
                             init_op=tf.initialize_all_variables(),
                             summary_op=tf.merge_all_summaries(),
                             global_step=global_step,
                             saver=saver,
                             save_model_secs=60)

    tf.logging.info('%s Supervisor' % datetime.now())

    sess_config = tf.ConfigProto(
        allow_soft_placement=True,
        log_device_placement=FLAGS.log_device_placement)

    print ("Before session init")
    # Get a session.
    sess = sv.prepare_or_wait_for_session(server.target, config=sess_config)
    print ("Before session init done")

    # Start the queue runners.
    queue_runners = tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS)
    sv.start_queue_runners(sess, queue_runners)
    print ('Started %d queues for processing input data.' % len(queue_runners))

    sv.start_queue_runners(sess, chief_queue_runners)
    sess.run(init_tokens_op)

    print ('Start training')
    for step in xrange(FLAGS.max_steps):
      start_time = time.time()
      _, loss_value, gs = sess.run([train_op, loss, global_step])
      duration = time.time() - start_time

      assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

      if step % 10 == 0:
        num_examples_per_step = FLAGS.batch_size
        examples_per_sec = num_examples_per_step / duration
        sec_per_batch = float(duration)

        format_str = ('%s: step %d (global_step %d), loss = %.2f (%.1f examples/sec; %.3f '
                      'sec/batch)')
        print (format_str % (datetime.now(), step, gs, loss_value,
                             examples_per_sec, sec_per_batch))

    if is_chief:
        saver.save(sess,
                   os.path.join(FLAGS.train_dir, 'model.ckpt'),
                   global_step=global_step)
def train():
    ps_hosts = FLAGS.ps_hosts.split(',')
    worker_hosts = FLAGS.worker_hosts.split(',')
    print('PS hosts are: %s' % ps_hosts)
    print('Worker hosts are: %s' % worker_hosts)

    server = tf.train.Server({
        'ps': ps_hosts,
        'worker': worker_hosts
    },
                             job_name=FLAGS.job_name,
                             task_index=FLAGS.task_id)

    if FLAGS.job_name == 'ps':
        server.join()

    is_chief = (FLAGS.task_id == 0)
    if is_chief:
        if tf.gfile.Exists(FLAGS.train_dir):
            tf.gfile.DeleteRecursively(FLAGS.train_dir)
        tf.gfile.MakeDirs(FLAGS.train_dir)

    device_setter = tf.train.replica_device_setter(ps_tasks=1)
    with tf.device('/job:worker/task:%d' % FLAGS.task_id):
        with tf.device(device_setter):
            global_step = tf.Variable(0, trainable=False)

            # Get images and labels for CIFAR-10.
            images, labels = cifar10.distorted_inputs()

            # Build a Graph that computes the logits predictions from the
            # inference model.
            logits = cifar10.inference(images)

            # Calculate loss.
            loss = cifar10.loss(logits, labels)
            train_op = cifar10.train(loss, global_step)

            saver = tf.train.Saver()
            # We run the summaries in the same thread as the training operations by
            # passing in None for summary_op to avoid a summary_thread being started.
            # Running summaries and training operations in parallel could run out of
            # GPU memory.
            sv = tf.train.Supervisor(is_chief=is_chief,
                                     logdir=FLAGS.train_dir,
                                     init_op=tf.initialize_all_variables(),
                                     summary_op=tf.merge_all_summaries(),
                                     global_step=global_step,
                                     saver=saver,
                                     save_model_secs=60)

            tf.logging.info('%s Supervisor' % datetime.now())

            sess_config = tf.ConfigProto(
                allow_soft_placement=True,
                log_device_placement=FLAGS.log_device_placement)

            print("Before session init")
            # Get a session.
            sess = sv.prepare_or_wait_for_session(server.target,
                                                  config=sess_config)
            print("Session init done")

            # Start the queue runners.
            queue_runners = tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS)
            sv.start_queue_runners(sess, queue_runners)
            print('Started %d queues for processing input data.' %
                  len(queue_runners))
            """Train CIFAR-10 for a number of steps."""
            for step in xrange(FLAGS.max_steps):
                start_time = time.time()
                _, loss_value, gs = sess.run([train_op, loss, global_step])
                duration = time.time() - start_time

                assert not np.isnan(
                    loss_value), 'Model diverged with loss = NaN'

                if step % 10 == 0:
                    num_examples_per_step = FLAGS.batch_size
                    examples_per_sec = num_examples_per_step / duration
                    sec_per_batch = float(duration)

                    format_str = (
                        '%s: step %d (global_step %d), loss = %.2f (%.1f examples/sec; %.3f sec/batch)'
                    )
                    print(format_str % (datetime.now(), step, gs, loss_value,
                                        examples_per_sec, sec_per_batch))

    if is_chief:
        saver.save(sess,
                   os.path.join(FLAGS.train_dir, 'model.ckpt'),
                   global_step=global_step)
def main(unused_argv):
    cifar10.maybe_download_and_extract()
    if FLAGS.download_only:
        sys.exit(0)
    if FLAGS.job_name is None or FLAGS.job_name == "":
        raise ValueError("Must specify an explicit `job_name`")
    if FLAGS.task_index is None or FLAGS.task_index == "":
        raise ValueError("Must specify an explicit `task_index`")

    print("job name = %s" % FLAGS.job_name)
    print("task index = %d" % FLAGS.task_index)

    #Construct the cluster and start the server
    ps_spec = FLAGS.ps_hosts.split(",")
    worker_spec = FLAGS.worker_hosts.split(",")

    #Approximation Layers
    approx_layers = FLAGS.layers_to_train.split(",")
    len_approx_layers = len(approx_layers)

    # Get the number of workers.
    num_workers = len(worker_spec)
    num_ps = len(ps_spec)

    cluster = tf.train.ClusterSpec({"ps": ps_spec, "worker": worker_spec})

    if not FLAGS.existing_servers:
        # Not using existing servers. Create an in-process server.
        server = tf.train.Server(cluster,
                                 job_name=FLAGS.job_name,
                                 task_index=FLAGS.task_index)
        if FLAGS.job_name == "ps":
            server.join()

    is_chief = (FLAGS.task_index == 0)
    if FLAGS.num_gpus > 0:
        if FLAGS.num_gpus < num_workers:
            raise ValueError("number of gpus is less than number of workers")
        # Avoid gpu allocation conflict: now allocate task_num -> #gpu
        # for each worker in the corresponding machine
        gpu = (FLAGS.task_index % FLAGS.num_gpus)
        worker_device = "/job:worker/task:%d/gpu:%d" % (FLAGS.task_index, gpu)
    elif FLAGS.num_gpus == 0:
        # Just allocate the CPU to worker server
        cpu = 0
        worker_device = "/job:worker/task:%d/cpu:%d" % (FLAGS.task_index, cpu)
    # The device setter will automatically place Variables ops on separate
    # parameter servers (ps). The non-Variable ops will be placed on the workers.
    # The ps use CPU and workers use corresponding GPU
    with tf.device(
            tf.train.replica_device_setter(
                worker_device=worker_device,
                ps_device="/job:ps/cpu:0",
                cluster=cluster,
                ps_strategy=tf.contrib.training.GreedyLoadBalancingStrategy(
                    num_ps, tf.contrib.training.byte_size_load_fn))):
        global_step = tf.Variable(0, name="global_step", trainable=False)
        #variables_to_update = tf.Placeholder(, name="variables_to_update")

        # Get images and labels for CIFAR-10.
        images, labels = cifar10.distorted_inputs()

        # Build a Graph that computes the logits predictions from the
        # inference model.
        logits = cifar10.inference(images)

        # Calculate loss.
        loss = cifar10.loss(logits, labels)

        # Build a Graph that trains the model with one batch of examples and
        # updates the model parameters.
        #train_op = cifar10.train(loss, global_step)

        # Create a saver.
        saver = tf.train.Saver(tf.global_variables(), max_to_keep=None)

        # Build the summary operation based on the TF collection of Summaries.
        summary_op = tf.merge_all_summaries()
        # Variables that affect learning rate.
        num_batches_per_epoch = 50000 / FLAGS.batch_size
        decay_steps = int(num_batches_per_epoch * 350)

        # Decay the learning rate exponentially based on the number of steps.
        lr = tf.train.exponential_decay(0.1,
                                        global_step,
                                        decay_steps,
                                        0.1,
                                        staircase=True)

        opt = tf.train.GradientDescentOptimizer(lr)

        if FLAGS.sync_replicas:
            if FLAGS.replicas_to_aggregate is None:
                replicas_to_aggregate = num_workers
            else:
                replicas_to_aggregate = FLAGS.replicas_to_aggregate

            opt = tf.train.SyncReplicasOptimizerV2(
                opt,
                replicas_to_aggregate=replicas_to_aggregate,
                total_num_replicas=num_workers,
                name="cifar10_sync_replicas")

        #trainable_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
        train_step = opt.minimize(loss, global_step=global_step)

        # Approximation Training
        var_list = []
        for i in range(len_approx_layers):
            var_list = var_list + tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, scope=approx_layers[i])

        train_step_approx = opt.minimize(loss,
                                         global_step=global_step,
                                         var_list=var_list)

        if FLAGS.sync_replicas:
            local_init_op = opt.local_step_init_op
            if is_chief:
                local_init_op = opt.chief_init_op

            ready_for_local_init_op = opt.ready_for_local_init_op

            # Initial token and chief queue runners required by the sync_replicas mode
            chief_queue_runner = opt.get_chief_queue_runner()
            sync_init_op = opt.get_init_tokens_op()

        init_op = tf.global_variables_initializer()
        train_dir = tempfile.mkdtemp(dir="/mnt",
                                     suffix="data",
                                     prefix="cifar10_train")

        if FLAGS.sync_replicas:
            sv = tf.train.Supervisor(
                is_chief=is_chief,
                logdir=train_dir,
                init_op=init_op,
                local_init_op=local_init_op,
                saver=None,
                summary_op=summary_op,
                save_summaries_secs=120,
                save_model_secs=600,
                checkpoint_basename='model.ckpt',
                ready_for_local_init_op=ready_for_local_init_op,
                recovery_wait_secs=1,
                global_step=global_step)
        else:
            sv = tf.train.Supervisor(is_chief=is_chief,
                                     logdir=train_dir,
                                     init_op=init_op,
                                     saver=None,
                                     summary_op=summary_op,
                                     save_summaries_secs=120,
                                     save_model_secs=600,
                                     checkpoint_basename='model.ckpt',
                                     recovery_wait_secs=1,
                                     global_step=global_step)

        sess_config = tf.ConfigProto(allow_soft_placement=True,
                                     log_device_placement=False,
                                     device_filters=[
                                         "/job:ps",
                                         "/job:worker/task:%d" %
                                         FLAGS.task_index
                                     ])

        # The chief worker (task_index==0) session will prepare the session,
        # while the remaining workers will wait for the preparation to complete.
        if is_chief:
            print("Worker %d: Initializing session..." % FLAGS.task_index)
        else:
            print("Worker %d: Waiting for session to be initialized..." %
                  FLAGS.task_index)

        if FLAGS.existing_servers:
            server_grpc_url = "grpc://" + worker_spec[FLAGS.task_index]
            print("Using existing server at: %s" % server_grpc_url)

            sess = sv.prepare_or_wait_for_session(server_grpc_url,
                                                  config=sess_config)
        else:
            sess = sv.prepare_or_wait_for_session(server.target,
                                                  config=sess_config)

        print("Worker %d: Session initialization complete." % FLAGS.task_index)

        if FLAGS.sync_replicas and is_chief:
            # Chief worker will start the chief queue runner and call the init op.
            sess.run(sync_init_op)
            sv.start_queue_runners(sess, [chief_queue_runner])

        # Restore from Checkpoint
        if FLAGS.checkpoint_restore > 0:
            checkpoint_directory = FLAGS.checkpoint_dir + str(
                FLAGS.checkpoint_restore)
            ckpt = tf.train.get_checkpoint_state(checkpoint_directory)
            if ckpt and ckpt.model_checkpoint_path:
                # Restores from checkpoint
                saver.restore(sess, ckpt.model_checkpoint_path)
                # Assuming model_checkpoint_path looks something like:
                #   /my-favorite-path/cifar10_train/model.ckpt-0,
                # extract global_step from it.
                #global_step = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]
            else:
                print('No checkpoint file found')
                return

        # Perform training
        time_begin = time.time()
        print("Training begins @ %f" % time_begin)

        local_step = 0
        num_examples_per_step = 128
        f = open('/mnt/train_output.log', 'w')
        #f.write("Training begins @ " + str(time_begin) +"\n")
        f.write(
            "Duration\tWorker\tLocalStep\tGlobalStep\tLoss\tExamplesPerSec\n")
        f.close()
        last = time_begin
        while True:
            start_time = time.time()
            if local_step < FLAGS.approx_step:
                _, step, loss_value = sess.run([train_step, global_step, loss])
            else:
                if local_step % FLAGS.approx_interval == 0:
                    _, step, loss_value = sess.run(
                        [train_step_approx, global_step, loss])
                else:
                    _, step, loss_value = sess.run(
                        [train_step, global_step, loss])

            duration = time.time() - start_time
            local_step += 1
            if local_step % 10 == 0:
                now = time.time()
                examples_per_sec = 10 * num_examples_per_step / (now - last)
                print(
                    "%f: Worker %d: step %d (global step: %d of %d) loss = %.2f examples_per_sec = %.2f \n"
                    % (now - last, FLAGS.task_index, local_step, step,
                       FLAGS.train_steps, loss_value, examples_per_sec))
                f = open('/mnt/train_output.log', 'a')
                f.write(
                    str(now - last) + "\t" + str(FLAGS.task_index) + "\t" +
                    str(local_step) + "\t" + str(step) + "\t" +
                    str(loss_value) + "\t" + str(examples_per_sec) + "\n")
                f.close()
                last = now

            if step >= FLAGS.train_steps:
                break

            if sv.should_stop():
                print('Stopped due to abort')
                break
            # Save the model checkpoint periodically.
            #if is_chief and (step % 1000 == 0 or (step + 1) == FLAGS.train_steps):
            if (step % 1000 == 0 or (step + 1) == FLAGS.train_steps):
                print('Taking a Checkpoint @ Global Step ' + str(step))
                checkpoint_dir = "/mnt/checkpoint" + str(step)
                if tf.gfile.Exists(checkpoint_dir):
                    tf.gfile.DeleteRecursively(checkpoint_dir)
                tf.gfile.MakeDirs(checkpoint_dir)
                checkpoint_path = os.path.join(checkpoint_dir, "model.ckpt")
                saver.save(sess, checkpoint_path, global_step=step)

        time_end = time.time()
        print("Training ends @ %f" % time_end)
        f = open('/mnt/train_output.log', 'a')
        #f.write("Training ends @ " + str(time_end) +"\n")
        training_time = time_end - time_begin
        print("Training elapsed time: %f s" % training_time)
        f.write("Training elapsed time: " + str(training_time) + " s\n")
        f.close()
def main(unused_argv):
  #mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True)
  cifar10.maybe_download_and_extract()
  if FLAGS.download_only:
    sys.exit(0)
  #cifar10.maybe_download_and_extract()
  if FLAGS.job_name is None or FLAGS.job_name == "":
    raise ValueError("Must specify an explicit `job_name`")
  if FLAGS.task_index is None or FLAGS.task_index =="":
    raise ValueError("Must specify an explicit `task_index`")

  print("job name = %s" % FLAGS.job_name)
  print("task index = %d" % FLAGS.task_index)

  #Construct the cluster and start the server
  ps_spec = FLAGS.ps_hosts.split(",")
  worker_spec = FLAGS.worker_hosts.split(",")

  # Get the number of workers.
  num_workers = len(worker_spec)

  cluster = tf.train.ClusterSpec({
      "ps": ps_spec,
      "worker": worker_spec})

  if not FLAGS.existing_servers:
    # Not using existing servers. Create an in-process server.
    server = tf.train.Server(
        cluster, job_name=FLAGS.job_name, task_index=FLAGS.task_index)
    if FLAGS.job_name == "ps":
      server.join()

  is_chief = (FLAGS.task_index == 0)
  if FLAGS.num_gpus > 0:
    if FLAGS.num_gpus < num_workers:
      raise ValueError("number of gpus is less than number of workers")
    # Avoid gpu allocation conflict: now allocate task_num -> #gpu 
    # for each worker in the corresponding machine
    gpu = (FLAGS.task_index % FLAGS.num_gpus)
    worker_device = "/job:worker/task:%d/gpu:%d" % (FLAGS.task_index, gpu)
  elif FLAGS.num_gpus == 0:
    # Just allocate the CPU to worker server
    cpu = 0
    worker_device = "/job:worker/task:%d/cpu:%d" % (FLAGS.task_index, cpu)
  # The device setter will automatically place Variables ops on separate
  # parameter servers (ps). The non-Variable ops will be placed on the workers.
  # The ps use CPU and workers use corresponding GPU
  with tf.device(
      tf.train.replica_device_setter(
          worker_device=worker_device,
          ps_device="/job:ps/cpu:0",
          cluster=cluster)):
    cifar10.maybe_download_and_extract()
    global_step = tf.Variable(0, name="global_step", trainable=False)

    # # Variables of the hidden layer
    # hid_w = tf.Variable(
    #     tf.truncated_normal(
    #         [IMAGE_PIXELS * IMAGE_PIXELS, FLAGS.hidden_units],
    #         stddev=1.0 / IMAGE_PIXELS),
    #     name="hid_w")
    # hid_b = tf.Variable(tf.zeros([FLAGS.hidden_units]), name="hid_b")

    # # Variables of the softmax layer
    # sm_w = tf.Variable(
    #     tf.truncated_normal(
    #         [FLAGS.hidden_units, 10],
    #         stddev=1.0 / math.sqrt(FLAGS.hidden_units)),
    #     name="sm_w")
    # sm_b = tf.Variable(tf.zeros([10]), name="sm_b")

    # # Ops: located on the worker specified with FLAGS.task_index
    # x = tf.placeholder(tf.float32, [None, IMAGE_PIXELS * IMAGE_PIXELS])
    # y_ = tf.placeholder(tf.float32, [None, 10])

    # hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b)
    # hid = tf.nn.relu(hid_lin)

    # y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b))
    # cross_entropy = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0)))

    # Get images and labels for CIFAR-10.
    images, labels = cifar10.distorted_inputs()

    # Build a Graph that computes the logits predictions from the
    # inference model.
    logits = cifar10.inference(images)

    # Calculate loss.
    loss = cifar10.loss(logits, labels)

    # Build a Graph that trains the model with one batch of examples and
    # updates the model parameters.
    #train_op = cifar10.train(loss, global_step)

    # Create a saver.
    saver = tf.train.Saver(tf.global_variables())

    # Build the summary operation based on the TF collection of Summaries.
    summary_op = tf.merge_all_summaries();
    # Variables that affect learning rate.
    num_batches_per_epoch = 50000 / FLAGS.batch_size
    decay_steps = int(num_batches_per_epoch * 350)

    # Decay the learning rate exponentially based on the number of steps.
    lr = tf.train.exponential_decay(0.1,
                                    global_step,
                                    decay_steps,
                                    0.1,
                                    staircase=True)

    # Generate moving averages of all losses and associated summaries.
    #loss_averages_op = _add_loss_summaries(total_loss)

    opt = tf.train.GradientDescentOptimizer(lr)
    
    #opt = tf.train.AdamOptimizer(FLAGS.learning_rate)

    if FLAGS.sync_replicas:
      if FLAGS.replicas_to_aggregate is None:
        replicas_to_aggregate = num_workers
      else:
        replicas_to_aggregate = FLAGS.replicas_to_aggregate

      opt = tf.train.SyncReplicasOptimizerV2(
          opt,
          replicas_to_aggregate=replicas_to_aggregate,
          total_num_replicas=num_workers,
          name="cifar10_sync_replicas")

    train_step = opt.minimize(loss, global_step=global_step)

    if FLAGS.sync_replicas:
      local_init_op = opt.local_step_init_op
      if is_chief:
        local_init_op = opt.chief_init_op

      ready_for_local_init_op = opt.ready_for_local_init_op

      # Initial token and chief queue runners required by the sync_replicas mode
      chief_queue_runner = opt.get_chief_queue_runner()
      sync_init_op = opt.get_init_tokens_op()

    init_op = tf.global_variables_initializer()
    train_dir = tempfile.mkdtemp(dir="/mnt")

    if FLAGS.sync_replicas:
      sv = tf.train.Supervisor(
          is_chief=is_chief,
          logdir=train_dir,
          init_op=init_op,
          local_init_op=local_init_op,
          ready_for_local_init_op=ready_for_local_init_op,
          recovery_wait_secs=1,
          global_step=global_step)
    else:
      sv = tf.train.Supervisor(
          is_chief=is_chief,
          logdir=train_dir,
          init_op=init_op,
          recovery_wait_secs=1,
          global_step=global_step)

    sess_config = tf.ConfigProto(
        allow_soft_placement=True,
        log_device_placement=False,
        device_filters=["/job:ps", "/job:worker/task:%d" % FLAGS.task_index])

    # The chief worker (task_index==0) session will prepare the session,
    # while the remaining workers will wait for the preparation to complete.
    if is_chief:
      print("Worker %d: Initializing session..." % FLAGS.task_index)
    else:
      print("Worker %d: Waiting for session to be initialized..." %
            FLAGS.task_index)

    if FLAGS.existing_servers:
      server_grpc_url = "grpc://" + worker_spec[FLAGS.task_index]
      print("Using existing server at: %s" % server_grpc_url)

      sess = sv.prepare_or_wait_for_session(server_grpc_url,
                                            config=sess_config)
    else:
      sess = sv.prepare_or_wait_for_session(server.target, config=sess_config)

    print("Worker %d: Session initialization complete." % FLAGS.task_index)

    if FLAGS.sync_replicas and is_chief:
      # Chief worker will start the chief queue runner and call the init op.
      sess.run(sync_init_op)
      sv.start_queue_runners(sess, [chief_queue_runner])

    # Perform training
    time_begin = time.time()
    print("Training begins @ %f" % time_begin)

    local_step = 0
    while True:
      start_time = time.time()
      _, step = sess.run([train_step, global_step])
      duration = time.time() - start_time
      local_step += 1
      #assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

      #if step % 10 == 0:
      #  num_examples_per_step = FLAGS.batch_size
      #  examples_per_sec = num_examples_per_step / duration
      #  sec_per_batch = float(duration)
#	loss_value = 0
 #       format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
#                      'sec/batch)')
#        print (format_str % (datetime.now(), local_step, loss_value,
#                             examples_per_sec, sec_per_batch))
      now = time.time()
      print("%f: Worker %d: training step %d done (global step: %d)" % (now, FLAGS.task_index, local_step, step))

      if step >= FLAGS.train_steps:
        break

      #if step % 100 == 0:
      #  summary_str = sess.run(summary_op)
      #  summary_writer.add_summary(summary_str, step)

      # Save the model checkpoint periodically.
      #if step % 1000 == 0 or (step + 1) == FLAGS.train_steps:
      #  checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
      #  saver.save(sess, checkpoint_path, global_step=step)
    # local_step = 0
    # while True:
    #   # Training feed
    #   batch_xs, batch_ys = mnist.train.next_batch(FLAGS.batch_size)
    #   train_feed = {x: batch_xs, y_: batch_ys}

    #   _, step = sess.run([train_step, global_step], feed_dict=train_feed)
    #   local_step += 1

    #   now = time.time()
    #   print("%f: Worker %d: training step %d done (global step: %d)" %
    #         (now, FLAGS.task_index, local_step, step))

    #   if step >= FLAGS.train_steps:
    #     break

    time_end = time.time()
    print("Training ends @ %f" % time_end)
    training_time = time_end - time_begin
    print("Training elapsed time: %f s" % training_time)
Exemple #31
0
def train():
    """Train CIFAR-10 for a number of steps."""
    with tf.Graph().as_default():
        global_step = tf.Variable(0, trainable=False)

        # Get images and labels for CIFAR-10.
        images, labels = cifar10.distorted_inputs()

        # Build a Graph that computes the logits predictions from the
        # inference model.
        logits = cifar10.inference(images)

        # Calculate loss.
        loss = cifar10.loss(logits, labels)

        # Build a Graph that trains the model with one batch of examples and
        # updates the model parameters.
        train_op = cifar10.train(loss, global_step)

        # Create a saver.
        saver = tf.train.Saver(tf.all_variables())

        # Build the summary operation based on the TF collection of Summaries.
        summary_op = tf.merge_all_summaries()

        # Build an initialization operation to run below.
        init = tf.initialize_all_variables()

        # Start running operations on the Graph.
        sess = tf.Session(config=tf.ConfigProto(
            log_device_placement=FLAGS.log_device_placement))
        sess.run(init)

        # Start the queue runners.
        tf.train.start_queue_runners(sess=sess)

        summary_writer = tf.train.SummaryWriter(FLAGS.train_dir)

        summary_writer0 = tf.train.SummaryWriter(FLAGS.train_dir0)
        summary_writer1 = tf.train.SummaryWriter(FLAGS.train_dir1)
        summary_writer2 = tf.train.SummaryWriter(FLAGS.train_dir2)
        summary_writer3 = tf.train.SummaryWriter(FLAGS.train_dir3)
        summary_writer4 = tf.train.SummaryWriter(FLAGS.train_dir4)
        summary_writer5 = tf.train.SummaryWriter(FLAGS.train_dir5)
        summary_writer6 = tf.train.SummaryWriter(FLAGS.train_dir6)
        summary_writer7 = tf.train.SummaryWriter(FLAGS.train_dir7)
        summary_writer8 = tf.train.SummaryWriter(FLAGS.train_dir8)
        summary_writer9 = tf.train.SummaryWriter(FLAGS.train_dir9)
        summary_writer10 = tf.train.SummaryWriter(FLAGS.train_dir10)
        summary_writer11 = tf.train.SummaryWriter(FLAGS.train_dir11)
        summary_writer12 = tf.train.SummaryWriter(FLAGS.train_dir12)
        summary_writer13 = tf.train.SummaryWriter(FLAGS.train_dir13)
        summary_writer14 = tf.train.SummaryWriter(FLAGS.train_dir14)
        summary_writer15 = tf.train.SummaryWriter(FLAGS.train_dir15)
        summary_writer16 = tf.train.SummaryWriter(FLAGS.train_dir16)
        summary_writer17 = tf.train.SummaryWriter(FLAGS.train_dir17)
        summary_writer18 = tf.train.SummaryWriter(FLAGS.train_dir18)
        summary_writer19 = tf.train.SummaryWriter(FLAGS.train_dir19)

        for step in xrange(FLAGS.max_steps):
            start_time = time.time()
            _, loss_value = sess.run([train_op, loss])
            duration = time.time() - start_time

            assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

            if step % 10 == 0:
                num_examples_per_step = FLAGS.batch_size
                examples_per_sec = num_examples_per_step / duration
                sec_per_batch = float(duration)

                format_str = (
                    '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                    'sec/batch)')
                print(format_str % (datetime.now(), step, loss_value,
                                    examples_per_sec, sec_per_batch))

            if step % 100 == 0:
                summary_str = sess.run(summary_op)
                summary_writer.add_summary(summary_str, step)
                summary_writer0.add_summary(summary_str, step)
                summary_writer1.add_summary(summary_str, step)
                summary_writer2.add_summary(summary_str, step)
                summary_writer3.add_summary(summary_str, step)
                summary_writer4.add_summary(summary_str, step)
                summary_writer5.add_summary(summary_str, step)
                summary_writer6.add_summary(summary_str, step)
                summary_writer7.add_summary(summary_str, step)
                summary_writer8.add_summary(summary_str, step)
                summary_writer9.add_summary(summary_str, step)
                summary_writer10.add_summary(summary_str, step)
                summary_writer11.add_summary(summary_str, step)
                summary_writer12.add_summary(summary_str, step)
                summary_writer13.add_summary(summary_str, step)
                summary_writer14.add_summary(summary_str, step)
                summary_writer15.add_summary(summary_str, step)
                summary_writer16.add_summary(summary_str, step)
                summary_writer17.add_summary(summary_str, step)
                summary_writer18.add_summary(summary_str, step)
                summary_writer19.add_summary(summary_str, step)

            # Save the model checkpoint periodically.
            # if step % 1000 == 0 or (step + 1) == FLAGS.max_steps:
            #   checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
            #   saver.save(sess, checkpoint_path, global_step=step/100)

            # hard cord here!!!
            if step == 100:
                checkpoint_path = os.path.join(FLAGS.train_dir0, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)

            if step == 200:
                checkpoint_path = os.path.join(FLAGS.train_dir1, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)

            if step == 300:
                checkpoint_path = os.path.join(FLAGS.train_dir2, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
            if step == 400:
                checkpoint_path = os.path.join(FLAGS.train_dir3, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
            if step == 500:
                checkpoint_path = os.path.join(FLAGS.train_dir4, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
            if step == 600:
                checkpoint_path = os.path.join(FLAGS.train_dir5, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
            if step == 700:
                checkpoint_path = os.path.join(FLAGS.train_dir6, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
            if step == 800:
                checkpoint_path = os.path.join(FLAGS.train_dir7, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
            if step == 900:
                checkpoint_path = os.path.join(FLAGS.train_dir8, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
            if step == 1000:
                checkpoint_path = os.path.join(FLAGS.train_dir9, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
            if step == 1100:
                checkpoint_path = os.path.join(FLAGS.train_dir10, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
            if step == 1200:
                checkpoint_path = os.path.join(FLAGS.train_dir11, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
            if step == 1300:
                checkpoint_path = os.path.join(FLAGS.train_dir12, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
            if step == 1400:
                checkpoint_path = os.path.join(FLAGS.train_dir13, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
            if step == 1500:
                checkpoint_path = os.path.join(FLAGS.train_dir14, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
            if step == 1600:
                checkpoint_path = os.path.join(FLAGS.train_dir15, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
            if step == 1700:
                checkpoint_path = os.path.join(FLAGS.train_dir16, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
            if step == 1800:
                checkpoint_path = os.path.join(FLAGS.train_dir17, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
            if step == 1900:
                checkpoint_path = os.path.join(FLAGS.train_dir18, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
            if step == 2000:
                checkpoint_path = os.path.join(FLAGS.train_dir19, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
Exemple #32
0
    reader = tf.TextLineReader()
    key, value = reader.read(filename_queue)


    batch_size = 128
    min_fraction_of_examples_in_queue = 0.4
    num_examples_per_epoch = 50000
    min_queue_examples = int(num_examples_per_epoch *
                             min_fraction_of_examples_in_queue)
    images_batch, label_batch =\
      tf.train.shuffle_batch_join([read_example(value) for _ in range(9)],
                                   batch_size=batch_size,
                                   capacity=min_queue_examples + 3*batch_size,
                                   min_after_dequeue=min_queue_examples)

    logits = cifar10.inference(images_batch)
    loss = cifar10.loss(logits, label_batch)

    global_step = tf.Variable(0, trainable=False)
    train_op = cifar10.train(loss, global_step)

    saver = tf.train.Saver(tf.all_variables())

    summary_op = tf.merge_all_summaries() 

    init = tf.initialize_all_variables()

    sess = tf.Session()


    summary_writer = tf.train.SummaryWriter('./train',
Exemple #33
0
def predict():
    """
  """
    with tf.Graph().as_default() as g:

        user_images = tf.placeholder(
            tf.float32, shape=[None, 32, 32,
                               3])  # None means whatever size you like

        # x_image = tf.reshape(images, [-1, 32, 32, 3])
        user_logits = cifar10.inference(user_images)

        # define predict function
        predict_function = tf.argmax(user_logits, 1)

        variable_averages = tf.train.ExponentialMovingAverage(
            cifar10.MOVING_AVERAGE_DECAY)

        variables_to_restore = variable_averages.variables_to_restore()

        saver = tf.train.Saver(variables_to_restore)

    with tf.Session() as sess:

        ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir)
        if ckpt and ckpt.model_checkpoint_path:
            # Restores from checkpoint
            saver.restore(sess, ckpt.model_checkpoint_path)
            # Assuming model_checkpoint_path looks something like:
            #   /my-favorite-path/cifar10_train/model.ckpt-0,
            # extract global_step from it.
            global_step = ckpt.model_checkpoint_path.split('/')[-1].split(
                '-')[-1]
        else:
            print('No checkpoint file found')
            return

        class_list = []
        class_dic = {
            0: "airplane",
            1: "automobile",
            2: "bird",
            3: "cat",
            4: "deer",
            5: "dog",
            6: "frog",
            7: "horse",
            8: "ship",
            9: "truck"
        }

        BATCH_SIZE = 128
        try:
            predicted_labels = []
            for batch_number in range(1, 31):
                # predicted_labels = []
                test_data = loadCIFAR10_Test(
                    '/mnt/hgfs/cs231n/cs231n/assignment2/cs231n/datasets/cifar-10-batches-py',
                    batch_number)
                test_label = predict_function.eval(
                    feed_dict={user_images: test_data})
                predicted_labels.append(test_label)
                # for i in range(0, test_data.shape[0] // BATCH_SIZE):
                #     test_batch = test_data[i * BATCH_SIZE: (i + 1) * BATCH_SIZE]
                #     predicted_labels[i * BATCH_SIZE: (i + 1) * BATCH_SIZE] = predict_function.eval(feed_dict={images: test_batch})
                #     print("test_batch length: %d" % len(test_batch))
            N = len(predicted_labels)
            for index in xrange(N):
                pre_class = class_dic[predicted_labels[index]]
                class_list.append(pre_class)
            np.savetxt('cifar10_CNN.csv',
                       np.c_[range(1,
                                   len(N) + 1), class_list],
                       delimiter=',',
                       header='id,label',
                       comments='',
                       fmt='%s')
        except:
            pass
Exemple #34
0
def train():
    """Train CIFAR-10 for a number of steps."""
    with tf.Graph().as_default():
        global_step = tf.Variable(0, trainable=False)

        # Get images and labels for CIFAR-10.
        images, labels = cifar10.distorted_inputs()

        # #Adding dropout
        # keep_drop_prob = tf.placeholder(tf.float32)

        # Build a Graph that computes the logits predictions from the
        # inference model.
        logits = cifar10.inference(images)

        # ###########Changes for visualization ###############
        with tf.variable_scope('conv1') as scope_conv:
            tf.get_variable_scope().reuse_variables()
            weights = tf.get_variable('weights')
            grid_x = grid_y = 8  # to get a square grid for 64 conv1 features
            grid = put_kernels_on_grid(weights, (grid_y, grid_x))
            tf.image_summary('conv1/features', grid, max_images=1)
        # ####################################################


        # Calculate loss.
        loss = cifar10.loss(logits, labels)

        # Build a Graph that trains the model with one batch of examples and
        # updates the model parameters.
        train_op = cifar10.train(loss, global_step)

        # Create a saver.
        saver = tf.train.Saver(tf.all_variables())

        # Build the summary operation based on the TF collection of Summaries.
        summary_op = tf.merge_all_summaries()

        # Build an initialization operation to run below.
        init = tf.initialize_all_variables()

        # Start running operations on the Graph.
        sess = tf.Session(config=tf.ConfigProto(
            log_device_placement=FLAGS.log_device_placement))
        sess.run(init)

        # Start the queue runners.
        tf.train.start_queue_runners(sess=sess)

        summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph)

        for step in range(FLAGS.max_steps):
            start_time = time.time()
            _, loss_value = sess.run([train_op, loss])
            duration = time.time() - start_time

            assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

            if step % 10 == 0:
                num_examples_per_step = FLAGS.batch_size
                examples_per_sec = num_examples_per_step / duration
                sec_per_batch = float(duration)

                format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                              'sec/batch)')
                print(format_str % (datetime.now(), step, loss_value,
                                    examples_per_sec, sec_per_batch))

            if step % 100 == 0:
                summary_str = sess.run(summary_op)
                summary_writer.add_summary(summary_str, step)

            # Save the model checkpoint periodically.
            if step % 1000 == 0 or (step + 1) == FLAGS.max_steps:
                checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
Exemple #35
0
def multilevel_train_1ord():
    """Train CIFAR-10 for a number of steps."""
    with tf.Graph().as_default():
        global_step = tf.Variable(0, trainable=False)

        # Get images and labels for CIFAR-10.
        images, labels = cifar10.distorted_inputs()

        # Build a Graph that computes the logits predictions from the
        # inference model.
        logits = cifar10.inference(images)

        # Calculate loss.
        loss = cifar10.loss(logits, labels)

        #Accurarcy
        top_k_op = tf.nn.in_top_k(logits, labels, 1)

        # Build a Graph that trains the model with one batch of examples and
        # updates the model parameters.
        train_op = cifar10.train(loss, global_step)

        # Create a saver.
        saver = tf.train.Saver(tf.all_variables())

        # Build the summary operation based on the TF collection of Summaries.
        summary_op = tf.merge_all_summaries()

        # Build an initialization operation to run below.
        init = tf.initialize_all_variables()

        # Start running operations on the Graph.
        sess = tf.Session(config=tf.ConfigProto(
            log_device_placement=FLAGS.log_device_placement))
        sess.run(init)

        # Start the queue runners.
        tf.train.start_queue_runners(sess=sess)

        summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph)

        for step in xrange(FLAGS.max_steps):
            start_time = time.time()
            _, loss_value = sess.run([train_op, loss])
            accurarcy = sess.run(top_k_op)
            duration = time.time() - start_time

            assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

            output_list = []
            # Do something with intermediate data (intermediate)
            # Save data on iterations of 0, 1000, 2000, 3000
            if step % 1000 == 0 or (step + 1) == FLAGS.max_steps:
                for v in tf.all_variables():
                    if "conv1/weights:" in v.name:
                        print(v.name)
                        output_list.append(
                            tf.get_default_graph().get_tensor_by_name(v.name))
                        break
                if (step == 0):
                    conv1_data_0 = sess.run(output_list)
                if (step == 1000):
                    conv1_data_1000 = sess.run(output_list)
                if (step == 2000):
                    conv1_data_2000 = sess.run(output_list)
                if (step == 3000):
                    conv1_data_3000 = sess.run(output_list)
                    (A, B, C, D, E) = np.array(conv1_data_3000).shape

            # do something.
            # do experiments
            if step == 3000 or (step + 1) == FLAGS.max_steps:
                print("************\n Chen process executing")
                _, new_data = process.exp_2_commMax(conv1_data_0,
                                                    conv1_data_1000,
                                                    conv1_data_2000,
                                                    conv1_data_3000)
                for v in tf.all_variables():
                    if "conv1/weights:" in v.name:
                        print("start assign: ")
                        sess.run(
                            tf.assign(
                                tf.get_default_graph().get_tensor_by_name(
                                    v.name), new_data[0]))
                        break
                value = sess.run(loss)
                pred = process.Count(accurarcy)
                print("new loss value is: " + str(value) + " accurarcy :" +
                      str(pred))

            if step % 10 == 0:
                num_examples_per_step = FLAGS.batch_size
                examples_per_sec = num_examples_per_step / duration
                sec_per_batch = float(duration)
                predict = process.Count(accurarcy)
                format_str = (
                    '%s: step %d, loss = %.2f, accu = %.2f (%.1f examples/sec; %.3f '
                    'sec/batch)')
                print(format_str % (datetime.now(), step, loss_value, predict,
                                    examples_per_sec, sec_per_batch))

            if step % 100 == 0:
                summary_str = sess.run(summary_op)
                summary_writer.add_summary(summary_str, step)

            # Save the model checkpoint periodically.
            if step % 1000 == 0 or (step + 1) == FLAGS.max_steps:
                checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
def train():
    ps_hosts = FLAGS.ps_hosts.split(',')
    worker_hosts = FLAGS.worker_hosts.split(',')
    print ('PS hosts are: %s' % ps_hosts)
    print ('Worker hosts are: %s' % worker_hosts)

    server = tf.train.Server(
        {'ps': ps_hosts, 'worker': worker_hosts},
        job_name = FLAGS.job_name,
        task_index=FLAGS.task_id)

    if FLAGS.job_name == 'ps':
        server.join()

    is_chief = (FLAGS.task_id == 0)
    if is_chief:
        if tf.gfile.Exists(FLAGS.train_dir):
            tf.gfile.DeleteRecursively(FLAGS.train_dir)
        tf.gfile.MakeDirs(FLAGS.train_dir)
  
    device_setter = tf.train.replica_device_setter(ps_tasks=1)
    with tf.device('/job:worker/task:%d' % FLAGS.task_id):
        with tf.device(device_setter):
            global_step = tf.Variable(0, trainable=False)

            # Get images and labels for CIFAR-10.
            images, labels = cifar10.distorted_inputs()

            # Build a Graph that computes the logits predictions from the
            # inference model.
            logits = cifar10.inference(images)

            # Calculate loss.
            loss = cifar10.loss(logits, labels)
            train_op = cifar10.train(loss, global_step)

            saver = tf.train.Saver()
            # We run the summaries in the same thread as the training operations by
            # passing in None for summary_op to avoid a summary_thread being started.
            # Running summaries and training operations in parallel could run out of
            # GPU memory.
            sv = tf.train.Supervisor(is_chief=is_chief,
                                     logdir=FLAGS.train_dir,
                                     init_op=tf.initialize_all_variables(),
                                     summary_op=tf.merge_all_summaries(),
                                     global_step=global_step,
                                     saver=saver,
                                     save_model_secs=60)

            tf.logging.info('%s Supervisor' % datetime.now())

            sess_config = tf.ConfigProto(allow_soft_placement=True,
                                         log_device_placement=FLAGS.log_device_placement)

            print ("Before session init")
            # Get a session.
            sess = sv.prepare_or_wait_for_session(server.target, config=sess_config)
            print ("Session init done")

            # Start the queue runners.
            queue_runners = tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS)
            sv.start_queue_runners(sess, queue_runners)
            print ('Started %d queues for processing input data.' % len(queue_runners))
  
            """Train CIFAR-10 for a number of steps."""
            for step in xrange(FLAGS.max_steps):
                start_time = time.time()
                _, loss_value, gs = sess.run([train_op, loss, global_step])
                duration = time.time() - start_time

                assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

                if step % 10 == 0:
                    num_examples_per_step = FLAGS.batch_size
                    examples_per_sec = num_examples_per_step / duration
                    sec_per_batch = float(duration)

                    format_str = ('%s: step %d (global_step %d), loss = %.2f (%.1f examples/sec; %.3f sec/batch)')
                    print (format_str % (datetime.now(), step, gs, loss_value, examples_per_sec, sec_per_batch))

    if is_chief:
        saver.save(sess, os.path.join(FLAGS.train_dir, 'model.ckpt'), global_step=global_step)
Exemple #37
0
def visualize_excitations():
    ''' Restore a trained model, and run one of the visualizations. '''
    with tf.Graph().as_default():
        # Get images for CIFAR-10.
        eval_data = FLAGS.eval_data == 'test'
        images, _ = cifar10.inputs(eval_data=eval_data)

        # Get conv2 and pool2 responses
        _, conv2, pool2 = cifar10.inference(images)

        # Restore the moving average version of the learned variables for eval.
        variable_averages = tf.train.ExponentialMovingAverage(
            cifar10.MOVING_AVERAGE_DECAY)
        variables_to_restore = variable_averages.variables_to_restore()
        saver = tf.train.Saver(variables_to_restore)

        with tf.Session() as sess:
            ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir)
            if ckpt and ckpt.model_checkpoint_path:
                # Restores from checkpoint
                saver.restore(sess, ckpt.model_checkpoint_path)
            else:
                print('No checkpoint file found')
                return

            if FLAGS.excitation_layer == 'conv2':
                channels = np.asarray([0, 31,
                                       63])  # first, 31st, and last channels
                excitation_map = visualize_conv(sess,
                                                images,
                                                conv2,
                                                channels,
                                                half_receptive_field=5,
                                                accum_padding=0,
                                                stride=2,
                                                dst_height=96,
                                                num_images=FLAGS.num_examples)

            elif FLAGS.excitation_layer == 'pool2':
                neurons = np.asarray([
                    [0, 0, 0],  # top-left corner of first map
                    [5, 5, 63],  # bottom-right corner of last map
                    [3, 4, 5]
                ])  # in the middle of 5th map
                excitation_map = visualize_pooling(
                    sess,
                    images,
                    pool2,
                    neurons,
                    half_receptive_field=6,
                    accum_padding=0,
                    stride=4,
                    dst_height=96,
                    num_images=FLAGS.num_examples)

            else:
                raise Exception('add your own layers and parameters')

            excitation_map = cv2.cvtColor(excitation_map, cv2.COLOR_RGB2BGR)
            cv2.imshow('excitations', excitation_map)
            cv2.waitKey(-1)
def train():
  """Train CIFAR-10 for a number of steps."""
  with tf.Graph().as_default():
    global_step = tf.Variable(0, trainable=False)


    # Get images and labels for CIFAR-10.
    images, labels = cifar10.distorted_inputs()

    # Build a Graph that computes the logits predictions from the
    # inference model.
    logits = cifar10.inference(images)

    # Calculate loss.
    loss = cifar10.loss(logits, labels)

    # Build a Graph that trains the model with one batch of examples and
    # updates the model parameters.
    train_op = cifar10.train(loss, global_step)

    # Create a saver.
    saver = tf.train.Saver(tf.all_variables())

    # Build the summary operation based on the TF collection of Summaries.
    summary_op = tf.merge_all_summaries()

    # Build an initialization operation to run below.
    init = tf.initialize_all_variables()

    # Start running operations on the Graph.
    sess = tf.Session(config=tf.ConfigProto(
        log_device_placement=FLAGS.log_device_placement))
    sess.run(init)

    # Start the queue runners.
    tf.train.start_queue_runners(sess=sess)

    summary_writer = tf.train.SummaryWriter(FLAGS.train_dir)

    summary_writer0 = tf.train.SummaryWriter(FLAGS.train_dir0)
    summary_writer1= tf.train.SummaryWriter(FLAGS.train_dir1)
    summary_writer2 = tf.train.SummaryWriter(FLAGS.train_dir2)
    summary_writer3 = tf.train.SummaryWriter(FLAGS.train_dir3)
    summary_writer4 = tf.train.SummaryWriter(FLAGS.train_dir4)
    summary_writer5 = tf.train.SummaryWriter(FLAGS.train_dir5)
    summary_writer6 = tf.train.SummaryWriter(FLAGS.train_dir6)
    summary_writer7 = tf.train.SummaryWriter(FLAGS.train_dir7)
    summary_writer8 = tf.train.SummaryWriter(FLAGS.train_dir8)
    summary_writer9 = tf.train.SummaryWriter(FLAGS.train_dir9)
    summary_writer10 = tf.train.SummaryWriter(FLAGS.train_dir10)
    summary_writer11 = tf.train.SummaryWriter(FLAGS.train_dir11)
    summary_writer12 = tf.train.SummaryWriter(FLAGS.train_dir12)
    summary_writer13 = tf.train.SummaryWriter(FLAGS.train_dir13)
    summary_writer14 = tf.train.SummaryWriter(FLAGS.train_dir14)
    summary_writer15 = tf.train.SummaryWriter(FLAGS.train_dir15)
    summary_writer16 = tf.train.SummaryWriter(FLAGS.train_dir16)
    summary_writer17 = tf.train.SummaryWriter(FLAGS.train_dir17)
    summary_writer18 = tf.train.SummaryWriter(FLAGS.train_dir18)
    summary_writer19 = tf.train.SummaryWriter(FLAGS.train_dir19)
   


    for step in xrange(FLAGS.max_steps):
      start_time = time.time()
      _, loss_value = sess.run([train_op, loss])
      duration = time.time() - start_time

      assert not np.isnan(loss_value), 'Model diverged with loss = NaN'


      if step % 10 == 0:
        num_examples_per_step = FLAGS.batch_size
        examples_per_sec = num_examples_per_step / duration
        sec_per_batch = float(duration)

        format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                      'sec/batch)')
        print (format_str % (datetime.now(), step, loss_value,
                             examples_per_sec, sec_per_batch))

      if step % 100 == 0:
        summary_str = sess.run(summary_op)
        summary_writer.add_summary(summary_str, step)
        summary_writer0.add_summary(summary_str, step)
        summary_writer1.add_summary(summary_str, step)
        summary_writer2.add_summary(summary_str, step)
        summary_writer3.add_summary(summary_str, step)
        summary_writer4.add_summary(summary_str, step)
        summary_writer5.add_summary(summary_str, step)
        summary_writer6.add_summary(summary_str, step)
        summary_writer7.add_summary(summary_str, step)
        summary_writer8.add_summary(summary_str, step)
        summary_writer9.add_summary(summary_str, step)
        summary_writer10.add_summary(summary_str, step)
        summary_writer11.add_summary(summary_str, step)
        summary_writer12.add_summary(summary_str, step)
        summary_writer13.add_summary(summary_str, step)
        summary_writer14.add_summary(summary_str, step)
        summary_writer15.add_summary(summary_str, step)
        summary_writer16.add_summary(summary_str, step)
        summary_writer17.add_summary(summary_str, step)
        summary_writer18.add_summary(summary_str, step)
        summary_writer19.add_summary(summary_str, step)

      # Save the model checkpoint periodically.
      # if step % 1000 == 0 or (step + 1) == FLAGS.max_steps:
      #   checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
      #   saver.save(sess, checkpoint_path, global_step=step/100)

        # hard cord here!!!
      if step==100:
        checkpoint_path = os.path.join(FLAGS.train_dir0, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)

      if step==200:
        checkpoint_path = os.path.join(FLAGS.train_dir1, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)

      if step==300:
        checkpoint_path = os.path.join(FLAGS.train_dir2, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
      if step==400:
        checkpoint_path = os.path.join(FLAGS.train_dir3, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
      if step==500:
        checkpoint_path = os.path.join(FLAGS.train_dir4, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
      if step==600:
        checkpoint_path = os.path.join(FLAGS.train_dir5, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
      if step==700:
        checkpoint_path = os.path.join(FLAGS.train_dir6, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
      if step==800:
        checkpoint_path = os.path.join(FLAGS.train_dir7, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
      if step==900:
        checkpoint_path = os.path.join(FLAGS.train_dir8, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
      if step==1000:
        checkpoint_path = os.path.join(FLAGS.train_dir9, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
      if step==1100:
        checkpoint_path = os.path.join(FLAGS.train_dir10, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
      if step==1200:
        checkpoint_path = os.path.join(FLAGS.train_dir11, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
      if step==1300:
        checkpoint_path = os.path.join(FLAGS.train_dir12, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
      if step==1400:
        checkpoint_path = os.path.join(FLAGS.train_dir13, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
      if step==1500:
        checkpoint_path = os.path.join(FLAGS.train_dir14, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
      if step==1600:
        checkpoint_path = os.path.join(FLAGS.train_dir15, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
      if step==1700:
        checkpoint_path = os.path.join(FLAGS.train_dir16, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
      if step==1800:
        checkpoint_path = os.path.join(FLAGS.train_dir17, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
      if step==1900:
        checkpoint_path = os.path.join(FLAGS.train_dir18, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
      if step==2000:
        checkpoint_path = os.path.join(FLAGS.train_dir19, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)