def train():
    with tf.Graph().as_default():
        # get global step
        global_step = tf.train.get_or_create_global_step()
        # get data through cpu
        with tf.device('/cpu:0'):
            images, labels = cifar10.distorted_inputs()
        # get loss and logit
        # logits = cifar10.inference(images=images, r=low_ranks)
        logits = cifar10.inference(images=images, r=low_ranks)
        loss = cifar10.loss(logits=logits, labels=labels)
        # set train_op
        train_op = cifar10.train(loss, global_step)
        for v in tf.trainable_variables():
            print(v)
        nonzero = tf.count_nonzero(tf.get_collection('sparse_components')[-1])

        # define a LoggerHook to log something
        # clean_list = tf.get_collection('sparse_components')
        # clean_list = clean_s(clean_list)
        # clean_op = [c.op for c in clean_list]

        class _LoggerHook(tf.train.SessionRunHook):
            """
            log session and runtime info
            """
            def begin(self):
                self._step = -1
                self._start_time = time.time()

            def before_run(self, run_context):
                self._step += 1
                return tf.train.SessionRunArgs(loss)

            def after_run(self, run_context, run_values):
                if self._step % FLAGS.log_frequency == 0:
                    current_time = time.time()
                    duration = current_time - self._start_time
                    self._start_time = current_time
                    loss_value = run_values.results
                    example_per_sec = FLAGS.log_frequency * FLAGS.batch_size / duration
                    sec_per_batch = float(duration / FLAGS.log_frequency)

                    format_str = (
                        '%s: step %d, loss = %.6f (%.1f examples/sec;'
                        '%.3f sec/batch)')
                    print(format_str % (datetime.now(), self._step, loss_value,
                                        example_per_sec, sec_per_batch))

        with tf.train.MonitoredTrainingSession(
                checkpoint_dir=FLAGS.train_dir,
                hooks=[
                    tf.train.StopAtStepHook(last_step=FLAGS.max_steps),
                    tf.train.NanTensorHook(loss),
                    _LoggerHook()
                ],
                config=tf.ConfigProto(log_device_placement=FLAGS.
                                      log_device_placement)) as mon_sess:
            while not mon_sess.should_stop():
                mon_sess.run(train_op)
def main(_):
    with tf.Graph().as_default():
        global_step = tf.contrib.framework.get_or_create_global_step()

        images, labels = cifar10.distorted_inputs()

        logits = cifar10.inference(images)

        loss = cifar10.loss(logits, labels)

        train_op = cifar10.train(loss, global_step)

        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            coord = tf.train.Coordinator()
            threads = tf.train.start_queue_runners(coord=coord)

            summary = tf.summary.merge_all()
            writer = tf.summary.FileWriter(FLAGS.log_dir, sess.graph)

            print('Training started')
            for i in range(FLAGS.max_step):
                start_time = time.time()
                _, loss_value = sess.run([train_op, loss])
                duration = time.time() - start_time

                if i % 100 == 0:
                    print('Step %6d: loss = %.2f (%.3f sec)' %
                          (i, loss_value, duration))
                    summary_str = sess.run(summary)
                    writer.add_summary(summary_str, i)
                    writer.flush()

            coord.request_stop()
            coord.join(threads)
def evaluate():
  """Eval CIFAR-10 for a number of steps."""
  with tf.Graph().as_default() as g:
    # Get images and labels for CIFAR-10.
    eval_data = FLAGS.eval_data == 'test'
    images, labels = cifar10.distorted_inputs(eval_data=eval_data)

    # Build a Graph that computes the logits predictions from the
    # inference model.
    logits = cifar10.inference(images)

    # Calculate predictions.
    top_k_op = tf.nn.in_top_k(logits, labels, 1)

    # Normalize predictions with softmax
    preds = tf.nn.softmax(logits, name='predictions')

    # Restore the moving average version of the learned variables for eval.
    variable_averages = tf.train.ExponentialMovingAverage(
        cifar10.MOVING_AVERAGE_DECAY)
    variables_to_restore = variable_averages.variables_to_restore()
    saver = tf.train.Saver(variables_to_restore)

    # Build the summary operation based on the TF collection of Summaries.
    summary_op = tf.merge_all_summaries()

    summary_writer = tf.train.SummaryWriter(FLAGS.eval_dir, g)

    while True:
      eval_once(saver, summary_writer, top_k_op, summary_op, preds, labels)
      if FLAGS.run_once:
        break
      time.sleep(FLAGS.eval_interval_secs)
def train():
  """Train CIFAR-10 for a number of steps."""
  with tf.Graph().as_default():
    global_step = tf.Variable(0, trainable=False)

    # Get images and labels for CIFAR-10.
    images, labels = cifar10.distorted_inputs()

    testImg, testlabels = cifar10.inputs(eval_data=True)
    # Build a Graph that computes the logits predictions from the
    # inference model.
    logits = cifar10.inference(images)
    test_pre = cifar10.inference(testImg,test=True)
     
    # Calculate loss.
    loss = cifar10.loss(logits, labels)

    # Build a Graph that trains the model with one batch of examples and
    # updates the model parameters.
    train_op = cifar10.train(loss, global_step)

    # Create a saver.
    saver = tf.train.Saver(tf.all_variables())

    # Build the summary operation based on the TF collection of Summaries.
    summary_op = tf.merge_all_summaries()

    # Build an initialization operation to run below.
    init = tf.initialize_all_variables()

    # Start running operations on the Graph.
    sess = tf.Session(config=tf.ConfigProto(
        log_device_placement=FLAGS.log_device_placement))
    sess.run(init)

    # Start the queue runners.
    tf.train.start_queue_runners(sess=sess)

    summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph)

    for step in xrange(FLAGS.max_steps):
      start_time = time.time()
      _, loss_value = sess.run([train_op, loss])
      duration = time.time() - start_time

      if step % 10 == 0:
        print ('loss '+str(loss_value))

      if step % 100 == 0:
        summary_str = sess.run(summary_op)
        summary_writer.add_summary(summary_str, step)

      # Save the model checkpoint periodically.
      if step % 10 == 0 or (step + 1) == FLAGS.max_steps:
        checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)

        #eval
      if step%10==0:
        cifar10.accuracy(test_pre,testlabels)
def evaluate():
    """Eval CIFAR-10 for a number of steps."""
    with tf.Graph().as_default() as g:
        # Get images and labels for CIFAR-10.
        eval_data = FLAGS.eval_data == 'test'
        images, labels = cifar10.distorted_inputs(eval_data=eval_data)

        # Build a Graph that computes the logits predictions from the
        # inference model.
        logits = cifar10.inference(images)

        # Calculate predictions.
        top_k_op = tf.nn.in_top_k(logits, labels, 1)

        # Normalize predictions with softmax
        preds = tf.nn.softmax(logits, name='predictions')

        # Restore the moving average version of the learned variables for eval.
        variable_averages = tf.train.ExponentialMovingAverage(
            cifar10.MOVING_AVERAGE_DECAY)
        variables_to_restore = variable_averages.variables_to_restore()
        saver = tf.train.Saver(variables_to_restore)

        # Build the summary operation based on the TF collection of Summaries.
        summary_op = tf.merge_all_summaries()

        summary_writer = tf.train.SummaryWriter(FLAGS.eval_dir, g)

        while True:
            eval_once(saver, summary_writer, top_k_op, summary_op, preds,
                      labels)
            if FLAGS.run_once:
                break
            time.sleep(FLAGS.eval_interval_secs)
def tower_loss(scope):
  """Calculate the total loss on a single tower running the CIFAR model.

  Args:
    scope: unique prefix string identifying the CIFAR tower, e.g. 'tower_0'

  Returns:
     Tensor of shape [] containing the total loss for a batch of data
  """
  # Get images and labels for CIFAR-10.
  images, labels = cifar10.distorted_inputs()

  # Build inference Graph.
  logits = cifar10.inference(images)

  # Build the portion of the Graph calculating the losses. Note that we will
  # assemble the total_loss using a custom function below.
  _ = cifar10.loss(logits, labels)

  # Assemble all of the losses for the current tower only.
  losses = tf.get_collection('losses', scope)

  # Calculate the total loss for the current tower.
  total_loss = tf.add_n(losses, name='total_loss')

  # Attach a scalar summary to all individual losses and the total loss; do the
  # same for the averaged version of the losses.
  for l in losses + [total_loss]:
    # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training
    # session. This helps the clarity of presentation on tensorboard.
    loss_name = re.sub('%s_[0-9]*/' % cifar10.TOWER_NAME, '', l.op.name)
    tf.summary.scalar(loss_name, l)

  return total_loss
Exemple #7
0
def train():
    """Train CIFAR-10 for a number of steps."""
    with tf.Graph().as_default():
        global_step = tf.contrib.framework.get_or_create_global_step()

        # Get images and labels for CIFAR-10.
        images, labels = cifar10.distorted_inputs()

        # Build a Graph that computes the logits predictions from the
        # inference model.
        logits = cifar10.inference(images, labels)

        # Calculate loss.
        loss = cifar10.loss(logits, labels)

        # Calculate accuracy.
        accuracy = cifar10.accuracy(logits, labels)

        # Build a Graph that trains the model with one batch of examples and
        # updates the model parameters.
        train_op = cifar10.train(loss, global_step)

        class _LoggerHook(tf.train.SessionRunHook):
            """Logs loss and runtime."""
            def begin(self):
                self._step = -1
                self._start_time = time.time()

            def before_run(self, run_context):
                self._step += 1
                return tf.train.SessionRunArgs(
                    accuracy)  # Asks for loss value.

            def after_run(self, run_context, run_values):
                if self._step % FLAGS.log_frequency == 0:
                    current_time = time.time()
                    duration = current_time - self._start_time
                    self._start_time = current_time

                    loss_value = run_values.results
                    examples_per_sec = FLAGS.log_frequency * FLAGS.batch_size / duration
                    sec_per_batch = float(duration / FLAGS.log_frequency)

                    format_str = (
                        '%s: step %d, accuracy = %.4f, (%.1f examples/sec; %.3f '
                        'sec/batch)')
                    print(format_str % (datetime.now(), self._step, loss_value,
                                        examples_per_sec, sec_per_batch))

        with tf.train.MonitoredTrainingSession(
                checkpoint_dir=FLAGS.train_dir,
                hooks=[
                    tf.train.StopAtStepHook(last_step=FLAGS.max_steps),
                    tf.train.NanTensorHook(loss),
                    _LoggerHook()
                ],
                config=tf.ConfigProto(log_device_placement=FLAGS.
                                      log_device_placement)) as mon_sess:
            while not mon_sess.should_stop():
                mon_sess.run(train_op)
def evaluate():
    """Eval CIFAR-10 for a number of steps."""
    with tf.Graph().as_default() as g:
        with tf.device('/cpu:0'):
            images, labels = cifar10.distorted_inputs()

        # Build a Graph that computes the logits predictions from the
        # inference model.
        logits = cifar10.inference(images)

        # Calculate predictions.
        top_k_op = tf.nn.in_top_k(logits, labels, 1)

        # Restore the moving average version of the learned variables for eval.
        variable_averages = tf.train.ExponentialMovingAverage(
            cifar10.MOVING_AVERAGE_DECAY)
        variables_to_restore = variable_averages.variables_to_restore()
        saver = tf.train.Saver(variables_to_restore)

        # Build the summary operation based on the TF collection of Summaries.
        summary_op = tf.summary.merge_all()

        summary_writer = tf.summary.FileWriter(FLAGS.train_dir, g)

        eval_once(saver, summary_writer, top_k_op, summary_op)
def tower_loss(scope):
    """Calculate the total loss on a single tower running the CIFAR model.
  Args:
    scope: unique prefix string identifying the CIFAR tower, e.g. 'tower_0'
  Returns:
     Tensor of shape [] containing the total loss for a batch of data
  """
    # Get images and labels for CIFAR-10.
    images, labels = cifar10.distorted_inputs(FLAGS.contrastcase)

    # Build inference Graph.
    logits = cifar10.inference1_2(images)

    # Build the portion of the Graph calculating the losses. Note that we will
    # assemble the total_loss using a custom function below.
    _ = cifar10.loss(logits, labels)

    # Assemble all of the losses for the current tower only.
    losses = tf.get_collection('losses', scope)

    # Calculate the total loss for the current tower.
    total_loss = tf.add_n(losses, name='total_loss')

    # Attach a scalar summary to all individual losses and the total loss; do the
    # same for the averaged version of the losses.
    for l in losses + [total_loss]:
        # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training
        # session. This helps the clarity of presentation on tensorboard.
        loss_name = re.sub('%s_[0-9]*/' % cifar10.TOWER_NAME, '', l.op.name)
        tf.summary.scalar(loss_name, l)

    return total_loss
Exemple #10
0
def train():
    with tf.Graph().as_default():
      global_step = tf.train.get_or_create_global_step()
      images, labels = cifar10.distorted_inputs()
      logits = cifar10.inference(images, train = True)
      loss = cifar10.loss(logits, labels)
      accuracy = cifar10.accuracy(logits, labels)
      train_op = cifar10.train(loss, global_step)

      class _LoggerHook(tf.train.SessionRunHook):

        def begin(self):
          self._step = -1

        def before_run(self, run_context):
          self._step += 1
          return tf.train.SessionRunArgs([loss, accuracy])

        def after_run(self, run_context, run_values):
          if self._step % 10 == 0:
            loss_value, acc_value = run_values.results
            format_str = ('step %d, loss = %.2f, accuracy = %.2f ')
            print (format_str %(self._step, loss_value, acc_value))

      with tf.train.MonitoredTrainingSession(
          checkpoint_dir=train_dir,
          hooks=[tf.train.StopAtStepHook(last_step=max_step),
               tf.train.NanTensorHook(loss),
               _LoggerHook()],
          config=tf.ConfigProto(
            log_device_placement=False)) as mon_sess:
          while not mon_sess.should_stop():
            mon_sess.run(train_op)
Exemple #11
0
def train():
  """Train CIFAR-10 for a number of steps."""
  with tf.Graph().as_default():

    global_step = tf.Variable(0, trainable=False)
    images, labels = cifar10.distorted_inputs()
    logits = cifar10.inference(images)
    loss = cifar10.loss(logits, labels)
    # loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits, labels))
    # train_op = tf.train.GradientDescentOptimizer(1e-2).minimize(loss)
    train_op = cifar10.train(loss, global_step)
    top_k_op = tf.nn.in_top_k(logits, labels, 1)

    saver = tf.train.Saver(tf.all_variables())
    init = tf.initialize_all_variables()
    sess = tf.Session(config=tf.ConfigProto(
        log_device_placement=FLAGS.log_device_placement))
    sess.run(init)
    tf.train.start_queue_runners(sess=sess)

    true_count = 0
    for step in xrange(FLAGS.max_steps):
      start_time = time.time()
      _, loss_value, precisions = sess.run([train_op, loss, top_k_op])

      true_count += np.sum(precisions)

      if step % 10 == 0:
        checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
        duration = time.time() - start_time
        print(' step %d, loss = %.3f, acc = %.3f, dur = %.2f' % 
             (step, loss_value, true_count/(FLAGS.batch_size*10), duration))
        true_count = 0
def tower_loss(scope):
  """Calculate the total loss on a single tower running the CIFAR model.
  Args:
    scope: unique prefix string identifying the CIFAR tower, e.g. 'tower_0'
  Returns:
     Tensor of shape [] containing the total loss for a batch of data
  """
  # Get images and labels for CIFAR-10.
  images, labels = cifar10.distorted_inputs()
  # Build inference Graph.
  logits = cifar10.inference(images)
  # Build the portion of the Graph calculating the losses. Note that we will
  # assemble the total_loss using a custom function below.
  _ = cifar10.loss(logits, labels)
  # Assemble all of the losses for the current tower only.
  losses = tf.get_collection('losses', scope)
  # Calculate the total loss for the current tower.
  total_loss = tf.add_n(losses, name='total_loss')
  # Compute the moving average of all individual losses and the total loss.
  loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg')
  loss_averages_op = loss_averages.apply(losses + [total_loss])
  # Attach a scalar summmary to all individual losses and the total loss; do the
  # same for the averaged version of the losses.
  for l in losses + [total_loss]:
    # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training
    # session. This helps the clarity of presentation on tensorboard.
    loss_name = re.sub('%s_[0-9]*/' % cifar10.TOWER_NAME, '', l.op.name)
    # Name each loss as '(raw)' and name the moving average version of the loss
    # as the original loss name.
    tf.scalar_summary(loss_name +' (raw)', l)
    tf.scalar_summary(loss_name, loss_averages.average(l))
  with tf.control_dependencies([loss_averages_op]):
    total_loss = tf.identity(total_loss)
  return total_loss
Exemple #13
0
def tower_loss(scope):
	images, labels = cifar10.distorted_inputs()
	logits = cifar10.inference(images)
	_ = cifar10.loss(logits, labels)
	losses = tf.get_collection('losses', scope)
	total_loss = tf.add_n(losses, name = 'total_loss')
	return total_loss
Exemple #14
0
def train():
    with tf.Graph().as_default():
        global_step = tf.contrib.framework.get_or_create_global_step()

        # Get images and labels for training data.
        # Force input pipeline to CPU:0 to avoid operations sometimes ending up on
        # GPU and resulting in a slow down.
        with tf.device('/cpu:0'):
            images, labels = cifar10.distorted_inputs(
                custom_data_dir="./train_data")

        # Build a Graph that computes the logits predictions from the
        # inference model.
        # ccen: there're 128 classes instead of 10
        logits = cifar10.inference(images, 128)

        # Calculate loss.
        loss = cifar10.loss(logits, labels)

        # Build a Graph that trains the model with one batch of examples and
        # updates the model parameters.
        train_op = cifar10.train(loss, global_step)

        class _LoggerHook(tf.train.SessionRunHook):
            """Logs loss and runtime."""
            def begin(self):
                self._step = -1
                self._start_time = time.time()

            def before_run(self, run_context):
                self._step += 1
                return tf.train.SessionRunArgs(loss)  # Asks for loss value.

            def after_run(self, run_context, run_values):
                if self._step % FLAGS.log_frequency == 0:
                    current_time = time.time()
                    duration = current_time - self._start_time
                    self._start_time = current_time

                    loss_value = run_values.results
                    examples_per_sec = FLAGS.log_frequency * FLAGS.batch_size / duration
                    sec_per_batch = float(duration / FLAGS.log_frequency)

                    format_str = (
                        '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                        'sec/batch)')
                    print(format_str % (datetime.now(), self._step, loss_value,
                                        examples_per_sec, sec_per_batch))

        with tf.train.MonitoredTrainingSession(
                checkpoint_dir=FLAGS.train_dir,
                hooks=[
                    tf.train.StopAtStepHook(last_step=FLAGS.max_steps),
                    tf.train.NanTensorHook(loss),
                    _LoggerHook()
                ],
                config=tf.ConfigProto(log_device_placement=FLAGS.
                                      log_device_placement)) as mon_sess:
            while not mon_sess.should_stop():
                mon_sess.run(train_op)
Exemple #15
0
def train():
  """Train CIFAR-10 for a number of steps."""
  with tf.Graph().as_default():
    global_step = tf.contrib.framework.get_or_create_global_step()

    # Get images and labels for CIFAR-10.
    # Force input pipeline to CPU:0 to avoid operations sometimes ending up on
    # GPU and resulting in a slow down.
    with tf.device('/cpu:0'):
      images, labels = cifar10.distorted_inputs()

    # Build a Graph that computes the logits predictions from the
    # inference model.
    logits = cifar10.inference(images)

    # Calculate loss.
    loss = cifar10.loss(logits, labels)

    # Build a Graph that trains the model with one batch of examples and
    # updates the model parameters.
    train_op = cifar10.train(loss, global_step)

    saver = tf.train.Saver(tf.global_variables())

    # Build an initialization operation to run below.
    init = tf.global_variables_initializer()

    # Start running operations on the Graph. allow_soft_placement must be set to
    # True to build towers on GPU, as some of the ops do not have GPU
    # implementations.
    sess = tf.Session(config=tf.ConfigProto(
        allow_soft_placement=True,
        log_device_placement=FLAGS.log_device_placement))
    sess.run(init)

    # Start the queue runners.
    tf.train.start_queue_runners(sess=sess)

    for step in xrange(FLAGS.max_steps):
      start_time = time.time()
      _, loss_value = sess.run([train_op, loss])
      duration = time.time() - start_time

      assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

      if step % 10 == 0:
        num_examples_per_step = FLAGS.batch_size
        examples_per_sec = num_examples_per_step / duration
        sec_per_batch = duration

        format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                      'sec/batch)')
        print (format_str % (datetime.now(), step, loss_value,
                             examples_per_sec, sec_per_batch))

      # Save the model checkpoint periodically.
      if step % 1000 == 0 or (step + 1) == FLAGS.max_steps:
        checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
        evaluate()
Exemple #16
0
def tower_loss(scope):
    """Calculate the total loss on a single tower running the CIFAR model.
    Args:
      scope: unique prefix string identifying the CIFAR tower, e.g. 'tower_0'
    Returns:
       Tensor of shape [] containing the total loss for a batch of data
    """
    # Get images and labels for CIFAR-10.
    images, labels = cifar10.distorted_inputs()
    # Build inference Graph.
    logits = cifar10.inference(images)
    # Build the portion of the Graph calculating the losses. Note that we will
    # assemble the total_loss using a custom function below.
    _ = cifar10.loss(logits, labels)
    # Assemble all of the losses for the current tower only.
    losses = tf.get_collection('losses', scope)
    # Calculate the total loss for the current tower.
    total_loss = tf.add_n(losses, name='total_loss')
    # Compute the moving average of all individual losses and the total loss.
    # loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg')
    # loss_averages_op = loss_averages.apply(losses + [total_loss])
    # Attach a scalar summary to all individual losses and the total loss; do the
    # same for the averaged version of the losses.
    # for l in losses + [total_loss]:
    # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training
    # session. This helps the clarity of presentation on tensorboard.
    # loss_name = re.sub('%s_[0-9]*/' % cifar10.TOWER_NAME, '', l.op.name)
    # Name each loss as '(raw)' and name the moving average version of the loss
    # as the original loss name.
    # tf.scalar_summary(loss_name +' (raw)', l)
    # tf.scalar_summary(loss_name, loss_averages.average(l))
    # with tf.control_dependencies([loss_averages_op]):
    # total_loss = tf.identity(total_loss)
    return total_loss
Exemple #17
0
def train():
    with tf.Graph().as_default():
        images, labels = cifar10.distorted_inputs()

        logits = cifar10.inference(images)

        loss = cifar10.loss(logits, labels)

        global_step = tf.Variable(0, trainable=False)
        train_op = cifar10.train(loss, global_step=global_step)

        summary_op = tf.merge_all_summaries()

        init = tf.global_variables_initializer()

        sess = tf.Session()
        sess.run(init)

        tf.train.start_queue_runners(sess=sess)
        summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph)

        trainable_var = tf.trainable_variables()
        print([v.name for v in trainable_var])
        var = trainable_var[0]
        res = sess.run(var)
        print(type(res))
Exemple #18
0
def train():
  """Train CIFAR-10 for a number of steps."""
  with tf.Graph().as_default():
    # with tf.variable_scope("cifar10", reuse=tf.AUTO_REUSE) as scope:
    global_step = tf.train.get_or_create_global_step()

    # Get images and labels for CIFAR-10.
    train_flag = tf.placeholder(tf.bool, shape = ())

    trX, trY = cifar10.distorted_inputs()
    teX, teY = cifar10.inputs(eval_data = True)

    # Build a Graph that computes the logits predictions from the
    # inference model.
    logits = cifar10.inference(trX)
    # Calculate accuracy
    tr_acc = cifar10.accuracy(logits, trY)[1]
    print(tr_acc, "tr_acc\n")
    # tr_acc_sum = tf.summary.scalar('train/accuracy', tr_acc)
    # Calculate loss.
    loss = cifar10.loss(logits, trY)
    # Build a Graph that trains the model with one batch of examples and
    # updates the model parameters.
    train_op = cifar10.train(loss, global_step)

    tf.get_variable_scope().reuse_variables()
    eval_logits = cifar10.inference(teX)
    te_acc = cifar10.accuracy(eval_logits, teY)[1]
    print(te_acc, "te_acc\n")
    # te_acc_sum = tf.summary.scalar('test/accuracy', te_acc)

    accuracy = tf.cond(train_flag, lambda: tr_acc, lambda: te_acc)
    tf.summary.scalar("accuracy", accuracy)

    merged = tf.summary.merge_all()
    train_writer = tf.summary.FileWriter('tmp/cifar10/train')
    test_writer = tf.summary.FileWriter('tmp/cifar10/test')

    print("Training Starts")

    # Configs
    config = tf.ConfigProto(log_device_placement=FLAGS.log_device_placement)
    config.gpu_options.allow_growth=True
                
    mon_sess = tf.train.MonitoredTrainingSession(
            hooks=[tf.train.StopAtStepHook(last_step=FLAGS.max_steps),
                   tf.train.NanTensorHook(loss)],config=config)
    step = -1
    while not mon_sess.should_stop():
      step += 1
      _,loss_value = mon_sess.run([train_op,loss])
      if step % FLAGS.log_frequency == 0:
          tr_acc,summary = mon_sess.run([accuracy,merged], feed_dict = {train_flag : True})
          train_writer.add_summary(summary, step)
          te_acc, summary = mon_sess.run([accuracy, merged], feed_dict = {train_flag : False})
          test_writer.add_summary(summary, step)

          format_str = ('%s: step %d, loss = %.2f, test accuracy = %.2f, train accuracy = %.2f')
          print (format_str % (datetime.now(), step, loss_value, te_acc, tr_acc))
def train():
    """Train CIFAR-10 for a number of steps."""

    g1 = tf.Graph()
    with g1.as_default():
        global_step = tf.contrib.framework.get_or_create_global_step()

        # Get images and labels for CIFAR-10.
        images, labels = cifar10.distorted_inputs()

        # Build a Graph that computes the logits predictions from the
        # inference model.
        logits = cifar10.inference(images)

        # Calculate loss.
        loss = cifar10.loss(logits, labels)
        grads = cifar10.train_part1(loss, global_step)

        only_gradients = [g for g, _ in grads]
        only_vars = [v for _, v in grads]
        placeholder_gradients = []

        #with tf.device("/gpu:0"):
        for grad_var in grads:
            placeholder_gradients.append(
                (tf.placeholder('float',
                                shape=grad_var[0].get_shape()), grad_var[1]))

        feed_dict = {}

        for i, grad_var in enumerate(grads):
            feed_dict[placeholder_gradients[i][0]] = np.zeros(
                placeholder_gradients[i][0].shape)

        train_op = cifar10.train_part2(global_step, placeholder_gradients)

        sess = tf.Session()
        sess.run(tf.global_variables_initializer())
        feeds = []
        print("Reached here")
        for i, grad_var in enumerate(grads):
            feeds.append(placeholder_gradients[i][0])
        # Partial Run
        print("Reached here", len(feeds))
        for x in feeds:
            print(x, )
        h = sess.partial_run_setup([only_gradients, train_op], feeds)
        print("Reached here")

        for i in xrange(10):
            res_grads = sess.partial_run(h,
                                         only_gradients,
                                         feed_dict=feed_dict)

            feed_dict = {}
            for i, grad_var in enumerate(res_grads):
                feed_dict[placeholder_gradients[i][0]] = res_grads[i]

            res_train_op = sess.partial_run(h, train_op, feed_dict=feed_dict)
Exemple #20
0
def train():
    """Train CIFAR-10 for a number of steps."""
    with tf.Graph().as_default():
        global_step = tf.Variable(0, trainable=False)

        images, labels = cifar10.distorted_inputs()

        # Build a Graph that computes the logits predictions from the
        # inference model.
        logits = cifar10.inference(images)

        # Calculate loss.
        loss = cifar10.loss(logits, labels)

        # Build a Graph that trains the model with one batch of examples and
        # updates the model parameters.
        train_op = cifar10.train(loss, global_step)

        # Create a saver.
        saver = tf.train.Saver(tf.all_variables())

        # Build the summary operation based on the TF collection of Summaries.
        summary_op = tf.merge_all_summaries()

        # Build an initialization operation to run below.
        init = tf.initialize_all_variables()

        # Start running operations on the Graph.
        sess = tf.Session(config=tf.ConfigProto(log_device_placement=FLAGS.log_device_placement))
        sess.run(init)

        # Start the queue runners.
        tf.train.start_queue_runners(sess=sess)

        summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph)

        for step in xrange(FLAGS.max_steps):
            start_time = time.time()
            _, loss_value = sess.run([train_op, loss])
            duration = time.time() - start_time

            assert not np.isnan(loss_value), "Model diverged with loss = NaN"

            if step % 10 == 0:
                num_examples_per_step = FLAGS.batch_size
                examples_per_sec = num_examples_per_step / duration
                sec_per_batch = float(duration)

                format_str = "%s: step %d, loss = %.2f (%.1f examples/sec; %.3f " "sec/batch)"
                print(format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch))

            if step % 100 == 0:
                summary_str = sess.run(summary_op)
                summary_writer.add_summary(summary_str, step)

            # Save the model checkpoint periodically.
            if step % 1000 == 0 or (step + 1) == FLAGS.max_steps:
                checkpoint_path = os.path.join(FLAGS.train_dir, "model.ckpt")
                saver.save(sess, checkpoint_path, global_step=step)
def train():
    """Train CIFAR-10 for a number of steps."""
    with tf.Graph().as_default():
        global_step = tf.contrib.framework.get_or_create_global_step()

        # Get images and labels for CIFAR-10.
        images, labels = cifar10.distorted_inputs()

        # Build a Graph that computes the logits predictions from the
        # inference model.
        logits = cifar10.inference(images)

        # Calculate loss.
        loss = cifar10.loss(logits, labels)

        # Build a Graph that trains the model with one batch of examples and
        # updates the model parameters.
        train_op = cifar10.train(loss, global_step)

        class _LoggerHook(tf.train.SessionRunHook):
            """Logs loss and runtime."""
            def begin(self):
                self._step = -1

            def before_run(self, run_context):
                self._step += 1
                self._start_time = time.time()
                return tf.train.SessionRunArgs(loss)  # Asks for loss value.

            def after_run(self, run_context, run_values):
                duration = time.time() - self._start_time
                loss_value = run_values.results
                if self._step % 10 == 0:
                    num_examples_per_step = FLAGS.batch_size
                    examples_per_sec = num_examples_per_step / duration
                    sec_per_batch = float(duration)

                    format_str = (
                        '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                        'sec/batch)')
                    print(format_str % (datetime.now(), self._step, loss_value,
                                        examples_per_sec, sec_per_batch))
                    global step_no
                    step_no = self._step

        with tf.train.MonitoredTrainingSession(
                checkpoint_dir=FLAGS.train_dir,
                hooks=[
                    tf.train.StopAtStepHook(last_step=FLAGS.max_steps),
                    tf.train.NanTensorHook(loss),
                    _LoggerHook()
                ],
                config=tf.ConfigProto(
                    log_device_placement=FLAGS.log_device_placement,
                    inter_op_parallelism_threads=4,
                    intra_op_parallelism_threads=0)) as mon_sess:
            while not mon_sess.should_stop():
                mon_sess.run(train_op)
                """run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
def train():
    """ 多步训练CIFAR-10数据集 """
    with tf.Graph().as_default():
        global_step = tf.contrib.framework.get_or_create_global_step()

        # 指定 CPU:0 进行数据的变形处理
        with tf.device('/cpu:0'):
            images, labels = cifar10.distorted_inputs()

        # inference 前向预测,定义在cifar10.py文件
        logits = cifar10.inference(images)

        # loss.
        loss = cifar10.loss(logits, labels)

        # train
        train_op = cifar10.train(loss, global_step)

        class _LoggerHook(tf.train.SessionRunHook):
            """记录 loss 和 runtime."""
            def __init__(self):
                self._start_time = time.time()
                self._step = -1

            def begin(self):
                pass

            def before_run(self, run_context):
                self._step += 1
                return tf.train.SessionRunArgs(
                    loss)  # 将loss 操作加到Session.run()调用

            def after_run(self, run_context, run_values):
                if self._step % FLAGS.log_frequency == 0:
                    current_time = time.time()
                    duration = current_time - self._start_time
                    self._start_time = current_time

                    loss_value = run_values.results
                    examples_per_sec = FLAGS.log_frequency * FLAGS.batch_size / duration
                    sec_per_batch = float(duration / FLAGS.log_frequency)

                    format_str = '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)'
                    print(format_str % (datetime.now(), self._step, loss_value,
                                        examples_per_sec, sec_per_batch))

        with tf.train.MonitoredTrainingSession(
                checkpoint_dir=FLAGS.train_dir,
                hooks=[
                    tf.train.StopAtStepHook(
                        last_step=FLAGS.max_steps),  # 在运行指定步长后停止
                    tf.train.NanTensorHook(loss),  # 监视loss,如果loss==NaN,停止训练
                    _LoggerHook()
                ],
                # log_device_placement是否打印设备分配日志
                config=tf.ConfigProto(log_device_placement=FLAGS.
                                      log_device_placement)) as mon_sess:
            while not mon_sess.should_stop():
                mon_sess.run(train_op)
def tower_loss(scope):
  """Calculate the total loss on a single tower running the CIFAR model.

  Args:
    scope: unique prefix string identifying the CIFAR tower, e.g. 'tower_0'

  Returns:
     Tensor of shape [] containing the total loss for a batch of data
  """
  # Get images and labels for CIFAR-10.
  images, labels = cifar10.distorted_inputs()

  # Build inference Graph.
  if FLAGS.model == 'small_cnn':
    logits = cifar10.inference_small(images)
  elif FLAGS.model == 'resnet_18':
    logits = cifar10.inference_resnet(images, is_train=True)
  elif FLAGS.model == 'lcnn_resnet_18':
    logits = cifar10.inference_lcnn_resnet(images, is_train=True)
  elif FLAGS.model == 'resnet_lcnn_hybrid':
    logits = cifar10.inference_resnet_lcnn_hybrid(images, is_train=True)
  else:
    raise Exception('Unknown model type: {}'.format(FLAGS.model))

  # Build the portion of the Graph calculating the losses. Note that we will
  # assemble the total_loss using a custom function below.
  _ = cifar10.loss(logits, labels)

  # Assemble all of the losses for the current tower only.
  losses = tf.get_collection('losses', scope)

  # Calculate the total loss for the current tower.
  total_loss = tf.add_n(losses, name='total_loss')
  l1_loss = tf.add_n([0.] + [l for l in losses if l.op.name.find('l1_loss') != -1],
                     name='l1_sum_loss')
  l2_loss = tf.add_n([0.] + [l for l in losses if l.op.name.find('weight_loss') != -1],
                     name='l2_sum_loss')
  all_losses = losses + [total_loss, l1_loss, l2_loss]

  # Compute the moving average of all individual losses and the total loss.
  loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg')
  loss_averages_op = loss_averages.apply(all_losses)

  # Attach a scalar summary to all individual losses and the total loss; do the
  # same for the averaged version of the losses.
  for l in all_losses:
    # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training
    # session. This helps the clarity of presentation on tensorboard.
    loss_name = re.sub('%s_[0-9]*/' % cifar10.TOWER_NAME, '', l.op.name)
    # Name each loss as '(raw)' and name the moving average version of the loss
    # as the original loss name.
    tf.scalar_summary(loss_name +' (raw)', l)
    tf.scalar_summary(loss_name, loss_averages.average(l))

  update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) 
  updates = tf.group(*update_ops) # For correct batch norm execution
  with tf.control_dependencies([updates, loss_averages_op]):
    total_loss = tf.identity(total_loss)
  return total_loss
Exemple #24
0
def build_model_func():

    images, labels = cifar10.distorted_inputs()

    logits = cifar10.inference(images)

    loss = cifar10.loss(logits, labels)

    return loss
def train():
  """Train CIFAR-10 for a number of steps."""
  with tf.Graph().as_default():
    global_step = tf.train.get_or_create_global_step()

    # Get images and labels for CIFAR-10.
    # Force input pipeline to CPU:0 to avoid operations sometimes ending up on
    # GPU and resulting in a slow down.
    with tf.device('/cpu:0'):
      images, labels = cifar10.distorted_inputs()

    # Build a Graph that computes the logits predictions from the
    # inference model.
    logits = cifar10.inference(images)

    # Calculate loss.
    loss = cifar10.loss(logits, labels)

    # Build a Graph that trains the model with one batch of examples and
    # updates the model parameters.
    train_op = cifar10.train(loss, global_step)

    class _LoggerHook(tf.train.SessionRunHook):
      """Logs loss and runtime."""

      def begin(self):
        self._step = -1
        self._start_time = time.time()

      def before_run(self, run_context):
        self._step += 1
        return tf.train.SessionRunArgs(loss)  # Asks for loss value.

      def after_run(self, run_context, run_values):
        if self._step % FLAGS.log_frequency == 0:
          current_time = time.time()
          duration = current_time - self._start_time
          self._start_time = current_time

          loss_value = run_values.results
          examples_per_sec = FLAGS.log_frequency * FLAGS.batch_size / duration
          sec_per_batch = float(duration / FLAGS.log_frequency)

          format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                        'sec/batch)')
          print (format_str % (datetime.now(), self._step, loss_value,
                               examples_per_sec, sec_per_batch))

    with tf.train.MonitoredTrainingSession(
        checkpoint_dir=FLAGS.train_dir,
        hooks=[tf.train.StopAtStepHook(last_step=FLAGS.max_steps),
               tf.train.NanTensorHook(loss),
               _LoggerHook()],
        config=tf.ConfigProto(
            log_device_placement=FLAGS.log_device_placement)) as mon_sess:
      while not mon_sess.should_stop():
        mon_sess.run(train_op)
def tower_loss(scope):
    """Calculate the total loss on a single tower running the CIFAR model.
    Args:
        scope: unique prefix string identifying the CIFAR tower, e.g. 'tower_0'
    Returns:
         Tensor of shape [] containing the total loss for a batch of data
    """
    # Build a Graph that computes the logits predictions from the
    # inference model.
    if tfFLAGS.network == 1:
        images, labels = cifar10.distorted_inputs()
        logits, fc1_w, fc1_b, fc2_w, fc2_b = MyModel.inference(images)
    else:
        images, labels = cifar10.distorted_inputs()
        logits, fc1_w, fc1_b, fc2_w, fc2_b = MyModel2.inference(images)

    # Calculate loss.
    loss = cifar10.loss(logits, labels)

        # L2 regularization for the fully connected parameters.
    regularizers = (tf.nn.l2_loss(fc1_w) + tf.nn.l2_loss(fc1_b) + tf.nn.l2_loss(fc2_w) + tf.nn.l2_loss(fc2_b))

    # Add the regularization term to the loss.
    loss += 5e-4 * regularizers

    # Build the portion of the Graph calculating the losses. Note that we will
    # assemble the total_loss using a custom function below.
    _ = cifar10.loss(logits, labels)

    # Assemble all of the losses for the current tower only.
    losses = tf.get_collection('losses', scope)

    # Calculate the total loss for the current tower.
    total_loss = tf.add_n(losses, name='total_loss')

    # Attach a scalar summary to all individual losses and the total loss; do the
    # same for the averaged version of the losses.
    for l in losses + [total_loss]:
        # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training
        # session. This helps the clarity of presentation on tensorboard.
        loss_name = re.sub('%s_[0-9]*/' % tfFLAGS.TOWER_NAME, '', l.op.name)
        tf.summary.scalar(loss_name, l)

    return total_loss
def train():
  """Train CIFAR-10 for a number of steps."""
  with tf.Graph().as_default():
    global_step = tf.contrib.framework.get_or_create_global_step()

    # Get images and labels for CIFAR-10.
    images, labels = cifar10.distorted_inputs()

    # Build a Graph that computes the logits predictions from the
    # inference model.
    logits = cifar10.inference(images)

    # Calculate loss.
    loss = cifar10.loss(logits, labels)

    # Build a Graph that trains the model with one batch of examples and
    # updates the model parameters.
    train_op = cifar10.train(loss, global_step)

    class _LoggerHook(tf.train.SessionRunHook):
      """Logs loss and runtime."""

      def begin(self):
        self._step = -1

      def before_run(self, run_context):
        self._step += 1
        self._start_time = time.time()
        return tf.train.SessionRunArgs(loss)  # Asks for loss value.

      def after_run(self, run_context, run_values):
        duration = time.time() - self._start_time
        loss_value = run_values.results
        if self._step % 10 == 0:
          num_examples_per_step = FLAGS.batch_size
          examples_per_sec = num_examples_per_step / duration
          sec_per_batch = float(duration)

          format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                        'sec/batch)')
          print (format_str % (datetime.now(), self._step, loss_value,
                               examples_per_sec, sec_per_batch))
          global step_no 
          step_no = self._step

    with tf.train.MonitoredTrainingSession(
        checkpoint_dir=FLAGS.train_dir,
        hooks=[tf.train.StopAtStepHook(last_step=FLAGS.max_steps),
               tf.train.NanTensorHook(loss),
               _LoggerHook()],
        config=tf.ConfigProto(
            log_device_placement=FLAGS.log_device_placement, inter_op_parallelism_threads=4,intra_op_parallelism_threads=0)) as mon_sess:
      while not mon_sess.should_stop():
        mon_sess.run(train_op)
        """run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
Exemple #28
0
def train():
    with g.as_default():

        global_step = tf.train.get_or_create_global_step()

        with tf.device('/cpu:0'):
            images, labels = cifar10.distorted_inputs()

        logits = cifar10.inference3model(images)
        stats_graph(g)
        loss = cifar10.loss(logits, labels)
        train_op = cifar10.train(loss, global_step)

        class _LoggerHook(tf.train.SessionRunHook):
            def begin(self):

                self._step = -1
                self._start_time = time.time()

            def before_run(self, run_context):

                self._step += 1
                # 在这里返回你想在运行过程中产看的信息,以list的形式传递,如:[loss, accuracy]
                return tf.train.SessionRunArgs(loss)

            def after_run(self, run_context, run_values):

                if self._step % FLAGS.log_frequency == 0:
                    current_time = time.time()
                    duration = current_time - self._start_time
                    self._start_time = current_time

                    # results返回的是上面before_run()的返回结果,上面是loss所以返回loss
                    # 如若上面返回的是个list,则返回的也是个list
                    loss_value = run_values.results
                    examples_per_sec = FLAGS.log_frequency * FLAGS.batch_size / duration
                    sec_per_batch = float(duration / FLAGS.log_frequency)

                    print(
                        '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f sec/batch)'
                        % (datetime.now(), self._step, loss_value,
                           examples_per_sec, sec_per_batch))

        with tf.train.MonitoredTrainingSession(
                checkpoint_dir=FLAGS.train_dir,
                hooks=[
                    tf.train.StopAtStepHook(last_step=FLAGS.max_steps),
                    tf.train.NanTensorHook(loss),
                    _LoggerHook()
                ],
                save_checkpoint_secs=60,
                config=tf.ConfigProto(log_device_placement=FLAGS.
                                      log_device_placement)) as mon_sess:
            while not mon_sess.should_stop():
                mon_sess.run(train_op)
Exemple #29
0
def train():
    """Train CIFAR-10 for a number of steps."""
    with tf.Graph().as_default():
        global_step = tf.contrib.framework.get_or_create_global_step()

    #为CIFAR-10获取(图像,label)样本对
    images, labels = cifar10.distorted_inputs()

    #创建图Graph用于从推断模型计算logits predictions
    logits = cifar10.inference(images)

    #计算损失
    loss = cifar10.loss(logits, labels)

    #创建图:用一个batch的样本来训练模型
    #更新模型参数
    train_op = cifar10.train(loss, global_step)

    class _LoggerHook(tf.train.SessionRunHook):
        """Logs loss and runtime"""
        def begin(self):
            self._step = -1
            self._start_time = time.time()

        def before_run(self, run_context):
            self._step += 1
            return tf.train.SessionRunArgs(loss)  #请求loss值

        def after_run(self, run_context, run_values):
            if self._step % FLAGS.log_frequency == 0:
                current_time = time.time()
                duration = current_time - _start_time
                self._start_time = current_time

                loss_value = run_values.results
                examples_per_sec = FLAGS.batch_size * log_frequency / duration
                sec_per_batch = float(duration / log_frequency)

                format_str = (
                    '%s: step %d, loss = %.2f(%.1f examples/sec, %.3f sec/batch)'
                )
                print(format_str % (datetime.now(), self._step, loss_value,
                                    examples_per_sec, sec_per_batch))

    with tf.train.MonitoredTrainingSession(
            checkpoint_dir=FLAGS.train_dir,
            hooks=[
                tf.train.StopAtStepHook(last_step=FLAGS.max_steps),
                tf.train.NanTensorHook(loss),
                _LoggerHook()
            ],
            config=tf.ConfigProto(
                log_device_placement=FLAGS.log_device_placement)) as mon_sess:
        while not mon_sess.should_stop():
            mon_sess.run(train_op)
Exemple #30
0
def train():
    """ CIFAR10训练函数 """
    with tf.Graph().as_default():
        global_step = tf.contrib.framework.get_or_create_global_step()

        # 从数据中读取图像和标签.
        images, labels = cifar10.distorted_inputs()

        # 建立一个图用来计算模型的预测结果
        logits = cifar10.inference(images)

        # 计算误差
        loss = cifar10.loss(logits, labels)

        # 构建图用一个batch的训练数据来训练模型并更新参数
        train_op = cifar10.train(loss, global_step)

        class _LoggerHook(tf.train.SessionRunHook):
            """ 记录误差和运行时间 """
            def begin(self):
                self._step = -1
                self._start_time = time.time()

            def before_run(self, run_context):
                self._step += 1
                return tf.train.SessionRunArgs(loss)  # 计算误差值

            def after_run(self, run_context, run_values):
                if self._step % FLAGS.log_frequency == 0:
                    current_time = time.time()
                    duration = current_time - self._start_time
                    self._start_time = current_time

                    loss_value = run_values.results
                    examples_per_sec = FLAGS.log_frequency * FLAGS.batch_size / duration
                    sec_per_batch = float(duration / FLAGS.log_frequency)

                    format_str = (
                        '%s: step %d, loss = %.5f (%.1f examples/sec; %.3f sec/batch)'
                    )
                    print(format_str % (datetime.now(), self._step, loss_value,
                                        examples_per_sec, sec_per_batch))

        with tf.train.MonitoredTrainingSession(
                checkpoint_dir=FLAGS.train_dir,
                hooks=[
                    tf.train.StopAtStepHook(last_step=FLAGS.max_steps),
                    tf.train.NanTensorHook(loss),
                    _LoggerHook()
                ],
                config=tf.ConfigProto(log_device_placement=FLAGS.
                                      log_device_placement)) as mon_sess:
            while not mon_sess.should_stop():
                mon_sess.run(train_op)
def train():
    """
    Train CIFAR-10 for a number of steps
    :return:
    """
    with tf.Graph().as_default():
        global_step = tf.train.get_or_create_global_step()

        # 获取CIFAR-10的images和labels
        # 让CPU专注流输入,避免GPU处理完,导致停顿
        with tf.device('/cpu:0'):
            images, labels = cifar10.distorted_inputs()

        # 预测结果 以及 loss
        logits = cifar10.inference(images)
        loss = cifar10.loss(logits, labels)

        train_op = cifar10.train(loss, global_step)

        class _LoggerHook(tf.train.SessionRunHook):
            """ log loss and runtime. """

            def begin(self):
                self._step = -1
                self._start_time = time.time()

            def before_run(self, run_context):
                self._step += 1
                return tf.train.SessionRunArgs(loss)

            def after_run(self, run_context, run_values):
                if self._step % FLAGS.log_frequency == 0:
                    current_time = time.time()
                    duration = current_time - self._start_time
                    self._start_time = current_time

                    loss_value = run_values.results
                    examples_per_sec = FLAGS.log_frequency * FLAGS.batch_size / duration
                    sec_per_batch = float(duration / FLAGS.log_frequency)

                    format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                                  'sec/batch)')

                    print(format_str % (datetime.now(), self._step, loss_value,
                                        examples_per_sec, sec_per_batch))

        with tf.train.MonitoredTrainingSession(
                checkpoint_dir=FLAGS.train_dir,
                hooks=[tf.train.StopAtStepHook(last_step=FLAGS.max_steps),
                       tf.train.NanTensorHook(loss),
                       _LoggerHook()],
                config=tf.ConfigProto(log_device_placement=FLAGS.log_device_placement)) as mon_sess:
            while not mon_sess.should_stop():
                mon_sess.run(train_op)
Exemple #32
0
def train():
    """Train CIFAR-10 for a number of steps."""
    with tf.Graph().as_default():
        global_step = tf.train.get_or_create_global_step()

        # Get images and labels for CIFAR-10.
        # Force input pipeline to CPU:0 to avoid operations sometimes ending up on
        # GPU and resulting in a slow down.
        with tf.device('/cpu:0'):
            images, labels = cifar10.distorted_inputs()

        # Build a Graph that computes the logits predictions from the
        # inference model.
        logits = cifar10.inference(images)

        # Calculate loss.
        loss = cifar10.loss(logits, labels)

        # Build a Graph that trains the model with one batch of examples and
        # updates the model parameters.
        train_op = cifar10.train(loss, global_step)

        class _LoggerHook(tf.train.SessionRunHook):
            """Logs loss and runtime."""
            def begin(self):
                self._step = -1
                self._start_time = time.time()

            def before_run(self, run_context):
                self._step += 1
                return tf.train.SessionRunArgs(loss)  # Asks for loss value.

            def after_run(self, run_context, run_values):
                if self._step % FLAGS.log_frequency == 0:
                    current_time = time.time()
                    self._start_time = current_time

        for step in range(0, FLAGS.max_steps + 1, FLAGS.log_frequency):
            print(str(step))
            with tf.train.MonitoredTrainingSession(
                    checkpoint_dir=FLAGS.train_dir,
                    hooks=[
                        tf.train.StopAtStepHook(last_step=step),
                        tf.train.NanTensorHook(loss),
                        _LoggerHook()
                    ],
                    config=tf.ConfigProto(log_device_placement=FLAGS.
                                          log_device_placement)) as mon_sess:
                while not mon_sess.should_stop():
                    mon_sess.run(train_op)
            # evaluate test data
            cifar10_eval.evaluate()
            # evaluate train data
            evaluate()
Exemple #33
0
def train():
  """Train CIFAR-10 for a number of steps."""
  with tf.Graph().as_default():
    global_step = tf.contrib.framework.get_or_create_global_step()

    # Get images and labels for CIFAR-10.
    images, labels = cifar10.distorted_inputs()

    # Build a Graph that computes the logits predictions from the
    # inference model.
    logits = cifar10.inference(images)

    # Calculate loss.
    loss = cifar10.loss(logits, labels)

    # Build a Graph that trains the model with one batch of examples and
    # updates the model parameters.
    train_op = cifar10.train(loss, global_step)

    class _LoggerHook(tf.train.SessionRunHook):
      """Logs loss and runtime."""

      def begin(self):
        self._step = -1

      def before_run(self, run_context):
        self._step += 1
        self._start_time = time.time()
        return tf.train.SessionRunArgs([loss, logits, labels])  # Asks for loss value.

      def after_run(self, run_context, run_values):
        duration = time.time() - self._start_time
        loss_value, predictions, targets = run_values.results
        if self._step % FLAGS.print_eval_interval == 0:
          num_examples_per_step = FLAGS.batch_size
          examples_per_sec = num_examples_per_step / duration
          sec_per_batch = float(duration)
          accuracy = 100.0 * np.mean(np.argmax(predictions, 1) == targets)

          format_str = ('%s: step %d, loss = %.2f, train acc = %.1f%%, '
                        '(%.1f examples/sec; %.3f sec/batch)')
          print (format_str % (datetime.now(), self._step, loss_value,
                               accuracy, examples_per_sec, sec_per_batch))

    with tf.train.MonitoredTrainingSession(
        checkpoint_dir=FLAGS.train_dir,
        hooks=[tf.train.StopAtStepHook(last_step=FLAGS.max_steps),
               tf.train.NanTensorHook(loss),
               _LoggerHook()],
        save_checkpoint_secs=FLAGS.save_checkpoint_secs,
        config=tf.ConfigProto(
            log_device_placement=FLAGS.log_device_placement)) as mon_sess:
      while not mon_sess.should_stop():
        mon_sess.run(train_op)
Exemple #34
0
def train():
    global updated_batch_size_num
    global passed_info
    global shall_update
    ps_hosts = FLAGS.ps_hosts.split(',')
    worker_hosts = FLAGS.worker_hosts.split(',')
    print ('PS hosts are: %s' % ps_hosts)
    print ('Worker hosts are: %s' % worker_hosts)
    issync = FLAGS.sync

    #server = tf.train.Server({'ps': ps_hosts, 'worker': worker_hosts},
    #                         job_name = FLAGS.job_name,
    #                         task_index=FLAGS.task_id)
    cluster = tf.train.ClusterSpec({'ps': ps_hosts, 'worker': worker_hosts})
    server = tf.train.Server(cluster, job_name=FLAGS.job_name, task_index=FLAGS.task_index)


#    batchSizeManager = BatchSizeManager(FLAGS.batch_size, len(worker_hosts))

    if FLAGS.job_name == 'ps':
#	rpcServer = batchSizeManager.create_rpc_server(ps_hosts[0].split(':')[0])
#        rpcServer.serve()
        server.join()
    elif FLAGS.job_name == "worker":
        time.sleep(10)


#    rpcClient = batchSizeManager.create_rpc_client(ps_hosts[0].split(':')[0])
    is_chief = (FLAGS.task_index == 0)
    if is_chief:
        if tf.gfile.Exists(FLAGS.train_dir):
            tf.gfile.DeleteRecursively(FLAGS.train_dir)
        tf.gfile.MakeDirs(FLAGS.train_dir)

# modified by faye
#    device_setter = tf.train.replica_device_setter(ps_tasks=1)
#    with tf.device('/job:worker/task:%d' % FLAGS.task_index):
#        with tf.device(device_setter):
#            global_step = tf.Variable(0, trainable=False)
    with tf.device(tf.train.replica_device_setter(
            worker_device="/job:worker/task:%d" % FLAGS.task_index,
            cluster=cluster
            )):

            global_step = tf.get_variable(
                    'global_step', [],
                    initializer=tf.constant_initializer(0), trainable=False)

        decay_steps = 50000*350.0/FLAGS.batch_size
        batch_size = tf.placeholder(dtype=tf.int32, shape=(), name='batch_size')
            images, labels = cifar10.distorted_inputs(batch_size)
            print('zx0')
            print(images.get_shape().as_list())
Exemple #35
0
def train():
    with tf.Graph().as_default(), tf.device('/cpu:0'):
        global_step = tf.get_variable('global_step', [],
                                      initializer=tf.constant_initializer(0),
                                      trainable=False)
        num_batches_per_epoch = cifar10.NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / batch_size
        decay_steps = int(num_batches_per_epoch * cifar10.NUM_EPOCHS_PER_DECAY)
        lr = tf.train.exponential_decay(cifar10.INITIAL_LEARNING_RATE,
                                        global_step,
                                        decay_steps,
                                        cifar10.LEARNING_RATE_DECAY_FACTOR,
                                        staircase=True)
        opt = tf.train.GradientDescentOptimizer(lr)

        images, labels = cifar10.distorted_inputs()
        # 使用预加载的队列,使用这个会速度可以提高五倍,亲测
        batch_queue = tf.contrib.slim.prefetch_queue.prefetch_queue(
            [images, labels], capacity=2 * FLAGS.num_gpus)
        tower_grads = []
        for i in range(num_gpus):
            with tf.device('/gpu:%d' % i):
                with tf.name_scope('%s_%d' % (cifar10.TOWER_NAME, i)) as scope:
                    loss = tower_loss(scope, images, labels)
                    tf.get_variable_scope().reuse_variables()
                    grads = opt.compute_gradients(loss)
                    tower_grads.append(grads)

        grads = average_gradients(tower_grads)
        apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)

        saver = tf.train.Saver(tf.all_variables())
        init = tf.global_variables_initializer()
        sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
        sess.run(init)
        tf.train.start_queue_runners(sess=sess)

        for step in range(max_steps):
            start_time = time.time()
            _, loss_value = sess.run([apply_gradient_op, loss])
            duration = time.time() - start_time

            if step % 10 == 0:
                num_examples_per_step = batch_size * num_gpus
                examples_per_sec = num_examples_per_step / duration
                sec_per_batch = duration / num_gpus

                format_str = ('step %d, loss= %.2f (%.1f example/sec; %.3f'
                              'sec/batch)')
                print(format_str %
                      (step, loss_value, examples_per_sec, sec_per_batch))

                if step % 1000 == 0 or (step + 1) == max_steps:
                    saver.save(sess, '/', global_step=step)
def train():
    with tf.Graph().as_default():
        images, labels = cifar10.distorted_inputs()

        sess = tf.Session()
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(coord=coord, sess=sess)
        for i in range(10):
            im, label = sess.run([images, labels])
            print(im.shape)
        coord.request_stop()
        coord.join(threads)
Exemple #37
0
def train():
    # todo:这句话啥意识
    with tf.Graph().as_default():
        # todo:啥意思
        global_step = tf.train.get_or_create_global_step()
        with tf.device('/cpu:0'):
            images, labels = cifar10.distorted_inputs()
        # 推测模型
        logits = cifar10.inference(images)
        loss = cifar10.loss(logits, labels)
        train_op = cifar10.train(loss, global_step)

        class _LoggerHook(tf.train.SessionRunHook):
            """
            logs loss and runtime
            """
            def begin(self):
                self._step = -1
                self._start_time = time.time()

            def before_run(self, run_context):
                self._step += 1
                return tf.train.SessionRunArgs(loss)

            def after_run(
                    self,
                    run_context,  # pylint: disable=unused-argument
                    run_values):
                if self._step % FLAGS.log_frequency == 0:
                    current_time = time.time()
                    duration = current_time - self._start_time
                    self._start_time = current_time

                    loss_value = run_values.results
                    example_per_sec = FLAGS.log_frequency * FLAGS.batch_size / duration
                    sec_per_batch = float(duration / FLAGS.log_frequency)
                    format_str = (
                        '%s: step %d,loss=%.2f (%.1f example/sec);%.3f sec/batch'
                    )
                    print(format_str)

        with tf.train.MonitoredTrainingSession(
                checkpoint_dir=FLAGS.train_dir,
                hooks=[
                    tf.train.StopAtStepHook(last_step=FLAGS.max_steps),
                    tf.train.NanTensorHook(loss),
                    _LoggerHook()
                ],
                config=tf.ConfigProto(log_device_placement=FLAGS.
                                      log_device_placement)) as mon_sess:
            while not mon_sess.should_stop():
                mon_sess.run(train_op)
def run_training():
    with tf.Graph().as_default(), tf.device('/gpu:0'):
        global_step = tf.Variable(0, trainable=False)

        with tf.device('/cpu:0'):
            images, labels = cifar10.distorted_inputs()

        images_placeholder, labels_placeholder = placeholder_inputs(
            FLAGS.batch_size, cifar10.IMAGE_SIZE)

        logits = cifar10.inference(images_placeholder)
        losses_dict = cifar10.loss(logits, labels_placeholder)

        moving_averages_op = cifar10.add_summaries_and_moving_avgs(
            losses_dict, global_step)

        lbfgs_optimizer = customized_optimizer.CustomizedOptimizerInterface(
            global_step=global_step,
            loss_dict=losses_dict,
            data_fetches=[images, labels],
            data_placeholders=(images_placeholder, labels_placeholder),
            maxiter=FLAGS.max_steps)

        saver = tf.train.Saver(tf.global_variables(), max_to_keep=25)

        summary_op = tf.summary.merge_all()

        init = tf.global_variables_initializer()

        with tf.Session(config=tf.ConfigProto(
                allow_soft_placement=True,
                log_device_placement=FLAGS.log_device_placement,
                gpu_options=tf.GPUOptions(
                    per_process_gpu_memory_fraction=0.4))) as sess:
            sess.run(init)

            coordinator = tf.train.Coordinator()
            try:
                threads = tf.train.start_queue_runners(sess=sess,
                                                       coord=coordinator)

                lbfgs_optimizer.minimize(session=sess,
                                         moving_averages_op=moving_averages_op,
                                         summary_op=summary_op,
                                         saver=saver,
                                         step_callback=step_callback)

            except Exception as e:
                coordinator.request_stop(e)

            coordinator.request_stop()
            coordinator.join(threads, stop_grace_period_secs=10)
Exemple #39
0
def train():
    """Train CIFAR-10 for a number of steps."""
    sess = tf.InteractiveSession()
    #sess = tf_debug.LocalCLIDebugWrapperSession(sess)

    # Get images and labels for CIFAR-10.
    with tf.device('/cpu:0'):
        images, labels = cifar10.distorted_inputs()

    # Define all the fixed point variables we will be using later
    cifar10.initialize_fix_point_variables()

    # Build a Graph that computes the logits predictions from the inference model
    logits = cifar10.inference(images)

    # Calculate loss.
    loss = cifar10.loss(logits, labels)

    # Build a Graph that trains the model with one batch of examples and
    # updates the model parameters.
    train_op = cifar10.train(loss, 0.05)

    # Update fixed point conversion parameters when needed
    update_fix_pt_ops = cifar10.update_fix_point_accuracy()

    # Merge all the summaries and write them out to
    # FLAGS.log_dir
    merged_summary = tf.summary.merge_all()
    train_writer = tf.summary.FileWriter(FLAGS.log_dir + '/train', sess.graph)
    test_writer = tf.summary.FileWriter(FLAGS.log_dir + '/test')

    # init all variables
    tf.global_variables_initializer().run()
    # create a saver for checkpoints
    saver = tf.train.Saver(tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES))
    # needed on interactive session so it doesn't hang
    tf.train.start_queue_runners()

    for i in range(FLAGS.max_steps):
        summary, _ = sess.run([merged_summary, train_op])
        train_writer.add_summary(summary, i)  # summary

        if (i % 10 == 0):
            saver.save(sess, FLAGS.log_dir + '/checkpoint', global_step=i)

        if (i % 5 == 0):
            sess.run([update_fix_pt_ops])
            print('Step: %s, Loss: %s' % (i, loss.eval()))

    train_writer.close()
def train():
    with tf.Graph().as_default():
        global_step = tf.Variable(0, trainable=False)

        images, labels = cifar10.distorted_inputs()

        logits = cifar10.inference(images)

        loss = cifar10.loss(logits, labels)

        train_op = cifar10.train(loss, global_step)

        saver = tf.train.Saver(tf.all_variables())

        summary_op = tf.merge_all_summaries()

        init = tf.initialize_all_variables()

        sess = tf.Session(config=tf.ConfigProto(log_device_placement=FLAGS.log_device_placement))
        sess.run(init)

        tf.train.start_queue_runners(sess=sess)

        summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, graph_def=sess.graph_def)

        for step in xrange(FLAGS.max_steps):
            start_time = time.time()
            _, loss_value = sess.run([train_op, loss])
            duration = time.time() - start_time

            assert not np.isnan(loss_value), "Model diverged with loss = NaN"

            if step % 10 == 0:
                num_examples_per_step = FLAGS.batch_size
                examples_per_sec = num_examples_per_step / duration
                sec_per_batch = float(duration)

                format_str = "%s: step %d, loss = %.2f (%.1f examples/sec; %.3f sec/batch)"
                print(format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch))

            if step % 100 == 0:
                summary_str = sess.run(summary_op)
                summary_writer.add_summary(summary_str, step)

            if step % 1000 == 0 or (step + 1) == FLAGS.max_steps:
                checkpoint_path = os.path.join(FLAGS.train_dir, "model.ckpt")
                saver.save(sess, checkpoint_path, global_step=step)
Exemple #41
0
def train_vgg16():
    with tf.Graph().as_default():
        image_size = 224  # 输入图像尺寸
        # 生成随机数测试是否能跑通
        #images = tf.Variable(tf.random_normal([batch_size, image_size, image_size, 3], dtype=tf.float32, stddev=1e-1))
        with tf.device('/cpu:0'):
            images, labels = cifar10.distorted_inputs()
        keep_prob = tf.placeholder(tf.float32)
        prediction,softmax,fc8,p = inference_op(images,keep_prob)
        init = tf.global_variables_initializer()
        sess = tf.Session()
        sess.run(init)
        time_tensorflow_run(sess, prediction,{keep_prob:1.0}, "Forward")
        # 用以模拟训练的过程
        objective = tf.nn.l2_loss(fc8)  # 给一个loss
        grad = tf.gradients(objective, p)  # 相对于loss的 所有模型参数的梯度
        time_tensorflow_run(sess, grad, {keep_prob:0.5},"Forward-backward")
def train():
  """Train CIFAR-10 for a number of steps."""



  with tf.Graph().as_default():
    global_step = tf.Variable(0, trainable=False)


    # Get images and labels for CIFAR-10.
    images, labels = cifar10.distorted_inputs()

    # Build a Graph that computes the logits predictions from the
    # inference model.
    logits = cifar10.inference(images)

    # Calculate loss.
    loss = cifar10.loss(logits, labels)

    # Build a Graph that trains the model with one batch of examples and
    # updates the model parameters.
    train_op = cifar10.train(loss, global_step)

    # Create a saver.
    saver = tf.train.Saver(tf.all_variables())

    # Build the summary operation based on the TF collection of Summaries.
    summary_op = tf.merge_all_summaries()


    # # Visualize conv1 features
    # with tf.variable_scope('conv1') as scope_conv:
    #     #tf.get_variable_scope().reuse_variables()
    #     scope_conv.reuse_variables()
    #     weights = tf.get_variable('weights')
    #     grid_x = grid_y = 8   # to get a square grid for 64 conv1 features
    #     grid = put_kernels_on_grid (weights, (grid_y, grid_x))
    #     tf.image_summary('conv1/features', grid, max_images=1)


    # Build an initialization operation to run below.
    init = tf.initialize_all_variables()

    # Start running operations on the Graph.
    sess = tf.Session(config=tf.ConfigProto(
        log_device_placement=FLAGS.log_device_placement))
    sess.run(init)

    # Start the queue runners.
    tf.train.start_queue_runners(sess=sess)



    summary_writer = tf.train.SummaryWriter(FLAGS.train_dir,
                                            graph_def=sess.graph_def)


    for step in xrange(FLAGS.max_steps):
      start_time = time.time()
      _, loss_value = sess.run([train_op, loss])
      duration = time.time() - start_time


      assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

      if step % 10 == 0:
        num_examples_per_step = FLAGS.batch_size
        examples_per_sec = num_examples_per_step / float(duration)
        sec_per_batch = float(duration)

        format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                      'sec/batch)')
        print (format_str % (datetime.now(), step, loss_value,
                             examples_per_sec, sec_per_batch))

      if step % 100 == 0:
        summary_str = sess.run(summary_op)
        summary_writer.add_summary(summary_str, step)

      # Save the model checkpoint periodically.
      if step % 1000 == 0 or (step + 1) == FLAGS.max_steps:
        checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
Exemple #43
0
def train(lambs):
    """Train CIFAR-10 for a number of steps."""
    with tf.Graph().as_default():
        global_step = tf.Variable(0, trainable=False)
        # Get images and labels for CIFAR-10.
        images, labels = cifar10.distorted_inputs()

        # Build a Graph that computes the logits predictions from the
        # inference model.
        logits = cifar10.inference(images)

        # Calculate loss.
        lambs = tf.constant(lambs, dtype=tf.float32)
        loss = infor.loss(logits, labels, lambs)

        # Build a Graph that trains the model with one batch of examples and
        # updates the model parameters.
        train_op, lr_op = cifar10.train(loss, global_step)

        # Create a saver.
        saver = tf.train.Saver(tf.all_variables())

        sess = tf.Session(config=tf.ConfigProto(
            log_device_placement=FLAGS.log_device_placement))

        # Build the summary operation based on the TF collection of Summaries.
        summary_op = tf.merge_all_summaries()

        ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir)
        if ckpt and ckpt.model_checkpoint_path:
            # Restores from checkpoint
            saver.restore(sess, ckpt.model_checkpoint_path)
            # Assuming model_checkpoint_path looks something like:
            #   /my-favorite-path/cifar10_train/model.ckpt-0,
            # extract global_step from it.
            previous_step = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]
            print ('Start training from previous')
        else:
            print('Strart training from step 0')
            previous_step = 0
            init = tf.initialize_all_variables()
            sess.run(init)

        # Start the queue runners.
        tf.train.start_queue_runners(sess=sess)

        summary_writer = tf.train.SummaryWriter(FLAGS.train_dir,
                                                graph_def=sess.graph_def)

        step = previous_step
        while step <= FLAGS.max_steps:
            start_time = time.time()
            _, loss_value, lr_value = sess.run([train_op, loss, lr_op])
            duration = time.time() - start_time

            assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

            if step % 10 == 0:
                num_examples_per_step = FLAGS.batch_size
                examples_per_sec = num_examples_per_step / duration
                sec_per_batch = float(duration)

                format_str = ('%s: step %d, loss = %.2f, lr_value = %.4f (%.1f examples/sec; %.3f '
                              'sec/batch)')
                print(format_str % (datetime.now(), step, loss_value, lr_value,
                                    examples_per_sec, sec_per_batch))

            if step % 100 == 0:
                summary_str = sess.run(summary_op)
                summary_writer.add_summary(summary_str, step)

            # Save the model checkpoint periodically.
            if step % 1000 == 0 or (step + 1) == FLAGS.max_steps:
                checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)

            step += 1
def train():
  """Train a model for a number of steps."""
  with tf.Graph().as_default():
    global_step = tf.Variable(0, trainable=False)

    # Get images and labels for a segmentation model.
    images, labels, ground_truth = cifar10.distorted_inputs()
    tf.histogram_summary('label_hist/with_ignore', labels)
    tf.histogram_summary('label_hist/ground_truth', ground_truth)
    
    # Build a Graph that computes the logits predictions from the
    # inference model.
    print("before inference")
    print(images.get_shape())
    logits, nr_params = cifar10.inference(images)
    print("nr_params: "+str(nr_params) )
    print("after inference")
    # Calculate loss.
    loss = cifar10.loss(logits, labels)
    accuracy, precision, cat_accs = cifar10.accuracy(logits, ground_truth)

    # Build a Graph that trains the model with one batch of examples and
    # updates the model parameters.
    train_op = cifar10.train(loss, global_step)

    # Create a saver.
    saver = tf.train.Saver(tf.all_variables())
#    tf.image_summary('images2', images)
    print (logits)
#    tf.image_summary('predictions', logits)

    # Build the summary operation based on the TF collection of Summaries.
    summary_op = tf.merge_all_summaries()

    # Build an initialization operation to run below.
    init = tf.initialize_all_variables()

    # Start running operations on the Graph.
    sess = tf.Session(config=tf.ConfigProto(
        log_device_placement=FLAGS.log_device_placement))
    ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir)
    if ckpt and ckpt.model_checkpoint_path:
      # Restores from checkpoint
      saver.restore(sess, ckpt.model_checkpoint_path)
      # Assuming model_checkpoint_path looks something like:
      #   /my-favorite-path/cifar10_train/model.ckpt-0,
      # extract global_step from it.
      global_step = int(ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1])
    else:
      print('No checkpoint file found')
      print('Initializing new model')
      sess.run(init)
      global_step = 0


    # Start the queue runners.
    tf.train.start_queue_runners(sess=sess)

    summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph)

    for step in xrange(global_step, FLAGS.max_steps):
      start_time = time.time()
      _, loss_value, accuracy_value, precision_value, cat_accs_val  = sess.run([train_op,
                                                                                loss,
                                                                                accuracy,
                                                                                precision,
                                                                                cat_accs])
                                                                  
      duration = time.time() - start_time

      print (precision_value)
      print (cat_accs_val)
      
      assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

      #precision_value = [0 if np.isnan(p) else p for p in precision_value]
      #print (precision_value)
      if step % 10 == 0:
        num_examples_per_step = FLAGS.batch_size
        examples_per_sec = num_examples_per_step / duration
        sec_per_batch = float(duration)

        format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                      'sec/batch)\n Accuracy = %.4f, mean average precision = %.4f')
        print (format_str % (datetime.now(), step, loss_value,
                             examples_per_sec, sec_per_batch,
                             accuracy_value, np.mean(precision_value)))

      if step % 100 == 0:
        summary_str = sess.run(summary_op)
        summary_writer.add_summary(summary_str, step)

        summary = tf.Summary()
        summary.value.add(tag='Accuracy (raw)', simple_value=float(accuracy_value))
        for i,s in enumerate(CLASSES):
          summary.value.add(tag="precision/"+s+" (raw)",simple_value=float(precision_value[i]))
          summary.value.add(tag="accs/"+s+" (raw)",simple_value=float(cat_accs_val[i]))
#        summary.value.add(tag='Human precision (raw)', simple_value=float(precision_value))
        summary_writer.add_summary(summary, step)
        print("hundred steps")
      # Save the model checkpoint periodically.
      if step % 1000 == 0 or (step + 1) == FLAGS.max_steps:
        print("thousand steps")
        checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
def main(_):

    class _LoggerHook(tf.train.SessionRunHook):
        """Logs loss and runtime."""

        def begin(self):
            self._step = -1

        def before_run(self, run_context):
            self._step += 1
            self._start_time = time.time()
            return tf.train.SessionRunArgs(loss)  # Asks for loss value.

        def after_run(self, run_context, run_values):
            duration = time.time() - self._start_time
            loss_value = run_values.results
            if self._step % 10 == 0:
                num_examples_per_step = FLAGS.batch_size
                examples_per_sec = num_examples_per_step / duration
                sec_per_batch = float(duration)

                format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                            'sec/batch)')
                print (format_str % (datetime.now(), self._step, loss_value,
                                    examples_per_sec, sec_per_batch))
    ps_hosts = FLAGS.ps_hosts.split(",")
    worker_hosts = FLAGS.worker_hosts.split(",")

    # Create a cluster from the parameter server and worker hosts.
    cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts})

    # Create and start a server for the local task.
    server = tf.train.Server(cluster,
                            job_name=FLAGS.job_name,
                            task_index=FLAGS.task_index)

    if FLAGS.job_name == "ps":
        server.join()
    elif FLAGS.job_name == "worker":

        # Assigns ops to the local worker by default.
        with tf.device(tf.train.replica_device_setter(
            worker_device="/job:worker/task:%d" % FLAGS.task_index,
            cluster=cluster)):

            global_step = tf.contrib.framework.get_or_create_global_step()

            # Get images and labels for CIFAR-10.
            images, labels = cifar10.distorted_inputs()

            # Build inference Graph.
            logits = cifar10.inference(images)

            # Build the portion of the Graph calculating the losses. Note that we will
            # assemble the total_loss using a custom function below.
            loss = cifar10.loss(logits, labels)

            # Build a Graph that trains the model with one batch of examples and
            # updates the model parameters.
            train_op = cifar10.train(loss,global_step)

        # The StopAtStepHook handles stopping after running given steps.
        hooks=[tf.train.StopAtStepHook(last_step=FLAGS.max_steps), _LoggerHook()]

        # The MonitoredTrainingSession takes care of session initialization,
        # restoring from a checkpoint, saving to a checkpoint, and closing when done
        # or an error occurs.
        with tf.train.MonitoredTrainingSession(master=server.target,
                                                is_chief=(FLAGS.task_index == 0),
                                                checkpoint_dir=FLAGS.train_dir,
                                                save_checkpoint_secs=60,
                                                hooks=hooks) as mon_sess:
            while not mon_sess.should_stop():
                # Run a training step asynchronously.
                # See `tf.train.SyncReplicasOptimizer` for additional details on how to
                # perform *synchronous* training.
                # mon_sess.run handles AbortedError in case of preempted PS.
                mon_sess.run(train_op)
def train():
    """Train CIFAR-10 for a number of steps."""
    with tf.Graph().as_default():
        global_step = tf.contrib.framework.get_or_create_global_step()

        # Get images and labels for CIFAR-10.
        images, labels = cifar10.distorted_inputs()

        # Build a Graph that computes the logits predictions from the
        # inference model.
        if tfFLAGS.network == 1:
            images, labels = cifar10.distorted_inputs()
            logits, fc1_w, fc1_b, fc2_w, fc2_b = MyModel.inference(images)
        else:
            images, labels = cifar10.distorted_inputs()
            logits, fc1_w, fc1_b, fc2_w, fc2_b = MyModel2.inference(images)

        # Calculate loss.
        loss = cifar10.loss(logits, labels)

            # L2 regularization for the fully connected parameters.
        regularizers = (tf.nn.l2_loss(fc1_w) + tf.nn.l2_loss(fc1_b) + tf.nn.l2_loss(fc2_w) + tf.nn.l2_loss(fc2_b))

        # Add the regularization term to the loss.
        loss += 5e-4 * regularizers

        # Build a Graph that trains the model with one batch of examples and
        # updates the model parameters.
        train_op = cifar10.train(loss, global_step)

        class _LoggerHook(tf.train.SessionRunHook):
            """Logs loss and runtime."""

            def begin(self):
                self._step = -1
                self._start_time = time.time()

            def before_run(self, run_context):
                self._step += 1
                return tf.train.SessionRunArgs(loss)    # Asks for loss value.

            def after_run(self, run_context, run_values):
                if self._step % tfFLAGS.log_frequency == 0:
                    current_time = time.time()
                    duration = current_time - self._start_time
                    self._start_time = current_time

                    loss_value = run_values.results
                    examples_per_sec = tfFLAGS.log_frequency * tfFLAGS.batch_size / duration
                    sec_per_batch = float(duration / tfFLAGS.log_frequency)

                    format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                                                'sec/batch)')
                    print_(format_str % (datetime.now(), self._step, loss_value, examples_per_sec, sec_per_batch))
        
        texts = ['conv1:', 'conv1Biases:', 'conv2:', 'conv2Biases:', 'local3:', 'local3Biases:', 'local4:', 'local4Biases:', 'softmax:', 'softmaxBiases:']
        total_parameters = 0; count = 0
        for variable in tf.trainable_variables():
            variable_parametes = 1
            for dim in variable.get_shape():
                    variable_parametes *= dim.value
            print('Number of hidden parameters of ' + texts[count], variable_parametes)
            total_parameters += variable_parametes
            count += 1
        print('Total Number of hidden parameters:', total_parameters)

        with tf.train.MonitoredTrainingSession(checkpoint_dir=tfFLAGS.train_dir,
                hooks=[tf.train.StopAtStepHook(last_step=tfFLAGS.max_steps), tf.train.NanTensorHook(loss),_LoggerHook()],
                config=tf.ConfigProto( device_count = {'GPU': 0}, log_device_placement=tfFLAGS.log_device_placement)) as mon_sess:
            while not mon_sess.should_stop():
                mon_sess.run(train_op)
def main_fun(argv, ctx):
  import tensorflow as tf
  import cifar10

  sys.argv = argv
  FLAGS = tf.app.flags.FLAGS
  tf.app.flags.DEFINE_string('train_dir', '/tmp/cifar10_train',
                             """Directory where to write event logs """
                             """and checkpoint.""")
  tf.app.flags.DEFINE_integer('max_steps', 1000000,
                              """Number of batches to run.""")
  tf.app.flags.DEFINE_boolean('log_device_placement', False,
                              """Whether to log device placement.""")
  tf.app.flags.DEFINE_boolean('rdma', False, """Whether to use rdma.""")

  # cifar10.maybe_download_and_extract()
  if tf.gfile.Exists(FLAGS.train_dir):
    tf.gfile.DeleteRecursively(FLAGS.train_dir)
  tf.gfile.MakeDirs(FLAGS.train_dir)

  cluster_spec, server = TFNode.start_cluster_server(ctx, 1, FLAGS.rdma)

  # Train CIFAR-10 for a number of steps.
  with tf.Graph().as_default():
    global_step = tf.contrib.framework.get_or_create_global_step()

    # Get images and labels for CIFAR-10.
    images, labels = cifar10.distorted_inputs()

    # Build a Graph that computes the logits predictions from the
    # inference model.
    logits = cifar10.inference(images)

    # Calculate loss.
    loss = cifar10.loss(logits, labels)

    # Build a Graph that trains the model with one batch of examples and
    # updates the model parameters.
    train_op = cifar10.train(loss, global_step)

    class _LoggerHook(tf.train.SessionRunHook):
      """Logs loss and runtime."""

      def begin(self):
        self._step = -1

      def before_run(self, run_context):
        self._step += 1
        self._start_time = time.time()
        return tf.train.SessionRunArgs(loss)  # Asks for loss value.

      def after_run(self, run_context, run_values):
        duration = time.time() - self._start_time
        loss_value = run_values.results
        if self._step % 10 == 0:
          num_examples_per_step = FLAGS.batch_size
          examples_per_sec = num_examples_per_step / duration
          sec_per_batch = float(duration)

          format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                        'sec/batch)')
          print (format_str % (datetime.now(), self._step, loss_value,
                               examples_per_sec, sec_per_batch))

    with tf.train.MonitoredTrainingSession(
        checkpoint_dir=FLAGS.train_dir,
        hooks=[tf.train.StopAtStepHook(last_step=FLAGS.max_steps),
               tf.train.NanTensorHook(loss),
               _LoggerHook()],
        config=tf.ConfigProto(
            log_device_placement=FLAGS.log_device_placement)) as mon_sess:
      while not mon_sess.should_stop():
        mon_sess.run(train_op)
def train():
  """Train CIFAR-10 for a number of steps."""
  with tf.Graph().as_default(), tf.device('/cpu:0'):
    # Create a variable to count the number of train() calls. This equals the
    # number of batches processed * FLAGS.num_gpus.
    global_step = tf.get_variable(
        'global_step', [],
        initializer=tf.constant_initializer(0), trainable=False)

    # Calculate the learning rate schedule.
    num_batches_per_epoch = (cifar10.NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN /
                             FLAGS.batch_size)
    decay_steps = int(num_batches_per_epoch * cifar10.NUM_EPOCHS_PER_DECAY)

    # Decay the learning rate exponentially based on the number of steps.
    lr = tf.train.exponential_decay(cifar10.INITIAL_LEARNING_RATE,
                                    global_step,
                                    decay_steps,
                                    cifar10.LEARNING_RATE_DECAY_FACTOR,
                                    staircase=True)

    # Create an optimizer that performs gradient descent.
    opt = tf.train.GradientDescentOptimizer(lr)

    # Get images and labels for CIFAR-10.
    images, labels = cifar10.distorted_inputs()
    batch_queue = tf.contrib.slim.prefetch_queue.prefetch_queue(
          [images, labels], capacity=2 * FLAGS.num_gpus)
    # Calculate the gradients for each model tower.
    tower_grads = []
    with tf.variable_scope(tf.get_variable_scope()):
      for i in xrange(FLAGS.num_gpus):
        with tf.device('/gpu:%d' % i):
          with tf.name_scope('%s_%d' % (cifar10.TOWER_NAME, i)) as scope:
            # Dequeues one batch for the GPU
            image_batch, label_batch = batch_queue.dequeue()
            # Calculate the loss for one tower of the CIFAR model. This function
            # constructs the entire CIFAR model but shares the variables across
            # all towers.
            loss = tower_loss(scope, image_batch, label_batch)

            # Reuse variables for the next tower.
            tf.get_variable_scope().reuse_variables()

            # Retain the summaries from the final tower.
            summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope)

            # Calculate the gradients for the batch of data on this CIFAR tower.
            grads = opt.compute_gradients(loss)

            # Keep track of the gradients across all towers.
            tower_grads.append(grads)

    # We must calculate the mean of each gradient. Note that this is the
    # synchronization point across all towers.
    grads = average_gradients(tower_grads)

    # Add a summary to track the learning rate.
    summaries.append(tf.summary.scalar('learning_rate', lr))

    # Add histograms for gradients.
    for grad, var in grads:
      if grad is not None:
        summaries.append(tf.summary.histogram(var.op.name + '/gradients', grad))

    # Apply the gradients to adjust the shared variables.
    apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)

    # Add histograms for trainable variables.
    for var in tf.trainable_variables():
      summaries.append(tf.summary.histogram(var.op.name, var))

    # Track the moving averages of all trainable variables.
    variable_averages = tf.train.ExponentialMovingAverage(
        cifar10.MOVING_AVERAGE_DECAY, global_step)
    variables_averages_op = variable_averages.apply(tf.trainable_variables())

    # Group all updates to into a single train op.
    train_op = tf.group(apply_gradient_op, variables_averages_op)

    # Create a saver.
    saver = tf.train.Saver(tf.global_variables())

    # Build the summary operation from the last tower summaries.
    summary_op = tf.summary.merge(summaries)

    # Build an initialization operation to run below.
    init = tf.global_variables_initializer()

    # Start running operations on the Graph. allow_soft_placement must be set to
    # True to build towers on GPU, as some of the ops do not have GPU
    # implementations.
    sess = tf.Session(config=tf.ConfigProto(
        allow_soft_placement=True,
        log_device_placement=FLAGS.log_device_placement))
    sess.run(init)

    # Start the queue runners.
    tf.train.start_queue_runners(sess=sess)

    summary_writer = tf.summary.FileWriter(FLAGS.train_dir, sess.graph)

    for step in xrange(FLAGS.max_steps):
      start_time = time.time()
      _, loss_value = sess.run([train_op, loss])
      duration = time.time() - start_time

      assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

      if step % 10 == 0:
        num_examples_per_step = FLAGS.batch_size * FLAGS.num_gpus
        examples_per_sec = num_examples_per_step / duration
        sec_per_batch = duration / FLAGS.num_gpus

        format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                      'sec/batch)')
        print (format_str % (datetime.now(), step, loss_value,
                             examples_per_sec, sec_per_batch))

      if step % 100 == 0:
        summary_str = sess.run(summary_op)
        summary_writer.add_summary(summary_str, step)

      # Save the model checkpoint periodically.
      if step % 1000 == 0 or (step + 1) == FLAGS.max_steps:
        checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
def train():

  # # debug

  # true_classes = np.ndarray(shape=(FLAGS.batch_size, 1), dtype=int)
  # true_classes.fill(2)


  # # Create a pair of constant ops, add the numpy 
  # # array matrices.
  # true_classes_tf_matrix = tf.constant(true_classes, dtype=tf.int64)

  # # playing with introducing the sampler
  # classes_sampler = tf.nn.learned_unigram_candidate_sampler(
  #                                     true_classes_tf_matrix, 
  #                                     1,                # true_classes
  #                                     5,                # num_sampled
  #                                     False,            # unique
  #                                     10,               # range_max
  #                                     seed=None, 
  #                                     name="my_classes_sampler")

  # # print(classes_sampler)
  # # print("debug")
  # # print(classes_sampler.set_sampler)
  # # exit()

  """Train CIFAR-10 for a number of steps."""
  with tf.Graph().as_default():
    global_step = tf.Variable(0, trainable=False)

    # Get images and labels for CIFAR-10.
    images, labels = cifar10.distorted_inputs()

    # print("images")
    # print(images)
    # images = tf.Print(images, [images])
    # print()
    # print(images[1])

    print("------------------- train calling interference ---------------------")
    print(cifar10.__file__)

    # Build a Graph that computes the logits predictions from the
    # inference model.
    logits = cifar10.inference(images)

    # Calculate loss.
    loss = cifar10.loss(logits, labels)

    # Build a Graph that trains the model with one batch of examples and
    # updates the model parameters.
    train_op = cifar10.train(loss, global_step)

    # Create a saver.
    saver = tf.train.Saver(tf.all_variables())

    # Build the summary operation based on the TF collection of Summaries.
    summary_op = tf.merge_all_summaries()

    # Build an initialization operation to run below.
    init = tf.initialize_all_variables()

    # Start running operations on the Graph.
    sess = tf.Session(config=tf.ConfigProto(
        log_device_placement=FLAGS.log_device_placement))
    sess.run(init)

    # Start the queue runners.
    tf.train.start_queue_runners(sess=sess)

    summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph)

    for step in xrange(FLAGS.max_steps):

      # manually load the contents of images and labels
      # before calling this sess.run()
      # 1. have Cifar10 dataset in memory
      # 2. create a mini-batch
      # 3. set the placeholders/vars to the the mini-batch data
      # 4. run one forward-backward step

      # print("training step: " + str(step))

      start_time = time.time()
      _, loss_value = sess.run([train_op, loss])
      duration = time.time() - start_time

      assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

      if step % 10 == 0:
        num_examples_per_step = FLAGS.batch_size
        examples_per_sec = num_examples_per_step / duration
        sec_per_batch = float(duration)

        format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                      'sec/batch)')
        print (format_str % (datetime.now(), step, loss_value,
                             examples_per_sec, sec_per_batch))

      # debug, temp change, go back to the one below
      summary_str = sess.run(summary_op)
      # print("summary: " + summary_str)
      summary_writer.add_summary(summary_str, step)
      summary_writer.flush()

      # if step % 100 == 0:
      #   summary_str = sess.run(summary_op)
      #   # print("summary: " + summary_str)
      #   summary_writer.add_summary(summary_str, step)
      #   summary_writer.flush()

      # Save the model checkpoint periodically.
      if step % 1000 == 0 or (step + 1) == FLAGS.max_steps:
        checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)