Example #1
0
def tower_loss(scope):
  """Calculate the total loss on a single tower running the CIFAR model.
  Args:
    scope: unique prefix string identifying the CIFAR tower, e.g. 'tower_0'
  Returns:
     Tensor of shape [] containing the total loss for a batch of data
  """
  # Get images and labels for CIFAR-10.
  images, labels = cifar10.distorted_inputs()
  # Build inference Graph.
  logits = cifar10.inference(images)
  # Build the portion of the Graph calculating the losses. Note that we will
  # assemble the total_loss using a custom function below.
  _ = cifar10.loss(logits, labels)
  # Assemble all of the losses for the current tower only.
  losses = tf.get_collection('losses', scope)
  # Calculate the total loss for the current tower.
  total_loss = tf.add_n(losses, name='total_loss')
  # Compute the moving average of all individual losses and the total loss.
  loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg')
  loss_averages_op = loss_averages.apply(losses + [total_loss])
  # Attach a scalar summmary to all individual losses and the total loss; do the
  # same for the averaged version of the losses.
  for l in losses + [total_loss]:
    # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training
    # session. This helps the clarity of presentation on tensorboard.
    loss_name = re.sub('%s_[0-9]*/' % cifar10.TOWER_NAME, '', l.op.name)
    # Name each loss as '(raw)' and name the moving average version of the loss
    # as the original loss name.
    tf.scalar_summary(loss_name +' (raw)', l)
    tf.scalar_summary(loss_name, loss_averages.average(l))
  with tf.control_dependencies([loss_averages_op]):
    total_loss = tf.identity(total_loss)
  return total_loss
Example #2
0
def train():
  """Train CIFAR-10 for a number of steps."""
  with tf.Graph().as_default():

    global_step = tf.Variable(0, trainable=False)
    images, labels = cifar10.distorted_inputs()
    logits = cifar10.inference(images)
    loss = cifar10.loss(logits, labels)
    # loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits, labels))
    # train_op = tf.train.GradientDescentOptimizer(1e-2).minimize(loss)
    train_op = cifar10.train(loss, global_step)
    top_k_op = tf.nn.in_top_k(logits, labels, 1)

    saver = tf.train.Saver(tf.all_variables())
    init = tf.initialize_all_variables()
    sess = tf.Session(config=tf.ConfigProto(
        log_device_placement=FLAGS.log_device_placement))
    sess.run(init)
    tf.train.start_queue_runners(sess=sess)

    true_count = 0
    for step in xrange(FLAGS.max_steps):
      start_time = time.time()
      _, loss_value, precisions = sess.run([train_op, loss, top_k_op])

      true_count += np.sum(precisions)

      if step % 10 == 0:
        checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
        duration = time.time() - start_time
        print(' step %d, loss = %.3f, acc = %.3f, dur = %.2f' % 
             (step, loss_value, true_count/(FLAGS.batch_size*10), duration))
        true_count = 0
def tower_loss(scope, images, labels):
  """Calculate the total loss on a single tower running the CIFAR model.

  Args:
    scope: unique prefix string identifying the CIFAR tower, e.g. 'tower_0'
    images: Images. 4D tensor of shape [batch_size, height, width, 3].
    labels: Labels. 1D tensor of shape [batch_size].

  Returns:
     Tensor of shape [] containing the total loss for a batch of data
  """

  # Build inference Graph.
  logits = cifar10.inference(images)

  # Build the portion of the Graph calculating the losses. Note that we will
  # assemble the total_loss using a custom function below.
  _ = cifar10.loss(logits, labels)

  # Assemble all of the losses for the current tower only.
  losses = tf.get_collection('losses', scope)

  # Calculate the total loss for the current tower.
  total_loss = tf.add_n(losses, name='total_loss')

  # Attach a scalar summary to all individual losses and the total loss; do the
  # same for the averaged version of the losses.
  for l in losses + [total_loss]:
    # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training
    # session. This helps the clarity of presentation on tensorboard.
    loss_name = re.sub('%s_[0-9]*/' % cifar10.TOWER_NAME, '', l.op.name)
    tf.summary.scalar(loss_name, l)

  return total_loss
def train():
  """Train CIFAR-10 for a number of steps."""
  with tf.Graph().as_default():
    global_step = tf.Variable(0, trainable=False)

    # Get images and labels for CIFAR-10.
    images, labels = cifar10.distorted_inputs()

    testImg, testlabels = cifar10.inputs(eval_data=True)
    # Build a Graph that computes the logits predictions from the
    # inference model.
    logits = cifar10.inference(images)
    test_pre = cifar10.inference(testImg,test=True)
     
    # Calculate loss.
    loss = cifar10.loss(logits, labels)

    # Build a Graph that trains the model with one batch of examples and
    # updates the model parameters.
    train_op = cifar10.train(loss, global_step)

    # Create a saver.
    saver = tf.train.Saver(tf.all_variables())

    # Build the summary operation based on the TF collection of Summaries.
    summary_op = tf.merge_all_summaries()

    # Build an initialization operation to run below.
    init = tf.initialize_all_variables()

    # Start running operations on the Graph.
    sess = tf.Session(config=tf.ConfigProto(
        log_device_placement=FLAGS.log_device_placement))
    sess.run(init)

    # Start the queue runners.
    tf.train.start_queue_runners(sess=sess)

    summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph)

    for step in xrange(FLAGS.max_steps):
      start_time = time.time()
      _, loss_value = sess.run([train_op, loss])
      duration = time.time() - start_time

      if step % 10 == 0:
        print ('loss '+str(loss_value))

      if step % 100 == 0:
        summary_str = sess.run(summary_op)
        summary_writer.add_summary(summary_str, step)

      # Save the model checkpoint periodically.
      if step % 10 == 0 or (step + 1) == FLAGS.max_steps:
        checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)

        #eval
      if step%10==0:
        cifar10.accuracy(test_pre,testlabels)
Example #5
0
def train():
    """Train CIFAR-10 for a number of steps."""
    with tf.Graph().as_default():
        global_step = tf.Variable(0, trainable=False)

        images, labels = cifar10.distorted_inputs()

        # Build a Graph that computes the logits predictions from the
        # inference model.
        logits = cifar10.inference(images)

        # Calculate loss.
        loss = cifar10.loss(logits, labels)

        # Build a Graph that trains the model with one batch of examples and
        # updates the model parameters.
        train_op = cifar10.train(loss, global_step)

        # Create a saver.
        saver = tf.train.Saver(tf.all_variables())

        # Build the summary operation based on the TF collection of Summaries.
        summary_op = tf.merge_all_summaries()

        # Build an initialization operation to run below.
        init = tf.initialize_all_variables()

        # Start running operations on the Graph.
        sess = tf.Session(config=tf.ConfigProto(log_device_placement=FLAGS.log_device_placement))
        sess.run(init)

        # Start the queue runners.
        tf.train.start_queue_runners(sess=sess)

        summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph)

        for step in xrange(FLAGS.max_steps):
            start_time = time.time()
            _, loss_value = sess.run([train_op, loss])
            duration = time.time() - start_time

            assert not np.isnan(loss_value), "Model diverged with loss = NaN"

            if step % 10 == 0:
                num_examples_per_step = FLAGS.batch_size
                examples_per_sec = num_examples_per_step / duration
                sec_per_batch = float(duration)

                format_str = "%s: step %d, loss = %.2f (%.1f examples/sec; %.3f " "sec/batch)"
                print(format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch))

            if step % 100 == 0:
                summary_str = sess.run(summary_op)
                summary_writer.add_summary(summary_str, step)

            # Save the model checkpoint periodically.
            if step % 1000 == 0 or (step + 1) == FLAGS.max_steps:
                checkpoint_path = os.path.join(FLAGS.train_dir, "model.ckpt")
                saver.save(sess, checkpoint_path, global_step=step)
Example #6
0
def train():
  """Train CIFAR-10 for a number of steps."""
  with tf.Graph().as_default():
    global_step = tf.train.get_or_create_global_step()

    # Get images and labels for CIFAR-10.
    # Force input pipeline to CPU:0 to avoid operations sometimes ending up on
    # GPU and resulting in a slow down.
    with tf.device('/cpu:0'):
      images, labels = cifar10.distorted_inputs()

    # Build a Graph that computes the logits predictions from the
    # inference model.
    logits = cifar10.inference(images)

    # Calculate loss.
    loss = cifar10.loss(logits, labels)

    # Build a Graph that trains the model with one batch of examples and
    # updates the model parameters.
    train_op = cifar10.train(loss, global_step)

    class _LoggerHook(tf.train.SessionRunHook):
      """Logs loss and runtime."""

      def begin(self):
        self._step = -1
        self._start_time = time.time()

      def before_run(self, run_context):
        self._step += 1
        return tf.train.SessionRunArgs(loss)  # Asks for loss value.

      def after_run(self, run_context, run_values):
        if self._step % FLAGS.log_frequency == 0:
          current_time = time.time()
          duration = current_time - self._start_time
          self._start_time = current_time

          loss_value = run_values.results
          examples_per_sec = FLAGS.log_frequency * FLAGS.batch_size / duration
          sec_per_batch = float(duration / FLAGS.log_frequency)

          format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                        'sec/batch)')
          print (format_str % (datetime.now(), self._step, loss_value,
                               examples_per_sec, sec_per_batch))

    with tf.train.MonitoredTrainingSession(
        checkpoint_dir=FLAGS.train_dir,
        hooks=[tf.train.StopAtStepHook(last_step=FLAGS.max_steps),
               tf.train.NanTensorHook(loss),
               _LoggerHook()],
        config=tf.ConfigProto(
            log_device_placement=FLAGS.log_device_placement)) as mon_sess:
      while not mon_sess.should_stop():
        mon_sess.run(train_op)
def tower_loss(scope):
    """Calculate the total loss on a single tower running the CIFAR model.
    Args:
        scope: unique prefix string identifying the CIFAR tower, e.g. 'tower_0'
    Returns:
         Tensor of shape [] containing the total loss for a batch of data
    """
    # Build a Graph that computes the logits predictions from the
    # inference model.
    if tfFLAGS.network == 1:
        images, labels = cifar10.distorted_inputs()
        logits, fc1_w, fc1_b, fc2_w, fc2_b = MyModel.inference(images)
    else:
        images, labels = cifar10.distorted_inputs()
        logits, fc1_w, fc1_b, fc2_w, fc2_b = MyModel2.inference(images)

    # Calculate loss.
    loss = cifar10.loss(logits, labels)

        # L2 regularization for the fully connected parameters.
    regularizers = (tf.nn.l2_loss(fc1_w) + tf.nn.l2_loss(fc1_b) + tf.nn.l2_loss(fc2_w) + tf.nn.l2_loss(fc2_b))

    # Add the regularization term to the loss.
    loss += 5e-4 * regularizers

    # Build the portion of the Graph calculating the losses. Note that we will
    # assemble the total_loss using a custom function below.
    _ = cifar10.loss(logits, labels)

    # Assemble all of the losses for the current tower only.
    losses = tf.get_collection('losses', scope)

    # Calculate the total loss for the current tower.
    total_loss = tf.add_n(losses, name='total_loss')

    # Attach a scalar summary to all individual losses and the total loss; do the
    # same for the averaged version of the losses.
    for l in losses + [total_loss]:
        # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training
        # session. This helps the clarity of presentation on tensorboard.
        loss_name = re.sub('%s_[0-9]*/' % tfFLAGS.TOWER_NAME, '', l.op.name)
        tf.summary.scalar(loss_name, l)

    return total_loss
def tower_loss(scope, images, labels):
    """Calculate the total loss on a single tower running the CIFAR model.
  Args:
    scope: unique prefix string identifying the CIFAR tower, e.g. 'tower_0'
    images: Images. 4D tensor of shape [batch_size, height, width, 3].
    labels: Labels. 1D tensor of shape [batch_size].
  Returns:
     Tensor of shape [] containing the total loss for a batch of data
  """

    # Build inference Graph.
    print('>>>>> input original = ', images)
    # in case of images less than or greater than 227x227
    #  images = tf.image.resize_images(images, [227,227] )
    #  print('>>>>> input resized = ',images)
    logits = cifar10.inference(images, 0.5)

    # Build the portion of the Graph calculating the losses. Note that we will
    # assemble the total_loss using a custom function below.
    _ = cifar10.loss(logits, labels)

    # Assemble all of the losses for the current tower only.
    losses = tf.get_collection('losses', scope)

    # to calculate accuracy for batch
    top_k_op = tf.nn.in_top_k(logits, labels, 1)
    top_k_op = tf.cast(top_k_op, tf.int32)
    acc_batch = tf.reduce_sum(top_k_op)
    tf.summary.scalar(name="acc_batch", tensor=acc_batch / FLAGS.batch_size)

    # 1-off accuracy for batch
    print('labels = ', labels)
    labels = tf.cast(labels, tf.int64)
    #tf.summary.text(name="labels",tensor=tf.as_string(labels))
    #tf.summary.text(name="logits",tensor=tf.as_string(logits))
    argmaxlogits = tf.argmax(logits, axis=-1)
    #tf.summary.text(name="argmaxlogits ",tensor=tf.as_string(argmaxlogits ))
    print('>> argmaxlogits = ', argmaxlogits)
    absdiff = (tf.abs(labels - argmaxlogits) <= 1)
    #tf.summary.text(name="absdiff", tensor=tf.as_string(absdiff))
    acc_1off = tf.reduce_sum(tf.cast(absdiff, tf.int64))
    tf.summary.scalar(name="acc_1off", tensor=acc_1off / FLAGS.batch_size)

    # Calculate the total loss for the current tower.
    total_loss = tf.add_n(losses, name='total_loss')

    # Attach a scalar summary to all individual losses and the total loss; do the
    # same for the averaged version of the losses.
    for l in losses + [total_loss]:
        # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training
        # session. This helps the clarity of presentation on tensorboard.
        loss_name = re.sub('%s_[0-9]*/' % cifar10.TOWER_NAME, '', l.op.name)
        tf.summary.scalar(loss_name, l)

    return total_loss
Example #9
0
def train():
    """Train CIFAR-10 for a number of steps."""
    with tf.Graph().as_default():
        global_step = tf.contrib.framework.get_or_create_global_step()
        # Get images and labels for CIFAR-10.
        images, labels = cifar10.distorted_inputs()

        # Build a Graph that computes the logits predictions from the
        # inference model.
        logits = cifar10.inference(images)

        # Calculate loss.
        loss = cifar10.loss(logits, labels)

        # Build a Graph that trains the model with one batch of examples and
        # updates the model parameters.
        train_op = cifar10.train(loss, global_step)

        class _LoggerHook(tf.train.SessionRunHook):
            """Logs loss and runtime."""
            def begin(self):
                self._step = -1
                self._start_time = time.time()

            def before_run(self, run_context):
                self._step += 1
                return tf.train.SessionRunArgs(loss)  # Asks for loss value.

            def after_run(self, run_context, run_values):
                if self._step % FLAGS.log_frequency == 0:
                    current_time = time.time()
                    duration = current_time - self._start_time
                    self._start_time = current_time

                    loss_value = run_values.results
                    examples_per_sec = FLAGS.log_frequency * FLAGS.batch_size / duration
                    sec_per_batch = float(duration / FLAGS.log_frequency)

                    format_str = (
                        '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                        'sec/batch)')
                    print(format_str % (datetime.now(), self._step, loss_value,
                                        examples_per_sec, sec_per_batch))

        with tf.train.MonitoredTrainingSession(
                checkpoint_dir=FLAGS.train_dir,
                hooks=[
                    tf.train.StopAtStepHook(last_step=FLAGS.max_steps),
                    tf.train.NanTensorHook(loss),
                    _LoggerHook()
                ],
                config=tf.ConfigProto(log_device_placement=FLAGS.
                                      log_device_placement)) as mon_sess:
            while not mon_sess.should_stop():
                mon_sess.run(train_op)
Example #10
0
def train():
    with tf.Graph().as_default():
        # get global step
        global_step = tf.train.get_or_create_global_step()
        # get data through cpu
        with tf.device('/cpu:0'):
            images, labels = cifar10.distorted_inputs()
        # get loss and logit
        # logits = cifar10.inference(images=images, r=low_ranks)
        logits = cifar10.inference(images=images,r=low_ranks)
        loss = cifar10.loss(logits=logits, labels=labels)
        # set train_op
        train_op = cifar10.train(loss, global_step)
        for v in tf.trainable_variables():
            print(v)
        nonzero = tf.count_nonzero(tf.get_collection('sparse_components')[-1])
        # define a LoggerHook to log something
        # clean_list = tf.get_collection('sparse_components')
        # clean_list = clean_s(clean_list)
        # clean_op = [c.op for c in clean_list]

        class _LoggerHook(tf.train.SessionRunHook):
            """
            log session and runtime info
            """
            def begin(self):
                self._step = -1
                self._start_time = time.time()

            def before_run(self, run_context):
                self._step += 1
                return tf.train.SessionRunArgs(loss)

            def after_run(self, run_context, run_values):
                if self._step % FLAGS.log_frequency == 0:
                    current_time = time.time()
                    duration = current_time - self._start_time
                    self._start_time = current_time
                    loss_value = run_values.results
                    example_per_sec = FLAGS.log_frequency * FLAGS.batch_size / duration
                    sec_per_batch = float(duration / FLAGS.log_frequency)

                    format_str = ('%s: step %d, loss = %.6f (%.1f examples/sec;'
                                  '%.3f sec/batch)')
                    print(format_str % (datetime.now(), self._step, loss_value,
                                        example_per_sec, sec_per_batch))

        with tf.train.MonitoredTrainingSession(
                checkpoint_dir=FLAGS.train_dir,
                hooks=[tf.train.StopAtStepHook(last_step=FLAGS.max_steps),
                       tf.train.NanTensorHook(loss),
                       _LoggerHook()],
                config=tf.ConfigProto(log_device_placement=FLAGS.log_device_placement)) as mon_sess:
            while not mon_sess.should_stop():
                mon_sess.run(train_op)
def train():
  """Train CIFAR-10 for a number of steps."""
  with tf.Graph().as_default():
    global_step = tf.contrib.framework.get_or_create_global_step()

    # Get images and labels for CIFAR-10.
    images, labels = cifar10.distorted_inputs()

    # Build a Graph that computes the logits predictions from the
    # inference model.
    logits = cifar10.inference(images)

    # Calculate loss.
    loss = cifar10.loss(logits, labels)

    # Build a Graph that trains the model with one batch of examples and
    # updates the model parameters.
    train_op = cifar10.train(loss, global_step)

    class _LoggerHook(tf.train.SessionRunHook):
      """Logs loss and runtime."""

      def begin(self):
        self._step = -1

      def before_run(self, run_context):
        self._step += 1
        self._start_time = time.time()
        return tf.train.SessionRunArgs(loss)  # Asks for loss value.

      def after_run(self, run_context, run_values):
        duration = time.time() - self._start_time
        loss_value = run_values.results
        if self._step % 10 == 0:
          num_examples_per_step = FLAGS.batch_size
          examples_per_sec = num_examples_per_step / duration
          sec_per_batch = float(duration)

          format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                        'sec/batch)')
          print (format_str % (datetime.now(), self._step, loss_value,
                               examples_per_sec, sec_per_batch))
          global step_no 
          step_no = self._step

    with tf.train.MonitoredTrainingSession(
        checkpoint_dir=FLAGS.train_dir,
        hooks=[tf.train.StopAtStepHook(last_step=FLAGS.max_steps),
               tf.train.NanTensorHook(loss),
               _LoggerHook()],
        config=tf.ConfigProto(
            log_device_placement=FLAGS.log_device_placement, inter_op_parallelism_threads=4,intra_op_parallelism_threads=0)) as mon_sess:
      while not mon_sess.should_stop():
        mon_sess.run(train_op)
        """run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
def train():
  """Train CIFAR-10 for a number of steps."""

  g1 = tf.Graph()
  with g1.as_default():
    global_step = tf.contrib.framework.get_or_create_global_step()

    # Get images and labels for CIFAR-10.
    images, labels = cifar10.distorted_inputs()

    # Build a Graph that computes the logits predictions from the
    # inference model.
    logits = cifar10.inference(images)

    # Calculate loss.
    loss = cifar10.loss(logits, labels)
    grads  = cifar10.train_part1(loss, global_step)

    only_gradients = [g for g,_ in grads]
    only_vars = [v for _,v in grads]
    placeholder_gradients = []

    #with tf.device("/gpu:0"):
    for grad_var in grads :
        placeholder_gradients.append((tf.placeholder('float', shape=grad_var[0].get_shape()) ,grad_var[1]))
    
    feed_dict = {}
       
    for i,grad_var in enumerate(grads): 
       feed_dict[placeholder_gradients[i][0]] = np.zeros(placeholder_gradients[i][0].shape)
  
    train_op = cifar10.train_part2(global_step,placeholder_gradients)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    feeds = []
    print("Reached here")
    for i,grad_var in enumerate(grads): 
        feeds.append(placeholder_gradients[i][0])
    # Partial Run
    print("Reached here", len(feeds))
    for x in feeds:
        print(x,)
    h = sess.partial_run_setup([only_gradients, train_op], feeds)
    print("Reached here")


    for i in xrange(10):
        res_grads = sess.partial_run(h, only_gradients, feed_dict = feed_dict)

        feed_dict = {}
        for i,grad_var in enumerate(res_grads): 
           feed_dict[placeholder_gradients[i][0]] = res_grads[i]

        res_train_op = sess.partial_run(h, train_op, feed_dict=feed_dict)
Example #13
0
def train():
    """Train CIFAR-10 for a number of steps."""
    with tf.Graph().as_default():
        global_step = tf.train.get_or_create_global_step()

        # Get images and labels for CIFAR-10.
        # Force input pipeline to CPU:0 to avoid operations sometimes ending up on
        # GPU and resulting in a slow down.
        with tf.device('/cpu:0'):
            images, labels = cifar10.distorted_inputs()

        # Build a Graph that computes the logits predictions from the
        # inference model.
        logits = cifar10.inference(images)

        # Calculate loss.
        loss = cifar10.loss(logits, labels)

        # Build a Graph that trains the model with one batch of examples and
        # updates the model parameters.
        train_op = cifar10.train(loss, global_step)

        class _LoggerHook(tf.train.SessionRunHook):
            """Logs loss and runtime."""
            def begin(self):
                self._step = -1
                self._start_time = time.time()

            def before_run(self, run_context):
                self._step += 1
                return tf.train.SessionRunArgs(loss)  # Asks for loss value.

            def after_run(self, run_context, run_values):
                if self._step % FLAGS.log_frequency == 0:
                    current_time = time.time()
                    self._start_time = current_time

        for step in range(0, FLAGS.max_steps + 1, FLAGS.log_frequency):
            print(str(step))
            with tf.train.MonitoredTrainingSession(
                    checkpoint_dir=FLAGS.train_dir,
                    hooks=[
                        tf.train.StopAtStepHook(last_step=step),
                        tf.train.NanTensorHook(loss),
                        _LoggerHook()
                    ],
                    config=tf.ConfigProto(log_device_placement=FLAGS.
                                          log_device_placement)) as mon_sess:
                while not mon_sess.should_stop():
                    mon_sess.run(train_op)
            # evaluate test data
            cifar10_eval.evaluate()
            # evaluate train data
            evaluate()
Example #14
0
def tower_loss(scope, images, labels):
    """Calculate the total loss on a single tower running the CIFAR model.



  Args:

    scope: unique prefix string identifying the CIFAR tower, e.g. 'tower_0'

    images: Images. 4D tensor of shape [batch_size, height, width, 3].

    labels: Labels. 1D tensor of shape [batch_size].



  Returns:

     Tensor of shape [] containing the total loss for a batch of data

  """

    # Build inference Graph.

    logits = cifar10.inference(images)

    # Build the portion of the Graph calculating the losses. Note that we will

    # assemble the total_loss using a custom function below.

    _ = cifar10.loss(logits, labels)

    # Assemble all of the losses for the current tower only.

    losses = tf.get_collection('losses', scope)

    # Calculate the total loss for the current tower.

    total_loss = tf.add_n(losses, name='total_loss')

    # Attach a scalar summary to all individual losses and the total loss; do the

    # same for the averaged version of the losses.

    for l in losses + [total_loss]:

        # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training

        # session. This helps the clarity of presentation on tensorboard.

        loss_name = re.sub('%s_[0-9]*/' % cifar10.TOWER_NAME, '', l.op.name)

        tf.summary.scalar(loss_name, l)

    return total_loss
Example #15
0
def train():
    """
    Train CIFAR-10 for a number of steps
    :return:
    """
    with tf.Graph().as_default():
        global_step = tf.train.get_or_create_global_step()

        # 获取CIFAR-10的images和labels
        # 让CPU专注流输入,避免GPU处理完,导致停顿
        with tf.device('/cpu:0'):
            images, labels = cifar10.distorted_inputs()

        # 预测结果 以及 loss
        logits = cifar10.inference(images)
        loss = cifar10.loss(logits, labels)

        train_op = cifar10.train(loss, global_step)

        class _LoggerHook(tf.train.SessionRunHook):
            """ log loss and runtime. """

            def begin(self):
                self._step = -1
                self._start_time = time.time()

            def before_run(self, run_context):
                self._step += 1
                return tf.train.SessionRunArgs(loss)

            def after_run(self, run_context, run_values):
                if self._step % FLAGS.log_frequency == 0:
                    current_time = time.time()
                    duration = current_time - self._start_time
                    self._start_time = current_time

                    loss_value = run_values.results
                    examples_per_sec = FLAGS.log_frequency * FLAGS.batch_size / duration
                    sec_per_batch = float(duration / FLAGS.log_frequency)

                    format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                                  'sec/batch)')

                    print(format_str % (datetime.now(), self._step, loss_value,
                                        examples_per_sec, sec_per_batch))

        with tf.train.MonitoredTrainingSession(
                checkpoint_dir=FLAGS.train_dir,
                hooks=[tf.train.StopAtStepHook(last_step=FLAGS.max_steps),
                       tf.train.NanTensorHook(loss),
                       _LoggerHook()],
                config=tf.ConfigProto(log_device_placement=FLAGS.log_device_placement)) as mon_sess:
            while not mon_sess.should_stop():
                mon_sess.run(train_op)
Example #16
0
def tower_loss(scope):
    """
    :param scope: 我们需要为每个GPU生成单独的结构完全一致的网络,由scope标识
    :return:
    """
    images, labels = cifar10.distored_inputs()
    logits = cifar10.inference(images)
    _ = cifar10.loss(logits, labels)
    losses = tf.get_collection('losses', scope)
    total_loss = tf.add_n(losses, name='total_loss')
    return total_loss
Example #17
0
def train():
    """ CIFAR10训练函数 """
    with tf.Graph().as_default():
        global_step = tf.contrib.framework.get_or_create_global_step()

        # 从数据中读取图像和标签.
        images, labels = cifar10.distorted_inputs()

        # 建立一个图用来计算模型的预测结果
        logits = cifar10.inference(images)

        # 计算误差
        loss = cifar10.loss(logits, labels)

        # 构建图用一个batch的训练数据来训练模型并更新参数
        train_op = cifar10.train(loss, global_step)

        class _LoggerHook(tf.train.SessionRunHook):
            """ 记录误差和运行时间 """
            def begin(self):
                self._step = -1
                self._start_time = time.time()

            def before_run(self, run_context):
                self._step += 1
                return tf.train.SessionRunArgs(loss)  # 计算误差值

            def after_run(self, run_context, run_values):
                if self._step % FLAGS.log_frequency == 0:
                    current_time = time.time()
                    duration = current_time - self._start_time
                    self._start_time = current_time

                    loss_value = run_values.results
                    examples_per_sec = FLAGS.log_frequency * FLAGS.batch_size / duration
                    sec_per_batch = float(duration / FLAGS.log_frequency)

                    format_str = (
                        '%s: step %d, loss = %.5f (%.1f examples/sec; %.3f sec/batch)'
                    )
                    print(format_str % (datetime.now(), self._step, loss_value,
                                        examples_per_sec, sec_per_batch))

        with tf.train.MonitoredTrainingSession(
                checkpoint_dir=FLAGS.train_dir,
                hooks=[
                    tf.train.StopAtStepHook(last_step=FLAGS.max_steps),
                    tf.train.NanTensorHook(loss),
                    _LoggerHook()
                ],
                config=tf.ConfigProto(log_device_placement=FLAGS.
                                      log_device_placement)) as mon_sess:
            while not mon_sess.should_stop():
                mon_sess.run(train_op)
def run_training():
    with tf.Graph().as_default(), tf.device('/gpu:0'):
        global_step = tf.Variable(0, trainable=False)

        with tf.device('/cpu:0'):
            images, labels = cifar10.distorted_inputs()

        images_placeholder, labels_placeholder = placeholder_inputs(
            FLAGS.batch_size, cifar10.IMAGE_SIZE)

        logits = cifar10.inference(images_placeholder)
        losses_dict = cifar10.loss(logits, labels_placeholder)

        moving_averages_op = cifar10.add_summaries_and_moving_avgs(
            losses_dict, global_step)

        lbfgs_optimizer = customized_optimizer.CustomizedOptimizerInterface(
            global_step=global_step,
            loss_dict=losses_dict,
            data_fetches=[images, labels],
            data_placeholders=(images_placeholder, labels_placeholder),
            maxiter=FLAGS.max_steps)

        saver = tf.train.Saver(tf.global_variables(), max_to_keep=25)

        summary_op = tf.summary.merge_all()

        init = tf.global_variables_initializer()

        with tf.Session(config=tf.ConfigProto(
                allow_soft_placement=True,
                log_device_placement=FLAGS.log_device_placement,
                gpu_options=tf.GPUOptions(
                    per_process_gpu_memory_fraction=0.4))) as sess:
            sess.run(init)

            coordinator = tf.train.Coordinator()
            try:
                threads = tf.train.start_queue_runners(sess=sess,
                                                       coord=coordinator)

                lbfgs_optimizer.minimize(session=sess,
                                         moving_averages_op=moving_averages_op,
                                         summary_op=summary_op,
                                         saver=saver,
                                         step_callback=step_callback)

            except Exception as e:
                coordinator.request_stop(e)

            coordinator.request_stop()
            coordinator.join(threads, stop_grace_period_secs=10)
Example #19
0
def train():
    # todo:这句话啥意识
    with tf.Graph().as_default():
        # todo:啥意思
        global_step = tf.train.get_or_create_global_step()
        with tf.device('/cpu:0'):
            images, labels = cifar10.distorted_inputs()
        # 推测模型
        logits = cifar10.inference(images)
        loss = cifar10.loss(logits, labels)
        train_op = cifar10.train(loss, global_step)

        class _LoggerHook(tf.train.SessionRunHook):
            """
            logs loss and runtime
            """
            def begin(self):
                self._step = -1
                self._start_time = time.time()

            def before_run(self, run_context):
                self._step += 1
                return tf.train.SessionRunArgs(loss)

            def after_run(
                    self,
                    run_context,  # pylint: disable=unused-argument
                    run_values):
                if self._step % FLAGS.log_frequency == 0:
                    current_time = time.time()
                    duration = current_time - self._start_time
                    self._start_time = current_time

                    loss_value = run_values.results
                    example_per_sec = FLAGS.log_frequency * FLAGS.batch_size / duration
                    sec_per_batch = float(duration / FLAGS.log_frequency)
                    format_str = (
                        '%s: step %d,loss=%.2f (%.1f example/sec);%.3f sec/batch'
                    )
                    print(format_str)

        with tf.train.MonitoredTrainingSession(
                checkpoint_dir=FLAGS.train_dir,
                hooks=[
                    tf.train.StopAtStepHook(last_step=FLAGS.max_steps),
                    tf.train.NanTensorHook(loss),
                    _LoggerHook()
                ],
                config=tf.ConfigProto(log_device_placement=FLAGS.
                                      log_device_placement)) as mon_sess:
            while not mon_sess.should_stop():
                mon_sess.run(train_op)
Example #20
0
def train():
    """Train CIFAR-10 for a number of steps."""
    sess = tf.InteractiveSession()
    #sess = tf_debug.LocalCLIDebugWrapperSession(sess)

    # Get images and labels for CIFAR-10.
    with tf.device('/cpu:0'):
        images, labels = cifar10.distorted_inputs()

    # Define all the fixed point variables we will be using later
    cifar10.initialize_fix_point_variables()

    # Build a Graph that computes the logits predictions from the inference model
    logits = cifar10.inference(images)

    # Calculate loss.
    loss = cifar10.loss(logits, labels)

    # Build a Graph that trains the model with one batch of examples and
    # updates the model parameters.
    train_op = cifar10.train(loss, 0.05)

    # Update fixed point conversion parameters when needed
    update_fix_pt_ops = cifar10.update_fix_point_accuracy()

    # Merge all the summaries and write them out to
    # FLAGS.log_dir
    merged_summary = tf.summary.merge_all()
    train_writer = tf.summary.FileWriter(FLAGS.log_dir + '/train', sess.graph)
    test_writer = tf.summary.FileWriter(FLAGS.log_dir + '/test')

    # init all variables
    tf.global_variables_initializer().run()
    # create a saver for checkpoints
    saver = tf.train.Saver(tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES))
    # needed on interactive session so it doesn't hang
    tf.train.start_queue_runners()

    for i in range(FLAGS.max_steps):
        summary, _ = sess.run([merged_summary, train_op])
        train_writer.add_summary(summary, i)  # summary

        if (i % 10 == 0):
            saver.save(sess, FLAGS.log_dir + '/checkpoint', global_step=i)

        if (i % 5 == 0):
            sess.run([update_fix_pt_ops])
            print('Step: %s, Loss: %s' % (i, loss.eval()))

    train_writer.close()
Example #21
0
def train():
    """Train CIFAR-10 for a number of steps."""
    with tf.Graph().as_default():
        global_step = tf.Variable(0, trainable=False)
        # Get images and labels for CIFAR-10.
        images, labels = cifar10.distorted_inputs()
        # Build a Graph that computes the logits predictions from the
        # inference model.
        logits = cifar10.inference(images)
        # Calculate loss.
        loss = cifar10.loss(logits, labels)
        # Build a Graph that trains the model with one batch of examples and
        # updates the model parameters.
        train_op = cifar10.train(loss, global_step)
        # Create a saver.
        saver = tf.train.Saver(tf.global_variables())
        # Build the summary operation based on the TF collection of Summaries.
        summary_op = tf.summary.merge_all()
        # Build an initialization operation to run below.
        init = tf.initialize_all_variables()
        # Start running operations on the Graph.
        sess = tf.Session(config=tf.ConfigProto(
            log_device_placement=FLAGS.log_device_placement))
        sess.run(init)
        # Start the queue runners.
        tf.train.start_queue_runners(sess=sess)
        summary_writer = tf.summary.FileWriter(FLAGS.train_dir,
                                               graph_def=sess.graph_def)
        for step in xrange(FLAGS.max_steps):
            start_time = time.time()
            _, loss_value = sess.run([train_op, loss])
            duration = time.time() - start_time
            assert not np.isnan(loss_value), 'Model diverged with loss = NaN'
            if step % 10 == 0:
                num_examples_per_step = FLAGS.batch_size
                examples_per_sec = num_examples_per_step / duration
                sec_per_batch = float(duration)
                format_str = (
                    '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                    'sec/batch)')
                print(format_str % (datetime.now(), step, loss_value,
                                    examples_per_sec, sec_per_batch))
            if step % 100 == 0:
                summary_str = sess.run(summary_op)
                summary_writer.add_summary(summary_str, step)
            # Save the model checkpoint periodically.
            if step % 1000 == 0 or (step + 1) == FLAGS.max_steps:
                checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
Example #22
0
def tower_loss(scope):
    """Calculate the total loss on a single tower running the CIFAR model.

  Args:
    scope: unique prefix string identifying the CIFAR tower, e.g. 'tower_0'

  Returns:
     Tensor of shape [] containing the total loss for a batch of data
  """
    # Get images and labels for CIFAR-10.
    images, labels = cifar10.distorted_inputs()

    # Build inference Graph.
    logits = cifar10.inference(images)

    # Build the portion of the Graph calculating the losses. Note that we will
    # assemble the total_loss using a custom function below.
    _ = cifar10.loss(logits, labels)

    # Assemble all of the losses for the current tower only.
    losses = tf.get_collection('losses', scope)

    # Calculate the total loss for the current tower.
    total_loss = tf.add_n(losses, name='total_loss')

    # Compute the moving average of all individual losses and the total loss.
    loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg')
    print('first one')
    loss_averages_op = loss_averages.apply(losses)
    print('second one')
    loss_averages_op = loss_averages.apply([total_loss])
    print('donezo')
    loss_averages_op = loss_averages.apply(losses + [total_loss])
    print('real')

    # Attach a scalar summary to all individual losses and the total loss; do the
    # same for the averaged version of the losses.
    for l in losses + [total_loss]:
        # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training
        # session. This helps the clarity of presentation on tensorboard.
        loss_name = re.sub('%s_[0-9]*/' % cifar10.TOWER_NAME, '', l.op.name)
        # Name each loss as '(raw)' and name the moving average version of the loss
        # as the original loss name.
        tf.summary.scalar(loss_name + ' (raw)', l)
        tf.summary.scalar(loss_name, loss_averages.average(l))

    with tf.control_dependencies([loss_averages_op]):
        total_loss = tf.identity(total_loss)
    return total_loss
def train():
    with tf.Graph().as_default():
        global_step = tf.Variable(0, trainable=False)
        images, labels = cifar10.distorted_inputs()

        logits = cifar10.inference(images)
        loss = cifar10.loss(logits, labels)
        train_op = cifar10.train(loss, global_step)

        saver = tf.train.Saver(tf.all_variables())

        summary_op = tf.merge_all_summaries()

        init = tf.initialize_all_variables()

        sess = tf.Session(config=tf.ConfigProto(
            log_device_placement=FLAGS.log_device_placement))

        sess.run(init)

        tf.train.start_queue_runners(sess=sess)
        summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph)

        for step in xrange(FLAGS.max_steps):
            start_time = time.time()
            _, loss_value = sess.run([train_op, loss])
            duration = time.time() - start_time

            assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

            if step % 10 == 0:
                num_examples_per_step = FLAGS.batch_size
                examples_per_sec = num_examples_per_step / duration
                sec_per_batch = float(duration)

                format_str = (
                    '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                    'sec/batch)')
                print(format_str % (datetime.now(), step, loss_value,
                                    examples_per_sec, sec_per_batch))

            if step % 100 == 0:
                summary_str = sess.run(summary_op)
                summary_writer.add_summary(summary_str, step)

            # Save the model checkpoint periodically.
            if step % 1000 == 0 or (step + 1) == FLAGS.max_steps:
                checkpoint_path = os.path.join(FLAGS.train_model, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
Example #24
0
def evaluate():
  """Eval CIFAR-10 for a number of steps."""
  with tf.Graph().as_default() as g:
    eval_data = FLAGS.eval_data == 'test'
    images, labels = cifar10.inputs(eval_data=eval_data)
    logits = cifar10.inference(images)
    top_k_op = tf.nn.in_top_k(logits, labels, 1)
    loss = cifar10.loss(logits, labels)
    saver = tf.train.Saver(tf.all_variables())
    
    while True:
      eval_once(saver, top_k_op, loss)
      if FLAGS.run_once:
        break
      time.sleep(FLAGS.eval_interval_secs)
Example #25
0
    def __init__(self):
        global pwd
        os.chdir(pwd)
        train_data = np.load('trainingdata.npz')
        self.images = train_data['images']
        self.labels = train_data['labels']
        self.counter = 0
        self.session = None

        with tf.Graph().as_default():
            global_step = tf.contrib.framework.get_or_create_global_step()

            self.images_ph = gen_ph(self.images[0], name='images_ph')
            self.labels_ph = gen_ph(self.labels[0], name='labels_ph')

            logits = cifar10.inference_divided(self.images_ph)
            self.loss = cifar10.loss(logits, self.labels_ph)

            num_batches_per_epoch = (cifar10.NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN /
                                     FLAGS.batch_size)
            decay_steps = int(num_batches_per_epoch *
                              cifar10.NUM_EPOCHS_PER_DECAY)

            lr = tf.train.exponential_decay(cifar10.INITIAL_LEARNING_RATE,
                                            global_step,
                                            decay_steps,
                                            cifar10.LEARNING_RATE_DECAY_FACTOR,
                                            staircase=True)
            opt = tf.train.GradientDescentOptimizer(lr)
            #            opt = tf.train.MomentumOptimizer(lr, 0.9)
            grads_pair_list = opt.compute_gradients(self.loss)
            #            self.grads = [i[0] for i in grads_pair_list]

            #            self.phs = [gen_ph(i[1]) for i in grads_pair_list]
            #            variables = [i[1] for i in grads_pair_list]

            #            self.train_op = opt.apply_gradients(zip(self.phs, variables), global_step=global_step)
            self.train_op = opt.apply_gradients(grads_pair_list,
                                                global_step=global_step)

            self.mon_sess = tf.Session(config=tf.ConfigProto(
                log_device_placement=FLAGS.log_device_placement))
            tf.global_variables_initializer().run(session=self.mon_sess)
            tf.summary.FileWriter("tb", self.mon_sess.graph)

            self.variable = ray.experimental.TensorFlowVariables(
                self.loss, self.mon_sess)
Example #26
0
def train():
    with tf.Graph().as_default():
        global_step = tf.Variable(0, trainable=False)

        images, labels = cifar10.distorted_inputs()

        logits = cifar10.inference(images)

        loss = cifar10.loss(logits, labels)

        train_op = cifar10.train(loss, global_step)

        saver = tf.train.Saver(tf.all_variables())

        summary_op = tf.merge_all_summaries()

        init = tf.initialize_all_variables()

        sess = tf.Session(config=tf.ConfigProto(log_device_placement=FLAGS.log_device_placement))
        sess.run(init)

        tf.train.start_queue_runners(sess=sess)

        summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, graph_def=sess.graph_def)

        for step in xrange(FLAGS.max_steps):
            start_time = time.time()
            _, loss_value = sess.run([train_op, loss])
            duration = time.time() - start_time

            assert not np.isnan(loss_value), "Model diverged with loss = NaN"

            if step % 10 == 0:
                num_examples_per_step = FLAGS.batch_size
                examples_per_sec = num_examples_per_step / duration
                sec_per_batch = float(duration)

                format_str = "%s: step %d, loss = %.2f (%.1f examples/sec; %.3f sec/batch)"
                print(format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch))

            if step % 100 == 0:
                summary_str = sess.run(summary_op)
                summary_writer.add_summary(summary_str, step)

            if step % 1000 == 0 or (step + 1) == FLAGS.max_steps:
                checkpoint_path = os.path.join(FLAGS.train_dir, "model.ckpt")
                saver.save(sess, checkpoint_path, global_step=step)
Example #27
0
def tower_loss(scope):
    # 获取数据增强后的images和labels
    images, labels = cifar10.distorted_inputs()

    # 生成卷积网络,GPU共享参数模型
    logits = cifar10.inference(images)

    # 计算损失函数
    _ = cifar10.loss(logits, labels)

    # 获取当前GPU上的loss
    losses = tf.get_collection('losses', scope)

    # 损失叠加计算总损失
    total_loss = tf.add_n(losses, name='total_loss')

    return total_loss
Example #28
0
def train():
    with tf.Graph().as_default():
        images, labels = cifar10.distorted_inputs()

        logits = cifar10.inference(images)

        loss = cifar10.loss(logits, labels)

        init = tf.global_variables_initializer()

        sess = tf.Session()
        sess.run(init)

        tf.train.start_queue_runners(sess=sess)
        for step in xrange(FLAGS.max_steps):
            los = sess.run([loss])
            #print(type(los))
            print(los[0])
Example #29
0
def train():
    with tf.Graph().as_default():
        images, labels = cifar10.distorted_inputs()

        logits = cifar10.inference(images)

        loss = cifar10.loss(logits, labels)

        global_step = tf.Variable(0, trainable=False)
        train_op = cifar10.train(loss, global_step=global_step)

        summary_op = tf.merge_all_summaries()

        init = tf.global_variables_initializer()
        memlim = tf.ConfigProto()
        memlim.gpu_options.allow_growth = True

        sess = tf.Session(config=memlim)
        sess.run(init)

        tf.train.start_queue_runners(sess=sess)
        summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph)

        for step in xrange(FLAGS.max_steps):
            start_time = time.time()
            _, loss_value = sess.run([train_op, loss])
            duration = time.time() - start_time

            assert not np.isnan(loss_value)

            if step % 5 == 0:
                examples_per_sec = FLAGS.batch_size / duration
                sec_per_batch = float(duration)

                format_str = (
                    '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f sec/batch)'
                )
                print(format_str % (datetime.now(), step, loss_value,
                                    examples_per_sec, sec_per_batch))

            if step % 100 == 0:
                summary_str = sess.run(summary_op)
                summary_writer.add_summary(summary_str, step)
Example #30
0
def tower_loss(scope, images, labels):
    '''
    当一个tower运行CIFAR模型时,计算total loss
    @param scope: 独特的前缀字符串表明CIFAR tower, 例如'tower_0'
    @param images:
    @param labels:
    @return: 一批次数据的total loss
    '''
    logits = cifar10.inference(images)
    _ = cifar10.loss(logits, labels)
    # 从当前tower中取出‘losses’的全部元素,构成一个列表
    losses = tf.get_collection('losses', scope)
    # tf.add_n([p1, p2, p3, ...])函数是实现一个列表元素的相加
    total_loss = tf.add_n(losses, name='total_loss')
    for l in losses + [total_loss]:
        loss_name = re.sub('%s_[0-9]*/' % cifar10.TOWER_NAME, '', l.op.name)
        tf.summary.scalar(loss_name, l)

    return total_loss
Example #31
0
def train():
    """Train CIFAR-10 for a number of steps."""
    with tf.Graph().as_default():
        global_step = tf.train.get_or_create_global_step()

        # Get images and labels for CIFAR-10.
        # Force input pipeline to CPU:0 to avoid operations sometimes ending up on
        # GPU and resulting in a slow down.

        with tf.device('/cpu:0'):
            images, labels = cifar10.distorted_inputs()

        # Build a Graph that computes the logits predictions from the
        # inference model.
        logits = cifar10.inference(images)

        # Calculate loss.
        loss = cifar10.loss(logits, labels)

        # Build a Graph that trains the model with one batch of examples and
        # updates the model parameters.
        train_op = cifar10.train(loss, global_step)

        sess = tf.InteractiveSession()
        saver = tf.train.Saver()
        tf.global_variables_initializer().run()
        # start thread queue to speed up for data augmentation
        tf.train.start_queue_runners()

        start = time.time()
        for step in range(FLAGS.max_steps):
            _loss, _ = sess.run([loss, train_op])
            if step % FLAGS.log_frequency == 0:
                duration = time.time() - start
                sec_per_batch = float(duration) / FLAGS.log_frequency
                print('{}: step:{:6d}  loss:{:.2f}  {:.2f} sec/batch'.format(
                    datetime.now(), step, _loss, sec_per_batch))
                start = time.time()
            if step % 100 == 0:
                ckpt = os.path.join(
                    FLAGS.train_dir,
                    'model_step_{}_loss_{:.1f}'.format(step, _loss))
                saver.save(sess, ckpt)
Example #32
0
def evaluate():
    """Eval CIFAR-10 for a number of steps."""
    with tf.Graph().as_default() as g:
        # Get images and labels for CIFAR-10.
        eval_data = FLAGS.eval_data == "test"
        print(eval_data)
        images, labels, ground_truth = cifar10.inputs(eval_data=eval_data)
        # Build a Graph that computes the logits predictions from the
        # inference model.
        logits, _ = cifar10.inference(images)
        print(logits)
        print(logits.get_shape())
        print("after inference node creation")
        loss = cifar10.loss(logits, labels)
        accuracy, precision, accuracies = cifar10.accuracy(logits, ground_truth)
        labels = tf.cast(labels, tf.int64)

        label_shape = labels.get_shape().as_list()
        reshaped_labels = tf.reshape(labels, [label_shape[0] * label_shape[1] * label_shape[2]])
        logits_shape = logits.get_shape().as_list()
        reshaped_logits = tf.reshape(logits, [logits_shape[0] * logits_shape[1] * logits_shape[2], logits_shape[3]])

        # Calculate predictions.
        # top_k_op = tf.nn.in_top_k(logits, labels, 1)
        # top_k_op = tf.nn.in_top_k(reshaped_logits, reshaped_labels, 1)

        # Restore the moving average version of the learned variables for eval.
        variable_averages = tf.train.ExponentialMovingAverage(cifar10.MOVING_AVERAGE_DECAY)
        variables_to_restore = variable_averages.variables_to_restore()
        saver = tf.train.Saver(variables_to_restore)

        # Build the summary operation based on the TF collection of Summaries.
        summary_op = tf.merge_all_summaries()

        summary_writer = tf.train.SummaryWriter(FLAGS.eval_dir, g)

        while True:
            print("evaluate:")
            eval_once(saver, summary_writer, summary_op, accuracy, precision, accuracies)
            if FLAGS.run_once:
                break
            time.sleep(FLAGS.eval_interval_secs)
Example #33
0
def tower_loss(scope, model):
    """Calculate the total loss on a single tower running the CIFAR model.
  Args:
    scope: unique prefix string identifying the CIFAR tower, e.g. 'tower_0'
  Returns:
     Tensor of shape [] containing the total loss for a batch of data
  """
    # Get images and labels for CIFAR-10.
    images, labels = cifar10.distorted_inputs()

    # Build inference Graph.
    logits = cifar10.inference(images)

    # Build the portion of the Graph calculating the losses. Note that we will
    # assemble the total_loss using a custom function below.
    _ = cifar10.loss(logits, labels)

    total_loss = model.loss

    return total_loss
def tower_loss(scope):
  """Calculate the total loss on a single tower.

  Args:
    scope: unique prefix string identifying a CIFAR tower, ex: 'tower_0'

  Returns:
    Tensor of shape [] containing the total loss for a batch of data
  """
  # Get images and labels for CIFAR-10
  images, labels = cifar10.distorted_inputs()

  # Build inference graph
  logits = cifar10.inference(images)

  # Build the portion of the graph calculating the losses
  _ = cifar10.loss(logits, labels)

  # Assemble all of the losses for the current tower only
  losses = tf.get_collection('losses', scope)

  # Calculate the total loss for the the current tower
  total_loss = tf.add_n(losses, name='total_loss')

  # Compute the moving average of all individual losses and the total loss
  loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg')
  loss_averages_op = loss_averages.apply(losses + [total_loss])

  # Attach a scalar summary to all individual losses and the total loss
  # Do the same for the average version of the losses
  for l in losses + [total_loss]:
    # Remove 'tower_[0-9]/' from the name in case this is multi-GPU training
    # This helps the clarity of the presentation in tensorboard
    loss_name = re.sub('%s_[0-9]*/' % cifar10.TOWER_NAME, '', l.op.name)
    # Name each loss as '(raw)' and name the moving average version of the loss
    tf.scalar_summary(loss_name + '(raw)', l)
    tf.scalar_summary(loss_name, loss_averages.average(l))

  with tf.control_dependencies([loss_averages_op]):
    total_loss = tf.identity(total_loss)
  return total_loss
Example #35
0
def train():
    """Train CIFAR-10 for a number of steps."""
    with tf.Graph().as_default():
        global_step = tf.train.get_or_create_global_step()

        # Get images and labels for CIFAR-10.
        # Force input pipeline to CPU:0 to avoid operations sometimes ending up on
        # GPU and resulting in a slow down.
        with tf.device('/cpu:0'):
            # images:128x24x24x3 float32 labels:128 int32
            images, labels = cifar10.distorted_inputs()

        # Build a Graph that computes the logits predictions from the
        # inference model.
        # logits:128x10
        logits = cifar10.inference(images)

        # Calculate loss.

        loss = cifar10.loss(logits, labels)
        """
Example #36
0
    def setup_model(self):

        # setup tensorflow model structure

        self.x_pl = tf.placeholder(tf.float32,
                                   shape=(None, 24, 24, 3),
                                   name='input_x')
        self.y_pl = tf.placeholder(tf.int32, shape=(None, ), name='output_y')
        self.lr_pl = tf.placeholder(tf.float32, shape=(), name='learning_rate')

        self.y_logits = cifar10.inference(self.x_pl)  # construct model
        self.loss = cifar10.loss(self.y_logits, self.y_pl)

        self.y_pred = tf.cast(tf.argmax(self.y_logits, 1), tf.int32)
        self.correct_prediction = tf.equal(self.y_pred,
                                           self.y_pl)  # used for accuracy
        self.num_correct = tf.reduce_sum(
            tf.cast(self.correct_prediction, tf.int64))
        self.accuracy = tf.reduce_mean(
            tf.cast(self.correct_prediction, tf.float32))

        self.optimizer = tf.train.GradientDescentOptimizer(self.lr_pl)
        self.train_op = self.optimizer.minimize(self.loss)
        self.opt_reset_op = tf.variables_initializer(
            self.optimizer.variables())

        # import ipdb; ipdb.set_trace() # check self.optimizer.variables()

        self.metrics = {  # used by self.eval()
            'loss': self.loss,
            'correct': self.num_correct,
            # and add more...
        }

        # transform ops
        self.x_tr_pl = tf.placeholder(tf.float32, shape=(None, 32, 32, 3))
        # with tf.device('/cpu:0'):
        self.train_transform_op = train_transform(self.x_tr_pl)
        self.test_transform_op = test_transform(self.x_tr_pl)
Example #37
0
def tower_loss(scope, images, labels):
    # Build inference Graph.
    logits = cifar10.inference(images, train=True)

    # Build the portion of the Graph calculating the losses. Note that we will
    # assemble the total_loss using a custom function below.
    _ = cifar10.loss(logits, labels)

    # Assemble all of the losses for the current tower only.
    losses = tf.get_collection('losses', scope)

    # Calculate the total loss for the current tower.
    total_loss = tf.add_n(losses, name='total_loss')

    # Attach a scalar summary to all individual losses and the total loss; do the
    # same for the averaged version of the losses.
    for l in losses + [total_loss]:
        # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training
        # session. This helps the clarity of presentation on tensorboard.
        loss_name = re.sub('%s_[0-9]*/' % cifar10.TOWER_NAME, '', l.op.name)
        tf.summary.scalar(loss_name, l)

    return total_loss
def tower_loss(scope):
    """Calculate the total loss on a single tower running the CIFAR model.

  Args:
    scope: unique prefix string identifying the CIFAR tower, e.g. 'tower_0'

  Returns:
     Tensor of shape [] containing the total loss for a batch of data
  """
    # ref: https://github.com/tensorflow/models/issues/1264
    with tf.device('/cpu:0'):  # FIXME: TRICK
        # Get images and labels for CIFAR-10.
        images, labels = cifar10.distorted_inputs()

    # Build inference Graph.
    logits = cifar10.inference(images)

    # Build the portion of the Graph calculating the losses. Note that we will
    # assemble the total_loss using a custom function below.
    _ = cifar10.loss(logits, labels)

    # Assemble all of the losses for the current tower only.
    losses = tf.get_collection('losses', scope)

    # Calculate the total loss for the current tower.
    total_loss = tf.add_n(losses, name='total_loss')

    # Attach a scalar summary to all individual losses and the total loss; do the
    # same for the averaged version of the losses.
    for l in losses + [total_loss]:
        # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training
        # session. This helps the clarity of presentation on tensorboard.
        loss_name = re.sub('%s_[0-9]*/' % cifar10.TOWER_NAME, '', l.op.name)
        tf.summary.scalar(loss_name, l)

    return total_loss
Example #39
0
def train():
    """Train CIFAR-10 for a number of steps."""
    with tf.Graph().as_default():
        with tf.variable_scope("model") as scope:
            global_step = tf.Variable(0, trainable=False)

            # Get images and labels for CIFAR-10.
            # Force input pipeline to CPU:0 to avoid operations sometimes ending up on
            # GPU and resulting in a slow down.
            with tf.device('/cpu:0'):
                images_train, labels_train = cifar10.distorted_inputs()
                images_test, labels_test = cifar10.inputs(eval_data=True)

            # Build a Graph that computes the logits predictions from the
            # inference model.
            logits_train = cifar10.inference(images_train)
            # Calculate loss.
            loss = cifar10.loss(logits_train, labels_train)
            # Build a Graph that trains the model with one batch of examples and
            # updates the model parameters.
            train_op = cifar10.train(loss, global_step)
        with tf.variable_scope("model", reuse=True):
            logits_test = cifar10.inference(images_test)

            # For evaluation
            top_k = tf.nn.in_top_k(logits_train, labels_train, 1)
            top_k_test = tf.nn.in_top_k(logits_test, labels_test, 1)

            summary_train_prec = tf.placeholder(
                tf.float32)  # summary writer for training data
            summary_test_prec = tf.placeholder(
                tf.float32)  # summary writer for testing data
            tf.summary.scalar('accuracy/train',
                              summary_train_prec)  # train accuracy
            tf.summary.scalar('accuracy/test',
                              summary_test_prec)  # test accuracy

            model_saver = tf.train.Saver(
                tf.all_variables())  # save the model by creating checkpoint
            summary_op = tf.summary.merge_all()  # merge all the summaries
            init = tf.initialize_all_variables()  # init the variables

            # Start running operations on the Graph.
            sess = tf.Session(config=tf.ConfigProto(
                log_device_placement=FLAGS.log_device_placement))
            sess.run(init)

            # Start the queue runners.
            tf.train.start_queue_runners(sess=sess)
            train_summary_writer = tf.summary.FileWriter(
                FLAGS.train_dir, sess.graph)

            for step in range(FLAGS.max_steps):  # iterate through no of steps
                start_time = time.time()
                _, loss_value = sess.run([train_op, loss])
                duration = time.time() - start_time
                assert not np.isnan(
                    loss_value), 'Model diverged with loss = NaN'

                # output the step loss after 10 batches
                if step % 10 == 0:
                    num_examples_per_step = FLAGS.batch_size
                    examples_per_sec = num_examples_per_step / duration
                    sec_per_batch = float(duration)

                    format_str = (
                        '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                        'sec/batch)')
                    print(format_str % (datetime.now(), step, loss_value,
                                        examples_per_sec, sec_per_batch))

                    prec_train = evaluate_set(sess, top_k,
                                              1024)  # get train accuracy
                    prec_test = evaluate_set(sess, top_k_test,
                                             1024)  # get test accuracy
                    print('%s: accuracy train = %.5f' %
                          (datetime.now(), prec_train))
                    print('%s: accuracy test  = %.5f' %
                          (datetime.now(), prec_test))
                    print(
                        "---------------------------------------------------------------------------------"
                    )

                # log the summary after every 100 steps
                if step % 100 == 0:
                    summary = sess.run(summary_op,
                                       feed_dict={
                                           summary_train_prec: prec_train,
                                           summary_test_prec: prec_test
                                       })
                    train_summary_writer.add_summary(
                        summary, step
                    )  # create summary for testing and training accuracy

                # save the model after 1000 steps
                if step % 1000 == 0 or (step + 1) == FLAGS.max_steps:
                    create_checkpoint = os.path.join(FLAGS.train_dir,
                                                     'model.ckpt')
                    model_saver.save(sess, create_checkpoint, global_step=step)
def train():
    ps_hosts = FLAGS.ps_hosts.split(',')
    worker_hosts = FLAGS.worker_hosts.split(',')
    print ('PS hosts are: %s' % ps_hosts)
    print ('Worker hosts are: %s' % worker_hosts)
    server = tf.train.Server({'ps': ps_hosts, 'worker': worker_hosts},
                             job_name = FLAGS.job_name,
                             task_index=FLAGS.task_id)
    if FLAGS.job_name == 'ps':
	os.environ['CUDA_VISIBLE_DEVICES'] = ''
        server.join()
    os.environ['CUDA_VISIBLE_DEVICES'] = '0'
    is_chief = (FLAGS.task_id == 0)
    if is_chief:
        if tf.gfile.Exists(FLAGS.train_dir):
            tf.gfile.DeleteRecursively(FLAGS.train_dir)
        tf.gfile.MakeDirs(FLAGS.train_dir)
    device_setter = tf.train.replica_device_setter(ps_tasks=len(ps_hosts))
    with tf.device('/job:worker/task:%d' % FLAGS.task_id):
        with tf.device(device_setter):
            global_step = tf.Variable(0, trainable=False)
	    decay_steps = 50000*350.0/FLAGS.batch_size
	    batch_size = tf.placeholder(dtype=tf.int32, shape=(), name='batch_size')
            images, labels = cifar10.distorted_inputs(batch_size)
	    logits = cifar10.inference(images, batch_size)
	    loss = cifar10.loss(logits, labels, batch_size)
            lr = tf.train.exponential_decay(INITIAL_LEARNING_RATE,
                                            global_step,
                                            decay_steps,
                                            LEARNING_RATE_DECAY_FACTOR,
                                            staircase=True)
            opt = tf.train.GradientDescentOptimizer(lr)
            exp_moving_averager = tf.train.ExponentialMovingAverage(MOVING_AVERAGE_DECAY, global_step)
            variables_to_average = (tf.trainable_variables() + tf.moving_average_variables())
            opt = tf.train.SyncReplicasOptimizer(
                opt,
                replicas_to_aggregate=len(worker_hosts),
                total_num_replicas=len(worker_hosts),
                variable_averages=exp_moving_averager,
                variables_to_average=variables_to_average)
            naive_grads = opt.compute_gradients(loss) 
	    grads = [(tf.scalar_mul(tf.cast(batch_size/FLAGS.batch_size, tf.float32), grad), var) for grad, var in naive_grads]
            apply_gradients_op = opt.apply_gradients(grads, global_step=global_step)
            with tf.control_dependencies([apply_gradients_op]):
                train_op = tf.identity(loss, name='train_op')
            chief_queue_runners = [opt.get_chief_queue_runner()]
            init_tokens_op = opt.get_init_tokens_op()
            saver = tf.train.Saver()
            sv = tf.train.Supervisor(is_chief=is_chief,
                                     logdir=FLAGS.train_dir,
				     init_op=tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()),
                                     summary_op=None,
                                     global_step=global_step,
                                     saver=saver,
				     recovery_wait_secs=1,
                                     save_model_secs=60)
            tf.logging.info('%s Supervisor' % datetime.now())
   	    sess_config = tf.ConfigProto(allow_soft_placement=True,
   	                                 log_device_placement=False)
   	    sess = sv.prepare_or_wait_for_session(server.target, config=sess_config)
            queue_runners = tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS)
            sv.start_queue_runners(sess, queue_runners)

            if is_chief:
                sv.start_queue_runners(sess, chief_queue_runners)
                sess.run(init_tokens_op)

            """Train CIFAR-10 for a number of steps."""
	    batch_size_num = FLAGS.batch_size
            for step in range(FLAGS.max_steps):
                start_time = time.time()
      		run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
      		run_metadata = tf.RunMetadata()
                _, loss_value, gs = sess.run([train_op, loss, global_step], feed_dict={batch_size: batch_size_num},  options=run_options, run_metadata=run_metadata)

                duration = time.time() - start_time
                num_examples_per_step = batch_size_num
                examples_per_sec = num_examples_per_step / duration
                sec_per_batch = float(duration)
                format_str = ('%s: step %d (global_step %d), loss = %.2f (%.1f examples/sec; %.3f sec/batch)')
                tf.logging.info(format_str % (datetime.now(), step, gs, loss_value, examples_per_sec, sec_per_batch))
Example #41
0
def train():
  """Train a model for a number of steps."""
  with tf.Graph().as_default():
    global_step = tf.Variable(0, trainable=False)

    # Get images and labels for a segmentation model.
    images, labels, ground_truth = cifar10.distorted_inputs()
    tf.histogram_summary('label_hist/with_ignore', labels)
    tf.histogram_summary('label_hist/ground_truth', ground_truth)
    
    # Build a Graph that computes the logits predictions from the
    # inference model.
    print("before inference")
    print(images.get_shape())
    logits, nr_params = cifar10.inference(images)
    print("nr_params: "+str(nr_params) )
    print("after inference")
    # Calculate loss.
    loss = cifar10.loss(logits, labels)
    accuracy, precision, cat_accs = cifar10.accuracy(logits, ground_truth)

    # Build a Graph that trains the model with one batch of examples and
    # updates the model parameters.
    train_op = cifar10.train(loss, global_step)

    # Create a saver.
    saver = tf.train.Saver(tf.all_variables())
#    tf.image_summary('images2', images)
    print (logits)
#    tf.image_summary('predictions', logits)

    # Build the summary operation based on the TF collection of Summaries.
    summary_op = tf.merge_all_summaries()

    # Build an initialization operation to run below.
    init = tf.initialize_all_variables()

    # Start running operations on the Graph.
    sess = tf.Session(config=tf.ConfigProto(
        log_device_placement=FLAGS.log_device_placement))
    ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir)
    if ckpt and ckpt.model_checkpoint_path:
      # Restores from checkpoint
      saver.restore(sess, ckpt.model_checkpoint_path)
      # Assuming model_checkpoint_path looks something like:
      #   /my-favorite-path/cifar10_train/model.ckpt-0,
      # extract global_step from it.
      global_step = int(ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1])
    else:
      print('No checkpoint file found')
      print('Initializing new model')
      sess.run(init)
      global_step = 0


    # Start the queue runners.
    tf.train.start_queue_runners(sess=sess)

    summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph)

    for step in xrange(global_step, FLAGS.max_steps):
      start_time = time.time()
      _, loss_value, accuracy_value, precision_value, cat_accs_val  = sess.run([train_op,
                                                                                loss,
                                                                                accuracy,
                                                                                precision,
                                                                                cat_accs])
                                                                  
      duration = time.time() - start_time

      print (precision_value)
      print (cat_accs_val)
      
      assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

      #precision_value = [0 if np.isnan(p) else p for p in precision_value]
      #print (precision_value)
      if step % 10 == 0:
        num_examples_per_step = FLAGS.batch_size
        examples_per_sec = num_examples_per_step / duration
        sec_per_batch = float(duration)

        format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                      'sec/batch)\n Accuracy = %.4f, mean average precision = %.4f')
        print (format_str % (datetime.now(), step, loss_value,
                             examples_per_sec, sec_per_batch,
                             accuracy_value, np.mean(precision_value)))

      if step % 100 == 0:
        summary_str = sess.run(summary_op)
        summary_writer.add_summary(summary_str, step)

        summary = tf.Summary()
        summary.value.add(tag='Accuracy (raw)', simple_value=float(accuracy_value))
        for i,s in enumerate(CLASSES):
          summary.value.add(tag="precision/"+s+" (raw)",simple_value=float(precision_value[i]))
          summary.value.add(tag="accs/"+s+" (raw)",simple_value=float(cat_accs_val[i]))
#        summary.value.add(tag='Human precision (raw)', simple_value=float(precision_value))
        summary_writer.add_summary(summary, step)
        print("hundred steps")
      # Save the model checkpoint periodically.
      if step % 1000 == 0 or (step + 1) == FLAGS.max_steps:
        print("thousand steps")
        checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
def train():
    """Train CIFAR-10 for a number of steps."""
    #print '---------'
    ps_spec = FLAGS.ps_hosts.split(',')
    worker_spec = FLAGS.worker_hosts.split(',')
    issync = FLAGS.sync

    num_worker = len(worker_spec)
    cluster = tf.train.ClusterSpec({'ps': ps_spec, 'worker': worker_spec})
    server = tf.train.Server(cluster,
                             job_name=FLAGS.job_name,
                             task_index=FLAGS.task_index)
    print("number of workers:%d" % num_worker)

    if FLAGS.job_name == 'ps':
        server.join()
    elif FLAGS.job_name == "worker":
        time.sleep(10)

    is_chief = (FLAGS.task_index == 0)
    # worker_device = '/job:worker/task%d/cpu:0' % FLAGS.task_index
    with tf.device(
            tf.train.replica_device_setter(
                worker_device="/job:worker/task:%d" % FLAGS.task_index,
                cluster=cluster)):
        #with tf.Graph().as_default(), tf.device('/cpu:0'):
        # Create a variable to count the number of train() calls. This equals the
        # number of batches processed * FLAGS.num_gpus.
        global_step = tf.get_variable('global_step', [],
                                      initializer=tf.constant_initializer(0),
                                      trainable=False)

        # Calculate the learning rate schedule.
        num_batches_per_epoch = (cifar10.NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN /
                                 FLAGS.batch_size)
        decay_steps = int(num_batches_per_epoch * cifar10.NUM_EPOCHS_PER_DECAY)

        # Decay the learning rate exponentially based on the number of steps.
        lr = tf.train.exponential_decay(cifar10.INITIAL_LEARNING_RATE,
                                        global_step,
                                        decay_steps,
                                        cifar10.LEARNING_RATE_DECAY_FACTOR,
                                        staircase=True)

        # Create an optimizer that performs gradient descent.
        opt = tf.train.GradientDescentOptimizer(lr)

        # Get images and labels for CIFAR-10.
        images, labels = cifar10.distorted_inputs()
        batch_queue = tf.contrib.slim.prefetch_queue.prefetch_queue(
            [images, labels], capacity=2 * num_worker)
        # Calculate the gradients for each model tower.
        ##    tower_grads = []
        ##    with tf.variable_scope(tf.get_variable_scope()):
        #      for i in xrange(FLAGS.num_gpus):
        #        with tf.device('/gpu:%d' % i):
        ##          with tf.name_scope('%s_%d' % (cifar10.TOWER_NAME, FLAGS.task_index)) as scope:
        # Dequeues one batch for the GPU
        image_batch, label_batch = batch_queue.dequeue()
        # Calculate the loss for one tower of the CIFAR model. This function
        # constructs the entire CIFAR model but shares the variables across
        # all towers.
        ##            loss = tower_loss(scope, image_batch, label_batch,tower_grads)
        logits = cifar10.inference(image_batch)
        loss = cifar10.loss(logits, label_batch)

        # Reuse variables for the next tower.
        ##            tf.get_variable_scope().reuse_variables()

        # Retain the summaries from the final tower.
        ##            summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope)
        summaries = tf.get_collection(tf.GraphKeys.SUMMARIES)

        # Calculate the gradients for the batch of data on this CIFAR tower.
        grads = opt.compute_gradients(loss)

        # Keep track of the gradients across all towers.
        ##            tower_grads.append(grads)

        # We must calculate the mean of each gradient. Note that this is the
        # synchronization point across all towers.
        ##    grads = average_gradients(tower_grads)

        # Add a summary to track the learning rate.
        summaries.append(tf.summary.scalar('learning_rate', lr))

        # Add histograms for gradients.
        ##    for grad, var in grads:
        ##      if grad is not None:
        ##        summaries.append(tf.summary.histogram(var.op.name + '/gradients', grad))

        # added by faye
        if issync == 1:
            syn_opt = tf.train.SyncReplicasOptimizer(
                opt,
                replicas_to_aggregate=num_worker,
                #replica_id=FLAGS.task_index,
                total_num_replicas=num_worker)
            #use_locking=True)

            # Apply the gradients to adjust the shared variables.
            apply_gradient_op = syn_opt.apply_gradients(
                grads, global_step=global_step)
            # Newly added
            if is_chief:
                local_init_op = syn_opt.chief_init_op
            else:
                local_init_op = syn_opt.local_step_init_op

            ready_for_local_init_op = syn_opt.ready_for_local_init_op

            init_token_op = syn_opt.get_init_tokens_op()
            chief_queue_runner = syn_opt.get_chief_queue_runner()

        else:
            apply_gradient_op = opt.apply_gradients(grads,
                                                    global_step=global_step)

        # Add histograms for trainable variables.
        for var in tf.trainable_variables():
            summaries.append(tf.summary.histogram(var.op.name, var))

        # Track the moving averages of all trainable variables.
        variable_averages = tf.train.ExponentialMovingAverage(
            cifar10.MOVING_AVERAGE_DECAY, global_step)
        variables_averages_op = variable_averages.apply(
            tf.trainable_variables())

        # Group all updates to into a single train op.
        train_op = tf.group(apply_gradient_op, variables_averages_op)

        # Create a saver.
        saver = tf.train.Saver(tf.global_variables())

        # Build the summary operation from the last tower summaries.
        summary_op = tf.summary.merge(summaries)

        # Build an initialization operation to run below.
        #    init = tf.global_variables_initializer()

        init_op = tf.initialize_all_variables()

        # Start running operations on the Graph. allow_soft_placement must be set to
        # True to build towers on GPU, as some of the ops do not have GPU
        # implementations.
        #sess = tf.Session(config=tf.ConfigProto(
        #    allow_soft_placement=True,
        #    log_device_placement=FLAGS.log_device_placement))
        #sess = tf.Session("grpc://%s" % FLAGS.ps_hosts)
        #sess.run(init)

        if issync == 1:
            sv = tf.train.Supervisor(
                is_chief=is_chief,
                logdir=FLAGS.train_dir,
                init_op=init_op,
                local_init_op=local_init_op,
                ready_for_local_init_op=ready_for_local_init_op,
                summary_op=summary_op,
                saver=saver,
                global_step=global_step,
                save_model_secs=600)
        else:
            sv = tf.train.Supervisor(is_chief=is_chief,
                                     logdir=FLAGS.train_dir,
                                     init_op=init_op,
                                     summary_op=summary_op,
                                     saver=saver,
                                     global_step=global_step,
                                     save_model_secs=600)

        sess = sv.prepare_or_wait_for_session(server.target)

        #    sess = tf.train.MonitoredTrainingSession(master=server.target,
        #                                           is_chief=is_chief)

        # Start the queue runners. Modified by faye
        if is_chief and issync == 1:
            sess.run(init_token_op)
            #	tf.train.start_queue_runners(sess, [chief_queue_runner])
            sv.start_queue_runners(sess, [chief_queue_runner])
        else:
            sv.start_queue_runners(sess=sess)


#	tf.train.start_queue_runners(sess=sess)

        summary_writer = tf.summary.FileWriter(FLAGS.train_dir, sess.graph)

        step = 0
        g_step = 0
        while g_step <= FLAGS.max_steps:
            #    for step in xrange(FLAGS.max_steps):
            start_time = time.time()
            _, loss_value = sess.run([train_op, loss])
            duration = time.time() - start_time
            g_step = int(tf.train.global_step(sess, global_step))
            print('worker %d: step %d, global step %d: loss = %.2f' %
                  (FLAGS.task_index, step, g_step, loss_value))

            assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

            if step % 10 == 0:
                num_examples_per_step = FLAGS.batch_size * num_worker
                examples_per_sec = num_examples_per_step / duration
                sec_per_batch = duration / num_worker

                format_str = (
                    '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                    'sec/batch, global_step %d)')
                print(format_str % (datetime.now(), step, loss_value,
                                    examples_per_sec, sec_per_batch, g_step))

            if step % 100 == 0:
                summary_str = sess.run(summary_op)
                summary_writer.add_summary(summary_str, step)

            # Save the model checkpoint periodically.
            #if step % 1000 == 0 or (step + 1) == FLAGS.max_steps:
            #  checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
            #  saver.save(sess, checkpoint_path, global_step=step)

            step += 1

        sv.stop()
Example #43
0
def train():
  """Train CIFAR-10 for a number of steps."""



  with tf.Graph().as_default():
    global_step = tf.Variable(0, trainable=False)


    # Get images and labels for CIFAR-10.
    images, labels = cifar10.distorted_inputs()

    # Build a Graph that computes the logits predictions from the
    # inference model.
    logits = cifar10.inference(images)

    # Calculate loss.
    loss = cifar10.loss(logits, labels)

    # Build a Graph that trains the model with one batch of examples and
    # updates the model parameters.
    train_op = cifar10.train(loss, global_step)

    # Create a saver.
    saver = tf.train.Saver(tf.all_variables())

    # Build the summary operation based on the TF collection of Summaries.
    summary_op = tf.merge_all_summaries()


    # # Visualize conv1 features
    # with tf.variable_scope('conv1') as scope_conv:
    #     #tf.get_variable_scope().reuse_variables()
    #     scope_conv.reuse_variables()
    #     weights = tf.get_variable('weights')
    #     grid_x = grid_y = 8   # to get a square grid for 64 conv1 features
    #     grid = put_kernels_on_grid (weights, (grid_y, grid_x))
    #     tf.image_summary('conv1/features', grid, max_images=1)


    # Build an initialization operation to run below.
    init = tf.initialize_all_variables()

    # Start running operations on the Graph.
    sess = tf.Session(config=tf.ConfigProto(
        log_device_placement=FLAGS.log_device_placement))
    sess.run(init)

    # Start the queue runners.
    tf.train.start_queue_runners(sess=sess)



    summary_writer = tf.train.SummaryWriter(FLAGS.train_dir,
                                            graph_def=sess.graph_def)


    for step in xrange(FLAGS.max_steps):
      start_time = time.time()
      _, loss_value = sess.run([train_op, loss])
      duration = time.time() - start_time


      assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

      if step % 10 == 0:
        num_examples_per_step = FLAGS.batch_size
        examples_per_sec = num_examples_per_step / float(duration)
        sec_per_batch = float(duration)

        format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                      'sec/batch)')
        print (format_str % (datetime.now(), step, loss_value,
                             examples_per_sec, sec_per_batch))

      if step % 100 == 0:
        summary_str = sess.run(summary_op)
        summary_writer.add_summary(summary_str, step)

      # Save the model checkpoint periodically.
      if step % 1000 == 0 or (step + 1) == FLAGS.max_steps:
        checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
def main_fun(argv, ctx):
  import tensorflow as tf
  import cifar10

  sys.argv = argv
  FLAGS = tf.app.flags.FLAGS
  tf.app.flags.DEFINE_string('train_dir', '/tmp/cifar10_train',
                             """Directory where to write event logs """
                             """and checkpoint.""")
  tf.app.flags.DEFINE_integer('max_steps', 1000000,
                              """Number of batches to run.""")
  tf.app.flags.DEFINE_boolean('log_device_placement', False,
                              """Whether to log device placement.""")
  tf.app.flags.DEFINE_boolean('rdma', False, """Whether to use rdma.""")

  # cifar10.maybe_download_and_extract()
  if tf.gfile.Exists(FLAGS.train_dir):
    tf.gfile.DeleteRecursively(FLAGS.train_dir)
  tf.gfile.MakeDirs(FLAGS.train_dir)

  cluster_spec, server = TFNode.start_cluster_server(ctx, 1, FLAGS.rdma)

  # Train CIFAR-10 for a number of steps.
  with tf.Graph().as_default():
    global_step = tf.contrib.framework.get_or_create_global_step()

    # Get images and labels for CIFAR-10.
    images, labels = cifar10.distorted_inputs()

    # Build a Graph that computes the logits predictions from the
    # inference model.
    logits = cifar10.inference(images)

    # Calculate loss.
    loss = cifar10.loss(logits, labels)

    # Build a Graph that trains the model with one batch of examples and
    # updates the model parameters.
    train_op = cifar10.train(loss, global_step)

    class _LoggerHook(tf.train.SessionRunHook):
      """Logs loss and runtime."""

      def begin(self):
        self._step = -1

      def before_run(self, run_context):
        self._step += 1
        self._start_time = time.time()
        return tf.train.SessionRunArgs(loss)  # Asks for loss value.

      def after_run(self, run_context, run_values):
        duration = time.time() - self._start_time
        loss_value = run_values.results
        if self._step % 10 == 0:
          num_examples_per_step = FLAGS.batch_size
          examples_per_sec = num_examples_per_step / duration
          sec_per_batch = float(duration)

          format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                        'sec/batch)')
          print (format_str % (datetime.now(), self._step, loss_value,
                               examples_per_sec, sec_per_batch))

    with tf.train.MonitoredTrainingSession(
        checkpoint_dir=FLAGS.train_dir,
        hooks=[tf.train.StopAtStepHook(last_step=FLAGS.max_steps),
               tf.train.NanTensorHook(loss),
               _LoggerHook()],
        config=tf.ConfigProto(
            log_device_placement=FLAGS.log_device_placement)) as mon_sess:
      while not mon_sess.should_stop():
        mon_sess.run(train_op)
def train():

  # # debug

  # true_classes = np.ndarray(shape=(FLAGS.batch_size, 1), dtype=int)
  # true_classes.fill(2)


  # # Create a pair of constant ops, add the numpy 
  # # array matrices.
  # true_classes_tf_matrix = tf.constant(true_classes, dtype=tf.int64)

  # # playing with introducing the sampler
  # classes_sampler = tf.nn.learned_unigram_candidate_sampler(
  #                                     true_classes_tf_matrix, 
  #                                     1,                # true_classes
  #                                     5,                # num_sampled
  #                                     False,            # unique
  #                                     10,               # range_max
  #                                     seed=None, 
  #                                     name="my_classes_sampler")

  # # print(classes_sampler)
  # # print("debug")
  # # print(classes_sampler.set_sampler)
  # # exit()

  """Train CIFAR-10 for a number of steps."""
  with tf.Graph().as_default():
    global_step = tf.Variable(0, trainable=False)

    # Get images and labels for CIFAR-10.
    images, labels = cifar10.distorted_inputs()

    # print("images")
    # print(images)
    # images = tf.Print(images, [images])
    # print()
    # print(images[1])

    print("------------------- train calling interference ---------------------")
    print(cifar10.__file__)

    # Build a Graph that computes the logits predictions from the
    # inference model.
    logits = cifar10.inference(images)

    # Calculate loss.
    loss = cifar10.loss(logits, labels)

    # Build a Graph that trains the model with one batch of examples and
    # updates the model parameters.
    train_op = cifar10.train(loss, global_step)

    # Create a saver.
    saver = tf.train.Saver(tf.all_variables())

    # Build the summary operation based on the TF collection of Summaries.
    summary_op = tf.merge_all_summaries()

    # Build an initialization operation to run below.
    init = tf.initialize_all_variables()

    # Start running operations on the Graph.
    sess = tf.Session(config=tf.ConfigProto(
        log_device_placement=FLAGS.log_device_placement))
    sess.run(init)

    # Start the queue runners.
    tf.train.start_queue_runners(sess=sess)

    summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph)

    for step in xrange(FLAGS.max_steps):

      # manually load the contents of images and labels
      # before calling this sess.run()
      # 1. have Cifar10 dataset in memory
      # 2. create a mini-batch
      # 3. set the placeholders/vars to the the mini-batch data
      # 4. run one forward-backward step

      # print("training step: " + str(step))

      start_time = time.time()
      _, loss_value = sess.run([train_op, loss])
      duration = time.time() - start_time

      assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

      if step % 10 == 0:
        num_examples_per_step = FLAGS.batch_size
        examples_per_sec = num_examples_per_step / duration
        sec_per_batch = float(duration)

        format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                      'sec/batch)')
        print (format_str % (datetime.now(), step, loss_value,
                             examples_per_sec, sec_per_batch))

      # debug, temp change, go back to the one below
      summary_str = sess.run(summary_op)
      # print("summary: " + summary_str)
      summary_writer.add_summary(summary_str, step)
      summary_writer.flush()

      # if step % 100 == 0:
      #   summary_str = sess.run(summary_op)
      #   # print("summary: " + summary_str)
      #   summary_writer.add_summary(summary_str, step)
      #   summary_writer.flush()

      # Save the model checkpoint periodically.
      if step % 1000 == 0 or (step + 1) == FLAGS.max_steps:
        checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
def train():
    """Train CIFAR-10 for a number of steps."""
    with tf.Graph().as_default():
        global_step = tf.contrib.framework.get_or_create_global_step()

        # Get images and labels for CIFAR-10.
        images, labels = cifar10.distorted_inputs()

        # Build a Graph that computes the logits predictions from the
        # inference model.
        if tfFLAGS.network == 1:
            images, labels = cifar10.distorted_inputs()
            logits, fc1_w, fc1_b, fc2_w, fc2_b = MyModel.inference(images)
        else:
            images, labels = cifar10.distorted_inputs()
            logits, fc1_w, fc1_b, fc2_w, fc2_b = MyModel2.inference(images)

        # Calculate loss.
        loss = cifar10.loss(logits, labels)

            # L2 regularization for the fully connected parameters.
        regularizers = (tf.nn.l2_loss(fc1_w) + tf.nn.l2_loss(fc1_b) + tf.nn.l2_loss(fc2_w) + tf.nn.l2_loss(fc2_b))

        # Add the regularization term to the loss.
        loss += 5e-4 * regularizers

        # Build a Graph that trains the model with one batch of examples and
        # updates the model parameters.
        train_op = cifar10.train(loss, global_step)

        class _LoggerHook(tf.train.SessionRunHook):
            """Logs loss and runtime."""

            def begin(self):
                self._step = -1
                self._start_time = time.time()

            def before_run(self, run_context):
                self._step += 1
                return tf.train.SessionRunArgs(loss)    # Asks for loss value.

            def after_run(self, run_context, run_values):
                if self._step % tfFLAGS.log_frequency == 0:
                    current_time = time.time()
                    duration = current_time - self._start_time
                    self._start_time = current_time

                    loss_value = run_values.results
                    examples_per_sec = tfFLAGS.log_frequency * tfFLAGS.batch_size / duration
                    sec_per_batch = float(duration / tfFLAGS.log_frequency)

                    format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                                                'sec/batch)')
                    print_(format_str % (datetime.now(), self._step, loss_value, examples_per_sec, sec_per_batch))
        
        texts = ['conv1:', 'conv1Biases:', 'conv2:', 'conv2Biases:', 'local3:', 'local3Biases:', 'local4:', 'local4Biases:', 'softmax:', 'softmaxBiases:']
        total_parameters = 0; count = 0
        for variable in tf.trainable_variables():
            variable_parametes = 1
            for dim in variable.get_shape():
                    variable_parametes *= dim.value
            print('Number of hidden parameters of ' + texts[count], variable_parametes)
            total_parameters += variable_parametes
            count += 1
        print('Total Number of hidden parameters:', total_parameters)

        with tf.train.MonitoredTrainingSession(checkpoint_dir=tfFLAGS.train_dir,
                hooks=[tf.train.StopAtStepHook(last_step=tfFLAGS.max_steps), tf.train.NanTensorHook(loss),_LoggerHook()],
                config=tf.ConfigProto( device_count = {'GPU': 0}, log_device_placement=tfFLAGS.log_device_placement)) as mon_sess:
            while not mon_sess.should_stop():
                mon_sess.run(train_op)
def main(_):

    class _LoggerHook(tf.train.SessionRunHook):
        """Logs loss and runtime."""

        def begin(self):
            self._step = -1

        def before_run(self, run_context):
            self._step += 1
            self._start_time = time.time()
            return tf.train.SessionRunArgs(loss)  # Asks for loss value.

        def after_run(self, run_context, run_values):
            duration = time.time() - self._start_time
            loss_value = run_values.results
            if self._step % 10 == 0:
                num_examples_per_step = FLAGS.batch_size
                examples_per_sec = num_examples_per_step / duration
                sec_per_batch = float(duration)

                format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                            'sec/batch)')
                print (format_str % (datetime.now(), self._step, loss_value,
                                    examples_per_sec, sec_per_batch))
    ps_hosts = FLAGS.ps_hosts.split(",")
    worker_hosts = FLAGS.worker_hosts.split(",")

    # Create a cluster from the parameter server and worker hosts.
    cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts})

    # Create and start a server for the local task.
    server = tf.train.Server(cluster,
                            job_name=FLAGS.job_name,
                            task_index=FLAGS.task_index)

    if FLAGS.job_name == "ps":
        server.join()
    elif FLAGS.job_name == "worker":

        # Assigns ops to the local worker by default.
        with tf.device(tf.train.replica_device_setter(
            worker_device="/job:worker/task:%d" % FLAGS.task_index,
            cluster=cluster)):

            global_step = tf.contrib.framework.get_or_create_global_step()

            # Get images and labels for CIFAR-10.
            images, labels = cifar10.distorted_inputs()

            # Build inference Graph.
            logits = cifar10.inference(images)

            # Build the portion of the Graph calculating the losses. Note that we will
            # assemble the total_loss using a custom function below.
            loss = cifar10.loss(logits, labels)

            # Build a Graph that trains the model with one batch of examples and
            # updates the model parameters.
            train_op = cifar10.train(loss,global_step)

        # The StopAtStepHook handles stopping after running given steps.
        hooks=[tf.train.StopAtStepHook(last_step=FLAGS.max_steps), _LoggerHook()]

        # The MonitoredTrainingSession takes care of session initialization,
        # restoring from a checkpoint, saving to a checkpoint, and closing when done
        # or an error occurs.
        with tf.train.MonitoredTrainingSession(master=server.target,
                                                is_chief=(FLAGS.task_index == 0),
                                                checkpoint_dir=FLAGS.train_dir,
                                                save_checkpoint_secs=60,
                                                hooks=hooks) as mon_sess:
            while not mon_sess.should_stop():
                # Run a training step asynchronously.
                # See `tf.train.SyncReplicasOptimizer` for additional details on how to
                # perform *synchronous* training.
                # mon_sess.run handles AbortedError in case of preempted PS.
                mon_sess.run(train_op)
def train():
  """Train CIFAR-10 for a number of steps."""
  with tf.Graph().as_default():

    global_step = tf.Variable(0, trainable=False)

    # Get images and labels for CIFAR-10.
    # images, labels = cifar10.standard_distorted_inputs()
    inputs = cifar10.ram_inputs(unit_variance=True, is_train=True)
    images = inputs['images']
    labels = inputs['labels']

    # Batch generator
    batcher = cifar10.Cifar10BatchGenerator(
        inputs['data_images'], inputs['data_labels'], True,
        FLAGS.max_epochs)

    # Build a Graph that computes the logits predictions from the
    # inference model.
    logits = cifar10.inference(images, 3, use_batchnorm=True,
        use_nrelu=False, id_decay=False, add_shortcuts=True, is_train=True)

    # Calculate loss.
    loss = cifar10.loss(logits, labels)

    # Build a Graph that trains the model with one batch of examples and
    # updates the model parameters.
    train_op = cifar10.train(loss, global_step)

    # Create a saver.
    saver = tf.train.Saver(tf.all_variables())

    # Build the summary operation based on the TF collection of Summaries.
    summary_op = tf.merge_all_summaries()

    # Build an initialization operation to run below.
    init = tf.initialize_all_variables()

    # Start running operations on the Graph.
    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.5)

    sess = tf.Session(config=tf.ConfigProto(
        log_device_placement=FLAGS.log_device_placement,
        gpu_options=gpu_options))
  
    sess.run(init)

    # Start the queue runners.
    tf.train.start_queue_runners(sess=sess)

    summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph)

    step = -1
    while not batcher.is_done():
      step += 1

      batch_im, batch_labs = batcher.next_batch()
      feed_dict = {
          inputs['images_pl']: batch_im,
          inputs['labels_pl']: batch_labs,
        }

      start_time = time.time()
      _, loss_value = sess.run([train_op, loss], feed_dict=feed_dict)

      duration = time.time() - start_time

      assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

      if step % 10 == 0:
        num_examples_per_step = FLAGS.batch_size
        examples_per_sec = num_examples_per_step / duration
        sec_per_batch = float(duration)

        format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                      'sec/batch)')
        print (format_str % (datetime.now(), step, loss_value,
                             examples_per_sec, sec_per_batch))

      if step % 10 == 0:
        summary_str = sess.run(summary_op, feed_dict=feed_dict)
        summary_writer.add_summary(summary_str, step)

      # Save the model checkpoint periodically.
      if step % 10 == 0 or batcher.is_done():
        checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)