Esempio n. 1
0
def train():
    with tf.Graph().as_default():
        global_step = tf.Variable(0, trainable=False)

        images, labels = cifar10.distorted_inputs()

        logits = cifar10_resnet(images)

        loss = cifar10.loss(logits, labels)

        train_op = cifar10.train(loss, global_step)

        summary_op = tf.merge_all_summaries()

        init = tf.initialize_all_variables()

        sess = tf.Session(config=tf.ConfigProto(
            log_device_placement=FLAGS.log_device_placement))
        sess.run(init)

        tf.train.start_queue_runners(sess=sess)

        for step in xrange(FLAGS.max_steps):
            _, loss_value = sess.run([train_op, loss])

            if step % 10 == 0:
                print 'step %d, loss = %.3f' % (step, loss_value)
def train():
  """Train CIFAR-10 for a number of steps."""
  with tf.Graph().as_default():
    global_step = tf.Variable(0, trainable=False)

    # Get images and labels for CIFAR-10.
    images, labels = cifar10.distorted_inputs()

    # Build a Graph that computes the logits predictions from the
    # inference model.
    logits = cifar10.inference6(images)

    # Calculate loss.
    loss = cifar10.loss(logits, labels)

    # Build a Graph that trains the model with one batch of examples and
    # updates the model parameters.
    train_op = cifar10.train(loss, global_step)

    # Create a saver.
    saver = tf.train.Saver(tf.all_variables())

    # Build the summary operation based on the TF collection of Summaries.
    summary_op = tf.merge_all_summaries()

    # Build an initialization operation to run below.
    init = tf.initialize_all_variables()

    # Start running operations on the Graph.
    sess = tf.Session(config=tf.ConfigProto(
        log_device_placement=FLAGS.log_device_placement))
    sess.run(init)

    # Start the queue runners.
    tf.train.start_queue_runners(sess=sess)

    summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph)

    for step in xrange(FLAGS.max_steps):
      start_time = time.time()
      _, loss_value = sess.run([train_op, loss])
      duration = time.time() - start_time

      assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

      if step % 10 == 0:
        num_examples_per_step = FLAGS.batch_size
        examples_per_sec = num_examples_per_step / duration
        sec_per_batch = float(duration)

        format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                      'sec/batch)')
        print (format_str % (datetime.now(), step, loss_value,
                             examples_per_sec, sec_per_batch))

      if step % 100 == 0:
        summary_str = sess.run(summary_op)
        summary_writer.add_summary(summary_str, step)
        checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
Esempio n. 3
0
def train():
    """Train CIFAR-10 for a number of steps."""
    with tf.Graph().as_default():
        global_step = tf.contrib.framework.get_or_create_global_step()

        # Get images and labels for CIFAR-10.
        images, labels = cifar10.distorted_inputs()
        print('Finished getting images & labels')

        # Build a Graph that computes the logits predictions from the
        # inference model.
        logits = modified_inference(images)
        print('Finished building inference graph')

        # Calculate loss.
        loss = cifar10.loss(logits, labels)
        print('Finished building loss graph')

        # Build a Graph that trains the model with one batch of examples and
        # updates the model parameters.
        train_op = cifar10.train(loss, global_step)
        print('Finished building train graph')

        class _LoggerHook(tf.train.SessionRunHook):
            """Logs loss and runtime."""
            def begin(self):
                self._step = -1

            def before_run(self, run_context):
                self._step += 1
                self._start_time = time.time()
                return tf.train.SessionRunArgs(loss)  # Asks for loss value.

            def after_run(self, run_context, run_values):
                duration = time.time() - self._start_time
                loss_value = run_values.results
                if self._step % 10 == 0:
                    num_examples_per_step = FLAGS.batch_size
                    examples_per_sec = num_examples_per_step / duration
                    sec_per_batch = float(duration)

                    format_str = (
                        '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                        'sec/batch)')
                    print(format_str % (datetime.now(), self._step, loss_value,
                                        examples_per_sec, sec_per_batch))

        with tf.train.MonitoredTrainingSession(
                checkpoint_dir=FLAGS.train_dir,
                hooks=[
                    tf.train.StopAtStepHook(last_step=FLAGS.max_steps),
                    tf.train.NanTensorHook(loss),
                    _LoggerHook()
                ],
                config=tf.ConfigProto(log_device_placement=FLAGS.
                                      log_device_placement)) as mon_sess:
            print('Hooks attached, starting training')
            while not mon_sess.should_stop():
                mon_sess.run(train_op)
Esempio n. 4
0
def train():
  """Train CIFAR-10 for a number of steps."""
  with tf.Graph().as_default():
    global_step = tf.contrib.framework.get_or_create_global_step()

    # Get images and labels for CIFAR-10.
    images, labels = cifar10.distorted_inputs()
    #images, labels = cifar10.inputs()

    # Build a Graph that computes the logits predictions from the
    # inference model.
    logits = cifar10.inference(images)

    # Calculate loss.
    loss = cifar10.loss(logits, labels)

    # Build a Graph that trains the model with one batch of examples and
    # updates the model parameters.
    train_op = cifar10.train(loss, global_step)

    class _LoggerHook(tf.train.SessionRunHook):
      """Logs loss and runtime."""

      def begin(self):
        self._step = -1

      def before_run(self, run_context):
        self._step += 1
        self._start_time = time.time()
        return tf.train.SessionRunArgs(loss)  # Asks for loss value.

      def after_run(self, run_context, run_values):
        duration = time.time() - self._start_time
        loss_value = run_values.results
        if self._step % 10 == 0:
          num_examples_per_step = FLAGS.batch_size
          examples_per_sec = num_examples_per_step / duration
          sec_per_batch = float(duration)

          format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                        'sec/batch)')
          print (format_str % (datetime.now(), self._step, loss_value,
                               examples_per_sec, sec_per_batch))

    with tf.train.MonitoredTrainingSession(
        checkpoint_dir=FLAGS.train_dir,
        hooks=[tf.train.StopAtStepHook(last_step=FLAGS.max_steps),
               tf.train.NanTensorHook(loss),
               _LoggerHook()],
        config=tf.ConfigProto(
            log_device_placement=FLAGS.log_device_placement)) as mon_sess:
      while not mon_sess.should_stop():
        mon_sess.run(train_op)
def train():
  """Train CIFAR-10 for a number of steps."""
  with tf.Graph().as_default():
    global_step = tf.Variable(0, trainable=False)

    images, labels = cifar10.distorted_inputs()

    logits = cifar10.inference(images)
    
    ## EDIT: Softmax activation
    softmax = tf.nn.softmax(logits)

    loss = cifar10.loss(softmax, labels)
    train_op = cifar10.train(loss, global_step)
    saver = tf.train.Saver(tf.all_variables())
    summary_op = tf.merge_all_summaries()
    init = tf.initialize_all_variables()


    sess = tf.Session(config=tf.ConfigProto(
        log_device_placement=FLAGS.log_device_placement))
    sess.run(init)
    
    tf.train.start_queue_runners(sess=sess)

    summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph)

    for step in xrange(FLAGS.max_steps):
      start_time = time.time()
      _, loss_value = sess.run([train_op, loss])
      duration = time.time() - start_time

      assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

      if step % 10 == 0:
        num_examples_per_step = FLAGS.batch_size
        examples_per_sec = num_examples_per_step / duration
        sec_per_batch = float(duration)

        format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                      'sec/batch)')
        print (format_str % (datetime.now(), step, loss_value,
                             examples_per_sec, sec_per_batch))

      if step % 100 == 0:
        summary_str = sess.run(summary_op)
        summary_writer.add_summary(summary_str, step)

      # Save the model checkpoint periodically.
      if step % 1000 == 0 or (step + 1) == FLAGS.max_steps:
        checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
Esempio n. 6
0
def train():
  """Train CIFAR-10 for a number of steps."""
  with tf.Graph().as_default():
    global_step = tf.Variable(0, trainable=False)
    # Get images and labels for CIFAR-10.
    images, labels = cifar10.distorted_inputs()
    print(labels)
    # Build a Graph that computes the logits predictions from the
    # inference model.
    logits = cifar10.inference(images)
    # Calculate loss.
    loss = cifar10.loss(logits, labels)
    # Build a Graph that trains the model with one batch of examples and
    # updates the model parameters.
    train_op = cifar10.train(loss, global_step)
    # Create a saver.
    saver = tf.train.Saver(tf.all_variables())
    # Build the summary operation based on the TF collection of Summaries.
    summary_op = tf.merge_all_summaries()
    # Build an initialization operation to run below.
    init = tf.initialize_all_variables()
    # Start running operations on the Graph.
    sess = tf.Session(config=tf.ConfigProto(
        log_device_placement=FLAGS.log_device_placement))
    sess.run(init)
    # Start the queue runners.
    tf.train.start_queue_runners(sess=sess)
    summary_writer = tf.train.SummaryWriter(FLAGS.train_dir,
                                            graph_def=sess.graph_def)
    for step in xrange(FLAGS.max_steps):
      start_time = time.time()
      _, loss_value = sess.run([train_op, loss])
      duration = time.time() - start_time
      assert not np.isnan(loss_value), 'Model diverged with loss = NaN'
      if step % 10 == 0:
        num_examples_per_step = FLAGS.batch_size
        examples_per_sec = num_examples_per_step / duration
        sec_per_batch = float(duration)
        format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                      'sec/batch)')
        print (format_str % (datetime.now(), step, loss_value,
                             examples_per_sec, sec_per_batch))
      if step % 100 == 0:
        summary_str = sess.run(summary_op)
        summary_writer.add_summary(summary_str, step)
      # Save the model checkpoint periodically.
      if step % 10 == 0 or (step + 1) == FLAGS.max_steps:
        checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
        print("module save")
        saver.save(sess, checkpoint_path, global_step=step)
def train():
  """Train CIFAR-10 for a number of steps."""
  with tf.Graph().as_default():
    global_step = tf.Variable(0, trainable=False)

    #GETTING THE TRAINING IMAGES
    images, labels = cifar10.distorted_inputs()
    # DATA FOR GRAPH.
    logits = cifar10.inference(images)
    # LOSS FUNCTION
    loss = cifar10.loss(logits, labels)

    # CREATING AND RUNNING A TENSORBOARD GRAPH
    train_op = cifar10.train(loss, global_step)
    saver = tf.train.Saver(tf.all_variables())
    summary_op = tf.merge_all_summaries()
    init = tf.initialize_all_variables()
    sess = tf.Session(config=tf.ConfigProto(
        log_device_placement=FLAGS.log_device_placement))
    sess.run(init)

    # START THE QUEUE
    tf.train.start_queue_runners(sess=sess)

    summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph)

    for step in xrange(FLAGS.max_steps):
      start_time = time.time()
      _, loss_value = sess.run([train_op, loss])
      duration = time.time() - start_time
      assert not np.isnan(loss_value), 'Model diverged with loss = NaN'
      if step % 10 == 0:
        num_examples_per_step = FLAGS.batch_size
        examples_per_sec = num_examples_per_step / duration
        sec_per_batch = float(duration)
        format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                      'sec/batch)')
        print (format_str % (datetime.now(), step, loss_value,
                             examples_per_sec, sec_per_batch))
      if step % 100 == 0:
        summary_str = sess.run(summary_op)
        summary_writer.add_summary(summary_str, step)
      # SAVE CHECKPOINT TO EVALUATE PERIODICALLY
      if step % 1000 == 0 or (step + 1) == FLAGS.max_steps:
        checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
Esempio n. 8
0
def train():
    # ops
    global_step = tf.Variable(0, trainable=False)
    images, labels = cifar10.distorted_inputs()
    logits = cifar10.inference(tf.image.resize_images(images, cifar10.IMAGE_SIZE, cifar10.IMAGE_SIZE))
    loss = cifar10.loss(logits, labels)
    train_op = cifar10.train(loss, global_step)
    summary_op = tf.merge_all_summaries()

    with tf.Session() as sess:
        saver = tf.train.Saver(tf.all_variables(), max_to_keep=21)
        summary_writer = tf.train.SummaryWriter(FLAGS.train_dir)

        # restore or initialize variables
        ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir)
        if ckpt and ckpt.model_checkpoint_path:
            saver.restore(sess, ckpt.model_checkpoint_path)
        else:
            sess.run(tf.initialize_all_variables())

        # Start the queue runners.
        tf.train.start_queue_runners(sess=sess)

        start = sess.run(global_step)
        for step in xrange(start, FLAGS.max_steps):
            start_time = time.time()
            _, loss_value = sess.run([train_op, loss])
            duration = time.time() - start_time

            assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

            print '%d: %f (%.3f sec/batch)' % (step, loss_value, duration)

            if step % 100 == 0:
                summary_str = sess.run(summary_op)
                summary_writer.add_summary(summary_str, step)
            if step % 500 == 0 or (step + 1) == FLAGS.max_steps:
                checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
def train():
    ps_hosts = FLAGS.ps_hosts.split(',')
    worker_hosts = FLAGS.worker_hosts.split(',')
    print('PS hosts are: %s' % ps_hosts)
    print('Worker hosts are: %s' % worker_hosts)

    server = tf.train.Server({
        'ps': ps_hosts,
        'worker': worker_hosts
    },
                             job_name=FLAGS.job_name,
                             task_index=FLAGS.task_id)

    if FLAGS.job_name == 'ps':
        server.join()

    is_chief = (FLAGS.task_id == 0)
    if is_chief:
        if tf.gfile.Exists(FLAGS.train_dir):
            tf.gfile.DeleteRecursively(FLAGS.train_dir)
        tf.gfile.MakeDirs(FLAGS.train_dir)

    device_setter = tf.train.replica_device_setter(ps_tasks=1)
    with tf.device('/job:worker/task:%d' % FLAGS.task_id):
        with tf.device(device_setter):
            global_step = tf.Variable(0, trainable=False)

            # Get images and labels for CIFAR-10.
            images, labels = cifar10.distorted_inputs()

            # Build a Graph that computes the logits predictions from the
            # inference model.
            logits = cifar10.inference(images)

            # Calculate loss.
            loss = cifar10.loss(logits, labels)
            train_op = cifar10.train(loss, global_step)

            saver = tf.train.Saver()
            # We run the summaries in the same thread as the training operations by
            # passing in None for summary_op to avoid a summary_thread being started.
            # Running summaries and training operations in parallel could run out of
            # GPU memory.
            sv = tf.train.Supervisor(is_chief=is_chief,
                                     logdir=FLAGS.train_dir,
                                     init_op=tf.initialize_all_variables(),
                                     summary_op=tf.merge_all_summaries(),
                                     global_step=global_step,
                                     saver=saver,
                                     save_model_secs=60)

            tf.logging.info('%s Supervisor' % datetime.now())

            sess_config = tf.ConfigProto(
                allow_soft_placement=True,
                log_device_placement=FLAGS.log_device_placement)

            print("Before session init")
            # Get a session.
            sess = sv.prepare_or_wait_for_session(server.target,
                                                  config=sess_config)
            print("Session init done")

            # Start the queue runners.
            queue_runners = tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS)
            sv.start_queue_runners(sess, queue_runners)
            print('Started %d queues for processing input data.' %
                  len(queue_runners))
            """Train CIFAR-10 for a number of steps."""
            for step in xrange(FLAGS.max_steps):
                start_time = time.time()
                _, loss_value, gs = sess.run([train_op, loss, global_step])
                duration = time.time() - start_time

                assert not np.isnan(
                    loss_value), 'Model diverged with loss = NaN'

                if step % 10 == 0:
                    num_examples_per_step = FLAGS.batch_size
                    examples_per_sec = num_examples_per_step / duration
                    sec_per_batch = float(duration)

                    format_str = (
                        '%s: step %d (global_step %d), loss = %.2f (%.1f examples/sec; %.3f sec/batch)'
                    )
                    print(format_str % (datetime.now(), step, gs, loss_value,
                                        examples_per_sec, sec_per_batch))

    if is_chief:
        saver.save(sess,
                   os.path.join(FLAGS.train_dir, 'model.ckpt'),
                   global_step=global_step)
Esempio n. 10
0
def train():
    """Train CIFAR-10 for a number of steps."""
    with tf.Graph().as_default():
        global_step = tf.Variable(0, trainable=False)

        # Get images and labels for CIFAR-10.
        images, labels = cifar10.distorted_inputs()

        # Build a Graph that computes the logits predictions from the
        # inference model.
        logits = cifar10.inference(images)

        # Calculate loss.
        loss = cifar10.loss(logits, labels)

        # Build a Graph that trains the model with one batch of examples and
        # updates the model parameters.
        train_op = cifar10.train(loss, global_step)

        # Create a saver.
        saver = tf.train.Saver(tf.all_variables())

        # Build the summary operation based on the TF collection of Summaries.
        summary_op = tf.merge_all_summaries()

        # Build an initialization operation to run below.
        init = tf.initialize_all_variables()

        # Start running operations on the Graph.
        sess = tf.Session(config=tf.ConfigProto(
            log_device_placement=FLAGS.log_device_placement))
        sess.run(init)

        # Start the queue runners.
        tf.train.start_queue_runners(sess=sess)

        summary_writer = tf.train.SummaryWriter(FLAGS.train_dir)

        summary_writer0 = tf.train.SummaryWriter(FLAGS.train_dir0)
        summary_writer1 = tf.train.SummaryWriter(FLAGS.train_dir1)
        summary_writer2 = tf.train.SummaryWriter(FLAGS.train_dir2)
        summary_writer3 = tf.train.SummaryWriter(FLAGS.train_dir3)
        summary_writer4 = tf.train.SummaryWriter(FLAGS.train_dir4)
        summary_writer5 = tf.train.SummaryWriter(FLAGS.train_dir5)
        summary_writer6 = tf.train.SummaryWriter(FLAGS.train_dir6)
        summary_writer7 = tf.train.SummaryWriter(FLAGS.train_dir7)
        summary_writer8 = tf.train.SummaryWriter(FLAGS.train_dir8)
        summary_writer9 = tf.train.SummaryWriter(FLAGS.train_dir9)
        summary_writer10 = tf.train.SummaryWriter(FLAGS.train_dir10)
        summary_writer11 = tf.train.SummaryWriter(FLAGS.train_dir11)
        summary_writer12 = tf.train.SummaryWriter(FLAGS.train_dir12)
        summary_writer13 = tf.train.SummaryWriter(FLAGS.train_dir13)
        summary_writer14 = tf.train.SummaryWriter(FLAGS.train_dir14)
        summary_writer15 = tf.train.SummaryWriter(FLAGS.train_dir15)
        summary_writer16 = tf.train.SummaryWriter(FLAGS.train_dir16)
        summary_writer17 = tf.train.SummaryWriter(FLAGS.train_dir17)
        summary_writer18 = tf.train.SummaryWriter(FLAGS.train_dir18)
        summary_writer19 = tf.train.SummaryWriter(FLAGS.train_dir19)

        for step in xrange(FLAGS.max_steps):
            start_time = time.time()
            _, loss_value = sess.run([train_op, loss])
            duration = time.time() - start_time

            assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

            if step % 10 == 0:
                num_examples_per_step = FLAGS.batch_size
                examples_per_sec = num_examples_per_step / duration
                sec_per_batch = float(duration)

                format_str = (
                    '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                    'sec/batch)')
                print(format_str % (datetime.now(), step, loss_value,
                                    examples_per_sec, sec_per_batch))

            if step % 100 == 0:
                summary_str = sess.run(summary_op)
                summary_writer.add_summary(summary_str, step)
                summary_writer0.add_summary(summary_str, step)
                summary_writer1.add_summary(summary_str, step)
                summary_writer2.add_summary(summary_str, step)
                summary_writer3.add_summary(summary_str, step)
                summary_writer4.add_summary(summary_str, step)
                summary_writer5.add_summary(summary_str, step)
                summary_writer6.add_summary(summary_str, step)
                summary_writer7.add_summary(summary_str, step)
                summary_writer8.add_summary(summary_str, step)
                summary_writer9.add_summary(summary_str, step)
                summary_writer10.add_summary(summary_str, step)
                summary_writer11.add_summary(summary_str, step)
                summary_writer12.add_summary(summary_str, step)
                summary_writer13.add_summary(summary_str, step)
                summary_writer14.add_summary(summary_str, step)
                summary_writer15.add_summary(summary_str, step)
                summary_writer16.add_summary(summary_str, step)
                summary_writer17.add_summary(summary_str, step)
                summary_writer18.add_summary(summary_str, step)
                summary_writer19.add_summary(summary_str, step)

            # Save the model checkpoint periodically.
            # if step % 1000 == 0 or (step + 1) == FLAGS.max_steps:
            #   checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
            #   saver.save(sess, checkpoint_path, global_step=step/100)

            # hard cord here!!!
            if step == 100:
                checkpoint_path = os.path.join(FLAGS.train_dir0, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)

            if step == 200:
                checkpoint_path = os.path.join(FLAGS.train_dir1, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)

            if step == 300:
                checkpoint_path = os.path.join(FLAGS.train_dir2, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
            if step == 400:
                checkpoint_path = os.path.join(FLAGS.train_dir3, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
            if step == 500:
                checkpoint_path = os.path.join(FLAGS.train_dir4, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
            if step == 600:
                checkpoint_path = os.path.join(FLAGS.train_dir5, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
            if step == 700:
                checkpoint_path = os.path.join(FLAGS.train_dir6, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
            if step == 800:
                checkpoint_path = os.path.join(FLAGS.train_dir7, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
            if step == 900:
                checkpoint_path = os.path.join(FLAGS.train_dir8, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
            if step == 1000:
                checkpoint_path = os.path.join(FLAGS.train_dir9, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
            if step == 1100:
                checkpoint_path = os.path.join(FLAGS.train_dir10, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
            if step == 1200:
                checkpoint_path = os.path.join(FLAGS.train_dir11, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
            if step == 1300:
                checkpoint_path = os.path.join(FLAGS.train_dir12, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
            if step == 1400:
                checkpoint_path = os.path.join(FLAGS.train_dir13, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
            if step == 1500:
                checkpoint_path = os.path.join(FLAGS.train_dir14, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
            if step == 1600:
                checkpoint_path = os.path.join(FLAGS.train_dir15, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
            if step == 1700:
                checkpoint_path = os.path.join(FLAGS.train_dir16, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
            if step == 1800:
                checkpoint_path = os.path.join(FLAGS.train_dir17, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
            if step == 1900:
                checkpoint_path = os.path.join(FLAGS.train_dir18, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
            if step == 2000:
                checkpoint_path = os.path.join(FLAGS.train_dir19, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
Esempio n. 11
0
def train():
  """Train CIFAR-10 for a number of steps."""
  with tf.Graph().as_default():
    global_step = tf.Variable(0, trainable=False)


    # Get images and labels for CIFAR-10.
    images, labels = cifar10.distorted_inputs()

    # Build a Graph that computes the logits predictions from the
    # inference model.
    logits = cifar10.inference(images)

    # Calculate loss.
    loss = cifar10.loss(logits, labels)

    # Build a Graph that trains the model with one batch of examples and
    # updates the model parameters.
    train_op = cifar10.train(loss, global_step)

    # Create a saver.
    saver = tf.train.Saver(tf.all_variables())

    # Build the summary operation based on the TF collection of Summaries.
    summary_op = tf.merge_all_summaries()

    # Build an initialization operation to run below.
    init = tf.initialize_all_variables()

    # Start running operations on the Graph.
    sess = tf.Session(config=tf.ConfigProto(
        log_device_placement=FLAGS.log_device_placement))
    sess.run(init)

    # Start the queue runners.
    tf.train.start_queue_runners(sess=sess)

    summary_writer = tf.train.SummaryWriter(FLAGS.train_dir)

    summary_writer0 = tf.train.SummaryWriter(FLAGS.train_dir0)
    summary_writer1= tf.train.SummaryWriter(FLAGS.train_dir1)
    summary_writer2 = tf.train.SummaryWriter(FLAGS.train_dir2)
    summary_writer3 = tf.train.SummaryWriter(FLAGS.train_dir3)
    summary_writer4 = tf.train.SummaryWriter(FLAGS.train_dir4)
    summary_writer5 = tf.train.SummaryWriter(FLAGS.train_dir5)
    summary_writer6 = tf.train.SummaryWriter(FLAGS.train_dir6)
    summary_writer7 = tf.train.SummaryWriter(FLAGS.train_dir7)
    summary_writer8 = tf.train.SummaryWriter(FLAGS.train_dir8)
    summary_writer9 = tf.train.SummaryWriter(FLAGS.train_dir9)
    summary_writer10 = tf.train.SummaryWriter(FLAGS.train_dir10)
    summary_writer11 = tf.train.SummaryWriter(FLAGS.train_dir11)
    summary_writer12 = tf.train.SummaryWriter(FLAGS.train_dir12)
    summary_writer13 = tf.train.SummaryWriter(FLAGS.train_dir13)
    summary_writer14 = tf.train.SummaryWriter(FLAGS.train_dir14)
    summary_writer15 = tf.train.SummaryWriter(FLAGS.train_dir15)
    summary_writer16 = tf.train.SummaryWriter(FLAGS.train_dir16)
    summary_writer17 = tf.train.SummaryWriter(FLAGS.train_dir17)
    summary_writer18 = tf.train.SummaryWriter(FLAGS.train_dir18)
    summary_writer19 = tf.train.SummaryWriter(FLAGS.train_dir19)
   


    for step in xrange(FLAGS.max_steps):
      start_time = time.time()
      _, loss_value = sess.run([train_op, loss])
      duration = time.time() - start_time

      assert not np.isnan(loss_value), 'Model diverged with loss = NaN'


      if step % 10 == 0:
        num_examples_per_step = FLAGS.batch_size
        examples_per_sec = num_examples_per_step / duration
        sec_per_batch = float(duration)

        format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                      'sec/batch)')
        print (format_str % (datetime.now(), step, loss_value,
                             examples_per_sec, sec_per_batch))

      if step % 100 == 0:
        summary_str = sess.run(summary_op)
        summary_writer.add_summary(summary_str, step)
        summary_writer0.add_summary(summary_str, step)
        summary_writer1.add_summary(summary_str, step)
        summary_writer2.add_summary(summary_str, step)
        summary_writer3.add_summary(summary_str, step)
        summary_writer4.add_summary(summary_str, step)
        summary_writer5.add_summary(summary_str, step)
        summary_writer6.add_summary(summary_str, step)
        summary_writer7.add_summary(summary_str, step)
        summary_writer8.add_summary(summary_str, step)
        summary_writer9.add_summary(summary_str, step)
        summary_writer10.add_summary(summary_str, step)
        summary_writer11.add_summary(summary_str, step)
        summary_writer12.add_summary(summary_str, step)
        summary_writer13.add_summary(summary_str, step)
        summary_writer14.add_summary(summary_str, step)
        summary_writer15.add_summary(summary_str, step)
        summary_writer16.add_summary(summary_str, step)
        summary_writer17.add_summary(summary_str, step)
        summary_writer18.add_summary(summary_str, step)
        summary_writer19.add_summary(summary_str, step)

      # Save the model checkpoint periodically.
      # if step % 1000 == 0 or (step + 1) == FLAGS.max_steps:
      #   checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
      #   saver.save(sess, checkpoint_path, global_step=step/100)

        # hard cord here!!!
      if step==100:
        checkpoint_path = os.path.join(FLAGS.train_dir0, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)

      if step==200:
        checkpoint_path = os.path.join(FLAGS.train_dir1, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)

      if step==300:
        checkpoint_path = os.path.join(FLAGS.train_dir2, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
      if step==400:
        checkpoint_path = os.path.join(FLAGS.train_dir3, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
      if step==500:
        checkpoint_path = os.path.join(FLAGS.train_dir4, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
      if step==600:
        checkpoint_path = os.path.join(FLAGS.train_dir5, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
      if step==700:
        checkpoint_path = os.path.join(FLAGS.train_dir6, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
      if step==800:
        checkpoint_path = os.path.join(FLAGS.train_dir7, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
      if step==900:
        checkpoint_path = os.path.join(FLAGS.train_dir8, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
      if step==1000:
        checkpoint_path = os.path.join(FLAGS.train_dir9, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
      if step==1100:
        checkpoint_path = os.path.join(FLAGS.train_dir10, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
      if step==1200:
        checkpoint_path = os.path.join(FLAGS.train_dir11, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
      if step==1300:
        checkpoint_path = os.path.join(FLAGS.train_dir12, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
      if step==1400:
        checkpoint_path = os.path.join(FLAGS.train_dir13, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
      if step==1500:
        checkpoint_path = os.path.join(FLAGS.train_dir14, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
      if step==1600:
        checkpoint_path = os.path.join(FLAGS.train_dir15, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
      if step==1700:
        checkpoint_path = os.path.join(FLAGS.train_dir16, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
      if step==1800:
        checkpoint_path = os.path.join(FLAGS.train_dir17, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
      if step==1900:
        checkpoint_path = os.path.join(FLAGS.train_dir18, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
      if step==2000:
        checkpoint_path = os.path.join(FLAGS.train_dir19, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
def train():
    ps_hosts = FLAGS.ps_hosts.split(',')
    worker_hosts = FLAGS.worker_hosts.split(',')
    print ('PS hosts are: %s' % ps_hosts)
    print ('Worker hosts are: %s' % worker_hosts)

    server = tf.train.Server(
        {'ps': ps_hosts, 'worker': worker_hosts},
        job_name = FLAGS.job_name,
        task_index=FLAGS.task_id)

    if FLAGS.job_name == 'ps':
        server.join()

    is_chief = (FLAGS.task_id == 0)
    if is_chief:
        if tf.gfile.Exists(FLAGS.train_dir):
            tf.gfile.DeleteRecursively(FLAGS.train_dir)
        tf.gfile.MakeDirs(FLAGS.train_dir)
  
    device_setter = tf.train.replica_device_setter(ps_tasks=1)
    with tf.device('/job:worker/task:%d' % FLAGS.task_id):
        with tf.device(device_setter):
            global_step = tf.Variable(0, trainable=False)

            # Get images and labels for CIFAR-10.
            images, labels = cifar10.distorted_inputs()

            # Build a Graph that computes the logits predictions from the
            # inference model.
            logits = cifar10.inference(images)

            # Calculate loss.
            loss = cifar10.loss(logits, labels)
            train_op = cifar10.train(loss, global_step)

            saver = tf.train.Saver()
            # We run the summaries in the same thread as the training operations by
            # passing in None for summary_op to avoid a summary_thread being started.
            # Running summaries and training operations in parallel could run out of
            # GPU memory.
            sv = tf.train.Supervisor(is_chief=is_chief,
                                     logdir=FLAGS.train_dir,
                                     init_op=tf.initialize_all_variables(),
                                     summary_op=tf.merge_all_summaries(),
                                     global_step=global_step,
                                     saver=saver,
                                     save_model_secs=60)

            tf.logging.info('%s Supervisor' % datetime.now())

            sess_config = tf.ConfigProto(allow_soft_placement=True,
                                         log_device_placement=FLAGS.log_device_placement)

            print ("Before session init")
            # Get a session.
            sess = sv.prepare_or_wait_for_session(server.target, config=sess_config)
            print ("Session init done")

            # Start the queue runners.
            queue_runners = tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS)
            sv.start_queue_runners(sess, queue_runners)
            print ('Started %d queues for processing input data.' % len(queue_runners))
  
            """Train CIFAR-10 for a number of steps."""
            for step in xrange(FLAGS.max_steps):
                start_time = time.time()
                _, loss_value, gs = sess.run([train_op, loss, global_step])
                duration = time.time() - start_time

                assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

                if step % 10 == 0:
                    num_examples_per_step = FLAGS.batch_size
                    examples_per_sec = num_examples_per_step / duration
                    sec_per_batch = float(duration)

                    format_str = ('%s: step %d (global_step %d), loss = %.2f (%.1f examples/sec; %.3f sec/batch)')
                    print (format_str % (datetime.now(), step, gs, loss_value, examples_per_sec, sec_per_batch))

    if is_chief:
        saver.save(sess, os.path.join(FLAGS.train_dir, 'model.ckpt'), global_step=global_step)
Esempio n. 13
0
def train():
  """Train CIFAR-10 for a number of steps."""
  with tf.Graph().as_default():
    
    global_step = tf.Variable(0, trainable=False)

    # Get images and labels for CIFAR-10.
    #with tf.device('/gpu:%d' % FLAGS.gpu_number):
    images, labels = cifar10.distorted_inputs()

    # Build a Graph that computes the logits predictions from the
    # inference model.
    logits = cifar10.inference(images)
    loss_per_batch = cifar10.loss_per_batch(logits, labels)
    # Calculate loss.
    loss = cifar10.loss(logits, labels)

    # Build a Graph that trains the model with one batch of examples and
    # updates the model parameters.
    train_op = cifar10.train(loss, global_step, FLAGS.gpu_number)

    # Create a saver.
    saver = tf.train.Saver(tf.all_variables(), max_to_keep=None)

    # Build the summary operation based on the TF collection of Summaries.
    summary_op = tf.merge_all_summaries()

    # Build an initialization operation to run below.
    init = tf.initialize_all_variables()

    # Start running operations on the Graph.
    config = tf.ConfigProto()
    config.gpu_options.allow_growth=True
    config.allow_soft_placement=True
    config.log_device_placement=FLAGS.log_device_placement
    sess = tf.Session(config=config)
    
    tf.train.write_graph(sess.graph_def, FLAGS.train_dir,
                         "cifar10_train.pb", False)
    sess.run(init)

    # Start the queue runners.
    tf.train.start_queue_runners(sess=sess)

    summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph)

    train_start_time = time.time()
    for step in xrange(FLAGS.max_steps):
      start_time = time.time()
      _, loss_value, logits_value, loss_per_batch_value, labels_value = sess.run([train_op, loss, logits, loss_per_batch, labels])
      duration = time.time() - start_time
      #logits_str = print_logits(logits_value, labels_value, loss_per_batch_value)
      
      #with open(os.path.join(FLAGS.train_dir, 'logits_%d.log' % step),'w') as f:
      #  f.write("%s" % logits_str)

      assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

      if step % 10 == 0:
        num_examples_per_step = FLAGS.batch_size
        examples_per_sec = num_examples_per_step / duration
        sec_per_batch = float(duration)

        format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                      'sec/batch)')
        log_str  = (format_str % (datetime.now(), step, loss_value,
                                  examples_per_sec, sec_per_batch))
        print(log_str)
        with open(os.path.join(FLAGS.train_dir, 'train.log'),'a+') as f:
          f.write("%s\n" % log_str)

      if step % 500 == 0:
        summary_str = sess.run(summary_op)
        summary_writer.add_summary(summary_str, step)
        checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
        save_path = saver.save(sess, checkpoint_path, global_step=step)
    train_duration = time.time() - train_start_time

    log_str = ("Finishing. Training %d batches of %d images took %fs\n" %
               (FLAGS.max_steps, FLAGS.batch_size, float(train_duration)))
    print(log_str)
    with open(os.path.join(FLAGS.train_dir, 'train.log'),'a+') as f:
      f.write("%s" % log_str)
Esempio n. 14
0
def run_training():
    """Train MNIST for a number of steps."""

    # Tell TensorFlow that the model will be built into the default Graph.
    with tf.Graph().as_default():
        global_step = tf.Variable(0, trainable=False)
        # Input images and labels.
        images, labels = inputs(train=True,
                                batch_size=BATCH_SIZE,
                                num_epochs=FLAGS.num_epochs)

        print('images', images)

        logits = calc_inference(images)
        print('logits', logits)
        print('labels', labels)
        # Calculate loss.
        loss = calc_loss(logits, labels)
        print(loss)

        # Build a Graph that trains the model with one batch of examples and
        # updates the model parameters.
        train_op = cifar10.train(loss, global_step)

        # Create a saver.
        saver = tf.train.Saver(tf.all_variables())

        # Build the summary operation based on the TF collection of Summaries.
        summary_op = tf.merge_all_summaries()

        # Build an initialization operation to run below.
        init = tf.initialize_all_variables()

        # Start running operations on the Graph.
        sess = tf.Session(config=tf.ConfigProto(
            log_device_placement=FLAGS.log_device_placement))
        sess.run(init)

        # Start the queue runners.
        tf.train.start_queue_runners(sess=sess)

        summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph)

        for step in xrange(FLAGS.max_steps):
            start_time = time.time()
            print(step)
            print(train_op)
            _, loss_value = sess.run([train_op, loss])
            duration = time.time() - start_time

            assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

            if step % 10 == 0:
                num_examples_per_step = BATCH_SIZE
                examples_per_sec = num_examples_per_step / duration
                sec_per_batch = float(duration)

                format_str = (
                    '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                    'sec/batch)')
                print(format_str % (datetime.now(), step, loss_value,
                                    examples_per_sec, sec_per_batch))

            if step % 100 == 0:
                summary_str = sess.run(summary_op)
                summary_writer.add_summary(summary_str, step)

            # Save the model checkpoint periodically.
            if step % 1000 == 0 or (step + 1) == FLAGS.max_steps:
                checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
Esempio n. 15
0
def train():
    """Train CIFAR-10 for a number of steps."""
    with tf.Graph().as_default():
        global_step = tf.Variable(0, trainable=False)

        # Get images and labels for CIFAR-10.
        images, labels = cifar10.distorted_inputs()

        # #Adding dropout
        # keep_drop_prob = tf.placeholder(tf.float32)

        # Build a Graph that computes the logits predictions from the
        # inference model.
        logits = cifar10.inference(images)

        # ###########Changes for visualization ###############
        with tf.variable_scope('conv1') as scope_conv:
            tf.get_variable_scope().reuse_variables()
            weights = tf.get_variable('weights')
            grid_x = grid_y = 8  # to get a square grid for 64 conv1 features
            grid = put_kernels_on_grid(weights, (grid_y, grid_x))
            tf.image_summary('conv1/features', grid, max_images=1)
        # ####################################################


        # Calculate loss.
        loss = cifar10.loss(logits, labels)

        # Build a Graph that trains the model with one batch of examples and
        # updates the model parameters.
        train_op = cifar10.train(loss, global_step)

        # Create a saver.
        saver = tf.train.Saver(tf.all_variables())

        # Build the summary operation based on the TF collection of Summaries.
        summary_op = tf.merge_all_summaries()

        # Build an initialization operation to run below.
        init = tf.initialize_all_variables()

        # Start running operations on the Graph.
        sess = tf.Session(config=tf.ConfigProto(
            log_device_placement=FLAGS.log_device_placement))
        sess.run(init)

        # Start the queue runners.
        tf.train.start_queue_runners(sess=sess)

        summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph)

        for step in range(FLAGS.max_steps):
            start_time = time.time()
            _, loss_value = sess.run([train_op, loss])
            duration = time.time() - start_time

            assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

            if step % 10 == 0:
                num_examples_per_step = FLAGS.batch_size
                examples_per_sec = num_examples_per_step / duration
                sec_per_batch = float(duration)

                format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                              'sec/batch)')
                print(format_str % (datetime.now(), step, loss_value,
                                    examples_per_sec, sec_per_batch))

            if step % 100 == 0:
                summary_str = sess.run(summary_op)
                summary_writer.add_summary(summary_str, step)

            # Save the model checkpoint periodically.
            if step % 1000 == 0 or (step + 1) == FLAGS.max_steps:
                checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
Esempio n. 16
0
def train():
    """Train CIFAR-10 for a number of steps."""
    with tf.Graph().as_default():
        with tf.variable_scope("model") as scope:
            global_step = tf.Variable(0, trainable=False)

            # Get images and labels for CIFAR-10.
            images, labels = cifar10.distorted_inputs()
            images_eval, labels_eval = cifar10.inputs(eval_data=True)

            # Build a Graph that computes the logits predictions from the
            # inference model.
            logits = cifar10.inference(images)
            scope.reuse_variables()
            logits_eval = cifar10.inference(images_eval)

            # Calculate loss.
            loss = cifar10.loss(logits, labels)

            # For evaluation
            top_k = tf.nn.in_top_k(logits, labels, 1)
            top_k_eval = tf.nn.in_top_k(logits_eval, labels_eval, 1)

            # Add precision summary
            summary_train_prec = tf.placeholder(tf.float32)
            summary_eval_prec = tf.placeholder(tf.float32)
            tf.scalar_summary('precision/train', summary_train_prec)
            tf.scalar_summary('precision/eval', summary_eval_prec)

            # Build a Graph that trains the model with one batch of examples and
            # updates the model parameters.
            train_op = cifar10.train(loss, global_step)

            # Create a saver.
            saver = tf.train.Saver(tf.all_variables())

            # Build the summary operation based on the TF collection of Summaries.
            summary_op = tf.merge_all_summaries()

            # Build an initialization operation to run below.
            init = tf.initialize_all_variables()

            # Start running operations on the Graph.
            sess = tf.Session(config=tf.ConfigProto(
                log_device_placement=FLAGS.log_device_placement))
            sess.run(init)

            # Start the queue runners.
            tf.train.start_queue_runners(sess=sess)

            summary_writer = tf.train.SummaryWriter(FLAGS.train_dir,
                                                    graph_def=sess.graph_def)

            for step in xrange(FLAGS.max_steps):
                start_time = time.time()
                _, loss_value = sess.run([train_op, loss])
                duration = time.time() - start_time

                assert not np.isnan(
                    loss_value), 'Model diverged with loss = NaN'

                if step % 10 == 0:
                    num_examples_per_step = FLAGS.batch_size
                    examples_per_sec = num_examples_per_step / duration
                    sec_per_batch = float(duration)

                    format_str = (
                        '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                        'sec/batch)')
                    print(format_str % (datetime.now(), step, loss_value,
                                        examples_per_sec, sec_per_batch))

                EVAL_STEP = 10
                EVAL_NUM_EXAMPLES = 1024
                if step % EVAL_STEP == 0:
                    prec_train = evaluate_set(sess, top_k, EVAL_NUM_EXAMPLES)
                    prec_eval = evaluate_set(sess, top_k_eval,
                                             EVAL_NUM_EXAMPLES)
                    print('%s: precision train = %.3f' %
                          (datetime.now(), prec_train))
                    print('%s: precision eval  = %.3f' %
                          (datetime.now(), prec_eval))

                if step % 100 == 0:
                    summary_str = sess.run(summary_op,
                                           feed_dict={
                                               summary_train_prec: prec_train,
                                               summary_eval_prec: prec_eval
                                           })
                    summary_writer.add_summary(summary_str, step)

                # Save the model checkpoint periodically.
                if step % 1000 == 0 or (step + 1) == FLAGS.max_steps:
                    checkpoint_path = os.path.join(FLAGS.train_dir,
                                                   'model.ckpt')
                    saver.save(sess, checkpoint_path, global_step=step)
Esempio n. 17
0
def train():
    """Train CIFAR-10 for a number of steps."""
    with tf.Graph().as_default():
        global_step = tf.Variable(0, trainable=False)

        # Get images and labels for CIFAR-10.
        images, labels = cifar10.distorted_inputs()

        # Build a Graph that computes the logits predictions from the
        # inference model.
        logits = cifar10.inference(images)

        # Calculate loss.
        loss = cifar10.loss(logits, labels)

        # Build a Graph that trains the model with one batch of examples and
        # updates the model parameters.
        train_op = cifar10.train(loss, global_step)

        # Create a saver.
        saver = tf.train.Saver(tf.all_variables())

        # Build the summary operation based on the TF collection of Summaries.
        summary_op = tf.merge_all_summaries()

        # Build an initialization operation to run below.
        init = tf.initialize_all_variables()

        # Start running operations on the Graph.
        sess = tf.Session(config=tf.ConfigProto(
            log_device_placement=FLAGS.log_device_placement))
        sess.run(init)

        if FLAGS.checkpoint_dir is not None:
            ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir)
            print("checkpoint path is %s" % ckpt.model_checkpoint_path)
            tf.train.Saver().restore(sess, ckpt.model_checkpoint_path)

        # Start the queue runners.
        print("FLAGS.checkpoint_dir is %s" % FLAGS.checkpoint_dir)
        tf.train.start_queue_runners(sess=sess)
        summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph)

        cur_step = sess.run(global_step)
        print("current step is %s" % cur_step)
        interrupt_check_duration = 0.0
        elapsed_time = time.time()
        flag = 0
        for step in xrange(cur_step, FLAGS.max_steps):
            start_time = time.time()
            _, loss_value = sess.run([train_op, loss])
            duration = time.time() - start_time
            interrupt_check_duration += duration
            if float(interrupt_check_duration) > 5.0:
                print("checking for interruption: %s", interrupt_check_duration)
                if decision_for_migration():
                    print("have to migrate")
                    checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
                    print("checkpoint path is %s" % checkpoint_path)
                    saver.save(sess, checkpoint_path, global_step=step)
                    random_id = generate_random_prefix()
                    start_new_instance(checkpoint_path, step, random_id)
                    upload_checkpoint_to_s3(checkpoint_path, step, "mj-bucket-1", random_id)
                    break
                else:
                    print("not interrupted")
                interrupt_check_duration = 0.0
            assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

            if step % 10 == 0:
                num_examples_per_step = FLAGS.batch_size
                examples_per_sec = num_examples_per_step / duration
                sec_per_batch = float(duration)

                format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                              'sec/batch)')
                print(format_str % (datetime.now(), step, loss_value,
                                    examples_per_sec, sec_per_batch))

            if step % 100 == 0:
                summary_str = sess.run(summary_op)
                summary_writer.add_summary(summary_str, step)

            # Save the model checkpoint periodically.
            if step % 1000 == 0 or (step + 1) == FLAGS.max_steps:
                checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
            elapsed = (int(time.time() - elapsed_time))
            if elapsed % 300 == 0 and flag == 0:
                print("uploading current status")
                uploading_current_status_to_rds(step)
                flag = 1
            elif elapsed % 300 != 0 and flag == 1:
                flag = 0
Esempio n. 18
0
			# Get images and labels for CIFAR-10.
			images, labels = cifar10.distorted_inputs()
			test_images, test_labels = cifar10.inputs(eval_data='test')

			# Build a Graph that computes the logits predictions from the
			# inference model.
			logits = test_model.predict(images)
			logit_test = test_model.predict(test_images)

			# Calculate loss.
			loss = cifar10.loss(logits, labels)

			# Build a Graph that trains the model with one batch of examples and
			# updates the model parameters.
			train_op = cifar10.train(loss, global_step)


			top_k_op = tf.nn.in_top_k(logit_test, test_labels, 1)


			# Build an initialization operation to run below.
			init = tf.initialize_all_variables()

			# Start running operations on the Graph.
			#sess = tf.Session(config=tf.ConfigProto(
			#    log_device_placement=FLAGS.log_device_placement))
			
			with tf.Session(config=tf.ConfigProto(
				log_device_placement=False)) as sess:
				sess.run(init)
Esempio n. 19
0
def train():
  """Train CIFAR-10 for a number of steps."""
  with tf.Graph().as_default():
    global_step = tf.Variable(0, trainable=False)

    # Get images and labels for CIFAR-10.
    images, labels = cifar10.distorted_inputs()

    eval_data = FLAGS.eval_data == 'test'
    #timages, tlabels = cifar10.inputs(eval_data=eval_data)
    # Build a Graph that computes the logits predictions from the
    # inference model.
    logits = cifar10.inference(images)

    #tlogits = cifar10.inference(timages)
    # Calculate loss.
    top_k_op = tf.nn.in_top_k(logits, labels, 1)
    loss = cifar10.loss(logits, labels)
    #precision = tf.Variable(0.8, name='precision')
    #tf.scalar_summary('accuracy', precision)

    # Build a Graph that trains the model with one batch of examples and
    # updates the model parameters.
    train_op = cifar10.train(loss, global_step)

    # Create a saver.
    saver = tf.train.Saver(tf.all_variables())

    # Build the summary operation based on the TF collection of Summaries.
    summary_op = tf.merge_all_summaries()

    # Build an initialization operation to run below.
    init = tf.initialize_all_variables()

    # Start running operations on the Graph.
    sess = tf.Session(config=tf.ConfigProto(
        log_device_placement=FLAGS.log_device_placement))
    sess.run(init)

    # Start the queue runners.
    tf.train.start_queue_runners(sess=sess)
    sess.graph.finalize()

    summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph)

    for step in xrange(FLAGS.max_steps):
      start_time = time.time()
      _, loss_value = sess.run([train_op, loss])
      duration = time.time() - start_time

      assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

      if step % 100 == 0:

    # Build a Graph that computes the logits predictions from the
    # inference model.

    # Calculate predictions.
        num_examples_per_step = FLAGS.batch_size
        examples_per_sec = num_examples_per_step / duration
        sec_per_batch = float(duration)

        format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                      'sec/batch)')
        print (format_str % (datetime.now(), step, loss_value,
                             examples_per_sec, sec_per_batch))
	num_iter = int(math.ceil(FLAGS.num_examples / FLAGS.batch_size))
        true_count = 0  # Counts the number of correct predictions.
        total_sample_count = num_iter * FLAGS.batch_size
        i_step = 0
        while i_step < num_iter:
          predictions = sess.run([top_k_op])
          true_count += np.sum(predictions)
          i_step += 1

      #Compute precision @ 1.
      	#sess.run(precision.assign(true_count / total_sample_count))
      	prec = true_count / total_sample_count
      	print(prec)
	summary = tf.Summary()
        summary.ParseFromString(sess.run(summary_op))
        summary.value.add(tag='accuracy', simple_value=prec)
        summary_writer.add_summary(summary, step)

	#summary_str = sess.run(summary_op)

        #summary_writer.add_summary(summary_str, step)
       	#summary_writer.flush()

      # Save the model checkpoint periodically.
      if step % 100 == 0 or (step + 1) == FLAGS.max_steps:
        checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
Esempio n. 20
0
		def SGDBead(self, bead, thresh, maxindex):
			
			finalerror = 0.
			
			#thresh = .05

			# Parameters
			learning_rate = 0.001
			training_epochs = 15
			batch_size = 100
			display_step = 1
			
			curWeights, curBiases = self.AllBeads[bead]
			#test_model = multilayer_perceptron(w=curWeights, b=curBiases)
			test_model = convnet(w=curWeights, b=curBiases)

			
			with test_model.g.as_default():

				global_step = tf.Variable(0, trainable=False)

				# Get images and labels for CIFAR-10.
				images, labels = cifar10.distorted_inputs()
				test_images, test_labels = cifar10.inputs(eval_data='test')

				# Build a Graph that computes the logits predictions from the
				# inference model.
				logits = test_model.predict(images)
				logit_test = test_model.predict(test_images)

				# Calculate loss.
				loss = cifar10.loss(logits, labels)

				# Build a Graph that trains the model with one batch of examples and
				# updates the model parameters.
				train_op = cifar10.train(loss, global_step)


				top_k_op = tf.nn.in_top_k(logit_test, test_labels, 1)


				# Build an initialization operation to run below.
				init = tf.initialize_all_variables()

				# Start running operations on the Graph.
				#sess = tf.Session(config=tf.ConfigProto(
				#    log_device_placement=FLAGS.log_device_placement))

				with tf.Session(config=tf.ConfigProto(
					log_device_placement=False)) as sess:
					sess.run(init)

					tf.train.start_queue_runners(sess=sess)

					step = 0
					stopcond = True
					while step < max_steps and stopcond:


						start_time = time.time()
						_, loss_value = sess.run([train_op, loss])
						duration = time.time() - start_time

						assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

						if step % 10 == 0:
							num_examples_per_step = batch_size
							examples_per_sec = num_examples_per_step / duration
							sec_per_batch = float(duration)

							format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
									  'sec/batch)')
							print (format_str % (datetime.now(), step, loss_value,
											 examples_per_sec, sec_per_batch))

						if step % 100 == 0:

							num_iter = int(math.ceil(num_examples / batch_size))
							true_count = 0  # Counts the number of correct predictions.
							total_sample_count = num_iter * batch_size
							stepp = 0
							while stepp < num_iter:
								predictions = sess.run([top_k_op])
								true_count += np.sum(predictions)
								stepp += 1


							# Compute precision @ 1.
							precision = true_count / total_sample_count
							print('%s: precision @ 1 = %.3f' % (datetime.now(), precision))

							if precision > 1 - thresh:
								stopcond = False
								test_model.params = sess.run(test_model.weightslist), sess.run(test_model.biaseslist)
								self.AllBeads[bead]=test_model.params
								finalerror = 1 - precision
								print ("Final bead error: ",str(finalerror))
								
						step += 1        
				return finalerror
Esempio n. 21
0
def multilevel_train_1ord():
    """Train CIFAR-10 for a number of steps."""
    with tf.Graph().as_default():
        global_step = tf.Variable(0, trainable=False)

        # Get images and labels for CIFAR-10.
        images, labels = cifar10.distorted_inputs()

        # Build a Graph that computes the logits predictions from the
        # inference model.
        logits = cifar10.inference(images)

        # Calculate loss.
        loss = cifar10.loss(logits, labels)

        #Accurarcy
        top_k_op = tf.nn.in_top_k(logits, labels, 1)

        # Build a Graph that trains the model with one batch of examples and
        # updates the model parameters.
        train_op = cifar10.train(loss, global_step)

        # Create a saver.
        saver = tf.train.Saver(tf.all_variables())

        # Build the summary operation based on the TF collection of Summaries.
        summary_op = tf.merge_all_summaries()

        # Build an initialization operation to run below.
        init = tf.initialize_all_variables()

        # Start running operations on the Graph.
        sess = tf.Session(config=tf.ConfigProto(
            log_device_placement=FLAGS.log_device_placement))
        sess.run(init)

        # Start the queue runners.
        tf.train.start_queue_runners(sess=sess)

        summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph)

        for step in xrange(FLAGS.max_steps):
            start_time = time.time()
            _, loss_value = sess.run([train_op, loss])
            accurarcy = sess.run(top_k_op)
            duration = time.time() - start_time

            assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

            output_list = []
            # Do something with intermediate data (intermediate)
            # Save data on iterations of 0, 1000, 2000, 3000
            if step % 1000 == 0 or (step + 1) == FLAGS.max_steps:
                for v in tf.all_variables():
                    if "conv1/weights:" in v.name:
                        print(v.name)
                        output_list.append(
                            tf.get_default_graph().get_tensor_by_name(v.name))
                        break
                if (step == 0):
                    conv1_data_0 = sess.run(output_list)
                if (step == 1000):
                    conv1_data_1000 = sess.run(output_list)
                if (step == 2000):
                    conv1_data_2000 = sess.run(output_list)
                if (step == 3000):
                    conv1_data_3000 = sess.run(output_list)
                    (A, B, C, D, E) = np.array(conv1_data_3000).shape

            # do something.
            # do experiments
            if step == 3000 or (step + 1) == FLAGS.max_steps:
                print("************\n Chen process executing")
                _, new_data = process.exp_2_commMax(conv1_data_0,
                                                    conv1_data_1000,
                                                    conv1_data_2000,
                                                    conv1_data_3000)
                for v in tf.all_variables():
                    if "conv1/weights:" in v.name:
                        print("start assign: ")
                        sess.run(
                            tf.assign(
                                tf.get_default_graph().get_tensor_by_name(
                                    v.name), new_data[0]))
                        break
                value = sess.run(loss)
                pred = process.Count(accurarcy)
                print("new loss value is: " + str(value) + " accurarcy :" +
                      str(pred))

            if step % 10 == 0:
                num_examples_per_step = FLAGS.batch_size
                examples_per_sec = num_examples_per_step / duration
                sec_per_batch = float(duration)
                predict = process.Count(accurarcy)
                format_str = (
                    '%s: step %d, loss = %.2f, accu = %.2f (%.1f examples/sec; %.3f '
                    'sec/batch)')
                print(format_str % (datetime.now(), step, loss_value, predict,
                                    examples_per_sec, sec_per_batch))

            if step % 100 == 0:
                summary_str = sess.run(summary_op)
                summary_writer.add_summary(summary_str, step)

            # Save the model checkpoint periodically.
            if step % 1000 == 0 or (step + 1) == FLAGS.max_steps:
                checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
Esempio n. 22
0
def run_training():
    """Train MNIST for a number of steps."""

    # Tell TensorFlow that the model will be built into the default Graph.
    with tf.Graph().as_default():
        global_step = tf.Variable(0, trainable=False)
        # Input images and labels.
        images, labels = inputs(train=True, batch_size=BATCH_SIZE,
                                num_epochs=FLAGS.num_epochs)

        print('images', images)

        logits = calc_inference(images)
        print('logits', logits)
        print('labels', labels)
        # Calculate loss.
        loss = calc_loss(logits, labels)
        print(loss)

        # Build a Graph that trains the model with one batch of examples and
        # updates the model parameters.
        train_op = cifar10.train(loss, global_step)

        # Create a saver.
        saver = tf.train.Saver(tf.all_variables())

        # Build the summary operation based on the TF collection of Summaries.
        summary_op = tf.merge_all_summaries()

        # Build an initialization operation to run below.
        init = tf.initialize_all_variables()

        # Start running operations on the Graph.
        sess = tf.Session(config=tf.ConfigProto(
            log_device_placement=FLAGS.log_device_placement))
        sess.run(init)

        # Start the queue runners.
        tf.train.start_queue_runners(sess=sess)

        summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph)

        for step in xrange(FLAGS.max_steps):
          start_time = time.time()
          print(step)
          print(train_op)
          _, loss_value = sess.run([train_op, loss])
          duration = time.time() - start_time

          assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

          if step % 10 == 0:
            num_examples_per_step = BATCH_SIZE
            examples_per_sec = num_examples_per_step / duration
            sec_per_batch = float(duration)

            format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                          'sec/batch)')
            print (format_str % (datetime.now(), step, loss_value,
                                 examples_per_sec, sec_per_batch))

          if step % 100 == 0:
            summary_str = sess.run(summary_op)
            summary_writer.add_summary(summary_str, step)

          # Save the model checkpoint periodically.
          if step % 1000 == 0 or (step + 1) == FLAGS.max_steps:
            checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
            saver.save(sess, checkpoint_path, global_step=step)