Beispiel #1
0
def evaluate(dataset):
  """Eval CIFAR-10 for a number of steps."""
  with tf.Graph().as_default() as g:
    # Get images and labels for CIFAR-10.
    eval_data = FLAGS.eval_data == 'test'
    images, labels, filenames = image_processing.distorted_inputs(dataset)

    # Build a Graph that computes the logits predictions from the
    # inference model.
    #logits = cifar10.inference(images)
    # Build inference Graph.
    logits = DAGResnet.inference(images)
    logits = tf.nn.softmax(logits)
    shape = logits.get_shape().as_list()
    label_predict = tf.argmax(logits, dimension=len(shape) - 1)
    # Calculate predictions.
    #top_k_op = tf.nn.in_top_k(logits, labels, 1)

    # Restore the moving average version of the learned variables for eval.
    variable_averages = tf.train.ExponentialMovingAverage(
        DAGResnet.MOVING_AVERAGE_DECAY)
    variables_to_restore = variable_averages.variables_to_restore()
    saver = tf.train.Saver(variables_to_restore)

    # Build the summary operation based on the TF collection of Summaries.
    summary_op = tf.merge_all_summaries()

    summary_writer = tf.train.SummaryWriter(FLAGS.eval_dir, g)

    while True:
      eval_once(saver, summary_writer, label_predict, summary_op, labels, images, filenames, logits)
      if FLAGS.run_once:
        break
      time.sleep(FLAGS.eval_interval_secs)
Beispiel #2
0
  def build_inputs(self):
    """Input prefetching, preprocessing and batching.

    Outputs:
      inputs: images with 4-D Tensor [batch_size, height, width, channels]
      labels: labels in each angle class
    """
#   if self.mode == "inference":
#     # In inference mode, images are fed via placeholder.
#     with tf.variable_scope('images'):
#       self.images = tf.placeholder(dtype=tf.float32,
#         shape=[None, self.num_frames, self.image_size, self.image_size, 3])

    if self.mode == 'train':
      with tf.variable_scope('images_and_labels'):
        self.images, self.labels = image_processing.distorted_inputs(
                                       batch_size=self.batch_size,
                                       num_preprocess_threads=self.num_preprocess_threads)
        #   self.images = tf.random_normal([self.batch_size, self.image_size, self.image_size, 3], dtype=tf.float32)
        #   self.labels = tf.random_uniform(shape=[self.batch_size, self.num_classes], maxval=2, dtype=tf.int32)

    elif self.mode == 'validation':
      with tf.variable_scope('images_and_labels'):
        self.images, self.labels = image_processing.inputs(
                                          batch_size=self.batch_size_val,
                                          num_preprocess_threads=self.num_preprocess_threads)
        # self.images = tf.random_normal([self.batch_size, self.image_size, self.image_size, 3], dtype=tf.float32)
        # self.labels = tf.random_uniform(shape=[self.batch_size, self.num_classes], maxval=2, dtype=tf.int32)

    else:
      with tf.variable_scope('images_and_labels'):
        self.images = tf.placeholder(dtype=tf.float32,
                                     shape=[1, FLAGS.image_size, FLAGS.image_size, 3])

    print('complete build inputs.')
Beispiel #3
0
def train_distibuted(args):
    res, cluster_spec = build_cluster_spec(args)
    if not res:
        ata_log("build_cluster_spec error")
        return

    res = DistributedConfig(args, cluster_spec)
    is_chief = (args.task_id == 0)

    # Ops are assigned to worker by default.
    with tf.device('/job:worker/task:%d' % args.task_id):
        # Variables and its related init/assign ops are assigned to ps.
        with slim.scopes.arg_scope(
                                   [slim.variables.variable, slim.variables.global_step],
                                   device=slim.variables.VariableDeviceChooser(num_parameter_servers)):

            #####################
            # data fetch config #
            #####################
            images, labels = image_processing.distorted_inputs(
                                dataset,
                                batch_size=args.batch_size,
                                num_preprocess_threads=args.num_preprocess_threads)

            #########################
            # LearningModule config #
            #########################
            LearningModuleConfig(args)

            ##################
            # Start Training #
            ##################
            StartTraining(args)
Beispiel #4
0
def run_training():
    #tf.reset_default_graph()
    #data_files_ = TRAIN_FILE
    #data_files_ = VALIDATION_FILE
    data_files_ = data_files()
    images, labels = image_processing.distorted_inputs(
        data_files_, FLAGS.num_epochs, batch_size=FLAGS.batch_size)
    labels = tf.one_hot(labels, 1000)   
    logits = inference(images)
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
        logits=logits, labels=labels))
    tf.summary.scalar('loss', loss)
    correct_pred = tf.equal(tf.arg_max(logits,1), tf.argmax(labels,1))
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
    tf.summary.scalar('accuracy', accuracy)
    merged_summary_op = tf.summary.merge_all()
    train_op = tf.train.AdamOptimizer(epsilon=0.1).minimize(loss)
    init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
    sess = tf.Session()
    sess.run(init_op)
    summary_writer = tf.summary.FileWriter(FLAGS.log_dir)
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)
    #save/restore model
    d={}
    l = ['w1', 'b1', 'w2', 'b2', 'w3', 'b3', 'w4', 'b4', 'w5', 'b5', 'w_fc1', 'b_fc1', 'w_fc2', 'b_fc2', 'w_output', 'b_output']
    for i in l:
        d[i] = [v for v in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) if v.name == i+':0'][0]
    saver = tf.train.Saver(d)
    saver.restore(sess, FLAGS.model_path)
    try:
        step = 0
        start_time = time.time()
        while not coord.should_stop():
            start_batch = time.time()
            #train             
            _, loss_value, pred, acc = sess.run(
                [train_op, loss, correct_pred, accuracy])
            duration = time.time() - start_batch
            if step % 10 == 0:             
                print('Step %d | loss = %.2f | accuracy = %.2f (%.3f sec/batch)')%(
                step, loss_value, acc, duration)
            if step % 500 == 0:
                summary = sess.run(merged_summary_op)
                summary_writer.add_summary(summary, step*FLAGS.batch_size)
            if step % 5000 == 0:
                saver.save(sess, FLAGS.model_path)
                                
            step +=1
    except tf.errors.OutOfRangeError:
        print('Done training for %d epochs, %d steps, %.1f min.' % (FLAGS.num_epochs, step, (time.time()-start_time)/60))
    finally:
        coord.request_stop()

    coord.join(threads)
    sess.close()
def tower_loss(scope):
  """Calculate the total loss on a single tower running the baxNet model.

  Args:
    scope: unique prefix string identifying the CIFAR tower, e.g. 'tower_0'

  Returns:
     Tensor of shape [] containing the total loss for a batch of data
  """
  dataset = ImagenetData(subset='train')
  assert dataset.data_files()
#  if tf.gfile.Exists(FLAGS.eval_dir):
#    tf.gfile.DeleteRecursively(FLAGS.eval_dir)
#  tf.gfile.MakeDirs(FLAGS.eval_dir)
  
  num_preprocess_threads = FLAGS.num_preprocess_threads * FLAGS.num_gpus
  images, labels = image_processing.distorted_inputs(dataset,
                                                     num_preprocess_threads=num_preprocess_threads)

  # Build inference Graph.
  logits = baxNet.inference(images)

  # Build the portion of the Graph calculating the losses. Note that we will
  # assemble the total_loss using a custom function below.
  _ = baxNet.loss(logits, labels)

  # Assemble all of the losses for the current tower only.
  losses = tf.get_collection('losses', scope)

  # Calculate the total loss for the current tower.
  total_loss = tf.add_n(losses, name='total_loss')

  # Compute the moving average of all individual losses and the total loss.
  loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg')
  loss_averages_op = loss_averages.apply(losses + [total_loss])

  # Attach a scalar summary to all individual losses and the total loss; do the
  # same for the averaged version of the losses.
  for l in losses + [total_loss]:
    # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training
    # session. This helps the clarity of presentation on tensorboard.
    loss_name = re.sub('%s_[0-9]*/' % baxNet.TOWER_NAME, '', l.op.name)
    # Name each loss as '(raw)' and name the moving average version of the loss
    # as the original loss name.
    tf.scalar_summary(loss_name +' (raw)', l)
    tf.scalar_summary(loss_name, loss_averages.average(l))

  with tf.control_dependencies([loss_averages_op]):
    total_loss = tf.identity(total_loss)
  return total_loss
Beispiel #6
0
def build_train_graph(config, dataset):
    with tf.device('/cpu:0'):
        inputs, labels = image_processing.distorted_inputs(
            dataset,
            batch_size=config['parameters']['batch_size'],
            height=config['input']['height'],
            width=config['input']['width'],
            channels=config['input']['channels'],
            add_variations=config['parameters']['additional_variations'],
            num_preprocess_threads=8)

    with tf.device('/gpu:0'):
        logits, endpoints = cnn_architectures.create_model(
            config['model']['architecture'],
            inputs,
            is_training=True,
            num_classes=config['input']['classes'],
            reuse=None)

    if config['parameters']['loss'] == 'regression':
        labels = tf.cast(labels - config['parameters']['label_mean'],
                         tf.float32)  # if needed,change to type int64
        mean_squared_error = tf.losses.mean_squared_error(labels=labels,
                                                          predictions=logits)
        loss = tf.add_n([mean_squared_error] +
                        tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES),
                        name='total_loss')
        accuracy = tf.constant(0, shape=[], dtype=tf.float32)
    elif config['parameters']['loss'] == 'classification':
        labels = tf.cast(labels // 5, tf.int64)

        cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=logits, labels=labels)
        cross_entropy_mean = tf.reduce_mean(cross_entropy)
        loss = tf.add_n([cross_entropy_mean] +
                        tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES),
                        name='total_loss')

        correct_prediction = tf.equal(tf.argmax(logits, 1), labels)
        accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

    tf.summary.scalar('loss', loss, collections=['train'])
    tf.summary.scalar('accuracy', accuracy, collections=['train'])

    if config['output']['trainable_variables_to_summary']:
        for var in tf.trainable_variables():
            tf.summary.histogram(var.op.name, var, collections=['train'])

    return loss, accuracy, tf.summary.merge_all(key='train')
Beispiel #7
0
def val(train_loss, dataset):
    with tf.name_scope("val_process"):
        with tf.device('/cpu:0'):
            val_images, val_labels = image_processing.distorted_inputs(
                dataset, num_preprocess_threads=FLAGS.num_preprocess_threads)
        val_logits = _logits(val_images)
        val_loss = _loss(val_logits, val_labels)

        val_acc = tf.nn.in_top_k(val_logits, val_labels, 1)
        val_acc_sum = tf.cast(val_acc, tf.float32)
        val_acc_sum = tf.reduce_mean(val_acc_sum)

    with tf.name_scope("loss"):
        tf.summary.scalar('train_loss', train_loss)
        tf.summary.scalar('val_loss', val_loss)
    return val_acc_sum
Beispiel #8
0
def train(train_dir, batch_size, num_batches, log_dir, dataset=FilmData('train')):
  # Calculate the learning rate schedule.
  num_batches_per_epoch = (dataset.num_examples_per_epoch() /
                           FLAGS.batch_size)

  images, labels = image_processing.distorted_inputs(
      dataset)


  predictions = simple(images[0])

  slim.losses.softmax_cross_entropy(predictions, labels[0])
  total_loss = slim.losses.get_total_loss()
  tf.scalar_summary('loss', total_loss)

  optimizer = tf.train.RMSPropOptimizer(0.001, 0.9)
  train_op = slim.learning.create_train_op(total_loss, optimizer, summarize_gradients=True)

  for step in xrange(FLAGS.max_steps):
    start_time = time.time()
    _, loss_value = sess.run([train_op, total_loss])
    duration = time.time() - start_time

    assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

    if step % 10 == 0:
      examples_per_sec = FLAGS.batch_size / float(duration)
      format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                    'sec/batch)')
      print(format_str % (datetime.now(), step, loss_value,
                          examples_per_sec, duration))

    if step % 100 == 0:
      summary_str = sess.run(summary_op)
      summary_writer.add_summary(summary_str, step)

    # Save the model checkpoint periodically.
    if step % 5000 == 0 or (step + 1) == FLAGS.max_steps:
      checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
      saver.save(sess, checkpoint_path, global_step=step)
def main(argv=None):
    ps_hosts = FLAGS.ps_hosts.split(',')
    worker_hosts = FLAGS.worker_hosts.split(',')
    tf.logging.info('PS hosts are: %s' % ps_hosts)
    tf.logging.info('Worker hosts are: %s' % worker_hosts)
    cluster_spec = tf.train.ClusterSpec({
        'ps': ps_hosts,
        'worker': worker_hosts
    })
    server = tf.train.Server({
        'ps': ps_hosts,
        'worker': worker_hosts
    },
                             job_name=FLAGS.job_name,
                             task_index=FLAGS.task_id,
                             protocol=FLAGS.protocol)

    sspManager = SspManager(len(worker_hosts), 5)
    if FLAGS.job_name == 'ps':
        if FLAGS.task_id == 0:
            rpcServer = sspManager.create_rpc_server(ps_hosts[0].split(':')[0])
            rpcServer.serve()
        server.join()

    time.sleep(5)
    rpcClient = sspManager.create_rpc_client(ps_hosts[0].split(':')[0])

    dataset = ImagenetData(subset=FLAGS.subset)
    assert dataset.data_files()
    is_chief = (FLAGS.task_id == 0)
    if is_chief:
        if not tf.gfile.Exists(FLAGS.train_dir):
            tf.gfile.MakeDirs(FLAGS.train_dir)

    num_workers = len(cluster_spec.as_dict()['worker'])
    num_parameter_servers = len(cluster_spec.as_dict()['ps'])

    with tf.device('/job:worker/task:%d' % FLAGS.task_id):
        with slim.scopes.arg_scope(
            [slim.variables.variable, slim.variables.global_step],
                device=slim.variables.VariableDeviceChooser(
                    num_parameter_servers)):
            '''Prepare Input'''
            global_step = slim.variables.global_step()
            batch_size = tf.placeholder(dtype=tf.int32,
                                        shape=(),
                                        name='batch_size')
            images, labels = image_processing.distorted_inputs(
                dataset,
                batch_size,
                num_preprocess_threads=FLAGS.num_preprocess_threads)
            num_classes = dataset.num_classes() + 1
            '''Inference'''
            logits = inception.inference(images,
                                         num_classes,
                                         for_training=True)
            '''Loss'''
            inception.loss(logits, labels, batch_size)
            losses = tf.get_collection(slim.losses.LOSSES_COLLECTION)
            losses += tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
            total_loss = tf.add_n(losses, name='total_loss')
            if is_chief:
                loss_averages = tf.train.ExponentialMovingAverage(0.9,
                                                                  name='avg')
                loss_averages_op = loss_averages.apply(losses + [total_loss])
                with tf.control_dependencies([loss_averages_op]):
                    total_loss = tf.identity(total_loss)
            '''Optimizer'''
            exp_moving_averager = tf.train.ExponentialMovingAverage(
                inception.MOVING_AVERAGE_DECAY, global_step)
            variables_to_average = (tf.trainable_variables() +
                                    tf.moving_average_variables())
            num_batches_per_epoch = (dataset.num_examples_per_epoch() /
                                     FLAGS.batch_size)
            decay_steps = int(num_batches_per_epoch *
                              FLAGS.num_epochs_per_decay / num_workers)
            lr = tf.train.exponential_decay(FLAGS.initial_learning_rate,
                                            global_step,
                                            decay_steps,
                                            FLAGS.learning_rate_decay_factor,
                                            staircase=True)
            opt = tf.train.RMSPropOptimizer(lr,
                                            RMSPROP_DECAY,
                                            momentum=RMSPROP_MOMENTUM,
                                            epsilon=RMSPROP_EPSILON)
            '''Train Operation'''
            batchnorm_updates = tf.get_collection(
                slim.ops.UPDATE_OPS_COLLECTION)
            assert batchnorm_updates, 'Batchnorm updates are missing'
            batchnorm_updates_op = tf.group(*batchnorm_updates)
            with tf.control_dependencies([batchnorm_updates_op]):
                total_loss = tf.identity(total_loss)
            naive_grads = opt.compute_gradients(total_loss)
            grads = [(tf.scalar_mul(
                tf.cast(batch_size / FLAGS.batch_size, tf.float32), grad), var)
                     for grad, var in naive_grads]
            apply_gradients_op = opt.apply_gradients(grads,
                                                     global_step=global_step)
            with tf.control_dependencies([apply_gradients_op]):
                train_op = tf.identity(total_loss, name='train_op')
            '''Supervisor and Session'''
            saver = tf.train.Saver()
            init_op = tf.global_variables_initializer()
            sv = tf.train.Supervisor(is_chief=is_chief,
                                     logdir=FLAGS.train_dir,
                                     init_op=init_op,
                                     summary_op=None,
                                     global_step=global_step,
                                     recovery_wait_secs=1,
                                     saver=saver,
                                     save_model_secs=FLAGS.save_interval_secs)
            tf.logging.info('%s Supervisor' % datetime.now())
            sess_config = tf.ConfigProto(
                allow_soft_placement=True,
                log_device_placement=FLAGS.log_device_placement)
            sess = sv.prepare_or_wait_for_session(server.target,
                                                  config=sess_config)
            queue_runners = tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS)
            '''Start Training'''
            sv.start_queue_runners(sess, queue_runners)
            tf.logging.info('Started %d queues for processing input data.',
                            len(queue_runners))

            batch_size_num = FLAGS.batch_size
            for step in range(FLAGS.max_steps):
                start_time = time.time()
                run_options = tf.RunOptions(
                    trace_level=tf.RunOptions.FULL_TRACE)
                run_metadata = tf.RunMetadata()
                loss_value, gs = sess.run(
                    [train_op, global_step],
                    feed_dict={batch_size: batch_size_num},
                    options=run_options,
                    run_metadata=run_metadata)

                assert not np.isnan(
                    loss_value), 'Model diverged with loss = NaN'

                duration = time.time() - start_time
                examples_per_sec = batch_size_num / float(duration)
                sec_per_batch = float(duration)
                format_str = (
                    "time: " + str(time.time()) +
                    '; %s: step %d (gs %d), loss= %.2f (%.1f samples/s; %.3f s/batch)'
                )
                tf.logging.info(format_str %
                                (datetime.now(), step, gs, loss_value,
                                 examples_per_sec, sec_per_batch))
                rpcClient.check_staleness(FLAGS.task_id, step)
Beispiel #10
0
def main(_):
    # print(FLAGS.num_preprocess_threads)
    trainset = GoodsData('train')
    assert trainset.data_files()
    validationset = GoodsData('validation')
    assert validationset.data_files()

    # lables_output=load_labels(FLAGS.labels_file)
    # lables_output.append('unknown')
    # get_tuned_variables()
    # get_trainable_variables()

    num_batches_per_epoch = (trainset.num_examples_per_epoch() /
                             FLAGS.batch_size)
    num_preprocess_threads = FLAGS.num_preprocess_threads * FLAGS.num_gpus

    if FLAGS.from_official == True:
        train_batch_size = FLAGS.batch_size * 4
    else:
        train_batch_size = FLAGS.batch_size

    print('train_batch_size', train_batch_size)

    images_train, labels_train = image_processing.distorted_inputs(
        trainset,
        batch_size=train_batch_size,
        num_preprocess_threads=num_preprocess_threads)
    images_validation, labels_validation = image_processing.distorted_inputs(
        validationset,
        batch_size=64,
        num_preprocess_threads=num_preprocess_threads)
    # images_splits = tf.split(axis=0, num_or_size_splits=FLAGS.num_gpus, value=images)
    # labels_splits = tf.split(axis=0, num_or_size_splits=FLAGS.num_gpus, value=labels)

    input_summaries = copy.copy(tf.get_collection(tf.GraphKeys.SUMMARIES))

    # Number of classes in the Dataset label set plus 1.
    # Label 0 is reserved for an (unused) background class.
    num_classes = trainset.num_classes() + 1
    # print(images_train.shape)
    # print(labels_train.shape)
    images = tf.placeholder(
        tf.float32, [None, images_train.shape[1], images_train.shape[2], 3],
        name="input_images")
    labels = tf.placeholder(tf.int64, [None], name="labels")
    with slim.arg_scope(inception_v3.inception_v3_arg_scope()):
        logits, _ = inception_v3.inception_v3(images, num_classes=num_classes)

    if FLAGS.from_official == True:
        tuned_variables = get_tuned_variables()
        trainable_variables = get_trainable_variables()
        checkpoint_path = FLAGS.official_checkpoint_path
    else:
        tuned_variables = get_all_variables()
        trainable_variables = get_all_variables()
        checkpoint_path = FLAGS.pretrained_model_checkpoint_path

    # print(trainable_variables)
    # test_tafafdsa=GraphKeys.TRAINABLE_VARIABLES
    # 获取需要训练的变量
    # trainable_variables = get_trainable_variables()
    # 定义交叉熵损失
    # 优化损失函数
    loss = tf.losses.softmax_cross_entropy(tf.one_hot(labels, num_classes),
                                           logits,
                                           weights=1.0)
    tf.summary.scalar('loss', loss)
    optimizer = tf.train.AdamOptimizer()
    # loss = tf.losses.get_total_loss()
    train_step = optimizer.minimize(loss, var_list=trainable_variables)

    # total_loss=tf.losses.softmax_cross_entropy(tf.one_hot(labels, num_classes), logits, weights=1.0)
    # train_step = tf.train.RMSPropOptimizer(FLAGS.initial_learning_rate).minimize(total_loss)

    # 计算正确率
    with tf.name_scope("evaluation"):
        correct_prediction = tf.equal(tf.argmax(logits, 1), labels)
        evaluation_step = tf.reduce_mean(
            tf.cast(correct_prediction, tf.float32))
        tf.summary.scalar('validation_accuracy', evaluation_step)
    # 导入预训练好的权重
    checkpoint_path = tf.train.latest_checkpoint(checkpoint_path)
    load_fn = slim.assign_from_checkpoint_fn(checkpoint_path,
                                             tuned_variables,
                                             ignore_missing_vars=True)
    # 用于存储finetune后的权重
    # print(get_tuned_variables())
    saver = tf.train.Saver()

    config = tf.ConfigProto(allow_soft_placement=True,
                            log_device_placement=FLAGS.log_device_placement)
    config.gpu_options.allow_growth = True

    sess = tf.Session(config=config)

    # with tf.Session(config=config) as sess:
    # sess.as_default()
    init = tf.global_variables_initializer()
    sess.run(init)

    merged = tf.summary.merge_all()
    writer = tf.summary.FileWriter("logs/", sess.graph)

    print("loading tuned variables from %s" % checkpoint_path)
    load_fn(sess)
    # sess.run(load_fn)
    # coord = tf.train.Coordinator()
    # threads = tf.train.start_queue_runners(coord=coord)
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)
    # tf.train.batch
    # start = 0
    # end = FLAGS.batch_size

    if tf.gfile.Exists(FLAGS.train_dir):
        tf.gfile.DeleteRecursively(FLAGS.train_dir)
    tf.gfile.MakeDirs(FLAGS.train_dir)
    checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
    saver.save(sess, checkpoint_path, global_step=0)
    for step in range(FLAGS.max_steps):
        # print(0)
        start_time = time.time()
        # print(1)
        # image_batch = sess.run(images_train[start:end])
        # print(2)
        # # label_batch = sess.run(labels_train[start:end])
        # label_batch = labels_train[start:end]
        #
        # print(3)
        # images_train, labels_train = image_processing.distorted_inputs(trainset,
        #                                                                num_preprocess_threads=num_preprocess_threads)
        # images_validation, labels_validation = image_processing.distorted_inputs(validationset,
        #                                                                          num_preprocess_threads=num_preprocess_threads)

        image_batch, label_batch = sess.run([images_train, labels_train])
        # print(3)
        # sess.run(train_step, feed_dict={
        #     images: image_batch,
        #     labels: label_batch
        # })
        # print(4)
        # print(1)
        # print(label_batch)
        # loss_tensor = tf.losses.get_total_loss()
        sess.run(train_step,
                 feed_dict={
                     images: image_batch,
                     labels: label_batch
                 })
        # loss_now=sess.run(loss)
        # print(2)
        duration = time.time() - start_time

        # assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

        if step % 5 == 0:
            examples_per_sec = FLAGS.batch_size / float(duration)
            format_str = (
                '%s: step %d,'  # loss = %.2f 
                '(%.1f examples/sec; %.3f '
                'sec/batch)')
            print(format_str % (
                datetime.now(),
                step,  # loss_now,
                examples_per_sec,
                duration))
            tempvar = tf.one_hot(labels, num_classes)
            print(
                sess.run(labels,
                         feed_dict={
                             images: image_batch,
                             labels: label_batch
                         }))
            print(
                sess.run(tempvar,
                         feed_dict={
                             images: image_batch,
                             labels: label_batch
                         }))
            print(
                sess.run(tf.nn.softmax(logits),
                         feed_dict={
                             images: image_batch,
                             labels: label_batch
                         }))
        if step % 50 == 0:
            image_batch, label_batch = sess.run(
                [images_validation, labels_validation])
            validation_accuracy = sess.run(evaluation_step,
                                           feed_dict={
                                               images: image_batch,
                                               labels: label_batch
                                           })

            result = sess.run(merged,
                              feed_dict={
                                  images: image_batch,
                                  labels: label_batch
                              })
            writer.add_summary(result, step)
            print('Step %d: Validation accuracy = %.1f%%' %
                  (step, validation_accuracy * 100.0))

        # Save the model checkpoint periodically.
        if step % 1000 == 0 or (step + 1) == FLAGS.max_steps:
            checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
            saver.save(sess, checkpoint_path)
Beispiel #11
0
def train(dataset):
    """Train on dataset for a number of steps."""
    with tf.Graph().as_default(), tf.device('/cpu:0'):
        # Create a variable to count the number of train() calls. This equals the
        # number of batches processed * FLAGS.num_gpus.
        global_step = tf.get_variable('global_step', [],
                                      initializer=tf.constant_initializer(0),
                                      trainable=False)

        # Calculate the learning rate schedule.
        num_batches_per_epoch = (dataset.num_examples_per_epoch() /
                                 FLAGS.batch_size)
        decay_steps = int(num_batches_per_epoch * FLAGS.num_epochs_per_decay)

        # Decay the learning rate exponentially based on the number of steps.
        lr = tf.train.exponential_decay(FLAGS.initial_learning_rate,
                                        global_step,
                                        decay_steps,
                                        FLAGS.learning_rate_decay_factor,
                                        staircase=True)

        # Create an optimizer that performs gradient descent.
        opt = tf.train.RMSPropOptimizer(lr,
                                        RMSPROP_DECAY,
                                        momentum=RMSPROP_MOMENTUM,
                                        epsilon=RMSPROP_EPSILON)

        # Get images and labels for ImageNet and split the batch across GPUs.
        assert FLAGS.batch_size % FLAGS.num_gpus == 0, (
            'Batch size must be divisible by number of GPUs')
        split_batch_size = int(FLAGS.batch_size / FLAGS.num_gpus)

        # Override the number of preprocessing threads to account for the increased
        # number of GPU towers.
        num_preprocess_threads = FLAGS.num_preprocess_threads * FLAGS.num_gpus
        images, labels = image_processing.distorted_inputs(
            dataset, num_preprocess_threads=num_preprocess_threads)

        input_summaries = copy.copy(tf.get_collection(tf.GraphKeys.SUMMARIES))

        # Number of classes in the Dataset label set plus 1.
        # Label 0 is reserved for an (unused) background class.
        num_classes = dataset.num_classes() + 1

        # Split the batch of images and labels for towers.
        images_splits = tf.split(axis=0,
                                 num_or_size_splits=FLAGS.num_gpus,
                                 value=images)
        labels_splits = tf.split(axis=0,
                                 num_or_size_splits=FLAGS.num_gpus,
                                 value=labels)

        # Calculate the gradients for each model tower.
        tower_grads = []
        reuse_variables = None
        for i in range(FLAGS.num_gpus):
            with tf.device('/gpu:%d' % i):
                with tf.name_scope('%s_%d' %
                                   (inception.TOWER_NAME, i)) as scope:
                    # Force all Variables to reside on the CPU.
                    with slim.arg_scope([slim.variables.variable],
                                        device='/cpu:0'):
                        # Calculate the loss for one tower of the ImageNet model. This
                        # function constructs the entire ImageNet model but shares the
                        # variables across all towers.
                        loss = _tower_loss(images_splits[i], labels_splits[i],
                                           num_classes, scope, reuse_variables)

                    # Reuse variables for the next tower.
                    reuse_variables = True

                    # Retain the summaries from the final tower.
                    summaries = tf.get_collection(tf.GraphKeys.SUMMARIES,
                                                  scope)

                    # Retain the Batch Normalization updates operations only from the
                    # final tower. Ideally, we should grab the updates from all towers
                    # but these stats accumulate extremely fast so we can ignore the
                    # other stats from the other towers without significant detriment.
                    batchnorm_updates = tf.get_collection(
                        slim.ops.UPDATE_OPS_COLLECTION, scope)

                    # Calculate the gradients for the batch of data on this ImageNet
                    # tower.
                    grads = opt.compute_gradients(loss)

                    # Keep track of the gradients across all towers.
                    tower_grads.append(grads)

        # We must calculate the mean of each gradient. Note that this is the
        # synchronization point across all towers.
        grads = _average_gradients(tower_grads)

        # Add a summaries for the input processing and global_step.
        summaries.extend(input_summaries)

        # Add a summary to track the learning rate.
        summaries.append(tf.summary.scalar('learning_rate', lr))

        # Add histograms for gradients.
        for grad, var in grads:
            if grad is not None:
                summaries.append(
                    tf.summary.histogram(var.op.name + '/gradients', grad))

        # Apply the gradients to adjust the shared variables.
        apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)

        # Add histograms for trainable variables.
        for var in tf.trainable_variables():
            summaries.append(tf.summary.histogram(var.op.name, var))

        # Track the moving averages of all trainable variables.
        # Note that we maintain a "double-average" of the BatchNormalization
        # global statistics. This is more complicated then need be but we employ
        # this for backward-compatibility with our previous models.
        variable_averages = tf.train.ExponentialMovingAverage(
            inception.MOVING_AVERAGE_DECAY, global_step)

        # Another possibility is to use tf.slim.get_variables().
        variables_to_average = (tf.trainable_variables() +
                                tf.moving_average_variables())
        variables_averages_op = variable_averages.apply(variables_to_average)

        # Group all updates to into a single train op.
        batchnorm_updates_op = tf.group(*batchnorm_updates)
        train_op = tf.group(apply_gradient_op, variables_averages_op,
                            batchnorm_updates_op)

        # Create a saver.
        saver = tf.train.Saver(tf.global_variables())

        # Build the summary operation from the last tower summaries.
        summary_op = tf.summary.merge(summaries)

        # Build an initialization operation to run below.
        init = tf.global_variables_initializer()

        # Start running operations on the Graph. allow_soft_placement must be set to
        # True to build towers on GPU, as some of the ops do not have GPU
        # implementations.
        sess = tf.Session(config=tf.ConfigProto(
            allow_soft_placement=True,
            log_device_placement=FLAGS.log_device_placement))
        sess.run(init)

        if FLAGS.pretrained_model_checkpoint_path:
            assert tf.gfile.Exists(FLAGS.pretrained_model_checkpoint_path)
            variables_to_restore = tf.get_collection(
                slim.variables.VARIABLES_TO_RESTORE)
            restorer = tf.train.Saver(variables_to_restore)
            restorer.restore(sess, FLAGS.pretrained_model_checkpoint_path)
            print('%s: Pre-trained model restored from %s' %
                  (datetime.now(), FLAGS.pretrained_model_checkpoint_path))

        # Start the queue runners.
        tf.train.start_queue_runners(sess=sess)

        summary_writer = tf.summary.FileWriter(FLAGS.train_dir,
                                               graph=sess.graph)

        for step in range(FLAGS.max_steps):
            start_time = time.time()
            _, loss_value = sess.run([train_op, loss])
            duration = time.time() - start_time

            assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

            if step % 10 == 0:
                examples_per_sec = FLAGS.batch_size / float(duration)
                format_str = (
                    '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                    'sec/batch)')
                print(format_str % (datetime.now(), step, loss_value,
                                    examples_per_sec, duration))

            if step % 100 == 0:
                summary_str = sess.run(summary_op)
                summary_writer.add_summary(summary_str, step)

            # Save the model checkpoint periodically.
            if step % 5000 == 0 or (step + 1) == FLAGS.max_steps:
                checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
Beispiel #12
0
    model_file = "retrained_graph.pb"
    label_file = "retrained_labels.txt"
    trainset = GoodsData('train')
    assert trainset.data_files()
    validationset = GoodsData('validation')
    assert validationset.data_files()

    lables_output = load_labels(FLAGS.labels_file)

    # get_tuned_variables()
    # get_trainable_variables()

    num_batches_per_epoch = (trainset.num_examples_per_epoch() /
                             FLAGS.batch_size)
    num_preprocess_threads = FLAGS.num_preprocess_threads * FLAGS.num_gpus
    images_train, labels_train = image_processing.distorted_inputs(
        trainset, num_preprocess_threads=num_preprocess_threads)
    images_validation, labels_validation = image_processing.distorted_inputs(
        validationset, num_preprocess_threads=num_preprocess_threads)
    # images_splits = tf.split(axis=0, num_or_size_splits=FLAGS.num_gpus, value=images)
    # labels_splits = tf.split(axis=0, num_or_size_splits=FLAGS.num_gpus, value=labels)
    input_summaries = copy.copy(tf.get_collection(tf.GraphKeys.SUMMARIES))
    num_classes = trainset.num_classes() + 1
    print(images_train.shape)
    print(labels_train.shape)
    images = tf.placeholder(
        tf.float32, [None, images_train.shape[1], images_train.shape[2], 3],
        name="input_images")
    labels = tf.placeholder(tf.int64, [None], name="labels")
    with slim.arg_scope(inception_v3.inception_v3_arg_scope()):
        logits, endpoints = inception_v3.inception_v3(images,
                                                      num_classes=num_classes)
def train(dataset):
  """Train on dataset for a number of steps."""
  # with tf.Graph().as_default(), tf.device('/cpu:0'):
  with tf.Graph().as_default():

    # ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir)
    # if ckpt and ckpt.model_checkpoint_path:
    #     global_step = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]

    global_step = tf.Variable(0,trainable=False)
    # global_step = tf.contrib.framework.get_or_create_global_step()

    num_preprocess_threads = FLAGS.num_preprocess_threads * FLAGS.num_gpus
    with tf.device('/cpu:0'):
      images, pitchs, yaws, rolls, names = image_processing.distorted_inputs(
        dataset,
        num_preprocess_threads=num_preprocess_threads)
    
    p = tf.expand_dims(pitchs,1)
    y = tf.expand_dims(yaws,1)
    r = tf.expand_dims(rolls,1)
    labels = tf.concat([p, y, r],1)

    train_output = model.inference(images,FLAGS.is_training)
    train_loss = model.losses(train_output, labels)    
    add_global = global_step.assign_add(1)     
    train_op = model.trainning(train_loss, FLAGS.learning_rate, global_step)
   
    summary_op = tf.summary.merge_all()
    sess = tf.Session()
    train_writer = tf.summary.FileWriter(FLAGS.train_dir, sess.graph)
    saver = tf.train.Saver()
    
    # ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir)
    # if ckpt and ckpt.model_checkpoint_path:
    #   if os.path.isabs(ckpt.model_checkpoint_path):
    #     # Restores from checkpoint with absolute path.
    #     saver.restore(sess, ckpt.model_checkpoint_path)
    #   else:
    #     # Restores from checkpoint with relative path.
    #     saver.restore(sess, os.path.join(FLAGS.checkpoint_dir,
    #                                      ckpt.model_checkpoint_path))

      # Assuming model_checkpoint_path looks something like:
      #   /my-favorite-path/imagenet_train/model.ckpt-0,
      # extract global_step from it.
      # global_step = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]
      # print('Successfully loaded model from %s at step=%s.' %
      #       (ckpt.model_checkpoint_path, global_step))
    # else:
    #   print('No checkpoint file found')
    #   return

    sess.run(tf.global_variables_initializer())
    
    """
    these codes get the variable in conv1

    print(sess.run(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)))
    w = tf.contrib.framework.get_variables('conv1')
    t = tf.nn.l2_loss(w[0])
    print(sess.run(t))
    """

    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)
    
    try:
        for step in np.arange(FLAGS.max_steps):
            if coord.should_stop():
                    break
            _, _, tra_loss= sess.run([add_global, train_op, train_loss])
               
            if step % 50 == 0:
                print('Step %d, train loss = %.2f'  %(step, tra_loss))
                summary_str = sess.run(summary_op)
                train_writer.add_summary(summary_str, step)
            
            if step % 2000 == 0 or (step + 1) == FLAGS.max_steps:
                checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
                
    except tf.errors.OutOfRangeError:
        print('Done training -- epoch limit reached')
    finally:
        coord.request_stop()
        
    coord.join(threads)
    sess.close()
Beispiel #14
0
def main(_):
    with tf.Graph().as_default(), tf.device('/cpu:0'):
        dataset = ImagenetData(subset=FLAGS.subset)
        assert dataset.data_files()
        global_step = tf.get_variable('global_step', [],
                                      initializer=tf.constant_initializer(0),
                                      trainable=False)

        # Calculate the learning rate schedule.
        num_batches_per_epoch = (dataset.num_examples_per_epoch() /
                                 FLAGS.batch_size)
        decay_steps = int(num_batches_per_epoch * FLAGS.num_epochs_per_decay)

        # Decay the learning rate exponentially based on the number of steps.
        learning_rate = tf.train.exponential_decay(
            FLAGS.learning_rate,
            global_step,
            decay_steps,
            FLAGS.learning_rate_decay_factor,
            staircase=True)

        tf.summary.scalar('lr', learning_rate)

        is_training = tf.placeholder(tf.bool)

        #opt = tf.train.AdamOptimizer(learning_rate)
        opt = tf.train.RMSPropOptimizer(learning_rate,
                                        RMSPROP_DECAY,
                                        momentum=RMSPROP_MOMENTUM,
                                        epsilon=RMSPROP_EPSILON)

        with tf.name_scope("create_inputs"):
            #if tf.gfile.Exists(FLAGS.SNAPSHOT_DIR):
            #    tf.gfile.DeleteRecursively(FLAGS.SNAPSHOT_DIR)
            #tf.gfile.MakeDirs(FLAGS.SNAPSHOT_DIR)

            # Get images and labels for ImageNet and split the batch across GPUs.
            assert FLAGS.batch_size % FLAGS.gpu_nums == 0, (
                'Batch size must be divisible by number of GPUs')
            split_batch_size = int(FLAGS.batch_size / FLAGS.gpu_nums)

            # Override the number of preprocessing threads to account for the increased
            # number of GPU towers.
            num_preprocess_threads = FLAGS.num_preprocess_threads * FLAGS.gpu_nums
            images, labels = image_processing.distorted_inputs(
                dataset, num_preprocess_threads=num_preprocess_threads)
            #tf.summary.image('images', images, max_outputs = 10)

            images_splits = tf.split(axis=0,
                                     num_or_size_splits=FLAGS.gpu_nums,
                                     value=images)
            labels_splits = tf.split(axis=0,
                                     num_or_size_splits=FLAGS.gpu_nums,
                                     value=tf.one_hot(indices=labels,
                                                      depth=FLAGS.num_classes))

        multi_grads = []
        with tf.variable_scope(tf.get_variable_scope()):
            for i in xrange(FLAGS.gpu_nums):
                with tf.device('/gpu:%d' % i):
                    with tf.name_scope('%s_%d' % ('ImageNet', i)) as scope:

                        graph = Model_Graph(num_class=FLAGS.num_classes,
                                            is_training=is_training)

                        model = graph._build_defaut_graph(
                            images=images_splits[i])

                        # Top-1 accuracy
                        top1acc = tf.reduce_mean(
                            tf.cast(
                                tf.nn.in_top_k(
                                    model.logits,
                                    tf.argmax(labels_splits[i], axis=1), 1),
                                tf.float32))
                        # Top-n accuracy
                        topnacc = tf.reduce_mean(
                            tf.cast(
                                tf.nn.in_top_k(
                                    model.logits,
                                    tf.argmax(labels_splits[i], axis=1),
                                    FLAGS.top_k), tf.float32))

                        tf.summary.scalar('top1acc_{}'.format(i), top1acc)
                        tf.summary.scalar('topkacc_{}'.format(i), topnacc)

                        all_trainable = [v for v in tf.trainable_variables()]

                        loss = tf.nn.softmax_cross_entropy_with_logits(
                            logits=model.logits, labels=labels_splits[i])

                        l2_losses = [
                            FLAGS.weight_decay * tf.nn.l2_loss(v)
                            for v in tf.trainable_variables()
                            if 'weights' in v.name
                        ]
                        reduced_loss = tf.reduce_mean(loss) + tf.add_n(
                            l2_losses)

                        tf.summary.scalar('loss_{}'.format(i), reduced_loss)

                        tf.get_variable_scope().reuse_variables()

                        #batchnorm_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, scope)
                        batchnorm_updates = tf.get_collection(
                            tf.GraphKeys.UPDATE_OPS)

                        grads = opt.compute_gradients(reduced_loss,
                                                      all_trainable)
                        multi_grads.append(grads)

        grads = average_gradients(multi_grads)

        # Track the moving averages of all trainable variables.
        # Note that we maintain a "double-average" of the BatchNormalization
        # global statistics. This is more complicated then need be but we employ
        # this for backward-compatibility with our previous models.
        variable_averages = tf.train.ExponentialMovingAverage(
            FLAGS.MOVING_AVERAGE_DECAY, global_step)

        variables_to_average = (tf.trainable_variables() +
                                tf.moving_average_variables())
        variables_averages_op = variable_averages.apply(variables_to_average)

        # Group all updates to into a single train op.
        batchnorm_updates_op = tf.group(*batchnorm_updates)
        train_op = tf.group(opt.apply_gradients(grads, global_step),
                            variables_averages_op, batchnorm_updates_op)

        #grads_value = list(zip(grads, all_trainable))
        #for grad, var in grads_value:
        #    tf.summary.histogram(var.name + '/gradient', grad)

        summary_op = tf.summary.merge_all()

        # Set up tf session and initialize variables.
        config = tf.ConfigProto()
        config.allow_soft_placement = True
        sess = tf.Session(config=config)
        init = tf.global_variables_initializer()

        sess.run(init)

        # Saver for storing checkpoints of the model.
        saver = tf.train.Saver(var_list=tf.global_variables(), max_to_keep=2)

        restore_var = [v for v in tf.trainable_variables()] + [
            v for v in tf.global_variables() if 'moving_mean' in v.name
            or 'moving_variance' in v.name or 'global_step' in v.name
        ]

        ckpt = tf.train.get_checkpoint_state(FLAGS.SNAPSHOT_DIR)
        if ckpt and ckpt.model_checkpoint_path:
            loader = tf.train.Saver(var_list=restore_var)
            load(loader, sess, ckpt.model_checkpoint_path)
        else:
            print('No checkpoint file found.')
            load_step = 0

        summary_writer = tf.summary.FileWriter(FLAGS.SNAPSHOT_DIR,
                                               graph=sess.graph)

        # Iterate over training steps.
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(coord=coord, sess=sess)

        for step in range(FLAGS.num_steps):
            start_time = time.time()

            feed_dict = {is_training: True}
            if step % 50000 == 0 and step != 0:
                loss_value, _ = sess.run([reduced_loss, train_op],
                                         feed_dict=feed_dict)
                save(saver, sess, FLAGS.SNAPSHOT_DIR, step)
            elif step % 100 == 0:
                summary_str, loss_value, _ = sess.run(
                    [summary_op, reduced_loss, train_op], feed_dict=feed_dict)
                duration = time.time() - start_time
                summary_writer.add_summary(summary_str, step)
                summary_writer.flush()
                print('step {:d} \t loss = {:.3f}, ({:.3f} sec/step)'.format(
                    step, loss_value, duration))
            else:
                loss_value, _ = sess.run([reduced_loss, train_op],
                                         feed_dict=feed_dict)

        coord.request_stop()
        coord.join(threads)
def train(dataset, dataset_val=None):
    """Train CIFAR-10 for a number of steps."""
    #with tf.variable_scope("CRRN", reuse=None):
    with tf.Graph().as_default(), tf.device('/cpu:0'):

        # Create a variable to count the number of train() calls. This equals the
        # number of batches processed * FLAGS.num_gpus.
        global_step = tf.get_variable('global_step', [],
                                      initializer=tf.constant_initializer(0),
                                      trainable=False)

        # Calculate the learning rate schedule.
        num_batches_per_epoch = (dataset.NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN /
                                 FLAGS.batch_size)
        decay_steps = int(num_batches_per_epoch *
                          DAGResnet.NUM_EPOCHS_PER_DECAY)

        # Decay the learning rate exponentially based on the number of steps.
        lr = tf.train.exponential_decay(DAGResnet.INITIAL_LEARNING_RATE,
                                        global_step,
                                        decay_steps,
                                        DAGResnet.LEARNING_RATE_DECAY_FACTOR,
                                        staircase=True)

        # Create an optimizer that performs gradient descent.
        opt = tf.train.GradientDescentOptimizer(lr)

        # images, labels = cifar10.distorted_inputs()
        assert FLAGS.batch_size % FLAGS.num_gpus == 0, (
            'Batch size must be divisible by number of GPUs')
        split_batch_size = int(FLAGS.batch_size / FLAGS.num_gpus)

        # Override the number of preprocessing threads to account for the increased
        # number of GPU towers.
        num_preprocess_threads = FLAGS.num_preprocess_threads * FLAGS.num_gpus
        images, labels, filenames = image_processing.distorted_inputs(
            dataset, num_preprocess_threads=num_preprocess_threads)

        # Split the batch of images and labels for towers.
        # images_splits = tf.split(0, FLAGS.num_gpus, images)
        # labels_splits = tf.split(0, FLAGS.num_gpus, labels)
        # Modify because of different version of TF. Date June 15, 2017
        images_splits = tf.split(images, FLAGS.num_gpus, 0)
        labels_splits = tf.split(labels, FLAGS.num_gpus, 0)

        if dataset_val is not None:
            images_val, labels_val, filenames_val = image_processing.distorted_inputs(
                dataset_val, num_preprocess_threads=num_preprocess_threads)
            images_val_splits = tf.split(0, FLAGS.num_gpus, images_val)
            labels_val_splits = tf.split(0, FLAGS.num_gpus, labels_val)

        # Calculate the gradients for each model tower.
        tower_grads = []
        loss_val = []
        pixel_accuracy = []
        for i in xrange(FLAGS.num_gpus):
            gpu_idx = i + FLAGS.start_gpu_idx
            with tf.device('/gpu:%d' % gpu_idx):
                with tf.name_scope('%s_%d' %
                                   (DAGResnet.TOWER_NAME, gpu_idx)) as scope:
                    # Calculate the loss for one tower of the CIFAR model. This function
                    # constructs the entire CIFAR model but shares the variables across
                    # all towers.
                    loss = tower_loss(images_splits[i], labels_splits[i],
                                      scope)

                    # Reuse variables for the next tower.
                    tf.get_variable_scope().reuse_variables()

                    # Retain the summaries from the final tower.
                    summaries = tf.get_collection(tf.GraphKeys.SUMMARIES,
                                                  scope)

                    # Calculate the gradients for the batch of data on this CIFAR tower.
                    grads = opt.compute_gradients(loss)
                    # grads = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in grads]

                    # grads = [(tf.clip_by_average_norm(grad, 5), var) for grad, var in grads]

                    # Keep track of the gradients across all towers.
                    tower_grads.append(grads)

                    if dataset_val is not None:
                        #with tf.name_scope('Validation'):
                        logits_val = DAGResnet.inference(images_val_splits[i])

                        # Build the portion of the Graph calculating the losses. Note that we will
                        # assemble the total_loss using a custom function below.
                        loss_val.append(
                            DAGResnet.loss(logits_val, labels_val_splits[i]))

                        label_val = labels_val_splits[i]
                        shape = logits_val.get_shape().as_list()
                        label_predict = tf.argmax(logits_val,
                                                  dimension=len(shape) - 1)
                        pixel_labeled = tf.reduce_sum(
                            tf.to_float(label_val > 0))
                        pixel_correct = tf.reduce_sum(
                            tf.to_float(
                                tf.equal(tf.cast(label_val, tf.int64),
                                         label_predict)) *
                            tf.to_float(label_val > 0))
                        pixel_accuracy.append(
                            tf.div(tf.scalar_mul(1.0, pixel_correct),
                                   pixel_labeled))

        # We must calculate the mean of each gradient. Note that this is the
        # synchronization point across all towers.
        grads = average_gradients(tower_grads)
        loss_val = tf.reduce_mean(loss_val)
        pixel_accuracy = tf.reduce_mean(pixel_accuracy)

        # Add a summary to track the learning rate.
        summaries.append(tf.summary.scalar('learning_rate', lr))

        # Add histograms for gradients.
        for grad, var in grads:
            if grad is not None:
                summaries.append(
                    tf.summary.histogram(var.op.name + '/gradients', grad))

        # Apply the gradients to adjust the shared variables.
        apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)

        # Add histograms for trainable variables.
        for var in tf.trainable_variables():
            summaries.append(tf.summary.histogram(var.op.name, var))

        with tf.variable_scope(tf.get_variable_scope(), reuse=None):
            # Track the moving averages of all trainable variables.
            variable_averages = tf.train.ExponentialMovingAverage(
                DAGResnet.MOVING_AVERAGE_DECAY, global_step)
            variables_averages_op = variable_averages.apply(
                tf.trainable_variables())

            # Group all updates to into a single train op.
            train_op = tf.group(apply_gradient_op, variables_averages_op)


#        train_op = tf.group(apply_gradient_op)

# Create a saver.
        saver = tf.train.Saver(tf.all_variables())

        # Build the summary operation from the last tower summaries.
        summary_op = tf.summary.merge(summaries)

        # Build an initialization operation to run below.
        init = tf.initialize_all_variables()

        # Start running operations on the Graph. allow_soft_placement must be set to
        # True to build towers on GPU, as some of the ops do not have GPU
        # implementations.
        sess = tf.Session(config=tf.ConfigProto(
            allow_soft_placement=True,
            log_device_placement=FLAGS.log_device_placement))
        sess.run(init)

        # restore the previous checkpoint model
        if not tf.gfile.Exists(FLAGS.train_dir):
            tf.gfile.MakeDirs(FLAGS.train_dir)
        else:
            ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir)
            if ckpt and ckpt.model_checkpoint_path:
                saver.restore(sess, ckpt.model_checkpoint_path)
                print('%s: Model restored from %s' %
                      (datetime.now(), ckpt.model_checkpoint_path))
            #global_step = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]

        # Load the checkpoint model
        if FLAGS.pretrained_model_checkpoint_path:
            try:
                if tf.gfile.Exists(FLAGS.pretrained_model_checkpoint_path):
                    t_vars = tf.trainable_variables()
                    variables_to_restore = [
                        var for var in t_vars
                        if not ('FC_V' in var.name or 'upscore' in var.name)
                    ]
                    restorer = tf.train.Saver(variables_to_restore)
                    restorer.restore(sess,
                                     FLAGS.pretrained_model_checkpoint_path)
                    print('%s: Pre-trained model restored from %s' %
                          (datetime.now(),
                           FLAGS.pretrained_model_checkpoint_path))
            except ValueError:
                print('No checkpoint is loaded')

        # Start the queue runners.
        tf.train.start_queue_runners(sess=sess)

        summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph)
        f = open(FLAGS.train_dir + '/' + 'log.txt', 'w')

        for step in xrange(FLAGS.max_steps):
            start_time = time.time()
            _, loss_value = sess.run([train_op, loss])
            duration = time.time() - start_time

            assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

            if step % 10 == 0:
                num_examples_per_step = FLAGS.batch_size * FLAGS.num_gpus
                examples_per_sec = num_examples_per_step / duration
                sec_per_batch = duration / FLAGS.num_gpus

                format_str = (
                    '%s: step %d, loss = %.6f (%.1f examples/sec; %.3f '
                    'sec/batch)')
                print(format_str % (datetime.now(), step, loss_value,
                                    examples_per_sec, sec_per_batch))

            if dataset_val is not None and step > 0 and step % FLAGS.do_val == 0:
                format_str = (
                    '%s: step %d, [VALIDATION] loss = %.6f  pixel acc = %.6f')
                loss_value_val, pixelAcc = sess.run([loss_val, pixel_accuracy])
                print(format_str %
                      (datetime.now(), step, loss_value_val, pixelAcc))
                f.write(format_str %
                        (datetime.now(), step, loss_value_val, pixelAcc))
                f.write('\n')

            if step % 100000 == 0 and step > 0:
                summary_str = sess.run(summary_op)
                summary_writer.add_summary(summary_str, step)

            # Save the model checkpoint periodically.
            if step % 1000 == 0 or (step + 1) == FLAGS.max_steps:
                checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
        f.close()
Beispiel #16
0
def train():
    print('[Dataset Configuration]')
    #print('\tCIFAR-100 dir: %s' % FLAGS.data_dir)
    print('\tNumber of classes: %d' % FLAGS.num_classes)
    print('\tNumber of training images: %d' % FLAGS.num_train_instance)
    print('\tNumber of test images: %d' % FLAGS.num_test_instance)

    print('[Network Configuration]')
    #print('\tBatch size: %d' % FLAGS.batch_size)
    print('\tResidual blocks per group: %d' % FLAGS.num_residual_units)
    print('\tNetwork width multiplier: %d' % FLAGS.k)

    print('[Optimization Configuration]')
    print('\tL2 loss weight: %f' % FLAGS.l2_weight)
    print('\tThe momentum optimizer: %f' % FLAGS.momentum)
    print('\tInitial learning rate: %f' % FLAGS.initial_lr)
    print('\tEpochs per lr step: %f' % FLAGS.lr_step_epoch)
    print('\tLearning rate decay: %f' % FLAGS.lr_decay)

    print('[Training Configuration]')
    print('\tTrain dir: %s' % FLAGS.train_dir)
    print('\tTraining max steps: %d' % FLAGS.max_steps)
    print('\tSteps per displaying info: %d' % FLAGS.display)
    print('\tSteps per testing: %d' % FLAGS.test_interval)
    print('\tSteps during testing: %d' % FLAGS.test_iter)
    print('\tSteps per saving checkpoints: %d' % FLAGS.checkpoint_interval)
    print('\tGPU memory fraction: %f' % FLAGS.gpu_fraction)
    print('\tLog device placement: %d' % FLAGS.log_device_placement)

    sys.stdout.flush()

    with tf.Graph().as_default():
        init_step = 0
        global_step = tf.Variable(0, trainable=False, name='global_step')

        # Get images and labels of ImageNet
        with tf.variable_scope('train_image'):
            train_images, train_labels = image_processing.distorted_inputs(
                dataset.Dataset('imagenet', 'train'), num_preprocess_threads=4)
        with tf.variable_scope('test_image'):
            test_images, test_labels = image_processing.distorted_inputs(
                dataset.Dataset('imagenet', 'validation'),
                num_preprocess_threads=4)

        # Build a Graph that computes the predictions from the inference model.
        images = tf.placeholder(
            tf.float32,
            [FLAGS.batch_size, FLAGS.image_size, FLAGS.image_size, 3])
        labels = tf.placeholder(tf.int32, [FLAGS.batch_size])

        # Build model
        decay_step = FLAGS.lr_step_epoch * FLAGS.num_train_instance / FLAGS.batch_size
        hp = resnet.HParams(batch_size=FLAGS.batch_size,
                            num_classes=FLAGS.num_classes,
                            num_residual_units=FLAGS.num_residual_units,
                            k=FLAGS.k,
                            weight_decay=FLAGS.l2_weight,
                            initial_lr=FLAGS.initial_lr,
                            decay_step=decay_step,
                            lr_decay=FLAGS.lr_decay,
                            momentum=FLAGS.momentum)
        network = resnet.ResNet(hp, images, labels, global_step)
        network.build_model()
        network.build_train_op()
        network.count_trainable_params()

        # Summaries(training)
        train_summary_op = tf.summary.merge_all()

        # Build an initialization operation to run below.
        init = tf.initialize_all_variables()

        # Start running operations on the Graph.
        sess = tf.Session(config=tf.ConfigProto(
            gpu_options=tf.GPUOptions(
                per_process_gpu_memory_fraction=FLAGS.gpu_fraction),
            log_device_placement=FLAGS.log_device_placement))
        sess.run(init)

        # Create a saver.
        saver = tf.train.Saver(tf.all_variables(), max_to_keep=10000)
        ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir)
        if ckpt and ckpt.model_checkpoint_path:
            print('\tRestore from %s' % ckpt.model_checkpoint_path)
            # Restores from checkpoint
            saver.restore(sess, ckpt.model_checkpoint_path)
            init_step = int(
                ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1])
        else:
            print('No checkpoint file found. Start from the scratch.')
        sys.stdout.flush()

        # Start queue runners & summary_writer
        tf.train.start_queue_runners(sess=sess)
        if not os.path.exists(FLAGS.train_dir):
            os.mkdir(FLAGS.train_dir)
        summary_writer = tf.summary.FileWriter(FLAGS.train_dir, sess.graph)

        # Training!
        test_best_acc = 0.0
        for step in range(init_step, FLAGS.max_steps):
            # Test
            if step % FLAGS.test_interval == 0:
                test_loss, test_acc = 0.0, 0.0
                for i in range(FLAGS.test_iter):
                    test_images_val, test_labels_val = sess.run(
                        [test_images, test_labels])
                    test_labels_val -= 1
                    loss_value, acc_value = sess.run(
                        [network.loss, network.acc],
                        feed_dict={
                            network.is_train: False,
                            images: test_images_val,
                            labels: test_labels_val
                        })
                    test_loss += loss_value
                    test_acc += acc_value
                test_loss /= FLAGS.test_iter
                test_acc /= FLAGS.test_iter
                test_best_acc = max(test_best_acc, test_acc)
                format_str = ('%s: (Test)     step %d, loss=%.4f, acc=%.4f')
                print(format_str % (datetime.now(), step, test_loss, test_acc))
                sys.stdout.flush()

                test_summary = tf.Summary()
                test_summary.value.add(tag='test/loss', simple_value=test_loss)
                test_summary.value.add(tag='test/acc', simple_value=test_acc)
                test_summary.value.add(tag='test/best_acc',
                                       simple_value=test_best_acc)
                summary_writer.add_summary(test_summary, step)
                # test_loss_summary = tf.Summary()
                # test_loss_summary.value.add(tag='test/loss', simple_value=test_loss)
                # summary_writer.add_summary(test_loss_summary, step)
                # test_acc_summary = tf.Summary()
                # test_acc_summary.value.add(tag='test/acc', simple_value=test_acc)
                # summary_writer.add_summary(test_acc_summary, step)
                # test_best_acc_summary = tf.Summary()
                # test_best_acc_summary.value.add(tag='test/best_acc', simple_value=test_best_acc)
                # summary_writer.add_summary(test_best_acc_summary, step)
                summary_writer.flush()

            # Train
            start_time = time.time()
            train_images_val, train_labels_val = sess.run(
                [train_images, train_labels])
            train_labels_val -= 1
            _, lr_value, loss_value, acc_value, train_summary_str = \
                    sess.run([network.train_op, network.lr, network.loss, network.acc, train_summary_op],
                        feed_dict={network.is_train:True, images:train_images_val, labels:train_labels_val})
            duration = time.time() - start_time

            assert not np.isnan(loss_value)

            # Display & Summary(training)
            if step % FLAGS.display == 0:
                num_examples_per_step = FLAGS.batch_size
                examples_per_sec = num_examples_per_step / duration
                sec_per_batch = float(duration)
                format_str = (
                    '%s: (Training) step %d, loss=%.4f, acc=%.4f, lr=%f (%.1f examples/sec; %.3f '
                    'sec/batch)')
                print(format_str %
                      (datetime.now(), step, loss_value, acc_value, lr_value,
                       examples_per_sec, sec_per_batch))
                sys.stdout.flush()
                summary_writer.add_summary(train_summary_str, step)

            # Save the model checkpoint periodically.
            if (step > init_step and step % FLAGS.checkpoint_interval
                    == 0) or (step + 1) == FLAGS.max_steps:
                checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
Beispiel #17
0
def main():
    """Create the model and start the training."""
    args = get_arguments()

    h, w = map(int, args.input_size.split(','))
    input_size = (h, w)

    # Create queue coordinator.
    coord = tf.train.Coordinator()

    # Load reader.
    #with tf.name_scope("create_inputs"):
    #    reader = ImageReader(
    #        args.data_dir,
    #        args.data_list,
    #        input_size,
    #        args.random_scale,
    #        coord)
    #    image_batch, label_batch = reader.dequeue(args.batch_size)

    num_preprocess_threads = 4
    dataset = SensorFusionData("train")
    data_files = dataset.data_files()
    print("Found {} data files!".format(len(data_files)))

    #with tf.name_scope("create_inputs"):
    image_batch, label_batch = image_processing.distorted_inputs(
        dataset, num_preprocess_threads=num_preprocess_threads)
    num_classes = dataset.num_classes() + 1

    # Create network.
    net = DeepLabResNetModel({'data': image_batch},
                             is_training=args.is_training)
    # For a small batch size, it is better to keep
    # the statistics of the BN layers (running means and variances)
    # frozen, and to not update the values provided by the pre-trained model.
    # If is_training=True, the statistics will be updated during the training.
    # Note that is_training=False still updates BN parameters gamma (scale) and beta (offset)
    # if they are presented in var_list of the optimiser definition.

    # Predictions.
    raw_output = net.layers['fc1_voc12']
    # Which variables to load. Running means and variances are not trainable,
    # thus all_variables() should be restored.
    restore_var = tf.global_variables()
    trainable = tf.trainable_variables()

    prediction = tf.reshape(raw_output, [-1, n_classes])
    label_proc = prepare_label(label_batch,
                               tf.pack(raw_output.get_shape()[1:3]))
    gt = tf.reshape(label_proc, [-1, n_classes])

    # Pixel-wise softmax loss.
    loss = tf.nn.softmax_cross_entropy_with_logits(prediction, gt)
    reduced_loss = tf.reduce_mean(loss)

    # Processed predictions.
    raw_output_up = tf.image.resize_bilinear(raw_output,
                                             tf.shape(image_batch)[1:3, ])
    raw_output_up = tf.argmax(raw_output_up, dimension=3)
    pred = tf.expand_dims(raw_output_up, dim=3)

    # Image summary.
    images_summary = tf.py_func(inv_preprocess,
                                [image_batch, args.save_num_images], tf.uint8)
    labels_summary = tf.py_func(decode_labels,
                                [label_batch, args.save_num_images], tf.uint8)
    preds_summary = tf.py_func(decode_labels, [pred, args.save_num_images],
                               tf.uint8)

    total_summary = tf.summary.image(
        'images',
        tf.concat(2, [images_summary, labels_summary, preds_summary]),
        max_outputs=args.save_num_images)  # Concatenate row-wise.
    summary_writer = tf.summary.FileWriter(args.snapshot_dir)

    # Define loss and optimisation parameters.
    optimiser = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
    optim = optimiser.minimize(reduced_loss, var_list=trainable)

    # Set up tf session and initialize variables.
    config = tf.ConfigProto(allow_soft_placement=True)
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    init = tf.global_variables_initializer()

    print("Running Session...")
    sess.run(init)

    # Saver for storing checkpoints of the model.
    saver = tf.train.Saver(var_list=restore_var, max_to_keep=40)

    # Load variables if the checkpoint is provided.
    if args.restore_from is not None:
        loader = tf.train.Saver(var_list=restore_var)
        load(loader, sess, args.restore_from)

    print("Starting queue runners...")
    # Start queue threads.
    threads = tf.train.start_queue_runners(sess=sess)

    # Iterate over training steps.
    for step in range(args.num_steps):
        start_time = time.time()

        if step % args.save_pred_every == 0:
            loss_value, images, labels, preds, summary, _ = sess.run([
                reduced_loss, image_batch, label_batch, pred, total_summary,
                optim
            ])
            summary_writer.add_summary(summary, step)
            save(saver, sess, args.snapshot_dir, step)
        else:
            loss_value, _ = sess.run([reduced_loss, optim])
        duration = time.time() - start_time
        print('step {:d} \t loss = {:.3f}, ({:.3f} sec/step)'.format(
            step, loss_value, duration))
    coord.request_stop()
    coord.join(threads)
Beispiel #18
0
def train(dataset):
    """Train on dataset for a number of steps."""
    # with tf.Graph().as_default(), tf.device('/cpu:0'):
    with tf.Graph().as_default():

        # ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir)
        # if ckpt and ckpt.model_checkpoint_path:
        #     global_step = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]

        global_step = tf.Variable(0, trainable=False)
        # global_step = tf.contrib.framework.get_or_create_global_step()

        decay_steps = 7500
        LEARNING_RATE_DECAY_FACTOR = 0.1
        INITIAL_LEARNING_RATE = 0.000001

        lr = tf.train.exponential_decay(INITIAL_LEARNING_RATE,
                                        global_step,
                                        decay_steps,
                                        LEARNING_RATE_DECAY_FACTOR,
                                        staircase=True)
        opt = tf.train.MomentumOptimizer(learning_rate=lr, momentum=0.1)

        num_preprocess_threads = FLAGS.num_preprocess_threads * FLAGS.num_gpus
        with tf.device('/cpu:0'):
            images, pitchs, yaws, rolls, names = image_processing.distorted_inputs(
                dataset, num_preprocess_threads=num_preprocess_threads)

        p = tf.expand_dims(pitchs, 1)
        y = tf.expand_dims(yaws, 1)
        r = tf.expand_dims(rolls, 1)
        labels = tf.concat([p, y, r], 1)

        batch_queue = tf.contrib.slim.prefetch_queue.prefetch_queue(
            [images, labels], capacity=2 * FLAGS.num_gpus)

        tower_grads = []

        with tf.variable_scope(tf.get_variable_scope()):
            for i in range(FLAGS.num_gpus):
                with tf.device('/gpu:%d' % i):
                    with tf.name_scope('tower_%d' % i) as scope:
                        image_batch, label_batch = batch_queue.dequeue()
                        loss = tower_loss(scope, image_batch, label_batch)

                        tf.get_variable_scope().reuse_variables()

                        grads = opt.compute_gradients(loss)

                        tower_grads.append(grads)

        grads = average_gradients(tower_grads)
        apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)

        variable_averages = tf.train.ExponentialMovingAverage(
            0.9999, global_step)

        variable_averages_op = variable_averages.apply(
            tf.trainable_variables())

        train_op = tf.group(apply_gradient_op, variable_averages_op)

        saver = tf.train.Saver(tf.global_variables())

        init = tf.global_variables_initializer()

        sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
        sess.run(init)

        tf.train.start_queue_runners(sess=sess)

        for step in np.arange(FLAGS.max_steps):

            _, loss_value = sess.run([train_op, loss])

            if step % 50 == 0:
                print('Step %d, train loss = %.2f' % (step, loss_value))

            if step % 2000 == 0 or (step + 1) == FLAGS.max_steps:
                checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
Beispiel #19
0
  def train(self):
    """Train DCGAN"""
    with tf.Graph().as_default(), tf.device('/cpu:0'):
      # Override the number of preprocessing threads to account for the increased
      # number of GPU towers.
      num_preprocess_threads = FLAGS.num_preprocess_threads
      images, labels = image_processing.distorted_inputs(self.dataset, num_preprocess_threads=num_preprocess_threads)
  
      with tf.device('/gpu:0'):
        # Set weight_decay for weights in Conv and FC layers.
        
        self.build_model(FLAGS.batch_size, images, labels, 12, True, False)
            
        d_opt = tf.train.AdamOptimizer(FLAGS.learning_rate, beta1=FLAGS.beta1) \
                      .minimize(self.d_loss, var_list=self.d_vars)
        g_opt = tf.train.AdamOptimizer(FLAGS.learning_rate, beta1=FLAGS.beta1) \
                      .minimize(self.g_loss, var_list=self.g_vars)
                      
        train_op = tf.group(d_opt, g_opt, g_opt)

        batchnorm_updates = tf.get_collection(slim.ops.UPDATE_OPS_COLLECTION)

      # Add a summaries for the input processing and global_step.
      summaries = tf.get_collection(tf.GraphKeys.SUMMARIES)

      # Group all updates to into a single train op.
      batchnorm_updates_op = tf.group(*batchnorm_updates)
      train_op = tf.group(train_op, batchnorm_updates_op)
  
      # Create a saver.
      saver = tf.train.Saver(tf.all_variables())
  
      summary_op = tf.merge_summary(summaries)
  
      # Build an initialization operation to run below.
      init = tf.initialize_all_variables()
  
      # Start running operations on the Graph. allow_soft_placement must be set to
      # True to build towers on GPU, as some of the ops do not have GPU
      # implementations.
      sess = tf.Session(config=tf.ConfigProto(
          allow_soft_placement=True,
          log_device_placement=FLAGS.log_device_placement))
      sess.run(init)
  
  
      ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir)
      if ckpt and ckpt.model_checkpoint_path:
        variables_to_restore = tf.get_collection(
            slim.variables.VARIABLES_TO_RESTORE)
        restorer = tf.train.Saver(variables_to_restore)
        restorer.restore(sess, ckpt.model_checkpoint_path)
        print('%s: Pre-trained model restored from %s' %
              (datetime.now(), FLAGS.checkpoint_dir))
  
      # Start the queue runners.
      tf.train.start_queue_runners(sess=sess)
      summary_writer = tf.train.SummaryWriter(
          FLAGS.log_dir,
          graph=sess.graph)
  
      for step in xrange(FLAGS.max_steps):
        start_time = time.time()
        sess.run([train_op])
        duration = time.time() - start_time
  
        if step % 10 == 0:
          examples_per_sec = FLAGS.batch_size / float(duration)
          format_str = ('%s: step %d(%.1f examples/sec; %.3f '
                        'sec/batch)')
          print(format_str % (datetime.now(), step, examples_per_sec, duration))
  
        if step % 100 == 0:
          summary_str = sess.run(summary_op)
          summary_writer.add_summary(summary_str, step)
          samples = sess.run(self.G)
          save_images(samples, './%s/%d' % (FLAGS.sample_dir, step))
  
        # Save the model checkpoint periodically.
        if step % 1000 == 0 or (step + 1) == FLAGS.max_steps:
          checkpoint_path = os.path.join(FLAGS.checkpoint_dir, 'model.ckpt')
          saver.save(sess, checkpoint_path, global_step=step)
Beispiel #20
0
def train(dataset):
    print('START')
    if FLAGS.issync:
        raise ValueError("Please set 'issync' to False when non-distribution")
    global_step = tf.Variable(0,
                              dtype=tf.int32,
                              name='global_step',
                              trainable=False)
    lr = _lr(global_step)
    with tf.name_scope("train_process"):
        with tf.device('/cpu:0'):
            images, labels = image_processing.distorted_inputs(
                dataset, num_preprocess_threads=FLAGS.num_preprocess_threads)
        logits = _logits(images)
        loss = _loss(logits, labels)
        train_op = _optimization(loss, global_step, lr, FLAGS.issync)


#    with tf.name_scope("global_step"):
#        tf.summary.scalar('global_step', global_step)
    val_step = int(
        math.ceil(arg_parsing.NUM_EXAMPLES_PER_EPOCH_FOR_EVAL /
                  FLAGS.batch_size))
    val_acc_sum = val(loss, dataset)
    all_hooks = [tf.train.NanTensorHook(loss)]
    if FLAGS.debug:
        all_hooks.append(tfdbg.LocalCLIDebugHook(ui_type='curses'))
    if FLAGS.finetune:
        print('Finetune from %s' % FLAGS.finetune)
        saver = tf.train.Saver()
    config = tf.ConfigProto(log_device_placement=FLAGS.log_device_placement)
    config.gpu_options.allow_growth = True
    with tf.train.MonitoredTrainingSession(checkpoint_dir=FLAGS.model_dir,
                                           hooks=all_hooks,
                                           config=config,
                                           save_summaries_steps=100,
                                           save_summaries_secs=None,
                                           log_step_count_steps=None) as sess:
        if FLAGS.finetune:
            print('Load Pre-trained model...')
            ckpt = tf.train.get_checkpoint_state(FLAGS.finetune)
            if ckpt and ckpt.model_checkpoint_path:
                saver.restore(sess, ckpt.model_checkpoint_path)
            else:
                raise ValueError('Failed to load model.')
            print('-------------------------')
        total_loss = 0
        start_time = time.time()
        for i in range(1, FLAGS.max_steps + 1):
            _, loss_value = sess.run([train_op, loss])
            total_loss += loss_value
            if i % FLAGS.log_frequency == 0:
                current_time = time.time()
                duration = current_time - start_time
                eg_per_sec = FLAGS.log_frequency * FLAGS.batch_size / duration
                sec_per_batch = float(duration / FLAGS.log_frequency)
                avg_loss = total_loss / i
                print(
                    '%s: training step %d cur loss = %.4f avg loss = %.4f (%.1f images/sec %.3f sec/batch)'
                    % (datetime.now(), i, loss_value, avg_loss, eg_per_sec,
                       sec_per_batch))
                start_time = time.time()
            if i % FLAGS.steps_to_val == 0:
                total_val_accu = 0
                for j in range(val_step):
                    total_val_accu += sess.run(val_acc_sum)
                print(
                    '%s: validation total accuracy = %.4f (%.3f sec %d batches)'
                    % (datetime.now(), total_val_accu / float(val_step),
                       float(time.time() - start_time), val_step))
                start_time = time.time()
def train(target, dataset, cluster_spec):
    """Train Inception on a dataset for a number of steps."""
    # Number of workers and parameter servers are infered from the workers and ps
    # hosts string.
    num_workers = len(cluster_spec.as_dict()['worker'])
    num_parameter_servers = len(cluster_spec.as_dict()['ps'])
    # If no value is given, num_replicas_to_aggregate defaults to be the number of
    # workers.
    if FLAGS.num_replicas_to_aggregate == -1:
        num_replicas_to_aggregate = num_workers
    else:
        num_replicas_to_aggregate = FLAGS.num_replicas_to_aggregate

    # Both should be greater than 0 in a distributed training.
    assert num_workers > 0 and num_parameter_servers > 0, (
        ' num_workers and '
        'num_parameter_servers'
        ' must be > 0.')

    # Choose worker 0 as the chief. Note that any worker could be the chief
    # but there should be only one chief.
    is_chief = (FLAGS.task_id == 0)

    # Ops are assigned to worker by default.
    with tf.device('/job:worker/task:%d' % FLAGS.task_id):
        # Variables and its related init/assign ops are assigned to ps.
        with slim.scopes.arg_scope(
            [slim.variables.variable, slim.variables.global_step],
                device=slim.variables.VariableDeviceChooser(
                    num_parameter_servers)):
            # Create a variable to count the number of train() calls. This equals the
            # number of updates applied to the variables.
            global_step = slim.variables.global_step()

            # Calculate the learning rate schedule.
            num_batches_per_epoch = (dataset.num_examples_per_epoch() /
                                     FLAGS.batch_size)
            # Decay steps need to be divided by the number of replicas to aggregate.
            decay_steps = int(num_batches_per_epoch *
                              FLAGS.num_epochs_per_decay /
                              num_replicas_to_aggregate)

            # Decay the learning rate exponentially based on the number of steps.
            lr = tf.train.exponential_decay(FLAGS.initial_learning_rate,
                                            global_step,
                                            decay_steps,
                                            FLAGS.learning_rate_decay_factor,
                                            staircase=True)
            # Add a summary to track the learning rate.
            tf.summary.scalar('learning_rate', lr)

            # Create an optimizer that performs gradient descent.
            opt = tf.train.RMSPropOptimizer(lr,
                                            RMSPROP_DECAY,
                                            momentum=RMSPROP_MOMENTUM,
                                            epsilon=RMSPROP_EPSILON)

            images, labels = image_processing.distorted_inputs(
                dataset,
                batch_size=FLAGS.batch_size,
                num_preprocess_threads=FLAGS.num_preprocess_threads)

            # Number of classes in the Dataset label set plus 1.
            # Label 0 is reserved for an (unused) background class.
            num_classes = dataset.num_classes() + 1
            logits = inception.inference(images,
                                         num_classes,
                                         for_training=True)
            # Add classification loss.
            inception.loss(logits, labels)

            # Gather all of the losses including regularization losses.
            losses = tf.get_collection(slim.losses.LOSSES_COLLECTION)
            losses += tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)

            total_loss = tf.add_n(losses, name='total_loss')

            if is_chief:
                # Compute the moving average of all individual losses and the
                # total loss.
                loss_averages = tf.train.ExponentialMovingAverage(0.9,
                                                                  name='avg')
                loss_averages_op = loss_averages.apply(losses + [total_loss])

                # Attach a scalar summmary to all individual losses and the total loss;
                # do the same for the averaged version of the losses.
                for l in losses + [total_loss]:
                    loss_name = l.op.name
                    # Name each loss as '(raw)' and name the moving average version of the
                    # loss as the original loss name.
                    tf.summary.scalar(loss_name + ' (raw)', l)
                    tf.summary.scalar(loss_name, loss_averages.average(l))

                # Add dependency to compute loss_averages.
                with tf.control_dependencies([loss_averages_op]):
                    total_loss = tf.identity(total_loss)

            # Track the moving averages of all trainable variables.
            # Note that we maintain a 'double-average' of the BatchNormalization
            # global statistics.
            # This is not needed when the number of replicas are small but important
            # for synchronous distributed training with tens of workers/replicas.
            exp_moving_averager = tf.train.ExponentialMovingAverage(
                inception.MOVING_AVERAGE_DECAY, global_step)

            variables_to_average = (tf.trainable_variables() +
                                    tf.moving_average_variables())

            # Add histograms for model variables.
            for var in variables_to_average:
                tf.summary.histogram(var.op.name, var)

            # Create synchronous replica optimizer.
            opt = tf.train.SyncReplicasOptimizer(
                opt,
                replicas_to_aggregate=num_replicas_to_aggregate,
                replica_id=FLAGS.task_id,
                total_num_replicas=num_workers,
                variable_averages=exp_moving_averager,
                variables_to_average=variables_to_average)

            batchnorm_updates = tf.get_collection(
                slim.ops.UPDATE_OPS_COLLECTION)
            assert batchnorm_updates, 'Batchnorm updates are missing'
            batchnorm_updates_op = tf.group(*batchnorm_updates)
            # Add dependency to compute batchnorm_updates.
            with tf.control_dependencies([batchnorm_updates_op]):
                total_loss = tf.identity(total_loss)

            # Compute gradients with respect to the loss.
            grads = opt.compute_gradients(total_loss)

            # Add histograms for gradients.
            for grad, var in grads:
                if grad is not None:
                    tf.summary.histogram(var.op.name + '/gradients', grad)

            apply_gradients_op = opt.apply_gradients(grads,
                                                     global_step=global_step)

            with tf.control_dependencies([apply_gradients_op]):
                train_op = tf.identity(total_loss, name='train_op')

            # Get chief queue_runners, init_tokens and clean_up_op, which is used to
            # synchronize replicas.
            # More details can be found in sync_replicas_optimizer.
            chief_queue_runners = [opt.get_chief_queue_runner()]
            init_tokens_op = opt.get_init_tokens_op()
            clean_up_op = opt.get_clean_up_op()

            # Create a saver.
            saver = tf.train.Saver()

            # Build the summary operation based on the TF collection of Summaries.
            summary_op = tf.merge_all_summaries()

            # Build an initialization operation to run below.
            init_op = tf.initialize_all_variables()

            # We run the summaries in the same thread as the training operations by
            # passing in None for summary_op to avoid a summary_thread being started.
            # Running summaries and training operations in parallel could run out of
            # GPU memory.
            sv = tf.train.Supervisor(is_chief=is_chief,
                                     logdir=FLAGS.train_dir,
                                     init_op=init_op,
                                     summary_op=None,
                                     global_step=global_step,
                                     saver=saver,
                                     save_model_secs=FLAGS.save_interval_secs)

            tf.logging.info('%s Supervisor' % datetime.now())

            sess_config = tf.ConfigProto(
                allow_soft_placement=True,
                log_device_placement=FLAGS.log_device_placement)

            # Get a session.
            sess = sv.prepare_or_wait_for_session(target, config=sess_config)

            # Start the queue runners.
            queue_runners = tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS)
            sv.start_queue_runners(sess, queue_runners)
            tf.logging.info('Started %d queues for processing input data.',
                            len(queue_runners))

            if is_chief:
                sv.start_queue_runners(sess, chief_queue_runners)
                sess.run(init_tokens_op)

            # Train, checking for Nans. Concurrently run the summary operation at a
            # specified interval. Note that the summary_op and train_op never run
            # simultaneously in order to prevent running out of GPU memory.
            next_summary_time = time.time() + FLAGS.save_summaries_secs
            while not sv.should_stop():
                try:
                    start_time = time.time()
                    loss_value, step = sess.run([train_op, global_step])
                    assert not np.isnan(
                        loss_value), 'Model diverged with loss = NaN'
                    if step > FLAGS.max_steps:
                        break
                    duration = time.time() - start_time

                    if step % 30 == 0:
                        examples_per_sec = FLAGS.batch_size / float(duration)
                        format_str = ('Worker %d: %s: step %d, loss = %.2f'
                                      '(%.1f examples/sec; %.3f  sec/batch)')
                        tf.logging.info(
                            format_str %
                            (FLAGS.task_id, datetime.now(), step, loss_value,
                             examples_per_sec, duration))

                    # Determine if the summary_op should be run on the chief worker.
                    if is_chief and next_summary_time < time.time():
                        tf.logging.info(
                            'Running Summary operation on the chief.')
                        summary_str = sess.run(summary_op)
                        sv.summary_computed(sess, summary_str)
                        tf.logging.info('Finished running Summary operation.')

                        # Determine the next time for running the summary.
                        next_summary_time += FLAGS.save_summaries_secs
                except:
                    if is_chief:
                        tf.logging.info('About to execute sync_clean_up_op!')
                        sess.run(clean_up_op)
                    raise

            # Stop the supervisor.  This also waits for service threads to finish.
            sv.stop()

            # Save after the training ends.
            if is_chief:
                saver.save(sess,
                           os.path.join(FLAGS.train_dir, 'model.ckpt'),
                           global_step=global_step)
Beispiel #22
0
def train(dataset):
  #sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0)
  """Train on dataset for a number of steps."""
  with tf.Graph().as_default(), tf.device('/cpu:0'):
    # Create a variable to count the number of train() calls. This equals the
    # number of batches processed * FLAGS.num_gpus.
    tf.set_random_seed(time.time())
    tf.set_random_seed(198918)
    global_step = tf.get_variable(
        'global_step', [],
        initializer=tf.constant_initializer(0), trainable=False)

    bits_ph = []
    for i in range(18):
        bits_ph.append(tf.placeholder(tf.int32))

    nm = norm_monitor.norm_monitor(FLAGS.digits, len(bits_ph), FLAGS.rel_res, FLAGS.interval, FLAGS.stride)
    if FLAGS.layerinfo_file:
      assert tf.gfile.Exists(FLAGS.layerinfo_file)
      tmp = pickle.load(open(FLAGS.layerinfo_file,'rb'))
      nm.set_layerinfo(tmp[-1])
      print("Restore layerinfo")
      print(nm.get_layerinfo())

    # Calculate the learning rate schedule.
    num_batches_per_epoch = (dataset.num_examples_per_epoch() / FLAGS.batch_size)
    decay_steps = int(num_batches_per_epoch * FLAGS.num_epochs_per_decay)
    print("num_batches_per_epoch: {}".format(num_batches_per_epoch))
    print("use bitpack: {}".format(FLAGS.use_bitpack))
    print("learning rate: {}".format(FLAGS.initial_learning_rate))
    print("produce trace: {}".format(FLAGS.profile))
    print("digits: {}".format(FLAGS.digits))
    print("rel_res: {}".format(FLAGS.rel_res))
    print("interval: {}".format(FLAGS.interval))
    print("stride: {}".format(FLAGS.stride))

    # Decay the learning rate exponentially based on the number of steps.
    lr = tf.train.exponential_decay(FLAGS.initial_learning_rate,
                                    global_step,
                                    decay_steps,
                                    FLAGS.learning_rate_decay_factor,
                                    staircase=True)

    # Create an optimizer that performs gradient descent.
    opt = tf.train.RMSPropOptimizer(lr, RMSPROP_DECAY, momentum=RMSPROP_MOMENTUM, epsilon=RMSPROP_EPSILON)

    # Get images and labels for ImageNet and split the batch across GPUs.
    assert FLAGS.batch_size % FLAGS.num_gpus == 0, (
        'Batch size must be divisible by number of GPUs')
    split_batch_size = int(FLAGS.batch_size / FLAGS.num_gpus)

    # Override the number of preprocessing threads to account for the increased
    # number of GPU towers.
    num_preprocess_threads = FLAGS.num_preprocess_threads * FLAGS.num_gpus
    images, labels = image_processing.distorted_inputs(
        dataset,
        num_preprocess_threads=num_preprocess_threads)

    input_summaries = copy.copy(tf.get_collection(tf.GraphKeys.SUMMARIES))

    # Number of classes in the Dataset label set plus 1.
    # Label 0 is reserved for an (unused) background class.
    num_classes = dataset.num_classes() + 1

     # Split the batch of images and labels for towers.
    images_splits = tf.split(axis=0, num_or_size_splits=FLAGS.num_gpus, value=images)
    labels_splits = tf.split(axis=0, num_or_size_splits=FLAGS.num_gpus, value=labels)

    # Calculate the gradients for each model tower.
    tower_norms  = []
    tower_grads  = []
    tower_preds_1  = []
    tower_preds_5  = []
    tower_losses = []

    reuse_variables = None
    for i in range(FLAGS.num_gpus):
      with tf.device('/gpu:%d' % i):
        with tf.name_scope('%s_%d' % (inception.TOWER_NAME, i)) as scope:
          # Force all Variables to reside on the CPU.
          # Calculate the loss for one tower of the ImageNet model. This
          # function constructs the entire ImageNet model but shares the
          # variables across all towers.
          #print(images_splits[i])
          #print(labels_splits[i])
          loss, norms, logits_split = _tower_loss(images_splits[i], labels_splits[i], num_classes, scope, reuse_variables, bits_ph)
          top_1_correct = tf.nn.in_top_k(logits_split, labels_splits[i], 1)
          top_5_correct = tf.nn.in_top_k(logits_split, labels_splits[i], 5)
          # Reuse variables for the next tower.
          reuse_variables = True

          # Retain the summaries from the final tower.
          summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope)

          # Retain the Batch Normalization updates operations only from the
          # final tower. Ideally, we should grab the updates from all towers
          # but these stats accumulate extremely fast so we can ignore the
          # other stats from the other towers without significant detriment.
          #batchnorm_updates = tf.get_collection(slim.ops.UPDATE_OPS_COLLECTION, scope)
          batchnorm_updates = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

          # Calculate the gradients for the batch of data on this ImageNet
          # tower.
          grads = opt.compute_gradients(loss)

          # Keep track of the gradients across all towers.
          tower_grads.append(grads)
          tower_norms.append(norms)
          tower_preds_1.append(tf.reduce_sum(tf.cast(top_1_correct, tf.int32)))
          tower_preds_5.append(tf.reduce_sum(tf.cast(top_5_correct, tf.int32)))
          tower_losses.append(loss)

    # We must calculate the mean of each gradient. Note that this is the
    # synchronization point across all towers.
    grads = _average_gradients(tower_grads)

    top_1_sum = tf.add_n(tower_preds_1)
    top_5_sum = tf.add_n(tower_preds_5)
    losses_sum = tf.add_n(tower_losses)
    # Add a summaries for the input processing and global_step.
    summaries.extend(input_summaries)

    # Add a summary to track the learning rate.
    summaries.append(tf.summary.scalar('learning_rate', lr))

    # Add histograms for gradients.
    for grad, var in grads:
      if grad is not None:
        summaries.append(
            tf.summary.histogram(var.op.name + '/gradients', grad))

    # Apply the gradients to adjust the shared variables.
    apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)

    # Add histograms for trainable variables.
    for var in tf.trainable_variables():
      summaries.append(tf.summary.histogram(var.op.name, var))

    # Track the moving averages of all trainable variables.
    # Note that we maintain a "double-average" of the BatchNormalization
    # global statistics. This is more complicated then need be but we employ
    # this for backward-compatibility with our previous models.
    variable_averages = tf.train.ExponentialMovingAverage(
        inception.MOVING_AVERAGE_DECAY, global_step)

    # Another possibility is to use tf.slim.get_variables().
    variables_to_average = (tf.trainable_variables() + tf.moving_average_variables())
    variables_averages_op = variable_averages.apply(variables_to_average)

    # Group all updates to into a single train op.
    batchnorm_updates_op = tf.group(*batchnorm_updates)
    train_op = tf.group(apply_gradient_op, variables_averages_op, 
            batchnorm_updates_op) 

    # Create a saver.
    saver = tf.train.Saver(tf.global_variables(), max_to_keep=100)

    # Build the summary operation from the last tower summaries.
    summary_op = tf.summary.merge(summaries)

    # Build an initialization operation to run below.
    init = tf.global_variables_initializer()

    # Start running operations on the Graph. allow_soft_placement must be set to
    # True to build towers on GPU, as some of the ops do not have GPU
    # implementations.
    sess = tf.Session(config=tf.ConfigProto(
        allow_soft_placement=True,
        log_device_placement=FLAGS.log_device_placement))
    sess.run(init)

    if FLAGS.pretrained_model_checkpoint_path:
      assert tf.gfile.Exists(FLAGS.pretrained_model_checkpoint_path)
      #variables_to_restore = tf.get_collection(slim.variables.VARIABLES_TO_RESTORE)
      restorer = tf.train.Saver(tf.global_variables(), max_to_keep=100)
      restorer.restore(sess, FLAGS.pretrained_model_checkpoint_path)
      print('%s: Pre-trained model restored from %s' %
            (datetime.now(), FLAGS.pretrained_model_checkpoint_path))
    #for v in tf.all_variables():
    #  print("%s %s %s %s" % (v.name, v.get_shape(), v.dtype, v.device))
    # Start the queue runners.
    tf.train.start_queue_runners(sess=sess)

    summary_writer = tf.summary.FileWriter(
        FLAGS.train_dir,
        graph=sess.graph)

    bits_dict = dict()
    #run_metadata = tf.RunMetadata()
    elapse = []

    #gweights = []
    glayerinfo = []
    #wnp_name = 'weights_norm_{}_{}_{}_{}_{}_{}_{}.dat'.format(9, 2048, 0, FLAGS.digits, FLAGS.stride, FLAGS.interval, FLAGS.use_bitpack)
    lip_name = 'layerinfo_{}_{}_{}_{}_{}_{}_{}.dat'.format(9, 4096, 0, FLAGS.digits, FLAGS.stride, FLAGS.interval, FLAGS.use_bitpack)

    for step in range(FLAGS.max_steps):
      run_metadata = tf.RunMetadata()
      start_time = time.time()
      info = nm.get_layerinfo()
      for i, bits in enumerate(bits_ph):
        bits_dict[bits] = info[i][0]
      if FLAGS.profile is False:
        _, loss_value, norms, top_1, top_5 = sess.run([train_op, losses_sum, tower_norms, top_1_sum, top_5_sum], feed_dict=bits_dict)
      else:
        _, loss_value, norms = sess.run([train_op, loss, tower_norms], 
                                 feed_dict=bits_dict, 
                                 options=tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE), 
                                 run_metadata=run_metadata)
        top_1 = 5
        top_5 = 25

      nm.adjust_digits(norms)
      duration = time.time() - start_time
      #gweights.append(norms)
      #glayerinfo.append(copy.deepcopy(nm.get_layerinfo()))
      elapse.append(duration)

      assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

      if step % 10 == 0:
        glayerinfo.append(copy.deepcopy(nm.get_layerinfo()))
        # Print layerinfo
        print(info)
        examples_per_sec = FLAGS.batch_size / float(duration)
        format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch) elapse %.5f s top_1 %.5f top_5 %.5f')
        pred_1 = top_1 / (FLAGS.batch_size*FLAGS.num_gpus)
        pred_5 = top_5 / (FLAGS.batch_size*FLAGS.num_gpus)
        print(format_str % (datetime.now(), step, loss_value, examples_per_sec, duration, sum(elapse), pred_1, pred_5))
        sys.stdout.flush()
        tl = timeline.Timeline(run_metadata.step_stats)
        if FLAGS.profile is True:
          if FLAGS.use_bitpack is False:
            trace_file = tf.gfile.Open(name='timeline%03d.json' % step, mode='w')
          else:
            trace_file = tf.gfile.Open(name='bitpack_timeline%03d.json' % step, mode='w')
          trace_file.write(tl.generate_chrome_trace_format(show_memory=True))

      if step % 100 == 0:
        summary_str = sess.run(summary_op, feed_dict=bits_dict)
        summary_writer.add_summary(summary_str, step)

      # Save the model checkpoint periodically.
      if step % 4000 == 0 or (step + 1) == FLAGS.max_steps:
        checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)

  glayerinfo.append(copy.deepcopy(nm.get_layerinfo()))
  #pickle.dump(gweights, open(wnp_name,'wb'))
  pickle.dump(glayerinfo, open(lip_name,'wb'))
Beispiel #23
0
def train():
    with tf.Graph().as_default(), tf.device('/cpu:0'):
        # Get images and labels for CIFAR-10.
        #dataset = CIFARData(subset='train')
        dataset = ImagenetData(subset='train')
        assert dataset.data_files()

        #test_set = CIFARData(subset='validation')
        test_set = ImagenetData(subset='validation')
        assert test_set.data_files()

        epoch1 = .5 * helper.MAX_EPOCHS
        epoch2 = .75 * helper.MAX_EPOCHS
        step1 = dataset.num_examples_per_epoch() * epoch1 // (
            helper.BATCH_SIZE)
        step2 = dataset.num_examples_per_epoch() * epoch2 // (
            helper.BATCH_SIZE)
        print('Reducing learning rate at step ' + str(step1) + ' and step ' +
              str(step2) + ' and ending at ' + str(helper.MAX_STEPS))

        # Create a variable to count the number of train() calls. This equals the
        # number of batches processed * FLAGS.num_gpus.
        global_step = tf.get_variable('global_step', [],
                                      initializer=tf.constant_initializer(0),
                                      trainable=False)

        # Learning rate
        lr = .1

        #learning_rate = tf.placeholder(tf.float32, shape=[], name='learning_rate')
        dropout = tf.placeholder(tf.float32, shape=[], name='dropout')
        is_training = tf.placeholder(tf.bool, shape=[], name='is_training')

        boundaries = [step1, step2]
        values = [lr, lr / 10, lr / 100]

        learning_rate = tf.train.piecewise_constant(global_step,
                                                    boundaries,
                                                    values,
                                                    name=None)

        decayed_lr = tf.train.polynomial_decay(lr,
                                               global_step,
                                               helper.MAX_STEPS,
                                               end_learning_rate=0.0001,
                                               power=4.0,
                                               cycle=False,
                                               name=None)

        # Create an optimizer that performs gradient descent.
        with tf.name_scope('Optimizer'):
            opt = tf.train.MomentumOptimizer(learning_rate=decayed_lr,
                                             momentum=0.9,
                                             use_nesterov=True)
            #opt = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=0.9, use_nesterov=True)

        tf.summary.scalar('decayed_learning_rate', decayed_lr)
        tf.summary.scalar('learning_rate', learning_rate)

        # Override the number of preprocessing threads to account for the increased
        # number of GPU towers.
        num_preprocess_threads = helper.NUM_THREADS * helper.N_GPUS
        distorted_images, distorted_labels = image_processing.distorted_inputs(
            dataset,
            batch_size=helper.SPLIT_BATCH_SIZE,
            num_preprocess_threads=num_preprocess_threads)

        #images, labels = image_processing.inputs(dataset, batch_size=helper.BATCH_SIZE, num_preprocess_threads=num_preprocess_threads)
        test_images, test_labels = image_processing.inputs(
            test_set,
            batch_size=helper.SPLIT_BATCH_SIZE,
            num_preprocess_threads=num_preprocess_threads)

        input_summaries = copy.copy(tf.get_collection(tf.GraphKeys.SUMMARIES))

        # Split the batch of images and labels for towers.
        #images_splits = tf.split(axis=0, num_or_size_splits=helper.N_GPUS, value=distorted_images)
        #labels_splits = tf.split(axis=0, num_or_size_splits=helper.N_GPUS, value=distorted_labels)

        batch_queue = tf.contrib.slim.prefetch_queue.prefetch_queue(
            [distorted_images, distorted_labels], capacity=2 * helper.N_GPUS)

        # Calculate the gradients for each model tower.
        tower_grads = []
        with tf.variable_scope(tf.get_variable_scope()):
            for i in range(helper.N_GPUS):
                with tf.device('/gpu:%d' % i):
                    with tf.name_scope('%s_%d' %
                                       (helper.TOWER_NAME, i)) as scope:
                        # Calculate the loss for one tower of the CIFAR model. This function
                        # constructs the entire CIFAR model but shares the variables across
                        # all towers.
                        image_batch, label_batch = batch_queue.dequeue()
                        loss = tower_loss(scope,
                                          image_batch,
                                          label_batch,
                                          dropout=dropout,
                                          is_training=is_training)
                        #loss = tower_loss(scope, images_splits[i], labels_splits[i], dropout=dropout, is_training=is_training)

                        # Retain the summaries from the final tower.
                        summaries = tf.get_collection(tf.GraphKeys.SUMMARIES,
                                                      scope)

                        tf.get_variable_scope().reuse_variables()

                        grads = opt.compute_gradients(loss)

                        tower_grads.append(grads)

        # We must calculate the mean of each gradient. Note that this is the
        # synchronization point across all towers.
        grads = average_gradients(tower_grads)

        # Add a summaries for the input processing and global_step.
        summaries.extend(input_summaries)

        # Apply the gradients to adjust the shared variables.
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(update_ops):
            apply_gradient_op = opt.apply_gradients(grads,
                                                    global_step=global_step)

            # Track the moving averages of all trainable variables.
            variable_averages = tf.train.ExponentialMovingAverage(
                helper.MOVING_AVERAGE_DECAY, global_step)
            variables_averages_op = variable_averages.apply(
                tf.trainable_variables())

            # Group all updates to into a single train op.
            #train_op = apply_gradient_op
            train_op = tf.group(apply_gradient_op, variables_averages_op)

        # Add histograms for trainable variables.
        #for var in tf.trainable_variables():
        #    summaries.append(tf.summary.histogram(var.op.name, var))

        for grad, var in grads:
            summaries.append(tf.summary.histogram(var.op.name, var))
            #summaries.append(tf.summary.histogram(var.op.name + '_gradient', grad))

        # Create a saver.
        saver = tf.train.Saver(tf.global_variables())

        cross_entropy_op = tf.reduce_mean(tf.get_collection('cross_entropies'),
                                          name='cross_entropy')

        accuracy_op = tf.reduce_mean(tf.get_collection('accuracy'),
                                     name='accuracies')
        summaries.append(tf.summary.scalar('cross_entropy', cross_entropy_op))
        summaries.append(tf.summary.scalar('accuracy', accuracy_op))

        # Build the summary operation from the last tower summaries.
        summary_op = tf.summary.merge(summaries)

        # Build an initialization operation to run below.
        init = tf.global_variables_initializer()

        # Start running operations on the Graph. allow_soft_placement must be set to
        # True to build towers on GPU, as some of the ops do not have GPU
        # implementations.
        sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True,
                                                log_device_placement=False))

        #run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
        #run_metadata = tf.RunMetadata()

        sess.run(init)
        tf.train.start_queue_runners(sess=sess)

        if RESTORE == True:
            ckpt = tf.train.get_checkpoint_state(SAVE_POINT)
            saver.restore(sess, ckpt.model_checkpoint_path)

            # Assuming model_checkpoint_path looks something like:
            #   /my-favorite-path/imagenet_train/model.ckpt-0,
            # extract global_step from it.
            restored_step = ckpt.model_checkpoint_path.split('/')[-1].split(
                '-')[-1]
            print('Successfully loaded model from %s at step=%s.' %
                  (ckpt.model_checkpoint_path, restored_step))
            step = int(restored_step)
            range_step = range(step, helper.MAX_STEPS)
            tf.get_variable_scope().reuse_variables()
            global_step = tf.get_variable('global_step', trainable=False)
        else:
            range_step = range(helper.MAX_STEPS)

        summary_writer = tf.summary.FileWriter('summary', graph=sess.graph)
        num_params = helper.count_params() / 1e6
        print('Total number of params = %.2fM' % num_params)
        print("training")
        top1_error = [-1.0, -1.0]
        top1_step = 0
        top5_error = [-1.0, -1.0]
        top5_step = 0

        for step in range_step:

            start_time = time.time()
            _, loss_value, cross_entropy_value, accuracy_value = sess.run(
                [train_op, loss, cross_entropy_op, accuracy_op],
                feed_dict={
                    dropout: 0.8,
                    is_training: True
                }
            )  #, options=run_options, run_metadata=run_metadata)#, learning_rate: lr})
            duration = time.time() - start_time

            if step == step1 or step == step2:
                print('Decreasing Learning Rate')
                lr /= 10

            if step % 10 == 0:
                num_examples_per_step = helper.BATCH_SIZE
                examples_per_sec = num_examples_per_step / duration
                sec_per_batch = duration

                format_str = (
                    'step %d, loss = %.2f, cross entropy = %.2f, accuracy = %.2f, %.3f sec/batch'
                )
                print(format_str % (step, loss_value, cross_entropy_value,
                                    accuracy_value, sec_per_batch))
                """
                # Create the Timeline object, and write it to a json
                tl = timeline.Timeline(run_metadata.step_stats)
                ctf = tl.generate_chrome_trace_format()
                with open('timeline.json', 'w') as f:
                    f.write(ctf)
                """

            if step % 100 == 0:
                summary_str = sess.run(summary_op,
                                       feed_dict={
                                           dropout: 0.8,
                                           is_training: False
                                       })  #, learning_rate: lr})
                summary_writer.add_summary(summary_str, step)

            # Save the model checkpoint periodically.
            if step % 5000 == 0 or (step + 1) == helper.MAX_STEPS:
                if step != 0:
                    checkpoint_path = SAVE_POINT + 'model.ckpt'
                    saver.save(sess, checkpoint_path, global_step=step)
                    print('Model saved')

                    #evaluate(distorted_images, distorted_labels, sess, dropout=dropout, is_training=is_training, train=True)
                    top1, top5 = evaluate(test_images,
                                          test_labels,
                                          sess,
                                          dropout=dropout,
                                          is_training=is_training,
                                          train=False)
                    if top1 > top1_error[0]:
                        top1_error[0] = top1
                        top1_error[1] = top5
                        top1_step = step
                    if top5 > top5_error[1]:
                        top5_error[0] = top1
                        top5_error[1] = top5
                        top5_step = step
                    print(
                        "Best top1 model achieved top1: %.4f, top5: %.4f at step %d"
                        % (top1_error[0], top1_error[1], top1_step))
                    print(
                        "Best top5 model achieved top1: %.4f, top5: %.4f at step %d"
                        % (top5_error[0], top5_error[1], top5_step))
def train(target, dataset, cluster_spec):
  """Train Inception on a dataset for a number of steps."""
  # Number of workers and parameter servers are inferred from the workers and ps
  # hosts string.
  num_workers = len(cluster_spec.as_dict()['worker'])
  num_parameter_servers = len(cluster_spec.as_dict()['ps'])
  # If no value is given, num_replicas_to_aggregate defaults to be the number of
  # workers.
  if FLAGS.num_replicas_to_aggregate == -1:
    num_replicas_to_aggregate = num_workers
  else:
    num_replicas_to_aggregate = FLAGS.num_replicas_to_aggregate

  # Both should be greater than 0 in a distributed training.
  assert num_workers > 0 and num_parameter_servers > 0, (' num_workers and '
                                                         'num_parameter_servers'
                                                         ' must be > 0.')

  # Choose worker 0 as the chief. Note that any worker could be the chief
  # but there should be only one chief.
  is_chief = (FLAGS.task_id == 0)

  #batchSizeManager = BatchSizeManager(32, 4)

  # Ops are assigned to worker by default.
  tf.logging.info('cccc-num_parameter_servers:'+str(num_parameter_servers))
  partitioner = tf.fixed_size_partitioner(num_parameter_servers, 0)  

  device_setter = tf.train.replica_device_setter(ps_tasks=num_parameter_servers)
  slim = tf.contrib.slim
  with tf.device('/job:worker/task:%d' % FLAGS.task_id):
   with tf.variable_scope('root', partitioner=partitioner):
    # Variables and its related init/assign ops are assigned to ps.
#    with slim.arg_scope(
#        [slim.variables.variable, slim.variables.global_step],
#        device=slim.variables.VariableDeviceChooser(num_parameter_servers)):
    with tf.device(device_setter):
#	partitioner=partitioner):
      # Create a variable to count the number of train() calls. This equals the
      # number of updates applied to the variables.
#      global_step = slim.variables.global_step()
      global_step = tf.Variable(0, trainable=False)

      # Calculate the learning rate schedule.

      batch_size = tf.placeholder(dtype=tf.int32, shape=(), name='batch_size')
      num_batches_per_epoch = (dataset.num_examples_per_epoch() /
                               FLAGS.batch_size)
      # Decay steps need to be divided by the number of replicas to aggregate.
      decay_steps = int(num_batches_per_epoch * FLAGS.num_epochs_per_decay /
                        num_replicas_to_aggregate)

      # Decay the learning rate exponentially based on the number of steps.
      lr = tf.train.exponential_decay(FLAGS.initial_learning_rate,
                                      global_step,
                                      decay_steps,
                                      FLAGS.learning_rate_decay_factor,
                                      staircase=True)
      # Add a summary to track the learning rate.
#      tf.summary.scalar('learning_rate', lr)

      # Create an optimizer that performs gradient descent.

      images, labels = image_processing.distorted_inputs(
          dataset,
          batch_size,
          num_preprocess_threads=FLAGS.num_preprocess_threads)
      print(images.get_shape())
      print(labels.get_shape())

      # Number of classes in the Dataset label set plus 1.
      # Label 0 is reserved for an (unused) background class.
#      num_classes = dataset.num_classes() + 1
      num_classes = dataset.num_classes()
      print(num_classes)
#      logits = inception.inference(images, num_classes, for_training=True)
      network_fn = nets_factory.get_network_fn('inception_v3',num_classes=num_classes) 
      (logits,_) = network_fn(images)
      print(logits.get_shape())
      # Add classification loss.
#      inception.loss(logits, labels, batch_size)

      # Gather all of the losses including regularization losses.
      labels = tf.one_hot(labels, 1000, 1, 0)
      cross_entropy = tf.losses.softmax_cross_entropy(
          logits=logits, 
          onehot_labels=labels)
#      losses = tf.get_collection(slim.losses.LOSSES_COLLECTION)
#      losses += tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
      losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
      total_loss = cross_entropy + _WEIGHT_DECAY * tf.add_n(
          [tf.nn.l2_loss(v) for v in tf.trainable_variables()])

#      total_loss = tf.add_n(losses, name='total_loss')

      loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg')
      loss_averages_op = loss_averages.apply(losses + [total_loss])

      with tf.control_dependencies([loss_averages_op]):
        opt = tf.train.RMSPropOptimizer(lr,
                                      RMSPROP_DECAY,
                                      momentum=RMSPROP_MOMENTUM,
                                      epsilon=RMSPROP_EPSILON)
        grads0 = opt.compute_gradients(total_loss) 
        grads = [(tf.scalar_mul(tf.cast(batch_size/FLAGS.batch_size, tf.float32), grad), var) for grad, var in grads0]
        total_loss = tf.identity(total_loss)

      exp_moving_averager = tf.train.ExponentialMovingAverage(
          MOVING_AVERAGE_DECAY, global_step)
      variables_averages_op = exp_moving_averager.apply(tf.trainable_variables())


      apply_gradients_op = opt.apply_gradients(grads, global_step=global_step)

      with tf.control_dependencies([apply_gradients_op, variables_averages_op]):
        train_op = tf.identity(total_loss, name='train_op')

      # Get chief queue_runners and init_tokens, which is used to synchronize
      # replicas. More details can be found in SyncReplicasOptimizer.
#      chief_queue_runners = [opt.get_chief_queue_runner()]
#      init_tokens_op = opt.get_init_tokens_op()

      # Create a saver.
      saver = tf.train.Saver()

      # Build the summary operation based on the TF collection of Summaries.
#      summary_op = tf.summary.merge_all()

      # Build an initialization operation to run below.
      init_op = tf.global_variables_initializer()

      # We run the summaries in the same thread as the training operations by
      # passing in None for summary_op to avoid a summary_thread being started.
      # Running summaries and training operations in parallel could run out of
      # GPU memory.
      sv = tf.train.Supervisor(is_chief=is_chief,
                               logdir=FLAGS.train_dir,
                               init_op=init_op,
                               summary_op=None,
                               global_step=global_step,
                               recovery_wait_secs=1,
                               saver=None,
                               save_model_secs=FLAGS.save_interval_secs)

      tf.logging.info('%s Supervisor' % datetime.now())

      sess_config = tf.ConfigProto(
          allow_soft_placement=True,
          log_device_placement=FLAGS.log_device_placement)

      # Get a session.
      sess = sv.prepare_or_wait_for_session(target, config=sess_config)

      # Start the queue runners.
      queue_runners = tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS)
      sv.start_queue_runners(sess, queue_runners)
      tf.logging.info('Started %d queues for processing input data.',
                      len(queue_runners))

#      if is_chief:
#        sv.start_queue_runners(sess, chief_queue_runners)
#        sess.run(init_tokens_op)

      # Train, checking for Nans. Concurrently run the summary operation at a
      # specified interval. Note that the summary_op and train_op never run
      # simultaneously in order to prevent running out of GPU memory.
#      next_summary_time = time.time() + FLAGS.save_summaries_secs
      step = 0
      time0 = time.time()
      batch_size_num = 1
      while not sv.should_stop():
        try:
          start_time = time.time()

	  batch_size_num = 32
#	   batch_size_num = int((int(step)/3*10)) % 100000 + 1
#          if step < 5:
#            batch_size_num = 32 
#          batch_size_num = (batch_size_num ) % 64 + 1
#          else:
#            batch_size_num = 80

          run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
          run_metadata = tf.RunMetadata()

          my_images, loss_value, step = sess.run([images, train_op, global_step], feed_dict={batch_size: batch_size_num}, options=run_options, run_metadata=run_metadata)
	  b = time.time()
#          assert not np.isnan(loss_value), 'Model diverged with loss = NaN'
          if step > FLAGS.max_steps:
            break
          duration = time.time() - start_time
#	  thread = threading2.Thread(target=get_computation_time, name="get_computation_time",args=(run_metadata.step_stats,step,))
#	  thread.start()
#          tl = timeline.Timeline(run_metadata.step_stats)
#          last_batch_time = tl.get_local_step_duration('sync_token_q_Dequeue')
          c0 = time.time()
#          batch_size_num = batchSizeManager.dictate_new_batch_size(FLAGS.task_id, last_batch_time)
#          batch_size_num = rpcClient.update_batch_size(FLAGS.task_id, last_batch_time, available_cpu, available_memory, step, batch_size_num) 
#          ctf = tl.generate_chrome_trace_format()
#          with open("timeline.json", 'a') as f:
#            f.write(ctf)

          if step % 1 == 0:
            examples_per_sec = FLAGS.batch_size / float(duration)
            c = time.time()
            tf.logging.info("time statistics" + " - train_time: " + str(b-start_time) + " - get_batch_time: " + str(c0-b) + " - get_bs_time:  " + str(c-c0) + " - accum_time: " + str(c-time0) + " - batch_size: " + str(batch_size_num))
            format_str = ('Worker %d: %s: step %d, loss = %.2f'
                          '(%.1f examples/sec; %.3f  sec/batch)')
            tf.logging.info(format_str %
                            (FLAGS.task_id, datetime.now(), step, loss_value,
                             examples_per_sec, duration))

          # Determine if the summary_op should be run on the chief worker.
#          if is_chief and next_summary_time < time.time():
#            tf.logging.info('Running Summary operation on the chief.')
#            summary_str = sess.run(summary_op)
#            sv.summary_computed(sess, summary_str)
#            tf.logging.info('Finished running Summary operation.')

            # Determine the next time for running the summary.
#            next_summary_time += FLAGS.save_summaries_secs
        except:
          if is_chief:
            tf.logging.info('Chief got exception while running!')
          raise

      # Stop the supervisor.  This also waits for service threads to finish.
      sv.stop()
def train(target, dataset, cluster_spec):
    """Train Inception on a dataset for a number of steps."""
    # Number of workers and parameter servers are infered from the workers and ps
    # hosts string.
    num_workers = len(cluster_spec.as_dict()['worker'])
    num_parameter_servers = len(cluster_spec.as_dict()['ps'])
    # If no value is given, num_replicas_to_aggregate defaults to be the number of
    # workers.
    if FLAGS.num_replicas_to_aggregate == -1:
        num_replicas_to_aggregate = num_workers
    else:
        num_replicas_to_aggregate = FLAGS.num_replicas_to_aggregate

    # Both should be greater than 0 in a distributed training.
    assert num_workers > 0 and num_parameter_servers > 0, (' num_workers and '
                                                           'num_parameter_servers'
                                                           ' must be > 0.')

    # Choose worker 0 as the chief. Note that any worker could be the chief
    # but there should be only one chief.
    is_chief = (FLAGS.task_id == 0)

    # Ops are assigned to worker by default.
    with tf.device('/job:worker/task:%d' % FLAGS.task_id):
        # Variables and its related init/assign ops are assigned to ps.
        with slim.scopes.arg_scope(
                [slim.variables.variable, slim.variables.global_step],
                device=slim.variables.VariableDeviceChooser(num_parameter_servers)):
            # Create a variable to count the number of train() calls. This equals the
            # number of updates applied to the variables.
            global_step = slim.variables.global_step()

            # Calculate the learning rate schedule.
            num_batches_per_epoch = (dataset.num_examples_per_epoch() /
                                     FLAGS.batch_size)
            # Decay steps need to be divided by the number of replicas to aggregate.
            decay_steps = int(num_batches_per_epoch * FLAGS.num_epochs_per_decay /
                              num_replicas_to_aggregate)

            # Decay the learning rate exponentially based on the number of steps.
            lr = tf.train.exponential_decay(FLAGS.initial_learning_rate,
                                            global_step,
                                            decay_steps,
                                            FLAGS.learning_rate_decay_factor,
                                            staircase=True)
            # Add a summary to track the learning rate.
            tf.summary.scalar('learning_rate', lr)

            # Create an optimizer that performs gradient descent.
            opt = tf.train.RMSPropOptimizer(lr,
                                            RMSPROP_DECAY,
                                            momentum=RMSPROP_MOMENTUM,
                                            epsilon=RMSPROP_EPSILON)

            images, labels = image_processing.distorted_inputs(
                dataset,
                batch_size=FLAGS.batch_size,
                num_preprocess_threads=FLAGS.num_preprocess_threads)

            # Number of classes in the Dataset label set plus 1.
            # Label 0 is reserved for an (unused) background class.
            num_classes = dataset.num_classes() + 1
            logits = inception.inference(images, num_classes, for_training=True)
            # Add classification loss.
            inception.loss(logits, labels)

            # Gather all of the losses including regularization losses.
            losses = tf.get_collection(slim.losses.LOSSES_COLLECTION)
            losses += tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)

            total_loss = tf.add_n(losses, name='total_loss')

            if is_chief:
                # Compute the moving average of all individual losses and the
                # total loss.
                loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg')
                loss_averages_op = loss_averages.apply(losses + [total_loss])

                # Attach a scalar summmary to all individual losses and the total loss;
                # do the same for the averaged version of the losses.
                for l in losses + [total_loss]:
                    loss_name = l.op.name
                    # Name each loss as '(raw)' and name the moving average version of the
                    # loss as the original loss name.
                    tf.summary.scalar(loss_name + ' (raw)', l)
                    tf.summary.scalar(loss_name, loss_averages.average(l))

                # Add dependency to compute loss_averages.
                with tf.control_dependencies([loss_averages_op]):
                    total_loss = tf.identity(total_loss)

            # Track the moving averages of all trainable variables.
            # Note that we maintain a 'double-average' of the BatchNormalization
            # global statistics.
            # This is not needed when the number of replicas are small but important
            # for synchronous distributed training with tens of workers/replicas.
            exp_moving_averager = tf.train.ExponentialMovingAverage(
                inception.MOVING_AVERAGE_DECAY, global_step)

            variables_to_average = (
                tf.trainable_variables() + tf.moving_average_variables())

            # Add histograms for model variables.
            for var in variables_to_average:
                tf.summary.histogram(var.op.name, var)

            # Create synchronous replica optimizer.
            opt = tf.train.SyncReplicasOptimizer(
                opt,
                replicas_to_aggregate=num_replicas_to_aggregate,
                replica_id=FLAGS.task_id,
                total_num_replicas=num_workers,
                variable_averages=exp_moving_averager,
                variables_to_average=variables_to_average)

            batchnorm_updates = tf.get_collection(slim.ops.UPDATE_OPS_COLLECTION)
            assert batchnorm_updates, 'Batchnorm updates are missing'
            batchnorm_updates_op = tf.group(*batchnorm_updates)
            # Add dependency to compute batchnorm_updates.
            with tf.control_dependencies([batchnorm_updates_op]):
                total_loss = tf.identity(total_loss)

            # Compute gradients with respect to the loss.
            grads = opt.compute_gradients(total_loss)

            # Add histograms for gradients.
            for grad, var in grads:
                if grad is not None:
                    tf.summary.histogram(var.op.name + '/gradients', grad)

            apply_gradients_op = opt.apply_gradients(grads, global_step=global_step)

            with tf.control_dependencies([apply_gradients_op]):
                train_op = tf.identity(total_loss, name='train_op')

            # Get chief queue_runners, init_tokens and clean_up_op, which is used to
            # synchronize replicas.
            # More details can be found in sync_replicas_optimizer.
            chief_queue_runners = [opt.get_chief_queue_runner()]
            init_tokens_op = opt.get_init_tokens_op()
            clean_up_op = opt.get_clean_up_op()

            # Create a saver.
            saver = tf.train.Saver()

            # Build the summary operation based on the TF collection of Summaries.
            summary_op = tf.merge_all_summaries()

            # Build an initialization operation to run below.
            init_op = tf.initialize_all_variables()

            # We run the summaries in the same thread as the training operations by
            # passing in None for summary_op to avoid a summary_thread being started.
            # Running summaries and training operations in parallel could run out of
            # GPU memory.
            sv = tf.train.Supervisor(is_chief=is_chief,
                                     logdir=FLAGS.train_dir,
                                     init_op=init_op,
                                     summary_op=None,
                                     global_step=global_step,
                                     saver=saver,
                                     save_model_secs=FLAGS.save_interval_secs)

            tf.logging.info('%s Supervisor' % datetime.now())

            sess_config = tf.ConfigProto(
                allow_soft_placement=True,
                log_device_placement=FLAGS.log_device_placement)

            # Get a session.
            sess = sv.prepare_or_wait_for_session(target, config=sess_config)

            # Start the queue runners.
            queue_runners = tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS)
            sv.start_queue_runners(sess, queue_runners)
            tf.logging.info('Started %d queues for processing input data.',
                            len(queue_runners))

            if is_chief:
                sv.start_queue_runners(sess, chief_queue_runners)
                sess.run(init_tokens_op)

            # Train, checking for Nans. Concurrently run the summary operation at a
            # specified interval. Note that the summary_op and train_op never run
            # simultaneously in order to prevent running out of GPU memory.
            next_summary_time = time.time() + FLAGS.save_summaries_secs
            while not sv.should_stop():
                try:
                    start_time = time.time()
                    loss_value, step = sess.run([train_op, global_step])
                    assert not np.isnan(loss_value), 'Model diverged with loss = NaN'
                    if step > FLAGS.max_steps:
                        break
                    duration = time.time() - start_time

                    if step % 30 == 0:
                        examples_per_sec = FLAGS.batch_size / float(duration)
                        format_str = ('Worker %d: %s: step %d, loss = %.2f'
                                      '(%.1f examples/sec; %.3f  sec/batch)')
                        tf.logging.info(format_str %
                                        (FLAGS.task_id, datetime.now(), step, loss_value,
                                         examples_per_sec, duration))

                    # Determine if the summary_op should be run on the chief worker.
                    if is_chief and next_summary_time < time.time():
                        tf.logging.info('Running Summary operation on the chief.')
                        summary_str = sess.run(summary_op)
                        sv.summary_computed(sess, summary_str)
                        tf.logging.info('Finished running Summary operation.')

                        # Determine the next time for running the summary.
                        next_summary_time += FLAGS.save_summaries_secs
                except:
                    if is_chief:
                        tf.logging.info('About to execute sync_clean_up_op!')
                        sess.run(clean_up_op)
                    raise

            # Stop the supervisor.  This also waits for service threads to finish.
            sv.stop()

            # Save after the training ends.
            if is_chief:
                saver.save(sess,
                           os.path.join(FLAGS.train_dir, 'model.ckpt'),
                           global_step=global_step)
def train():
    with tf.Graph().as_default(), tf.device('/cpu:0'):
        # Create a variable to count the number of train() calls. This equals the
        # number of batches processed * FLAGS.num_gpus.
        global_step = tf.get_variable('global_step', [],
                                      initializer=tf.constant_initializer(0),
                                      trainable=False)

        # Learning rate
        lr = .001

        # Create an optimizer that performs gradient descent.
        opt = tf.train.AdamOptimizer(lr)

        split_batch_size = int(helper.BATCH_SIZE / helper.N_GPUS)
        num_preprocess_threads = helper.NUM_THREADS * helper.N_GPUS

        # Get images and labels for CIFAR-10.
        dataset = ImagenetData(subset='train')
        assert dataset.data_files()

        assert helper.BATCH_SIZE % helper.N_GPUS == 0, (
            'Batch size must be divisible by number of GPUs')
        split_batch_size = int(helper.BATCH_SIZE / helper.N_GPUS)

        # Override the number of preprocessing threads to account for the increased
        # number of GPU towers.
        num_preprocess_threads = helper.NUM_THREADS * helper.N_GPUS
        images, labels = image_processing.distorted_inputs(
            dataset,
            batch_size=helper.BATCH_SIZE,
            num_preprocess_threads=num_preprocess_threads)

        # Split the batch of images and labels for towers.
        images_splits = tf.split(axis=0,
                                 num_or_size_splits=helper.N_GPUS,
                                 value=images)
        labels_splits = tf.split(axis=0,
                                 num_or_size_splits=helper.N_GPUS,
                                 value=labels)

        # Calculate the gradients for each model tower.
        tower_grads = []
        with tf.variable_scope(tf.get_variable_scope()):
            for i in range(helper.N_GPUS):
                with tf.device('/gpu:%d' % i):
                    with tf.name_scope('%s_%d' %
                                       (helper.TOWER_NAME, i)) as scope:
                        # Calculate the loss for one tower of the CIFAR model. This function
                        # constructs the entire CIFAR model but shares the variables across
                        # all towers.
                        loss = tower_loss(scope, images_splits[i],
                                          labels_splits[i])

                        tf.get_variable_scope().reuse_variables()

                        grads = opt.compute_gradients(loss)

                        tower_grads.append(grads)

        # We must calculate the mean of each gradient. Note that this is the
        # synchronization point across all towers.
        grads = average_gradients(tower_grads)

        # Apply the gradients to adjust the shared variables.
        apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)

        # Track the moving averages of all trainable variables.
        variable_averages = tf.train.ExponentialMovingAverage(
            helper.MOVING_AVERAGE_DECAY, global_step)
        variables_averages_op = variable_averages.apply(
            tf.trainable_variables())

        # Group all updates to into a single train op.
        train_op = tf.group(apply_gradient_op, variables_averages_op)

        # Build an initialization operation to run below.
        init = tf.global_variables_initializer()

        # Start running operations on the Graph. allow_soft_placement must be set to
        # True to build towers on GPU, as some of the ops do not have GPU
        # implementations.
        sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True,
                                                log_device_placement=False))
        sess.run(init)
        tf.train.start_queue_runners(sess=sess)
        print("training")

        #for epoch in range(helper.MAX_EPOCH):
        for epoch in range(helper.MAX_STEPS):

            start_time = time.time()
            _, loss_value = sess.run([train_op, loss])
            duration = time.time() - start_time

            num_examples_per_step = helper.BATCH_SIZE * helper.N_GPUS
            examples_per_sec = num_examples_per_step / duration
            sec_per_batch = duration / helper.N_GPUS

            format_str = (
                '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f sec/batch)')
            print(format_str % (datetime.now(), i, loss_value,
                                examples_per_sec, sec_per_batch))
Beispiel #27
0
def train_dis_(dataset):
    ps_hosts = arg_parsing.PS_HOSTS.split(",")
    worker_hosts = arg_parsing.WORKER_HOSTS.split(",")
    cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts})
    server = tf.train.Server(cluster,
                             job_name=FLAGS.job_name,
                             task_index=FLAGS.task_index)
    if FLAGS.job_name == "ps":
        server.join()
    if FLAGS.job_name == "worker":
        print('START')
        with tf.device(
                tf.train.replica_device_setter(
                    worker_device="/job:worker/task:%d" % FLAGS.task_index,
                    cluster=cluster)):
            global_step = tf.Variable(0,
                                      dtype=tf.int32,
                                      name='global_step',
                                      trainable=False)

            lr = _lr(global_step)
            with tf.name_scope("train_process"):
                with tf.device('/cpu:0'):
                    images, labels = image_processing.distorted_inputs(
                        dataset,
                        num_preprocess_threads=FLAGS.num_preprocess_threads)
                logits = _logits(images)
                loss = _loss(logits, labels)
                train_op = _optimization(loss, global_step, lr, FLAGS.issync,
                                         len(worker_hosts))
#            with tf.name_scope("global_step"):
#                tf.summary.scalar('global_step', global_step)

            val_acc_sum = val(loss, dataset)

            class _LoggerHook(tf.train.SessionRunHook):
                def begin(self):
                    self._local_step = 0
                    self._start_time = time.time()
                    self._total_loss = 0

                def before_run(self, run_context):
                    self._local_step += 1
                    return tf.train.SessionRunArgs(loss)

                def after_run(self, run_context, run_values):
                    self._step = run_context.session.run(global_step)
                    loss_value = run_values.results

                    self._total_loss += loss_value
                    if self._step % FLAGS.log_frequency == 0:
                        current_time = time.time()
                        duration = current_time - self._start_time
                        self._start_time = current_time
                        avg_loss = self._total_loss / self._local_step
                        eg_per_sec = FLAGS.log_frequency * FLAGS.batch_size / duration
                        sec_per_batch = float(duration / FLAGS.log_frequency)
                        print(
                            '%s: training step %d cur loss = %.4f avg loss = %.4f (%.1f images/sec %.3f sec/batch)'
                            % (datetime.now(), self._step, loss_value,
                               avg_loss, eg_per_sec, sec_per_batch))

            class _ValHook(tf.train.SessionRunHook):
                def begin(self):
                    #                    self._step = 0
                    self._val_step = int(
                        math.ceil(arg_parsing.NUM_EXAMPLES_PER_EPOCH_FOR_EVAL /
                                  FLAGS.batch_size))

                def before_run(self, run_context):
                    #                    self._step += 1
                    self._total_val_accu = 0
                    self._start_time = time.time()

                def after_run(self, run_context, run_values):
                    #                    if FLAGS.issync:
                    self._step = run_context.session.run(global_step)
                    if self._step % FLAGS.steps_to_val == 0:
                        if (FLAGS.task_index == 0
                                and FLAGS.issync) or not FLAGS.issync:
                            for j in range(self._val_step):
                                self._total_val_accu += run_context.session.run(
                                    val_acc_sum)
                            print(
                                '%s: step %d validation accuracy = %.4f (%.3f sec %d batches)'
                                %
                                (datetime.now(), self._step,
                                 self._total_val_accu / float(self._val_step),
                                 float(time.time() - self._start_time),
                                 self._val_step))

            class _ExitHook(tf.train.SessionRunHook):  # same as StopAtStepHook
                def begin(self):
                    self._val_step = int(
                        math.ceil(arg_parsing.NUM_EXAMPLES_PER_EPOCH_FOR_EVAL /
                                  FLAGS.batch_size))

                def before_run(self, run_context):
                    self._total_val_accu = 0
                    self._start_time = time.time()

                def after_run(self, run_context, run_values):
                    self._step = run_context.session.run(global_step)
                    if self._step >= FLAGS.max_steps:
                        if FLAGS.task_index == 0 and not FLAGS.issync:
                            for j in range(self._val_step * 2):
                                self._total_val_accu += run_context.session.run(
                                    val_acc_sum)
                            print(
                                '%s: last step %d validation final accuracy = %.4f (%.3f sec(2 times) %d batches)'
                                % (datetime.now(), self._step,
                                   self._total_val_accu /
                                   float(self._val_step * 2),
                                   float(time.time() - self._start_time),
                                   self._val_step))
                        run_context.request_stop()

#            all_hooks=[tf.train.NanTensorHook(loss), tf.train.StopAtStepHook(last_step=FLAGS.max_steps), _LoggerHook(), _ValHook()]

            all_hooks = [
                tf.train.NanTensorHook(loss),
                _LoggerHook(),
                _ValHook(),
                _ExitHook()
            ]
            if FLAGS.issync:
                all_hooks.append(sync_replicas_hook)
            if FLAGS.debug:
                all_hooks.append(tfdbg.LocalCLIDebugHook(ui_type='curses'))
            if FLAGS.finetune:
                print('Finetune from %s' % FLAGS.finetune)
                saver = tf.train.Saver()
            config = tf.ConfigProto(
                log_device_placement=FLAGS.log_device_placement)
            config.gpu_options.allow_growth = True
            with tf.train.MonitoredTrainingSession(
                    master=server.target,
                    is_chief=(FLAGS.task_index == 0),
                    checkpoint_dir=FLAGS.model_dir,
                    hooks=all_hooks,
                    config=config,
                    save_summaries_steps=100,
                    save_summaries_secs=None,
                    log_step_count_steps=None) as sess:
                if FLAGS.finetune:
                    print('Load Pretrained model')
                    ckpt = tf.train.get_checkpoint_state(FLAGS.finetune)
                    if ckpt and ckpt.model_checkpoint_path:
                        saver.restore(sess, ckpt.model_checkpoint_path)
                    print('-------------------------')
                while not sess.should_stop():
                    sess.run(train_op)
                print('DONE')
def train(dataset):
  """Train on dataset for a number of steps."""
  # with tf.Graph().as_default(), tf.device('/cpu:0'):
  with tf.Graph().as_default():

    global_step = tf.Variable(0,trainable=False)

    num_preprocess_threads = FLAGS.num_preprocess_threads * FLAGS.num_gpus
    with tf.device('/cpu:0'):
      images, pitchs, yaws, rolls, names = image_processing.distorted_inputs(
        dataset,
        num_preprocess_threads=num_preprocess_threads)
    
    p = tf.expand_dims(pitchs,1)
    y = tf.expand_dims(yaws,1)
    r = tf.expand_dims(rolls,1)
    labels = tf.concat([p, y, r],1)

    train_output = model.inference(images)
    train_loss = model.losses(train_output, labels) 

    add_global = global_step.assign_add(1)  
       
    train_op = model.trainning(train_loss, FLAGS.learning_rate, global_step)
   
    summary_op = tf.summary.merge_all()
    sess = tf.Session()
    train_writer = tf.summary.FileWriter(FLAGS.train_dir, sess.graph)
    saver = tf.train.Saver()
    
    sess.run(tf.global_variables_initializer())
    
    """
    these codes get the variable in conv1

    print(sess.run(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)))
    w = tf.contrib.framework.get_variables('conv1')
    t = tf.nn.l2_loss(w[0])
    print(sess.run(t))
    """

    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)
    
    try:
        for step in np.arange(FLAGS.max_steps):
            if coord.should_stop():
                    break
            _, _, tra_loss= sess.run([add_global, train_op, train_loss])
               
            if step % 50 == 0:
                gs = sess.run(global_step)
                print('Step %d, train loss = %.2f, global_step= %d'  %(step, tra_loss, gs))
                summary_str = sess.run(summary_op)
                train_writer.add_summary(summary_str, step)
            
            if step % 2000 == 0 or (step + 1) == FLAGS.max_steps:
                checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
                
    except tf.errors.OutOfRangeError:
        print('Done training -- epoch limit reached')
    finally:
        coord.request_stop()
        
    coord.join(threads)
    sess.close()

    # coord = tf.train.Coordinator()
    # threads = tf.train.start_queue_runners(sess=sess,coord=coord)
    # try:
    #   print(sess.run(pitchs))
    # except Exception as e:
    #   coord.request_stop(e)
    # coord.request_stop()
    # coord.join(threads)
    # sess.close()
    

    # sv = tf.train.Supervisor()
    # with sv.managed_session() as sess:
    #   print(sess.sun(pitchs))
    
Beispiel #29
0
def main(_):
    print(FLAGS.num_preprocess_threads)
    trainset = GoodsData('train')
    # assert trainset.data_files()
    validationset = GoodsData('validation')
    assert validationset.data_files()

    # get_tuned_variables()
    # get_trainable_variables()

    # num_batches_per_epoch = (trainset.num_examples_per_epoch() /
    #                          FLAGS.batch_size)
    num_preprocess_threads = FLAGS.num_preprocess_threads * FLAGS.num_gpus
    images_train, labels_train = image_processing.distorted_inputs(trainset,
                                                                   num_preprocess_threads=num_preprocess_threads)
    images_validation, labels_validation = image_processing.distorted_inputs(validationset,batch_size=64,
                                                                   num_preprocess_threads=num_preprocess_threads)
    # images_splits = tf.split(axis=0, num_or_size_splits=FLAGS.num_gpus, value=images)
    # labels_splits = tf.split(axis=0, num_or_size_splits=FLAGS.num_gpus, value=labels)

    input_summaries = copy.copy(tf.get_collection(tf.GraphKeys.SUMMARIES))

    # Number of classes in the Dataset label set plus 1.
    # Label 0 is reserved for an (unused) background class.
    num_classes = trainset.num_classes() + 1
    # print(images_train.shape)
    # print(labels_train.shape)
    images = tf.placeholder(tf.float32, [None, images_train.shape[1], images_train.shape[2], 3], name="input_images")
    labels = tf.placeholder(tf.int64, [None], name="labels")
    with slim.arg_scope(inception_v3.inception_v3_arg_scope()):
        logits, _ = inception_v3.inception_v3(images, num_classes=num_classes)


    tuned_variables = get_all_variables()
    trainable_variables = get_all_variables()
    checkpoint_path = FLAGS.pretrained_model_checkpoint_path

    # 计算正确率
    with tf.name_scope("evaluation"):
        prediction=tf.argmax(logits, 1)
        correct_prediction = tf.equal(prediction, labels)
        evaluation_step = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    # 导入预训练好的权重
    checkpoint_path = tf.train.latest_checkpoint(checkpoint_path)
    load_fn = slim.assign_from_checkpoint_fn(checkpoint_path, tuned_variables, ignore_missing_vars=True)
    # 用于存储finetune后的权重
    # print(get_tuned_variables())
    # saver = tf.train.Saver()


    config = tf.ConfigProto(
        allow_soft_placement=True,
        log_device_placement=FLAGS.log_device_placement
    )
    config.gpu_options.allow_growth = True

    sess = tf.Session(config=config)

    # with tf.Session(config=config) as sess:
    # sess.as_default()
    init = tf.global_variables_initializer()
    sess.run(init)

    print("loading tuned variables from %s" % checkpoint_path)
    load_fn(sess)
    # sess.run(load_fn)
    # coord = tf.train.Coordinator()
    # threads = tf.train.start_queue_runners(coord=coord)
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)
    # tf.train.batch
    # start = 0
    # end = FLAGS.batch_size

    # if tf.gfile.Exists(FLAGS.train_dir):
    #     tf.gfile.DeleteRecursively(FLAGS.train_dir)
    # tf.gfile.MakeDirs(FLAGS.train_dir)

    for step in range(FLAGS.max_steps):
        # print(0)
        start_time = time.time()

        image_batch, label_batch = sess.run([images_validation, labels_validation])
        validation_accuracy = sess.run(evaluation_step, feed_dict={images: image_batch,
                                                                   labels: label_batch})
        label_prediction=sess.run(prediction,feed_dict={images: image_batch,
                                                                   labels: label_batch})
        print(label_prediction)
        print('Step %d: Validation accuracy = %.1f%%' % (step, validation_accuracy * 100.0))
        duration = time.time() - start_time