Esempio n. 1
0
def evaluate(hps):
  """Eval loop."""
  images, labels = hwdb_input.build_input(
      FLAGS.dataset, FLAGS.eval_data_path, hps.batch_size, FLAGS.mode)
  model = resnet_model.ResNet(hps, images, labels, FLAGS.mode)
  model.build_graph()
  saver = tf.train.Saver()
  summary_writer = tf.summary.FileWriter(FLAGS.eval_dir)

  config = tf.ConfigProto(allow_soft_placement=True)
  config.gpu_options.allow_growth=True
  sess = tf.Session(config=config)
  tf.train.start_queue_runners(sess)

  best_precision = 0.0
  while True:
    try:
      ckpt_state = tf.train.get_checkpoint_state(FLAGS.log_root)
    except tf.errors.OutOfRangeError as e:
      tf.logging.error('Cannot restore checkpoint: %s', e)
      continue
    if not (ckpt_state and ckpt_state.model_checkpoint_path):
      tf.logging.info('No model to eval yet at %s', FLAGS.log_root)
      time.sleep(60)
      continue
    tf.logging.info('Loading checkpoint %s', ckpt_state.model_checkpoint_path)
    saver.restore(sess, ckpt_state.model_checkpoint_path)

    total_prediction, correct_prediction = 0, 0
    for iter in six.moves.range(FLAGS.eval_batch_count):
      (summaries, loss, predictions, truth, train_step) = sess.run(
          [model.summaries, model.cost, model.predictions,
           model.labels, model.global_step])

      truth = np.argmax(truth, axis=1)
      predictions = np.argmax(predictions, axis=1)
      correct_prediction += np.sum(truth == predictions)
      total_prediction += predictions.shape[0]
      #tf.logging.info('iter: %d' % (iter))

    precision = 1.0 * correct_prediction / total_prediction
    best_precision = max(precision, best_precision)

    precision_summ = tf.Summary()
    precision_summ.value.add(
        tag='Precision', simple_value=precision)
    summary_writer.add_summary(precision_summ, train_step)
    best_precision_summ = tf.Summary()
    best_precision_summ.value.add(
        tag='Best Precision', simple_value=best_precision)
    summary_writer.add_summary(best_precision_summ, train_step)
    summary_writer.add_summary(summaries, train_step)
    tf.logging.info('loss: %.3f, precision: %.3f, best precision: %.3f' %
                    (loss, precision, best_precision))
    summary_writer.flush()

    if FLAGS.eval_once:
      break

    time.sleep(60)
Esempio n. 2
0
def evaluate(hps, num_iterations, dataset):
    total_acc = 0.0
    print('Loading trained network, please wait......')

    # input data
    images, labels = resnet_input.input(dataset, hps.batch_size, 'eval')

    # resnet model
    model = resnet_model.ResNet(hps, images, labels, 'eval')
    model.build_graph()

    # run session
    coord = tf.train.Coordinator()
    config = tf.ConfigProto(allow_soft_placement=True,
                            log_device_placement=False)
    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.8)
    config.gpu_options.allow_growth = True
    saver = tf.train.Saver()
    with tf.Session(config=config) as sess:
        queue_runner = tf.train.start_queue_runners(sess=sess, coord=coord)
        saver.restore(sess, './model/model.ckpt')
        for i in range(num_iterations):
            acc = sess.run(model.accuracy)
            total_acc += acc
        total_acc /= num_iterations
        print('Total accuracy on test set is %.2f' % total_acc)
        coord.request_stop()
        coord.join(queue_runner)
Esempio n. 3
0
 def __init__(self, data, eval_batch_count):
     hps = resnet_model.HParams(batch_size=100,
                                num_classes=10,
                                min_lrn_rate=0.0001,
                                lrn_rate=0.1,
                                num_residual_units=5,
                                use_bottleneck=False,
                                weight_decay_rate=0.0002,
                                relu_leakiness=0.1,
                                optimizer='mom',
                                num_gpus=0)
     data = ray.get(data)
     total_images = np.concatenate([data[0], data[1], data[2]])
     with tf.Graph().as_default():
         with tf.device('/cpu:0'):
             images, labels = cifar_input.build_input(
                 [total_images, data[3]], hps.batch_size, False)
             self.model = resnet_model.ResNet(hps, images, labels, 'eval')
             self.model.build_graph()
             config = tf.ConfigProto(allow_soft_placement=True)
             sess = tf.Session(config=config)
             self.model.variables.set_session(sess)
             self.coord = tf.train.Coordinator()
             tf.train.start_queue_runners(sess, coord=self.coord)
             init = tf.global_variables_initializer()
             sess.run(init)
             self.best_precision = 0.0
             self.eval_batch_count = eval_batch_count
Esempio n. 4
0
    def __init__(self, data, dataset, eval_batch_count, eval_dir):
        os.environ["CUDA_VISIBLE_DEVICES"] = ""
        hps = resnet_model.HParams(
            batch_size=100,
            num_classes=100 if dataset == "cifar100" else 10,
            min_lrn_rate=0.0001,
            lrn_rate=0.1,
            num_residual_units=5,
            use_bottleneck=False,
            weight_decay_rate=0.0002,
            relu_leakiness=0.1,
            optimizer="mom",
            num_gpus=0)
        with tf.device("/cpu:0"):
            # Builds the testing network.
            images, labels = cifar_input.build_input(data,
                                                     hps.batch_size, dataset,
                                                     False)
            self.model = resnet_model.ResNet(hps, images, labels, "eval")
            self.model.build_graph()
            config = tf.ConfigProto(allow_soft_placement=True)
            config.gpu_options.allow_growth = True
            sess = tf.Session(config=config)
            self.model.variables.set_session(sess)
            init = tf.global_variables_initializer()
            sess.run(init)

            # Initializing parameters for tensorboard.
            self.best_precision = 0.0
            self.eval_batch_count = eval_batch_count
            self.summary_writer = tf.summary.FileWriter(eval_dir, sess.graph)
        # The IP address where tensorboard logs will be on.
        self.ip_addr = ray.services.get_node_ip_address()
Esempio n. 5
0
    def __init__(self, data, num_gpus):
        if num_gpus > 0:
            os.environ['CUDA_VISIBLE_DEVICES'] = ','.join(
                [str(i) for i in ray.get_gpu_ids()])
        hps = resnet_model.HParams(batch_size=128,
                                   num_classes=10,
                                   min_lrn_rate=0.0001,
                                   lrn_rate=0.1,
                                   num_residual_units=5,
                                   use_bottleneck=False,
                                   weight_decay_rate=0.0002,
                                   relu_leakiness=0.1,
                                   optimizer='mom',
                                   num_gpus=num_gpus)
        data = ray.get(data)
        total_images = np.concatenate([data[0], data[1], data[2]])
        with tf.Graph().as_default():
            if num_gpus > 0:
                tf.set_random_seed(ray.get_gpu_ids()[0] + 1)
            else:
                tf.set_random_seed(1)

            with tf.device('/gpu:0' if num_gpus > 0 else '/cpu:0'):
                images, labels = cifar_input.build_input(
                    [total_images, data[3]], hps.batch_size, True)
                self.model = resnet_model.ResNet(hps, images, labels, 'train')
                self.model.build_graph()
                config = tf.ConfigProto(allow_soft_placement=True)
                sess = tf.Session(config=config)
                self.model.variables.set_session(sess)
                self.coord = tf.train.Coordinator()
                tf.train.start_queue_runners(sess, coord=self.coord)
                init = tf.global_variables_initializer()
                sess.run(init)
Esempio n. 6
0
def evaluate(hps):
    """Eval loop."""
    images, labels = cifar_input.build_input('cifar10', FLAGS.eval_data_path,
                                             hps.batch_size, 'eval')
    model = resnet_model.ResNet(hps, images, labels, 'eval')
    model.build_graph()

    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))

    saver = tf.train.Saver()

    ##################################
    ## FIXME: Make a summary writer ##
    ##################################
    summary_writer = tf.summary.FileWriter(FLAGS.eval_dir)

    try:
        ckpt_state = tf.train.get_checkpoint_state(FLAGS.ckpt_dir)
    except tf.errors.OutOfRangeError as e:
        tf.logging.error('Cannot restore checkpoint: %s', e)
    if not (ckpt_state):
        tf.logging.info('No model to eval yet at %s', FLAGS.ckpt_dir)

    best_precision = 0.
    for i in range(len(ckpt_state.all_model_checkpoint_paths)):
        tf.logging.info('Loading checkpoint %s',
                        ckpt_state.all_model_checkpoint_paths[i])
        saver.restore(sess, ckpt_state.all_model_checkpoint_paths[i])
        total_prediction, correct_prediction = 0, 0

        for _ in six.moves.range(FLAGS.eval_batch_count):
            (summaries, loss, predictions, truth, train_step) = sess.run([
                model.summaries, model.cost, model.predictions, model.labels,
                model.global_step
            ])

            truth = np.argmax(truth, axis=1)
            predictions = np.argmax(predictions, axis=1)
            correct_prediction += np.sum(truth == predictions)
            total_prediction += predictions.shape[0]

        precision = 1.0 * correct_prediction / total_prediction
        best_precision = max(precision, best_precision)

        ########################################################
        ## FIXME: Add summary of precision and best precision ##
        ########################################################
        summ_precision = tf.Summary()
        summ_precision.value.add(tag='precision', simple_value=precision)
        summary_writer.add_summary(summ_precision, train_step)

        summ_best_precision = tf.Summary()
        summ_best_precision.value.add(tag='best_precision',
                                      simple_value=best_precision)
        summary_writer.add_summary(summ_best_precision, train_step)

        tf.logging.info('loss: %.3f, precision: %.3f, best precision: %.3f' %
                        (loss, precision, best_precision))
        summary_writer.flush()
Esempio n. 7
0
def train(hps):
  """Training loop."""
  images, labels = cifar_input.build_input(
      FLAGS.dataset, FLAGS.train_data_path, hps.batch_size, FLAGS.mode)
  model = resnet_model.ResNet(hps, images, labels, FLAGS.mode)
  model.build_graph()
  summary_writer = tf.train.SummaryWriter(FLAGS.train_dir)

  sv = tf.train.Supervisor(logdir=FLAGS.log_root,
                           is_chief=True,
                           summary_op=None,
                           save_summaries_secs=60,
                           save_model_secs=300,
                           global_step=model.global_step)
  sess = sv.prepare_or_wait_for_session()

  step = 0
  total_prediction = 0
  correct_prediction = 0
  precision = 0.0
  lrn_rate = 0.1

  while not sv.should_stop():
    (_, summaries, loss, predictions, truth, train_step) = sess.run(
        [model.train_op, model.summaries, model.cost, model.predictions,
         model.labels, model.global_step],
        feed_dict={model.lrn_rate: lrn_rate})

    if train_step < 40000:
      lrn_rate = 0.1
    elif train_step < 60000:
      lrn_rate = 0.01
    elif train_step < 80000:
      lrn_rate = 0.001
    else:
      lrn_rate = 0.0001

    predictions = np.argmax(predictions, axis=1)
    truth = np.argmax(truth, axis=1)
    for (t, p) in zip(truth, predictions):
      if t == p:
        correct_prediction += 1
      total_prediction += 1
    precision = float(correct_prediction) / total_prediction
    correct_prediction = total_prediction = 0

    step += 1
    if step % 100 == 0:
      precision_summ = tf.Summary()
      precision_summ.value.add(
          tag='Precision', simple_value=precision)
      summary_writer.add_summary(precision_summ, train_step)
      summary_writer.add_summary(summaries, train_step)
      tf.logging.info('loss: %.3f, precision: %.3f\n' % (loss, precision))
      summary_writer.flush()

  sv.Stop()
Esempio n. 8
0
def train(hps):
    trainset = input.ImageSet(FLAGS.train_data)
    images, labels, _ = trainset.next_batch(FLAGS.batch_size)
    # images, labels = cifar_input.build_input(
    # FLAGS.dataset, FLAGS.train_data_path, hps.batch_size, FLAGS.mode)
    model = resnet_model.ResNet(hps, images, labels, FLAGS.mode)
    model.build_graph()

    truth = model.labels
    predictions = tf.argmax(model.predictions, axis=1)
    precision = tf.reduce_mean(tf.to_float(tf.equal(predictions, truth)))

    summary_hook = tf.train.SummarySaverHook(
        save_steps=10,
        output_dir=FLAGS.train_dir,
        summary_op=tf.summary.merge(
            [model.summaries,
             tf.summary.scalar('Precision', precision)]))
    logging_hook = tf.train.LoggingTensorHook(
        tensors={'step': model.global_step,
                 'loss': model.cost,
                 'precision': precision},
        every_n_iter=10)

    class _LearningRateSetterHook(tf.train.SessionRunHook):

        def begin(self):
            self._lrn_rate = 0.1

        def before_run(self, run_context):
            return tf.train.SessionRunArgs(
                model.global_step,
                feed_dict={model.lrn_rate: self._lrn_rate})

        def after_run(self, run_context, run_values):

            train_step = run_values.results
            if train_step < 40000:
                self._lrn_rate = 0.1
            elif train_step < 60000:
                self._lrn_rate = 0.01
            elif train_step < 80000:
                self._lrn_rate = 0.001
            else:
                self._lrn_rate = 0.0001

    with tf.train.MonitoredTrainingSession(
            checkpoint_dir=FLAGS.log_root,
            hooks=[logging_hook, _LearningRateSetterHook()],
            chief_only_hooks=[summary_hook],
            save_summaries_steps=0,
            config=tf.ConfigProto(allow_soft_placement=True)) as mon_sess:
        while not mon_sess.should_stop():
            mon_sess.run(model.train_op)
Esempio n. 9
0
def getQfeature(filepath):

    image = read(filepath)
    labels = [3]
    hps = resnet_model.HParams(batch_size=1,
                               num_classes=4,
                               min_lrn_rate=0.0001,
                               lrn_rate=0.1,
                               num_residual_units=5,
                               use_bottleneck=False,
                               weight_decay_rate=0.0002,
                               relu_leakiness=0.1,
                               optimizer='mom')
    model = resnet_model.ResNet(hps, image, labels, FLAGS.mode)
    model.build_graph()

    logits = tf.get_default_graph().get_tensor_by_name("logit/xw_plus_b:0")
    print(logits)

    logits_norm = tf.nn.l2_normalize(logits, 1)

    # Run our model
    steps = 1  # *** Maybe exist some duplicate image features, next dict op will clear it.
    # Restore the moving average version of the learned variables for better effect.
    # for name in variables_to_restore:
    # 	print(name)
    saver = tf.train.Saver()

    with tf.Session() as sess:
        # Restore model from checkpoint.
        # Note!: checkpoint file not a single file, so don't use like this:
        # saver.restore(sess, '/path/to/model.ckpt-1000.index') xxx
        # Don't forget launch queue, use coordinator to avoid harmless 'Enqueue operation was cancelled ERROR'(of course you can also just start)
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=sess, coord=coord)

        # ckpt correspond to 'checkpoint' file.
        ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir)
        # model_checkpoint_path looks something like: /path/to/model.ckpt-1000
        print(ckpt.model_checkpoint_path)
        if ckpt and ckpt.model_checkpoint_path:
            saver.restore(sess, ckpt.model_checkpoint_path)

            # fc1_list=fc2_list=fc3_list=[] # the same object!
        logits_list = []
        _logits = sess.run([logits_norm])  # return nd-array
        print('................')
        print(_logits)
        print('................')
        put_2darray(_logits, logits_list)

        return logits_list
Esempio n. 10
0
def train(hps, num_iterations, dataset):
    with tf.Graph().as_default():
        # input data
        images, labels = resnet_input.input(dataset, hps.batch_size, 'train')

        # resnet model
        model = resnet_model.ResNet(hps, images, labels, 'train')
        model.build_graph()

        # summary hook
        merged_summary_op = tf.summary.merge_all()

        # run session
        coord = tf.train.Coordinator()
        config = tf.ConfigProto(allow_soft_placement=True,
                                log_device_placement=False)
        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.8)
        config.gpu_options.allow_growth = True
        saver = tf.train.Saver()
        start_time = time.time()
        with tf.Session(config=config) as sess:
            # summaries
            train_writer = tf.summary.FileWriter('./train', graph=sess.graph)
            sess.run(tf.global_variables_initializer())
            queue_runner = tf.train.start_queue_runners(sess=sess, coord=coord)
            # train
            for i in range(num_iterations):
                # learning rate decay
                if i < 32000:
                    model.learning_rate = model.hps.init_lr
                elif i == 32000:
                    model.learning_rate /= 10
                elif i == 48000:
                    model.learning_rate /= 10

                _, acc, loss = sess.run(
                    [model.train_op, model.accuracy, model.cross_entropy])
                summary = sess.run(merged_summary_op)
                if i % 100 == 0:
                    train_writer.add_summary(summary, i)
                    print(
                        'iter %d, the loss is %.3f, accuracy on train set is %.2f'
                        % (i, loss, acc))
                if i % 1000 == 0:
                    saver.save(sess, 'model/model.ckpt')
                    print('learning rate -> %f' % model.learning_rate)
            coord.request_stop()
            coord.join(queue_runner)
            train_writer.close()
            stop_time = time.time()
        print('%d iterations takes %.2f seconds' %
              (num_iterations, stop_time - start_time))
Esempio n. 11
0
def train(hps):
    """Training loop."""
    images, labels = synthetic_data(hps.batch_size)
    model = resnet_model.ResNet(hps, images, labels, FLAGS.mode)
    model.build_graph()
    summary_writer = tf.train.SummaryWriter(FLAGS.train_dir)

    sv = tf.train.Supervisor(logdir=FLAGS.log_root,
                             is_chief=True,
                             summary_op=None,
                             save_summaries_secs=60,
                             save_model_secs=300,
                             global_step=model.global_step)
    sess = sv.prepare_or_wait_for_session(config=tf.ConfigProto(
        allow_soft_placement=True))

    step = 0
    lrn_rate = 0.1

    while not sv.should_stop():
        (_, summaries, loss, predictions, truth,
         train_step) = sess.run([
             model.train_op, model.summaries, model.cost, model.predictions,
             model.labels, model.global_step
         ],
                                feed_dict={model.lrn_rate: lrn_rate})

        if train_step < 40000:
            lrn_rate = 0.1
        elif train_step < 60000:
            lrn_rate = 0.01
        elif train_step < 80000:
            lrn_rate = 0.001
        else:
            lrn_rate = 0.0001

        truth = np.argmax(truth, axis=1)
        predictions = np.argmax(predictions, axis=1)
        precision = np.mean(truth == predictions)

        step += 1
        if step % 100 == 0:
            precision_summ = tf.Summary()
            precision_summ.value.add(tag='Precision', simple_value=precision)
            summary_writer.add_summary(precision_summ, train_step)
            summary_writer.add_summary(summaries, train_step)
            tf.logging.info('loss: %.3f, precision: %.3f\n' %
                            (loss, precision))
            summary_writer.flush()

    sv.Stop()
Esempio n. 12
0
def gene_prob(hps):
  """Generating loop."""
  images, labels, prob = cifar_input_v2.build_input(
      FLAGS.dataset, FLAGS.train_data_path, hps.batch_size, 'eval')
  model = resnet_model.ResNet(hps, images, labels, prob, 'eval')
  model.build_graph()
  saver = tf.train.Saver()

  sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
  tf.train.start_queue_runners(sess)

  while True:
    try:
      ckpt_state = tf.train.get_checkpoint_state(FLAGS.log_root_expert)
    except tf.errors.OutOfRangeError as e:
      tf.logging.error('Cannot restore checkpoint: %s', e)
      continue
    if not (ckpt_state and ckpt_state.model_checkpoint_path):
      tf.logging.info('No model to eval yet at %s', FLAGS.log_root_expert)
      continue
    tf.logging.info('Loading checkpoint %s', ckpt_state.model_checkpoint_path)
    saver.restore(sess, ckpt_state.model_checkpoint_path)

    obj_ = []
    i = 1
    count = 0
    f = open('prob.pkl','wb')
    for _ in six.moves.range(40000/hps.batch_size):
      (predictions, truth) = sess.run(
          [model.predictions,
           model.labels])

      pred_probability = np.sum(truth*predictions,axis=1) #with shape [128]
      if pred_probability[0] >= 0.90:
        count = count + 1
      obj_.append(pred_probability)
      
      if i%100 == 0:
          print i,'----->',pred_probability
      i = i + 1
    print 'the ratio is:', 1.0*count/40000
    cPickle.dump(obj=obj_, file=f, protocol=0)
    f.close()
    #create bin file
    """
    file = open('prob.pkl','rb')
    data = cPickle.load(file)
    arr = np.array(data)
    arr.tofile("prob_pkl.bin")
    """
    break
Esempio n. 13
0
def train(hps):
    """Training loop."""

    single_gpu_graph = tf.Graph()
    with single_gpu_graph.as_default():

        images, labels = cifar_input.build_input('cifar10',
                                                 FLAGS.train_data_path,
                                                 hps.batch_size, 'train')
        model = resnet_model.ResNet(hps, images, labels, 'train')
        model.build_graph()

        truth = tf.argmax(model.labels, axis=1)
        predictions = tf.argmax(model.predictions, axis=1)
        precision = tf.reduce_mean(tf.to_float(tf.equal(predictions, truth)))

    ########################################################################
    #### FIXME: Get session for distributed environments using Parallax ####
    #### Pass parallax_config as an argument                            ####
    ########################################################################

    parallax_sess, num_workers, worker_id, num_replicas_per_worker = \
          parallax.parallel_run(single_gpu_graph,
                                FLAGS.resource_info_file,
                                sync=FLAGS.sync,
                                parallax_config=parallax_config.build_config())

    for i in range(350000):

        _, global_step, cost, precision_ = \
            parallax_sess.run([model.train_op, model.global_step, model.cost, precision])

        if i % 10 == 0:
            print('step: %d, loss: %.3f, precision: %.3f' %
                  (global_step[0], cost[0], precision_[0]))

            # Tuning learning rate
            train_step = global_step[0]
            if train_step < 10000:
                lrn_rate = 0.1
            elif train_step < 15000:
                lrn_rate = 0.01
            elif train_step < 20000:
                lrn_rate = 0.001
            else:
                lrn_rate = 0.0001
            feed_dict = {model.lrn_rate: []}
            for worker in range(num_replicas_per_worker):
                feed_dict[model.lrn_rate].append(lrn_rate)
            parallax_sess.run(model.global_step, feed_dict=feed_dict)
Esempio n. 14
0
def test(hps):
    images, labels = resnet_model.inputs(FLAGS.eval_data_path,
                                         FLAGS.eval_batch_size,
                                         eval_data=True)
    model = resnet_model.ResNet(hps, images, labels, 'eval')
    model.build_graph()
    saver = tf.train.Saver(tf.global_variables())
    summary_writer = tf.summary.FileWriter(FLAGS.eval_dir)
    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
    # while True:
    #     try:
    #         ckpt_state = tf.train.get_checkpoint_state(FLAGS.log_root)
    #     except tf.errors.OutOfRangeError as e:
    #         tf.logging.error('Cannot restore checkpoint: %s', e)
    #     continue
    #     if not (ckpt_state and ckpt_state.model_checkpoint_path):
    #         tf.logging.info('No model to eval yet at %s', FLAGS.log_root)
    #     continue
    ckpt = tf.train.get_checkpoint_state(FLAGS.log_root)
    if ckpt and ckpt.model_checkpoint_path:
        saver.restore(sess, ckpt.model_checkpoint_path)
        tf.logging.info('Loading checkpoint %s', ckpt.model_checkpoint_path)
    else:
        tf.logging.info('No model to eval yet at %s', FLAGS.log_root)
        return

    tf.train.start_queue_runners(sess)
    test_acc, test_loss = 0, 0
    print('start')
    for _ in range(FLAGS.eval_batch_count):
        (summaries, loss, predictions, acc, train_step) = sess.run([
            model.summaries, model.costs, model.labels, model.acc,
            model.global_step
        ])
        test_acc += acc
        test_loss += loss
        print(acc)

    precision = 1.0 * test_acc / FLAGS.eval_batch_count
    total_loss = 1.0 * test_loss / FLAGS.eval_batch_count

    precision_summ = tf.Summary()
    precision_summ.value.add(tag='Precision', simple_value=precision)
    summary_writer.add_summary(precision_summ, train_step)
    summary_writer.add_summary(summaries, train_step)
    tf.logging.info('loss: %.3f, precision: %.3f' % (total_loss, precision))
    summary_writer.flush()
Esempio n. 15
0
def train(hps):
    # the function calls tf.contrib.framework.get_or_create_global_step() and also build_model which builds the resnet graph
    model = resnet_model.ResNet(hps, train_features, train_labels_1, '')
    model._build_model()

    trainable_variables = tf.trainable_variables()
    grads = tf.gradients(model.cost, trainable_variables)

    apply_op = optimizer.apply_gradients(zip(grads, trainable_variables),
                                         global_step=global_step,
                                         name='train_step')

    train_ops = [apply_op]
    model.train_op = tf.group(*train_ops)
    model.summaries = tf.summary.merge_all()

    param_stats = tf.contrib.tfprof.model_analyzer.print_model_analysis(
        tf.get_default_graph(),
        tfprof_options=tf.contrib.tfprof.model_analyzer.FLOAT_OPS_OPTIONS)

    truth = tf.argmax(model.labels, axis=1)
    predictions = tf.argmax(model.predictions, axis=1)
    precision = tf.reduce_mean(tf.to_float(tf.equal(predictions, truth)))

    summary_hook = tf.train.SummarySaverHook(
        save_steps=100,
        output_dir=FLAGS.train_dir,
        summary_op=tf.summary.merge(
            [model.summaries,
             tf.summary.scalar('Precision', precision)]))

    logging_hook = tf.train.LoggingTensorHook(tensors={
        'step': global_step,
        'loss': model.cost,
        'precision': precision
    },
                                              every_n_iter=100)

    with tf.train.MonitoredTrainingSession(
            checkpoint_dir=FLAGS.log_root,
            hooks=[logging_hook],
            chief_only_hooks=[summary_hook],
            save_summaries_steps=0,  #disable the default summary
            config=tf.ConfigProto(allow_soft_placement=True)) as mon_sess:
        while not mon_sess.should_stop():
            mon_sess.run(model.train_op)
Esempio n. 16
0
    def __init__(self, data, dataset, num_gpus):
        if num_gpus > 0:
            os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(
                [str(i) for i in ray.get_gpu_ids()])
        hps = resnet_model.HParams(
            batch_size=128,
            num_classes=100 if dataset == "cifar100" else 10,
            min_lrn_rate=0.0001,
            lrn_rate=0.1,
            num_residual_units=5,
            use_bottleneck=False,
            weight_decay_rate=0.0002,
            relu_leakiness=0.1,
            optimizer="mom",
            num_gpus=num_gpus)

        # We seed each actor differently so that each actor operates on a
        # different subset of data.
        if num_gpus > 0:
            tf.set_random_seed(ray.get_gpu_ids()[0] + 1)
        else:
            # Only a single actor in this case.
            tf.set_random_seed(1)

        input_images = data[0]
        input_labels = data[1]
        with tf.device("/gpu:0" if num_gpus > 0 else "/cpu:0"):
            # Build the model.
            images, labels = cifar_input.build_input([input_images,
                                                      input_labels],
                                                     hps.batch_size, dataset,
                                                     False)
            self.model = resnet_model.ResNet(hps, images, labels, "train")
            self.model.build_graph()
            config = tf.ConfigProto(allow_soft_placement=True)
            sess = tf.Session(config=config)
            self.model.variables.set_session(sess)
            self.coord = tf.train.Coordinator()
            tf.train.start_queue_runners(sess, coord=self.coord)
            init = tf.global_variables_initializer()
            sess.run(init)
            self.steps = 10
def _my_model_fn(features, labels, mode, params):
    del params  # unused, but needed for TPU training

    #
    # Model - Here we use pre-built 'resnet_model'
    #
    model_params = resnet_model.HParams(
        batch_size=int(
            batch_size /
            FLAGS.num_replica),  # because batch is divided by TPU replicas
        num_classes=10,
        min_lrn_rate=0.0001,
        lrn_rate=0.1,
        num_residual_units=5,  # 5 x (3 x sub 2) + 2 = 32 layers
        use_bottleneck=False,
        weight_decay_rate=0.0002,
        relu_leakiness=0.1,
        optimizer='mom')
    train_model = resnet_model.ResNet(model_params, features, labels, 'train')
    train_model.build_graph(tpu_opt=True)

    # create evaluation metrices
    #truth = tf.argmax(train_model.labels, axis=1)
    #predictions = tf.argmax(train_model.predictions, axis=1)
    #precision = tf.reduce_mean(
    #    tf.to_float(tf.equal(predictions, truth)),
    #    name="precision")
    #accuracy = tf.metrics.accuracy(truth, predictions)
    #tf.summary.scalar('accuracy', accuracy[1]) # output to TensorBoard

    # define operations (Here we assume only training operation !)
    #prediction_outputs = {
    #    "precision": precision
    #}
    return tpu_estimator.TPUEstimatorSpec(
        mode=mode,
        loss=train_model.cost,
        train_op=train_model.train_op,
        #predictions=prediction_outputs,
        eval_metrics=(metric_fn, [train_model.labels,
                                  train_model.predictions]))
Esempio n. 18
0
def infer(hps, X_infer, y_infer):
    """Infering process

    Args:
        hps: Hyperparameters.
        X_infer: Pathes of images. A 1-D numpy array of shape (N_infer, ).
        y_infer: Labels. A 1-D numpy array of shape (N_infer, ).
                 Note that there is no labels when infering, so 'y_infer' is never used
                 in infering process, but working as a placeholder because of the requirement
                 that labels must be provided to build model.
    """

    images, labels = cifar_input.build_infer_input(X_infer, y_infer, hps)

    model = resnet_model.ResNet(hps, images, labels, FLAGS.mode)
    model.build_graph()

    saver = tf.train.Saver()

    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
    tf.train.start_queue_runners(sess)

    try:
        ckpt_state = tf.train.get_checkpoint_state(FLAGS.log_root)
    except tf.errors.OutOfRangeError as e:
        tf.logging.error('Cannot restore checkpoint: %s', e)
    if not (ckpt_state and ckpt_state.model_checkpoint_path):
        tf.logging.info('No model to eval yet at %s', FLAGS.log_root)
    tf.logging.info('Loading checkpoint %s', ckpt_state.model_checkpoint_path)
    saver.restore(sess, ckpt_state.model_checkpoint_path)

    for _ in six.moves.range(FLAGS.infer_batch_count):
        predictions = sess.run(model.predictions)
        predictions = np.argmax(predictions, axis=1)

        # Store the prediction into a .txt file
        with open('./predict.txt', 'a') as f:
            for item in predictions.tolist():
                f.write(str(item) + '\n')
Esempio n. 19
0
    def __init__(self, loader, args=None, curv=False):
        class Args():
            num_classes = 10
            resnet_width = 1
            num_resunits = 3
            nohess = False
            randvec = False
            poison = False
            n_grads_spec = 1
            batch_size = 128
            specreg_bn = False
            normalizer = 'filtnorm'
            bin_path = '/root/bin'
            weight_decay = 0.0

        self.args = Args() if args == None else args
        if curv: self.args.randvec = True
        self.home = os.environ['HOME']

        # model and data loader
        self.model = resnet_model.ResNet(self.args,
                                         mode='eval' if not curv else 'curv')
        self.loader = loader

        # session
        self.sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True,
                                                     gpu_options=tf.GPUOptions(
                                                         allow_growth=True)))
        self.sess.run(tf.global_variables_initializer())

        # build assign op to prevent memory leak from creating the node on each iteration
        self.inputweights = [
            tf.zeros_like(t) for t in tf.trainable_variables()
        ]
        self.assignop = [
            tf.assign(t, w)
            for t, w in zip(tf.trainable_variables(), self.inputweights)
        ]
Esempio n. 20
0
def get_model_config(model):
    """Map model name to model network configuration."""
    if model == 'deep_mnist':
        mc = deepmnist_model.DeepMNISTModel()
    elif model == 'eng_acoustic_model':
        mc = engacoustic_model.EngAcousticModel()
    elif model == 'sensor_net':
        mc = sensornet_model.SensorNetModel()
    elif model == 'vgg11':
        mc = vgg_model.Vgg11Model()
    elif model == 'vgg13':
        mc = vgg_model.Vgg13Model()
    elif model == 'vgg16':
        mc = vgg_model.Vgg16Model()
    elif model == 'vgg19':
        mc = vgg_model.Vgg19Model()
    elif model == 'lenet':
        mc = lenet_model.Lenet5Model()
    elif model == 'googlenet':
        mc = googlenet_model.GooglenetModel()
    elif model == 'overfeat':
        mc = overfeat_model.OverfeatModel()
    elif model == 'alexnet':
        mc = alexnet_model.AlexnetModel()
    elif model == 'trivial':
        mc = trivial_model.TrivialModel()
    elif model == 'inception3':
        mc = inception_model.Inceptionv3Model()
    elif model == 'inception4':
        mc = inception_model.Inceptionv4Model()
    elif model in ('resnet18', 'resnet34', 'resnet50', 'resnet101',
                   'resnet152', 'resnet200', 'resnet269'):
        mc = resnet_model.ResNet(model)
    else:
        raise KeyError('Invalid model name \'%s\'' % model)
    return mc
Esempio n. 21
0
def get_model(hps, dataset, train_data_path, mode='train'):
    images, labels = cifar_input.build_input(dataset, train_data_path,
                                             hps.batch_size, mode)
    model = resnet_model.ResNet(hps, images, labels, mode)
    model.build_graph()
    return model
Esempio n. 22
0
def train_resnet_mentornet(max_step_run):
  """Trains the mentornet with the student resnet model.

  Args:
    max_step_run: The maximum number of gradient steps.
  """
  if not os.path.exists(FLAGS.train_log_dir):
    os.makedirs(FLAGS.train_log_dir)
  g = tf.Graph()

  with g.as_default():
    with tf.device(tf.train.replica_device_setter(FLAGS.ps_tasks)):
      tf_global_step = tf.train.get_or_create_global_step()

      # pylint: disable=line-too-long
      images, one_hot_labels, clean_images, clean_one_hot_labels, num_samples_per_epoch, num_of_classes = cifar_data_provider.my_provide_resnet_data(
          FLAGS.dataset_name,
          'train',
          FLAGS.batch_size,
          dataset_dir=FLAGS.data_dir)

      hps = resnet_model.HParams(
          batch_size=FLAGS.batch_size,
          num_classes=num_of_classes,
          min_lrn_rate=0.0001,
          lrn_rate=FLAGS.learning_rate,
          num_residual_units=9,
          use_bottleneck=False,
          weight_decay_rate=0.0002,
          relu_leakiness=0.1,
          optimizer='mom')

      images.set_shape([FLAGS.batch_size, 32, 32, 3])
      tf.logging.info('num_of_example=%s', num_samples_per_epoch)

      # Define the model:
      resnet = resnet_model.ResNet(hps, images, one_hot_labels, mode='train')
      logits = resnet.build_model()

      # Specify the loss function:
      loss = tf.nn.softmax_cross_entropy_with_logits(
          labels=one_hot_labels, logits=logits)

      dropout_rates = utils.parse_dropout_rate_list(FLAGS.example_dropout_rates)
      example_dropout_rates = tf.convert_to_tensor(
          dropout_rates, np.float32, name='example_dropout_rates')

      loss_p_percentile = tf.convert_to_tensor(
          np.array([FLAGS.loss_p_percentile] * 100),
          np.float32,
          name='loss_p_percentile')

      loss = tf.reshape(loss, [-1, 1])

      epoch_step = tf.to_int32(
          tf.floor(tf.divide(tf_global_step, max_step_run) * 100))

      zero_labels = tf.zeros([tf.shape(loss)[0], 1], tf.float32)

      v = utils.mentornet(
          epoch_step,
          loss,
          zero_labels,
          loss_p_percentile,
          example_dropout_rates,
          burn_in_epoch=FLAGS.burn_in_epoch,
          fixed_epoch_after_burn_in=FLAGS.fixed_epoch_after_burn_in,
          loss_moving_average_decay=FLAGS.loss_moving_average_decay)

      tf.stop_gradient(v)

      # Split v into clean data & noise data part
      is_clean = tf.reshape(tf.reduce_all(tf.equal(one_hot_labels, clean_one_hot_labels), axis=1), [-1,1])
      clean_v = tf.boolean_mask(v, is_clean)
      noise_v = tf.boolean_mask(v, ~is_clean)
      tf.add_to_collection('v', v)
      tf.add_to_collection('v', clean_v)
      tf.add_to_collection('v', noise_v)

      slim.summaries.add_histogram_summary(tf.boolean_mask(v, is_clean), 'clean_v')
      slim.summaries.add_histogram_summary(tf.boolean_mask(v, ~is_clean), 'noisy_v')

      # Log data utilization
      data_util = utils.summarize_data_utilization(v, tf_global_step,
                                                   FLAGS.batch_size)
      decay_loss = resnet.decay()
      weighted_loss_vector = tf.multiply(loss, v)

      weighted_loss = tf.reduce_mean(weighted_loss_vector)

      slim.summaries.add_scalar_summary(
          tf.reduce_mean(loss), 'mentornet/orig_loss')
      slim.summaries.add_scalar_summary(weighted_loss,
                                        'mentornet/weighted_loss')

      # Normalize the decay loss based on v
      weighed_decay_loss = decay_loss * (tf.reduce_sum(v) / FLAGS.batch_size)

      weighted_total_loss = weighted_loss + weighed_decay_loss

      slim.summaries.add_scalar_summary(weighted_total_loss,
                                        'mentornet/total_loss')

      slim.summaries.add_scalar_summary(weighted_total_loss, 'total_loss')
      tf.add_to_collection('total_loss', weighted_total_loss)

      boundaries = [19531, 25000, 30000]
      values = [FLAGS.learning_rate * t for t in [1, 0.1, 0.01, 0.001]]
      lr = tf.train.piecewise_constant(tf_global_step, boundaries, values)
      slim.summaries.add_scalar_summary(lr, 'learning_rate')

      # Specify the optimization scheme:
      with tf.control_dependencies([weighted_total_loss, data_util]):
        # Set up training.
        trainable_variables = tf.trainable_variables()
        trainable_variables = tf.contrib.framework.filter_variables(
            trainable_variables, exclude_patterns=['mentornet'])

        grads = tf.gradients(weighted_total_loss, trainable_variables)
        optimizer = tf.train.MomentumOptimizer(lr, momentum=0.9)

        apply_op = optimizer.apply_gradients(
            zip(grads, trainable_variables),
            global_step=tf_global_step,
            name='train_step')

        train_ops = [apply_op] + resnet.extra_train_ops
        train_op = tf.group(*train_ops)

      # Parameter restore setup
      if FLAGS.trained_mentornet_dir is not None:
        ckpt_model = FLAGS.trained_mentornet_dir
        if os.path.isdir(FLAGS.trained_mentornet_dir):
          ckpt_model = tf.train.latest_checkpoint(ckpt_model)

        # Fix the mentornet parameters
        variables_to_restore = slim.get_variables_to_restore(
            # TODO(lujiang): mentornet_inputs or mentor_inputs?
            include=['mentornet', 'mentornet_inputs'])
        iassign_op1, ifeed_dict1 = tf.contrib.framework.assign_from_checkpoint(
            ckpt_model, variables_to_restore)

        # Create an initial assignment function.
        def init_assign_fn(sess):
          tf.logging.info('Restore using customer initializer %s', '.' * 10)
          sess.run(iassign_op1, ifeed_dict1)
      else:
        init_assign_fn = None

      tf.logging.info('-' * 20 + 'MentorNet' + '-' * 20)
      tf.logging.info('loaded pretrained mentornet from %s', ckpt_model)
      tf.logging.info('loss_p_percentile=%3f', FLAGS.loss_p_percentile)
      tf.logging.info('burn_in_epoch=%d', FLAGS.burn_in_epoch)
      tf.logging.info('fixed_epoch_after_burn_in=%s',
                      FLAGS.fixed_epoch_after_burn_in)
      tf.logging.info('loss_moving_average_decay=%3f',
                      FLAGS.loss_moving_average_decay)
      tf.logging.info('example_dropout_rates %s', ','.join(
          str(t) for t in dropout_rates))
      tf.logging.info('-' * 20)

      saver = tf.train.Saver(max_to_keep=10, keep_checkpoint_every_n_hours=24)

      # Run training.
      slim.learning.train(
          train_op=train_op,
          train_step_fn=resnet_train_step,
          logdir=FLAGS.train_log_dir,
          master=FLAGS.master,
          is_chief=FLAGS.task == 0,
          saver=saver,
          number_of_steps=max_step_run,
          init_fn=init_assign_fn,
          save_summaries_secs=FLAGS.save_summaries_secs,
          save_interval_secs=FLAGS.save_interval_secs)
Esempio n. 23
0
def train(hps):
  # 构建输入数据(读取队列执行器)
  images, labels = cifar_input.build_input(
      FLAGS.dataset, FLAGS.train_data_path, hps.batch_size, FLAGS.mode)
  # 构建残差网络模型
  model = resnet_model.ResNet(hps, images, labels, FLAGS.mode)
  model.build_graph()

  # 计算预测准确率
  truth = tf.argmax(model.labels, axis=1)
  predictions = tf.argmax(model.predictions, axis=1)
  precision = tf.reduce_mean(tf.to_float(tf.equal(predictions, truth)))

  # 建立总结存储器,每100步存储一次
  summary_hook = tf.train.SummarySaverHook(
              save_steps=100,
              output_dir=FLAGS.train_dir,
              summary_op=tf.summary.merge(
                              [model.summaries,
                               tf.summary.scalar('Precision', precision)]))
  # 建立日志打印器,每100步打印一次
  logging_hook = tf.train.LoggingTensorHook(
      tensors={'step': model.global_step,
               'loss': model.cost,
               'precision': precision},
      every_n_iter=100)

  # 学习率更新器,基于全局Step
  class _LearningRateSetterHook(tf.train.SessionRunHook):

    def begin(self):
      #初始学习率
      self._lrn_rate = 0.1

    def before_run(self, run_context):
      return tf.train.SessionRunArgs(
                      # 获取全局Step
                      model.global_step,
                      # 设置学习率
                      feed_dict={model.lrn_rate: self._lrn_rate})  

    def after_run(self, run_context, run_values):
      # 动态更新学习率
      train_step = run_values.results
      if train_step < 40000:
        self._lrn_rate = 0.1
      elif train_step < 60000:
        self._lrn_rate = 0.01
      elif train_step < 80000:
        self._lrn_rate = 0.001
      else:
        self._lrn_rate = 0.0001

  # 建立监控Session
  with tf.train.MonitoredTrainingSession(
      checkpoint_dir=FLAGS.log_root,
      hooks=[logging_hook, _LearningRateSetterHook()],
      chief_only_hooks=[summary_hook],
      # 禁用默认的SummarySaverHook,save_summaries_steps设置为0
      save_summaries_steps=0, 
      config=tf.ConfigProto(allow_soft_placement=True)) as mon_sess:
    while not mon_sess.should_stop():
      # 执行优化训练操作
      mon_sess.run(model.train_op)
Esempio n. 24
0
def evaluate(hps):
  # 构建输入数据(读取队列执行器)
  images, labels = cifar_input.build_input(
      FLAGS.dataset, FLAGS.eval_data_path, hps.batch_size, FLAGS.mode)
  # 构建残差网络模型
  model = resnet_model.ResNet(hps, images, labels, FLAGS.mode)
  model.build_graph()
  # 模型变量存储器
  saver = tf.train.Saver()
  # 总结文件 生成器
  summary_writer = tf.summary.FileWriter(FLAGS.eval_dir)
  
  # 执行Session
  sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
  
  # 启动所有队列执行器
  tf.train.start_queue_runners(sess)

  best_precision = 0.0
  while True:
    # 检查checkpoint文件
    try:
      ckpt_state = tf.train.get_checkpoint_state(FLAGS.log_root)
    except tf.errors.OutOfRangeError as e:
      tf.logging.error('Cannot restore checkpoint: %s', e)
      continue
    if not (ckpt_state and ckpt_state.model_checkpoint_path):
      tf.logging.info('No model to eval yet at %s', FLAGS.log_root)
      continue
  
    # 读取模型数据(训练期间生成)
    tf.logging.info('Loading checkpoint %s', ckpt_state.model_checkpoint_path)
    saver.restore(sess, ckpt_state.model_checkpoint_path)

    # 逐Batch执行测试
    total_prediction, correct_prediction = 0, 0
    for _ in six.moves.range(FLAGS.eval_batch_count):
      # 执行预测
      (loss, predictions, truth, train_step) = sess.run(
          [model.cost, model.predictions,
           model.labels, model.global_step])
      # 计算预测结果
      truth = np.argmax(truth, axis=1)
      predictions = np.argmax(predictions, axis=1)
      correct_prediction += np.sum(truth == predictions)
      total_prediction += predictions.shape[0]

    # 计算准确率
    precision = 1.0 * correct_prediction / total_prediction
    best_precision = max(precision, best_precision)

    # 添加准确率总结
    precision_summ = tf.Summary()
    precision_summ.value.add(
        tag='Precision', simple_value=precision)
    summary_writer.add_summary(precision_summ, train_step)
    
    # 添加最佳准确总结
    best_precision_summ = tf.Summary()
    best_precision_summ.value.add(
        tag='Best Precision', simple_value=best_precision)
    summary_writer.add_summary(best_precision_summ, train_step)
    
    # 添加测试总结
    #summary_writer.add_summary(summaries, train_step)
    
    # 打印日志
    tf.logging.info('loss: %.3f, precision: %.3f, best precision: %.3f' %
                    (loss, precision, best_precision))
    
    # 执行写文件
    summary_writer.flush()

    if FLAGS.eval_once:
      break

    time.sleep(60)
Esempio n. 25
0
def train(hps):
    """Training loop."""
    images, labels = cifar_input.build_input(FLAGS.dataset,
                                             FLAGS.train_data_path,
                                             hps.batch_size, FLAGS.mode,
                                             hps.data_format)
    model = resnet_model.ResNet(hps, images, labels, FLAGS.mode)
    model.build_graph()

    param_stats = tf.contrib.tfprof.model_analyzer.print_model_analysis(
        tf.get_default_graph(),
        tfprof_options=tf.contrib.tfprof.model_analyzer.
        TRAINABLE_VARS_PARAMS_STAT_OPTIONS)
    sys.stdout.write('total_params: %d\n' % param_stats.total_parameters)

    tf.contrib.tfprof.model_analyzer.print_model_analysis(
        tf.get_default_graph(),
        tfprof_options=tf.contrib.tfprof.model_analyzer.FLOAT_OPS_OPTIONS)

    truth = tf.argmax(model.labels, axis=1)
    predictions = tf.argmax(model.predictions, axis=1)
    precision = tf.reduce_mean(tf.to_float(tf.equal(predictions, truth)))

    summary_hook = tf.train.SummarySaverHook(
        save_steps=100,
        output_dir=FLAGS.train_dir,
        summary_op=tf.summary.merge(
            [model.summaries,
             tf.summary.scalar('Precision', precision)]))

    num_steps_per_epoch = 391  # TODO: Don't hardcode this.

    logging_hook = tf.train.LoggingTensorHook(tensors={
        'step': model.global_step,
        'loss': model.cost,
        'precision': precision
    },
                                              every_n_iter=100)

    class _LearningRateSetterHook(tf.train.SessionRunHook):
        """Sets learning_rate based on global step."""
        def begin(self):
            self._lrn_rate = 0.01

        def before_run(self, run_context):
            return tf.train.SessionRunArgs(
                model.global_step,  # Asks for global step value.
                feed_dict={model.lrn_rate:
                           self._lrn_rate})  # Sets learning rate

        def after_run(self, run_context, run_values):
            train_step = run_values.results
            if train_step < num_steps_per_epoch:
                self._lrn_rate = 0.01
            elif train_step < (91 * num_steps_per_epoch):
                self._lrn_rate = 0.1
            elif train_step < (136 * num_steps_per_epoch):
                self._lrn_rate = 0.01
            elif train_step < (181 * num_steps_per_epoch):
                self._lrn_rate = 0.001
            else:
                self._lrn_rate = 0.0001

    class _SaverHook(tf.train.SessionRunHook):
        """Sets learning_rate based on global step."""
        def begin(self):
            self.saver = tf.train.Saver(max_to_keep=10000)
            subprocess.call("rm -rf %s; mkdir -p %s" %
                            (FLAGS.checkpoint_dir, FLAGS.checkpoint_dir),
                            shell=True)
            self.f = open(os.path.join(FLAGS.checkpoint_dir, "times.log"), 'w')

        def after_create_session(self, sess, coord):
            self.sess = sess
            self.start_time = time.time()

        def before_run(self, run_context):
            return tf.train.SessionRunArgs(
                model.global_step  # Asks for global step value.
            )

        def after_run(self, run_context, run_values):
            train_step = run_values.results
            epoch = train_step / num_steps_per_epoch
            if train_step % num_steps_per_epoch == 0:
                end_time = time.time()
                directory = os.path.join(FLAGS.checkpoint_dir,
                                         ("%5d" % epoch).replace(' ', '0'))
                subprocess.call("mkdir -p %s" % directory, shell=True)
                ckpt_name = 'model.ckpt'
                self.saver.save(self.sess,
                                os.path.join(directory, ckpt_name),
                                global_step=train_step)
                self.f.write("Step: %d\tTime: %s\n" %
                             (train_step, end_time - self.start_time))
                print("Saved checkpoint after %d epoch(s) to %s..." %
                      (epoch, directory))
                sys.stdout.flush()
                self.start_time = time.time()

        def end(self, sess):
            self.f.close()

    with tf.train.MonitoredTrainingSession(
            checkpoint_dir=FLAGS.log_root,
            hooks=[logging_hook, _LearningRateSetterHook()],
            chief_only_hooks=[summary_hook, _SaverHook()],
            save_checkpoint_secs=None,
            # Since we provide a SummarySaverHook, we need to disable default
            # SummarySaverHook. To do that we set save_summaries_steps to 0.
            save_summaries_steps=None,
            save_summaries_secs=None,
            config=tf.ConfigProto(allow_soft_placement=True)) as mon_sess:
        for i in range(num_steps_per_epoch * 181):
            mon_sess.run(model.train_op)
Esempio n. 26
0
def evaluate(hps):
    """Eval loop."""
    images, labels = cifar_input.build_input(FLAGS.dataset,
                                             FLAGS.eval_data_path,
                                             hps.batch_size, FLAGS.mode,
                                             hps.data_format)
    model = resnet_model.ResNet(hps, images, labels, FLAGS.mode)
    model.build_graph()
    saver = tf.train.Saver()
    summary_writer = tf.summary.FileWriter(FLAGS.eval_dir)

    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
    tf.train.start_queue_runners(sess)

    best_precision = 0.0
    while True:
        try:
            ckpt_state = tf.train.get_checkpoint_state(FLAGS.log_root)
        except tf.errors.OutOfRangeError as e:
            tf.logging.error('Cannot restore checkpoint: %s', e)
            continue
        if not (ckpt_state and ckpt_state.model_checkpoint_path):
            tf.logging.info('No model to eval yet at %s', FLAGS.log_root)
            break
        tf.logging.info('Loading checkpoint %s',
                        ckpt_state.model_checkpoint_path)
        saver.restore(sess, ckpt_state.model_checkpoint_path)

        global_step = ckpt_state.model_checkpoint_path.split('/')[-1].split(
            '-')[-1]
        if not global_step.isdigit():
            global_step = 0
        else:
            global_step = int(global_step)

        total_prediction, correct_prediction, correct_prediction_top5 = 0, 0, 0
        start_time = time.time()
        for _ in six.moves.range(FLAGS.eval_batch_count):
            (summaries, loss, predictions, truth, train_step) = sess.run([
                model.summaries, model.cost, model.predictions, model.labels,
                model.global_step
            ])

            if not FLAGS.time_inference:
                for (indiv_truth, indiv_prediction) in zip(truth, predictions):
                    indiv_truth = np.argmax(indiv_truth)
                    top5_prediction = np.argsort(indiv_prediction)[-5:]
                    top1_prediction = np.argsort(indiv_prediction)[-1]
                    correct_prediction += (indiv_truth == top1_prediction)
                    if indiv_truth in top5_prediction:
                        correct_prediction_top5 += 1
                    total_prediction += 1

        if FLAGS.time_inference:
            print("Time for inference: %.4f" % (time.time() - start_time))
        else:
            precision = 1.0 * correct_prediction / total_prediction
            precision_top5 = 1.0 * correct_prediction_top5 / total_prediction
            best_precision = max(precision, best_precision)

            precision_summ = tf.Summary()
            precision_summ.value.add(tag='Precision', simple_value=precision)
            summary_writer.add_summary(precision_summ, train_step)
            best_precision_summ = tf.Summary()
            best_precision_summ.value.add(tag='Best Precision',
                                          simple_value=best_precision)
            summary_writer.add_summary(best_precision_summ, train_step)
            summary_writer.add_summary(summaries, train_step)
            print('Precision @ 1 = %.4f, Recall @ 5 = %.4f, Global step = %d' %
                  (precision, precision_top5, global_step))
            summary_writer.flush()

        if FLAGS.eval_once:
            break

        time.sleep(60)
def train_resnet_baseline(max_step_run):
    """Trains the resnet baseline model.

  Args:
    max_step_run: The maximum number of gradient steps.
  """
    if not os.path.exists(FLAGS.train_log_dir):
        os.makedirs(FLAGS.train_log_dir)
    g = tf.Graph()

    with g.as_default():
        with tf.device(tf.train.replica_device_setter(FLAGS.ps_tasks)):
            tf_global_step = tf.train.get_or_create_global_step()

            # pylint: disable=line-too-long
            images, one_hot_labels, num_samples_per_epoch, num_of_classes = cifar_data_provider.provide_resnet_data(
                FLAGS.dataset_name,
                'train',
                FLAGS.batch_size,
                dataset_dir=FLAGS.data_dir)

            hps = resnet_model.HParams(batch_size=FLAGS.batch_size,
                                       num_classes=num_of_classes,
                                       min_lrn_rate=0.0001,
                                       lrn_rate=FLAGS.learning_rate,
                                       num_residual_units=9,
                                       use_bottleneck=False,
                                       weight_decay_rate=0.0002,
                                       relu_leakiness=0.1,
                                       optimizer='mom')

            images.set_shape([FLAGS.batch_size, 32, 32, 3])
            tf.logging.info('num_of_example={}'.format(num_samples_per_epoch))

            # Define the model:
            resnet = resnet_model.ResNet(hps,
                                         images,
                                         one_hot_labels,
                                         mode='train')
            logits = resnet.build_model()

            # Specify the loss function:
            total_loss = tf.nn.softmax_cross_entropy_with_logits(
                labels=one_hot_labels, logits=logits)
            total_loss = tf.reduce_mean(total_loss, name='xent')
            total_loss += resnet.decay()  # decay
            tf.add_to_collection('total_loss', total_loss)

            decay_steps = int(num_samples_per_epoch / FLAGS.batch_size *
                              FLAGS.num_epochs_per_decay)

            boundaries = [19531, 25000, 30000]
            values = [FLAGS.learning_rate * t for t in [1, 0.1, 0.01, 0.001]]
            lr = tf.train.piecewise_constant(tf_global_step, boundaries,
                                             values)
            slim.summaries.add_scalar_summary(lr,
                                              'learning_rate',
                                              print_summary=True)

            lr = tf.train.exponential_decay(FLAGS.learning_rate,
                                            tf_global_step,
                                            decay_steps,
                                            FLAGS.learning_rate_decay_factor,
                                            staircase=True)

            slim.summaries.add_scalar_summary(total_loss,
                                              'total_loss',
                                              print_summary=True)

            # Set up training.
            trainable_variables = tf.trainable_variables()

            grads = tf.gradients(total_loss, trainable_variables)

            optimizer = tf.train.MomentumOptimizer(lr, momentum=0.9)

            apply_op = optimizer.apply_gradients(zip(grads,
                                                     trainable_variables),
                                                 global_step=tf_global_step,
                                                 name='train_step')

            train_ops = [apply_op] + resnet.extra_train_ops
            train_op = tf.group(*train_ops)

            saver = tf.train.Saver(max_to_keep=10,
                                   keep_checkpoint_every_n_hours=24)

            # Run training.
            slim.learning.train(train_op=train_op,
                                train_step_fn=resnet_train_step,
                                logdir=FLAGS.train_log_dir,
                                master=FLAGS.master,
                                saver=saver,
                                is_chief=FLAGS.task == 0,
                                number_of_steps=max_step_run,
                                save_summaries_secs=FLAGS.save_summaries_secs,
                                save_interval_secs=FLAGS.save_interval_secs)
def train_resnet_mentormix(max_step_run):
    """Trains the mentornet with the student resnet model.

  Args:
    max_step_run: The maximum number of gradient steps.
  """
    if not os.path.exists(FLAGS.train_log_dir):
        os.makedirs(FLAGS.train_log_dir)
    g = tf.Graph()

    with g.as_default():
        with tf.device(tf.train.replica_device_setter(FLAGS.ps_tasks)):
            tf_global_step = tf.train.get_or_create_global_step()

            (images, one_hot_labels, num_samples_per_epoch,
             num_of_classes) = cifar_data_provider.provide_resnet_data(
                 FLAGS.dataset_name,
                 'train',
                 FLAGS.batch_size,
                 dataset_dir=FLAGS.data_dir)

            hps = resnet_model.HParams(batch_size=FLAGS.batch_size,
                                       num_classes=num_of_classes,
                                       min_lrn_rate=0.0001,
                                       lrn_rate=FLAGS.learning_rate,
                                       num_residual_units=5,
                                       use_bottleneck=False,
                                       weight_decay_rate=0.0002,
                                       relu_leakiness=0.1,
                                       optimizer='mom')

            images.set_shape([FLAGS.batch_size, 32, 32, 3])

            # Define the model:
            resnet = resnet_model.ResNet(hps,
                                         images,
                                         one_hot_labels,
                                         mode='train')
            with tf.variable_scope('ResNet32'):
                logits = resnet.build_model()

            # Specify the loss function:
            loss = tf.nn.softmax_cross_entropy_with_logits(
                labels=one_hot_labels, logits=logits)

            dropout_rates = utils.parse_dropout_rate_list(
                FLAGS.example_dropout_rates)
            example_dropout_rates = tf.convert_to_tensor(
                dropout_rates, np.float32, name='example_dropout_rates')

            loss_p_percentile = tf.convert_to_tensor(np.array(
                [FLAGS.loss_p_percentile] * 100),
                                                     np.float32,
                                                     name='loss_p_percentile')

            loss = tf.reshape(loss, [-1, 1])

            epoch_step = tf.to_int32(
                tf.floor(tf.divide(tf_global_step, max_step_run) * 100))

            zero_labels = tf.zeros([tf.shape(loss)[0], 1], tf.float32)

            mentornet_net_hparams = utils.get_mentornet_network_hyperparameter(
                FLAGS.trained_mentornet_dir)

            # In the simplest case, this function can be replaced with a thresholding
            # function. See loss_thresholding_function in utils.py.
            v = utils.mentornet(epoch_step,
                                loss,
                                zero_labels,
                                loss_p_percentile,
                                example_dropout_rates,
                                burn_in_epoch=FLAGS.burn_in_epoch,
                                mentornet_net_hparams=mentornet_net_hparams,
                                avg_name='individual')

            v = tf.stop_gradient(v)
            loss = tf.stop_gradient(tf.identity(loss))
            logits = tf.stop_gradient(tf.identity(logits))

            # Perform MentorMix
            images_mix, labels_mix = utils.mentor_mix_up(
                images, one_hot_labels, v, FLAGS.mixup_alpha)
            resnet = resnet_model.ResNet(hps,
                                         images_mix,
                                         labels_mix,
                                         mode='train')
            with tf.variable_scope('ResNet32', reuse=True):
                logits_mix = resnet.build_model()

            loss = tf.nn.softmax_cross_entropy_with_logits(labels=labels_mix,
                                                           logits=logits_mix)
            decay_loss = resnet.decay()

            # second weighting
            if FLAGS.second_reweight:
                loss = tf.reshape(loss, [-1, 1])
                v = utils.mentornet(
                    epoch_step,
                    loss,
                    zero_labels,
                    loss_p_percentile,
                    example_dropout_rates,
                    burn_in_epoch=FLAGS.burn_in_epoch,
                    mentornet_net_hparams=mentornet_net_hparams,
                    avg_name='mixed')
                v = tf.stop_gradient(v)
                weighted_loss_vector = tf.multiply(loss, v)
                loss = tf.reduce_mean(weighted_loss_vector)
                # reproduced with the following decay loss which should be 0.
                decay_loss = tf.losses.get_regularization_loss()
                decay_loss = decay_loss * (tf.reduce_sum(v) / FLAGS.batch_size)

            # Log data utilization
            data_util = utils.summarize_data_utilization(
                v, tf_global_step, FLAGS.batch_size)

            loss = tf.reduce_mean(loss)
            slim.summaries.add_scalar_summary(tf.reduce_mean(loss),
                                              'mentormix/mix_loss')

            weighted_total_loss = loss + decay_loss

            slim.summaries.add_scalar_summary(weighted_total_loss,
                                              'total_loss')
            tf.add_to_collection('total_loss', weighted_total_loss)

            # Set up the moving averages:
            moving_average_variables = tf.trainable_variables()
            moving_average_variables = tf.contrib.framework.filter_variables(
                moving_average_variables, exclude_patterns=['mentornet'])

            variable_averages = tf.train.ExponentialMovingAverage(
                0.9999, tf_global_step)
            tf.add_to_collection(
                tf.GraphKeys.UPDATE_OPS,
                variable_averages.apply(moving_average_variables))

            decay_steps = FLAGS.num_epochs_per_decay * num_samples_per_epoch / FLAGS.batch_size
            lr = tf.train.exponential_decay(FLAGS.learning_rate,
                                            tf_global_step,
                                            decay_steps,
                                            FLAGS.learning_rate_decay_factor,
                                            staircase=True)
            lr = tf.squeeze(lr)
            slim.summaries.add_scalar_summary(lr, 'learning_rate')

            # Specify the optimization scheme:
            with tf.control_dependencies([weighted_total_loss, data_util]):
                # Set up training.
                trainable_variables = tf.trainable_variables()
                trainable_variables = tf.contrib.framework.filter_variables(
                    trainable_variables, exclude_patterns=['mentornet'])

                grads = tf.gradients(weighted_total_loss, trainable_variables)
                optimizer = tf.train.MomentumOptimizer(lr, momentum=0.9)

                apply_op = optimizer.apply_gradients(
                    zip(grads, trainable_variables),
                    global_step=tf_global_step,
                    name='train_step')

                train_ops = [apply_op
                             ] + resnet.extra_train_ops + tf.get_collection(
                                 tf.GraphKeys.UPDATE_OPS)
                train_op = tf.group(*train_ops)

            # Parameter restore setup
            if FLAGS.trained_mentornet_dir is not None:
                ckpt_model = FLAGS.trained_mentornet_dir
                if os.path.isdir(FLAGS.trained_mentornet_dir):
                    ckpt_model = tf.train.latest_checkpoint(ckpt_model)

                # Fix the mentornet parameters
                variables_to_restore = slim.get_variables_to_restore(
                    include=['mentornet', 'mentornet_inputs'])
                iassign_op1, ifeed_dict1 = tf.contrib.framework.assign_from_checkpoint(
                    ckpt_model, variables_to_restore)

                # Create an initial assignment function.
                def init_assign_fn(sess):
                    tf.logging.info('Restore using customer initializer %s',
                                    '.' * 10)
                    sess.run(iassign_op1, ifeed_dict1)
            else:
                init_assign_fn = None

            tf.logging.info('-' * 20 + 'MentorMix' + '-' * 20)
            tf.logging.info('loss_p_percentile=%3f', FLAGS.loss_p_percentile)
            tf.logging.info('mixup_alpha=%d', FLAGS.mixup_alpha)
            tf.logging.info('-' * 20)

            saver = tf.train.Saver(max_to_keep=10,
                                   keep_checkpoint_every_n_hours=24)

            # Run training.
            slim.learning.train(train_op=train_op,
                                train_step_fn=resnet_train_step,
                                logdir=FLAGS.train_log_dir,
                                master=FLAGS.master,
                                is_chief=FLAGS.task == 0,
                                saver=saver,
                                number_of_steps=max_step_run,
                                init_fn=init_assign_fn,
                                save_summaries_secs=FLAGS.save_summaries_secs,
                                save_interval_secs=FLAGS.save_interval_secs)
def train(hps):
    """Training loop."""
    images, labels = cifar_input.build_input(FLAGS.dataset,
                                             FLAGS.train_data_path,
                                             hps.batch_size, FLAGS.mode)
    model = resnet_model.ResNet(hps, images, labels, FLAGS.mode)
    model.build_graph()

    param_stats = tf.contrib.tfprof.model_analyzer.print_model_analysis(
        tf.get_default_graph(),
        tfprof_options=tf.contrib.tfprof.model_analyzer.
        TRAINABLE_VARS_PARAMS_STAT_OPTIONS)
    sys.stdout.write('total_params: %d\n' % param_stats.total_parameters)

    tf.contrib.tfprof.model_analyzer.print_model_analysis(
        tf.get_default_graph(),
        tfprof_options=tf.contrib.tfprof.model_analyzer.FLOAT_OPS_OPTIONS)

    truth = tf.argmax(model.labels, axis=1)
    predictions = tf.argmax(model.predictions, axis=1)
    precision = tf.reduce_mean(tf.to_float(tf.equal(predictions, truth)))

    summary_hook = tf.train.SummarySaverHook(
        save_steps=100,
        output_dir=FLAGS.train_dir,
        summary_op=tf.summary.merge(
            [model.summaries,
             tf.summary.scalar('Precision', precision)]))

    logging_hook = tf.train.LoggingTensorHook(tensors={
        'step': model.global_step,
        'loss': model.cost,
        'precision': precision
    },
                                              every_n_iter=100)

    class _LearningRateSetterHook(tf.train.SessionRunHook):
        """Sets learning_rate based on global step."""
        def begin(self):
            self._lrn_rate = 0.1

        def before_run(self, run_context):
            return tf.train.SessionRunArgs(
                model.global_step,  # Asks for global step value.
                feed_dict={model.lrn_rate:
                           self._lrn_rate})  # Sets learning rate

        def after_run(self, run_context, run_values):
            train_step = run_values.results
            if train_step < 40000:
                self._lrn_rate = 0.1
            elif train_step < 60000:
                self._lrn_rate = 0.01
            elif train_step < 80000:
                self._lrn_rate = 0.001
            else:
                self._lrn_rate = 0.0001

    with tf.train.MonitoredTrainingSession(
            checkpoint_dir=FLAGS.log_root,
            hooks=[logging_hook, _LearningRateSetterHook()],
            chief_only_hooks=[summary_hook],
            # Since we provide a SummarySaverHook, we need to disable default
            # SummarySaverHook. To do that we set save_summaries_steps to 0.
            save_summaries_steps=0,
            config=tf.ConfigProto(allow_soft_placement=True)) as mon_sess:
        while not mon_sess.should_stop():
            mon_sess.run(model.train_op)
Esempio n. 30
0
    def train(self, hps):
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        sess = tf.InteractiveSession(config=config)

        img = tf.placeholder(tf.float32, shape=[self.batch_num, 32, 32, 3])
        labels = tf.placeholder(tf.int32, shape=[
            self.batch_num,
        ])

        model = resnet_model.ResNet(hps, img, labels, 'train')
        model.build_graph()

        merged = model.summaries
        train_writer = tf.summary.FileWriter("/tmp/train_log", sess.graph)

        sess.run(tf.global_variables_initializer())
        print('Done initializing variables')
        print('Running model...')

        # Set default learning rate for scheduling
        lr = args.lr

        for j in range(self.num_epoch):
            print('Epoch {}'.format(j + 1))

            # Decrease learning rate every args.lr_schedule epoch
            # By args.lr_factor
            if (j + 1) % args.lr_schedule == 0:
                lr *= args.lr_factor

            for i in range(self.num_iter):
                batch = self.next_batch(self.batch_num)
                feed_dict = {
                    img: batch[0],
                    labels: batch[1],
                    model.lrn_rate: lr
                }
                _, l, ac, summary, lr = sess.run([
                    model.train_op, model.cost, model.acc, merged,
                    model.lrn_rate
                ],
                                                 feed_dict=feed_dict)
                train_writer.add_summary(summary, i)
                #
                if i % 200 == 0:
                    print('step', i + 1)
                    print('Training loss', l)
                    print('Training accuracy', ac)
                    print('Learning rate', lr)

            print('Running evaluation...')

            test_loss, test_acc, n_batch = 0, 0, 0
            for batch in tl.iterate.minibatches(inputs=self.x_valid,
                                                targets=self.y_valid,
                                                batch_size=self.batch_num,
                                                shuffle=False):
                feed_dict_eval = {img: batch[0], labels: batch[1]}

                loss, ac = sess.run([model.cost, model.acc],
                                    feed_dict=feed_dict_eval)
                test_loss += loss
                test_acc += ac
                n_batch += 1

            tot_test_loss = test_loss / n_batch
            tot_test_acc = test_acc / n_batch

            print('   Test loss: {}'.format(tot_test_loss))
            print('   Test accuracy: {}'.format(tot_test_acc))

        print('Completed training and evaluation.')

        test_predicted = []

        for batch in tl.iterate.minibatches(inputs=self.x_test,
                                            targets=self.y_test,
                                            batch_size=50,
                                            shuffle=False):
            feed_dict_eval = {img: batch[0]}
            preds = sess.run(model.predict, feed_dict=feed_dict_eval)
            for pred in preds:
                test_predicted.append(pred)

        csv_content = [["ID", "Label"]]
        for ind, data in enumerate(test_predicted):
            csv_content.append([ind + 1, data + 1])
        with open("cifar_prediction.csv", "w") as f:
            writer = csv.writer(f)
            writer.writerows(csv_content)