Esempio n. 1
0
def _tower_fn(is_training, images, score_maps, geo_maps, training_masks, reuse_variables=None):
    # Build inference graph
    with tf.variable_scope(tf.get_variable_scope(), reuse=reuse_variables):
        f_score, f_geometry = model.model(images, is_training=True)

    model_loss = model.loss(score_maps, f_score,
                            geo_maps, f_geometry,
                            training_masks)
    total_loss = tf.add_n([model_loss] + tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))

    # add summary
    summaries = None
    if reuse_variables is None:
        image_sum = tf.summary.image('input', images)
        score_sum = tf.summary.image('score_map', score_maps)
        f_score_sum = tf.summary.image('score_map_pred', f_score * 255)
        geo_sum = tf.summary.image('geo_map_0', geo_maps[:, :, :, 0:1])
        f_geo_sum = tf.summary.image('geo_map_0_pred', f_geometry[:, :, :, 0:1])
        mask_sum = tf.summary.image('training_masks', training_masks)
        loss1_sum = tf.summary.scalar('model_loss', model_loss)
        loss_sum = tf.summary.scalar('total_loss', total_loss)
        summaries = [image_sum, score_sum, f_score_sum, geo_sum, f_geo_sum, mask_sum, loss1_sum, loss_sum]

    model_params = tf.trainable_variables()
    tower_grad = tf.gradients(total_loss, model_params)

    return total_loss, zip(tower_grad, model_params), summaries
def tower_loss(images,
               score_maps,
               geo_maps,
               training_masks,
               reuse_variables=None):
    # Build inference graph
    with tf.variable_scope(tf.get_variable_scope(), reuse=reuse_variables):
        f_score, f_geometry = model.model(images, is_training=True)

    model_loss = model.loss(score_maps, f_score, geo_maps, f_geometry,
                            training_masks)
    total_loss = tf.add_n(
        [model_loss] + tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))

    # add summary
    if reuse_variables is None:
        tf.summary.image('input', images)
        tf.summary.image('score_map', score_maps)
        tf.summary.image('score_map_pred', f_score * 255)
        tf.summary.image('geo_map_0', geo_maps[:, :, :, 0:1])
        tf.summary.image('geo_map_0_pred', f_geometry[:, :, :, 0:1])
        tf.summary.image('training_masks', training_masks)
        tf.summary.scalar('model_loss', model_loss)
        tf.summary.scalar('total_loss', total_loss)

    return total_loss, model_loss
Esempio n. 3
0
def run_training(args):
    with tf.Graph().as_default():

        # data_train, data_validation, im_size = data.load_data_random(args.n_images, im_size=(256,256), light_size=(8,8))
        # data_train, data_validation, im_size = data.load_data_smooth(args.n_images, im_size=(256,256), light_size=(8,8))
        # data_train, data_validation, im_size = data.load_data_grid(args.n_images, im_size=(256,256), light_size=(8,8))
        # data_train, data_validation = data.load_Tgray_mat(args.n_images)
        data_train, data_validation, im_size = data.load_Green_mat(
            args.n_images)

        X_tensor = tf.placeholder(tf.float32,
                                  shape=(None, data.INPUT_DIM),
                                  name="input")
        yt_tensor = tf.placeholder(tf.float32,
                                   shape=(None, data.OUTPUT_DIM),
                                   name="output")

        y_tensor = model.inference(X_tensor,
                                   n_units=15,
                                   output_dim=data.OUTPUT_DIM)
        loss_tensor = model.loss(y_tensor, yt_tensor)
        error_tensor = model.training_error(loss_tensor, yt_tensor)
        train_op = model.training(loss_tensor, args.learning_rate)

        config = tf.ConfigProto(device_count={'GPU': 0})
        if args.gpu: config = tf.ConfigProto()
        init = tf.initialize_all_variables()
        saver = tf.train.Saver()
        sess = tf.Session(config=config)
        sess.run(init)

        # show_image(data_train[0,...,-2], im_size)
        show_image(data_train[0, ..., -1], im_size)
        # y_ = run_inference(sess, X_tensor, y_tensor, data_train[0,...,:-1])
        # show_image(y_[:,0], im_size)

        for step in range(args.max_steps):
            X_data, yt_data = data.split_input_output(
                data.next_batch_images(data_train, args.batch_size))
            # print(X_data.min(axis=0))
            # print(X_data.max(axis=0))
            # print(yt_data.min(axis=0))
            # print(yt_data.max(axis=0))
            feed_dict = {X_tensor: X_data, yt_tensor: yt_data}
            _, loss_value, error = sess.run(
                [train_op, loss_tensor, error_tensor], feed_dict=feed_dict)

            if step % 5 == 0:
                epoch = step * args.batch_size / data_train.shape[0]
                print('Step %d (epoch %.2f): loss = %.2f (error = %.3f)' %
                      (step, epoch, loss_value, error))
                # y_ = run_inference(sess, X, y_tensor, (0.5, 0.5), data.TGRAY_SIZE)
                # show_image(y_[:,0], data.TGRAY_SIZE)

            if (step + 1) % 5 == 0:
                y_ = run_inference(sess, X_tensor, y_tensor,
                                   data_train[0, ..., :-1])
                # y_ = run_inference(sess, X_tensor, y_tensor, X_data[:im_size[0]*im_size[1]])
                # show_image(y_[:,0], im_size)
                write_image(y_[:, 0], im_size, 'results/green-%i.jpg' % step)
Esempio n. 4
0
def train():
  with tf.Graph().as_default():
    global_step = tf.contrib.framework.get_or_create_global_step()
    print ("Global step", global_step)
    images, labels = model.distorted_inputs()
    logits = model.inference(images)
    loss = model.loss(logits, labels)
    train_op = model.train(loss, global_step)
    class _LoggerHook(tf.train.SessionRunHook):
      def begin(self):
        self._step = -1
      def before_run(self, run_context):
        self._step += 1
        self._start_time = time.time()
        return tf.train.SessionRunArgs(loss)
      def after_run(self, run_context, run_values):
        duration = time.time() - self._start_time
        loss_value = run_values.results
        if self._step % 10 == 0:
          num_examples_per_step = FLAGS.batch_size
          examples_per_sec = num_examples_per_step /duration
          sec_per_batch = float(duration)
          format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
            'sec/batch)')
          print (format_str % (datetime.now(), self._step, loss_value,
                  examples_per_sec, sec_per_batch))
 
    with tf.train.MonitoredTrainingSession(
      checkpoint_dir = FLAGS.train_dir,
      hooks=[tf.train.StopAtStepHook(last_step=FLAGS.max_steps),
              tf.train.NanTensorHook(loss),
              _LoggerHook()],
      config=tf.ConfigProto(log_device_placement=FLAGS.log_device_placement)) as mon_sess:
      while not mon_sess.should_stop():
        mon_sess.run(train_op)
Esempio n. 5
0
def train():
    # Turn on training mode which enables dropout.
    model.train()
    total_loss = 0.
    start_time = time.time()
    ntokens = len(corpus.dictionary)
    for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)):
        data, labels = get_batch(train_data, train_labels, corpus.train_seq_lens, i)
        hidden = model.init_hidden(args.batch_size)
        model.zero_grad()
        output, _ = model(data, hidden)
        mask = (data >= 0).float()
        loss, _ = model.loss(output, labels, mask)
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip)
        for p in model.parameters():
            p.data.add_(-lr, p.grad.data)

        total_loss += loss.item()

        if batch % args.log_interval == 0 and batch > 0:
            cur_loss = total_loss / args.log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
                    'loss {:5.2f} | ppl {:8.2f}'.format(
                epoch, batch, len(train_data) // args.bptt, lr,
                elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()
Esempio n. 6
0
def train():
    with tf.Graph().as_default():
        global_step = tf.Variable(0, trainable=False)
        image, label = input.get_input(LABEL_PATH, LABEL_FORMAT, IMAGE_PATH, IMAGE_FORMAT)
        logits = model.inference(image)
        loss = model.loss(logits, label)
        train_op = model.train(loss, global_step)
        saver = tf.train.Saver(tf.all_variables())
        summary_op = tf.merge_all_summaries()
        init = tf.initialize_all_variables()
        sess = tf.Session(config=tf.ConfigProto(log_device_placement=input.FLAGS.log_device_placement))
        sess.run(init)
        # Start the queue runners.
        tf.train.start_queue_runners(sess=sess)
        summary_writer = tf.train.SummaryWriter(input.FLAGS.train_dir, graph_def=sess.graph_def)
        for step in xrange(input.FLAGS.max_steps):
            start_time = time.time()
            _, loss_value = sess.run([train_op, loss])
            duration = time.time() - start_time
            assert not np.isnan(loss_value), "Model diverged with loss = NaN"
            if step % 1 == 0:
                num_examples_per_step = input.FLAGS.batch_size
                examples_per_sec = num_examples_per_step / duration
                sec_per_batch = float(duration)
                format_str = "%s: step %d, loss = %.2f (%.1f examples/sec; %.3f sec/batch)"
                print(format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch))
            if step % 10 == 0:
                summary_str = sess.run(summary_op)
                summary_writer.add_summary(summary_str, step)
            # Save the model checkpoint periodically.
            if step % 25 == 0:
                checkpoint_path = os.path.join(input.FLAGS.train_dir, "model.ckpt")
                saver.save(sess, checkpoint_path, global_step=step)
Esempio n. 7
0
def main(argv=None):
    labels_data = labels_json()
    tf.Variable(labels_data, trainable=False, name='labels')

    batch_size = 128
    files = [os.path.join(FLAGS.datadir, f) for f in os.listdir(os.path.join(FLAGS.datadir)) if f.endswith('.tfrecords')]
    images, labels = inputs(batch_size, files)
    logits = model.inference(images, len(json.loads(labels_data)) + 1)
    losses = model.loss(logits, labels)
    train_op = model.train(losses)
    summary_op = tf.summary.merge_all()
    saver = tf.train.Saver(tf.global_variables(), max_to_keep=21)
    with tf.Session() as sess:
        summary_writer = tf.summary.FileWriter(FLAGS.logdir, graph=sess.graph)
        restore_or_initialize(sess)

        tf.train.start_queue_runners(sess=sess)

        for step in range(FLAGS.max_steps):
            start_time = time.time()
            _, loss_value = sess.run([train_op, losses])
            duration = time.time() - start_time

            assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

            format_str = '%s: step %d, loss = %.5f (%.3f sec/batch)'
            print(format_str % (datetime.now(), step, loss_value, duration))

            if step % 100 == 0:
                summary_str = sess.run(summary_op)
                summary_writer.add_summary(summary_str, step)
            if step % 250 == 0 or (step + 1) == FLAGS.max_steps:
                checkpoint_path = os.path.join(FLAGS.logdir, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step, write_meta_graph=False, write_state=False)
Esempio n. 8
0
    def __init__(self, model_dir=None, gpu_fraction=0.7):
        config = tf.ConfigProto(allow_soft_placement=True)
        config.gpu_options.per_process_gpu_memory_fraction = gpu_fraction
        self.sess = tf.Session(config=config)
        self.imgs_ph, self.bn, self.output_tensors, self.pred_labels, self.pred_locs = model.model(
            self.sess)
        total_boxes = self.pred_labels.get_shape().as_list()[1]
        self.positives_ph, self.negatives_ph, self.true_labels_ph, self.true_locs_ph, self.total_loss, self.class_loss, self.loc_loss = \
            model.loss(self.pred_labels, self.pred_locs, total_boxes)
        out_shapes = [out.get_shape().as_list() for out in self.output_tensors]
        c.out_shapes = out_shapes
        c.defaults = model.default_boxes(out_shapes)

        # variables in model are already initialized, so only initialize those declared after
        with tf.variable_scope("optimizer"):
            self.global_step = tf.Variable(0)
            self.lr_ph = tf.placeholder(tf.float32, shape=[])

            self.optimizer = tf.train.AdamOptimizer(1e-3).minimize(
                self.total_loss, global_step=self.global_step)
        new_vars = tf.get_collection(tf.GraphKeys.VARIABLES, scope="optimizer")
        self.sess.run(tf.initialize_variables(new_vars))

        if model_dir is None:
            model_dir = FLAGS.model_dir

        ckpt = tf.train.get_checkpoint_state(model_dir)
        self.saver = tf.train.Saver()

        if ckpt and ckpt.model_checkpoint_path:
            self.saver.restore(self.sess, ckpt.model_checkpoint_path)
            print("restored %s" % ckpt.model_checkpoint_path)
def train():
    tr, va, te = read_dataset('../mnist.pkl.gz')
    binarizer = LabelBinarizer().fit(range(10))

    x = tf.placeholder(tf.float32, [None, 784])
    y = tf.placeholder(tf.float32, [None, 10])
    keep_prob = tf.placeholder(tf.float32)
    preds = model.inference(x, keep_prob)
    loss, total_loss = model.loss(preds, y)
    acc = model.evaluation(preds, y)
    # learning rate: 0.1
    train_op = model.training(total_loss, 0.1)

    init = tf.initialize_all_variables()
    sess = tf.Session()
    sess.run(init)
    for i in xrange(10000):
        batch_xs, batch_ys = tr.next_batch(50)
        if i % 100 == 0:
            train_acc = acc.eval(feed_dict={
                x:batch_xs, y:binarizer.transform(batch_ys),
                keep_prob: 1.0}, session=sess)
            print "step: {0}, training accuracy {1}".format(i, train_acc)
            validation_accuracy = getAccuracy(x, y, keep_prob, binarizer, acc, va, sess)
            print("Validation accuracy : {0}".format(validation_accuracy))
        train_op.run(feed_dict={
            x:batch_xs, y:binarizer.transform(batch_ys), keep_prob: 0.5},
                     session=sess)

    test_accuracy = getAccuracy(x, y, keep_prob, binarizer, acc, te, sess)
    print("Test accuracy : ", test_accuracy)
Esempio n. 10
0
def tower_loss(images,
               score_maps,
               geo_maps,
               training_masks,
               labels,
               reuse_variables=None):
    # Build inference graph
    with tf.variable_scope(tf.get_variable_scope(), reuse=reuse_variables):
        f_score, f_geometry = model.model(images, is_training=True)
        f_dat = labels

    model_loss = model.loss(score_maps, f_score, geo_maps, f_geometry,
                            training_masks)
    #total_loss = tf.add_n([model_loss] + 0.7*sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)))
    total_loss = sum([model_loss]) + reg_constant * sum(
        tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))

    # add summary
    if reuse_variables is None:
        tf.summary.image('input', images)
        tf.summary.image('score_map', score_maps)
        tf.summary.image('score_map_pred', f_score * 255)
        tf.summary.image('geo_map_0', geo_maps[:, :, :, 0:1])
        tf.summary.image('geo_map_0_pred', f_geometry[:, :, :, 0:1])
        tf.summary.image('training_masks', training_masks)
        #tf.summary.image('weight_vis', [v for v in tf.trainable_variables() if 'resnet_v1_50' in v.name][0])
        tf.summary.scalar('model_loss', model_loss)
        tf.summary.scalar('total_loss', total_loss)

    return total_loss, model_loss, f_score, f_geometry, f_dat
Esempio n. 11
0
def model_fn(features, labels):
    tf.keras.backend.set_learning_phase(True)
    predictions = model.build(features)
    loss = None
    train_op = None
    eval_metric_ops = None

    if mode in [tf.estimator.ModeKeys.EVAL, tf.estimator.ModeKeys.TRAIN]:
        loss_dict, total_loss = model.loss(predictions, labels)
        loss = toal_loss
        eval_metric_ops = eval_metric_operate(loss_dict)

        if mode == tf.estimator.ModeKeys.TRAIN:
            train_op = get_train_op(features, labels, total_loss, params)

    export_outputs = {
        tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
        tf.estimator.export.PredictOutput(predictions)
    }

    spec = tf.estimator.EstimatorSpec(mode=mode,
                                      predictions=predictions,
                                      loss=loss,
                                      train_op=train_op,
                                      eval_metric_ops=eval_metric_ops,
                                      export_outputs=export_outputs)
    return spec
Esempio n. 12
0
def run_training():
    data_dir = 'D:/WCsPy/data/train/'
    log_dir = 'saves'
    image, label = inputData.get_files(data_dir)
    image_batches, label_batches = inputData.get_batches(
        image, label, 32, 32, 16, 20)
    print(image_batches.shape)
    p = model.mmodel(image_batches, 16)
    cost = model.loss(p, label_batches)
    train_op = model.training(cost, 0.001)
    acc = model.get_accuracy(p, label_batches)

    sess = tf.Session()
    init = tf.global_variables_initializer()
    sess.run(init)
    saver = tf.train.Saver()
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)

    try:
        for step in np.arange(1000):
            print(step)
            if coord.should_stop():
                break
            _, train_acc, train_loss = sess.run([train_op, acc, cost])
            print("loss:{} accuracy:{}".format(train_loss, train_acc))
            if step % 100 == 0:
                check = os.path.join(log_dir, "model.ckpt")
                saver.save(sess, check, global_step=step)
    except tf.errors.OutOfRangeError:
        print("Done!!!")
    finally:
        coord.request_stop()
    coord.join(threads)
    sess.close()
Esempio n. 13
0
def tower_loss(scope, images, labels):
    """Calculate the total loss on a single tower running the model.

  Args:
    scope: unique prefix string identifying the tower, e.g. 'tower_0'
    images: Images. 4D tensor of shape [batch_size, height, width, 1].
    labels: Labels. 4D tensor of shape [batch_size, height, width, 1].

  Returns:
     Tensor of shape [] containing the total loss for a batch of data
  """

    # Build inference Graph.
    resize_images = model.inference(images)

    # Build the portion of the Graph calculating the losses. Note that we will
    # assemble the total_loss using a custom function below.
    _ = model.loss(resize_images, labels)

    # Assemble all of the losses for the current tower only.
    losses = tf.get_collection('losses', scope)

    # Calculate the total loss for the current tower.
    total_loss = tf.add_n(losses, name='total_loss')

    # Attach a scalar summary to all individual losses and the total loss; do the
    # same for the averaged version of the losses.
    tf.summary.scalar('total_loss', total_loss)
    return total_loss
Esempio n. 14
0
	def __init__(self, model_dir=None):
		self.sess = tf.Session()
		
		self.imgs_ph, self.bn, self.output_tensors, self.pred_labels, self.pred_locs = model.model(self.sess)

		total_boxes = self.pred_labels.get_shape().as_list()[1]
		self.positives_ph, self.negatives_ph, self.true_labels_ph, self.true_locs_ph, self.total_loss, self.class_loss, self.loc_loss = \
			model.loss(self.pred_labels, self.pred_locs, total_boxes)

		out_shapes = [out.get_shape().as_list() for out in self.output_tensors]

		c.out_shapes = out_shapes
		
		c.defaults = model.default_boxes(out_shapes)
		# variables in model are already initialized, so only initialize those declared after
		with tf.variable_scope("optimizer"):
			self.global_step = tf.Variable(0)
			self.lr_ph = tf.placeholder(tf.float32)
			self.optimizer = tf.train.AdamOptimizer(1e-3).minimize(self.total_loss, global_step=self.global_step)
		new_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="optimizer")
		init = tf.variables_initializer(new_vars)
		self.sess.run(init)

		if model_dir is None:
			model_dir = FLAGS.model_dir

		ckpt = tf.train.get_checkpoint_state(model_dir)
		self.saver = tf.train.Saver()

		if ckpt and ckpt.model_checkpoint_path:
			self.saver.restore(self.sess, ckpt.model_checkpoint_path)
			print("restored %s" % ckpt.model_checkpoint_path)
Esempio n. 15
0
def train(tfrecord_file, train_dir, batch_size, num_epochs):
    _, vectors, labels = data_loader.inputs([tfrecord_file],
                                            batch_size=batch_size,
                                            num_threads=16,
                                            capacity=batch_size * 4,
                                            min_after_dequeue=batch_size * 2,
                                            num_epochs=num_epochs,
                                            is_training=True)

    loss = model.loss(vectors, labels)

    global_step = tf.Variable(0, name='global_step', trainable=False)

    # Create training op with dependencies on update ops for batch norm
    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    with tf.control_dependencies(update_ops):
        train_op = tf.train.AdamOptimizer(learning_rate=0.001). \
            minimize(loss, global_step=global_step)

    # Create training supervisor to manage model logging and saving
    sv = tf.train.Supervisor(logdir=train_dir,
                             global_step=global_step,
                             save_summaries_secs=60,
                             save_model_secs=600)

    with sv.managed_session() as sess:
        while not sv.should_stop():
            _, loss_out, step_out = sess.run([train_op, loss, global_step])

            if step_out % 100 == 0:
                print('Step {}: Loss {}'.format(step_out, loss_out))
Esempio n. 16
0
def train():
  """Train datasets for a number of steps."""
  with tf.Graph().as_default():
    global_step = tf.Variable(0, trainable=False)

    # Get images and labels for model.
    images, labels = model.distorted_inputs()

    # Build a Graph that computes the logits predictions from the
    # inference model.
    logits = model.inference(images)

    # Calculate loss.
    loss = model.loss(logits, labels)

    # Build a Graph that trains the model with one batch of examples and
    # updates the model parameters.
    train_op = model.train(loss, global_step)

    # Create a saver.
    saver = tf.train.Saver(tf.global_variables())

    # Build the summary operation based on the TF collection of Summaries.
    summary_op = tf.summary.merge_all()

    # Build an initialization operation to run below.
    init = tf.global_variables_initializer()

    # Start running operations on the Graph.
    sess = tf.Session(config=tf.ConfigProto(log_device_placement=FLAGS.log_device_placement))# log_device_placement=True,该参数表示程序会将运行每一个操作的设备输出到屏幕
    sess.run(init)

    # Start the queue runners.
    tf.train.start_queue_runners(sess=sess)

    summary_writer = tf.summary.FileWriter(FLAGS.train_dir, graph_def=sess.graph_def)

    for step in range(FLAGS.max_steps):
      start_time = time.time()
      _, loss_value = sess.run([train_op, loss])
      duration = time.time() - start_time

      assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

      if step % 10 == 0:
        num_examples_per_step = FLAGS.batch_size
        examples_per_sec = num_examples_per_step / duration
        sec_per_batch = float(duration)

        format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f sec/batch)')
        print (format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch))

      if step % 100 == 0:
        summary_str = sess.run(summary_op)
        summary_writer.add_summary(summary_str, step)

      # Save the model checkpoint periodically.
      if step % 1000 == 0 or (step + 1) == FLAGS.max_steps:
        checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
Esempio n. 17
0
def eval(module, test_iter, args, write_to_file=False):
    mode = module.training
    module.eval()

    correct = 0
    total = 0
    loss_tot = 0
    eval_results = {}
    predictions = []

    for batch in tqdm(test_iter):
        scores = module.forward(batch.text)
        loss = model.loss(scores, batch.label)
        loss_tot += loss.item()
        preds = scores.argmax(1).squeeze()
        correct += sum((preds == batch.label)).item()
        total += batch.text.shape[0]

        if write_to_file:
            predictions += list(preds.cpu().numpy())

    eval_results['loss'] = loss_tot / len(test_iter)
    eval_results['accuracy'] = correct / total

    # Write predictions to file.
    if write_to_file:
        write_predictions(predictions, args, eval_results)

    module.train(mode)
    return eval_results
Esempio n. 18
0
def setup_refine_model(input_images, depth_maps, depth_maps_sigma, keep_conv,
                       keep_hidden):
    print("refine train.")
    if USE_ORIGINAL_MODEL:
        coarse = original_model.globalDepthMap(input_images,
                                               keep_conv,
                                               trainable=False)
        # coarse7, coarse6, coarse5, coarse3 = original_model.globalDepthMap(input_images, keep_conv, trainable=False)
        logits = original_model.localDepthMap(input_images, coarse, keep_conv,
                                              keep_hidden)
        loss = original_model.loss(logits, depth_maps, depth_maps_sigma)

        #c7 = tf.Print(coarse7, [coarse7], summarize=100)
        #c6 = tf.Print(coarse6, [coarse6], summarize=100)
        #c5 = tf.Print(coarse5, [coarse5], summarize=100)
        #c3 = tf.Print(coarse3, [coarse3], summarize=100)
        #logits, f3_d, f3, f2, f1_d, f1, pf1 = original_model.localDepthMap(images, coarse, keep_conv, keep_hidden)
        #o_p_logits = tf.Print(logits, [logits], summarize=100)
        #o_p_f3_d = tf.Print(f3_d, [f3_d], "fine3_dropout", summarize=100)
        #o_p_f3 = tf.Print(f3, [f3], "fine3", summarize=100)
        #o_p_f2 = tf.Print(f2, [f2], "fine2", summarize=100)
        #o_p_f1_d = tf.Print(f1_d, [f1_d], "fine1_dropout", summarize=100)
        #o_p_f1 = tf.Print(f1, [f1], "fine1", summarize=100)
        #o_p_pf1 = tf.Print(pf1, [pf1], "pre_fine1", summarize=100)
    else:
        coarse = maurice_model.globalDepthMap(input_images,
                                              keep_conv,
                                              trainable=False)
        logits = maurice_model.localDepthMap(input_images, coarse, keep_conv,
                                             keep_hidden)
        loss = maurice_model.loss(logits, depth_maps, depth_maps_sigma)
    return logits, loss
Esempio n. 19
0
def train():
    with tf.Graph().as_default():
        global_step = tf.Variable(0, trainable=False)

        images, labels = model.distorted_inputs()

        logits = model.inference(images)

        loss = model.loss(logits, labels)

        train_op = model.train(loss, global_step)

        saver = tf.train.Saver(tf.all_variables())

        summary_op = tf.merge_all_summaries()

        sess = tf.Session(config=tf.ConfigProto(
            log_device_placement=FLAGS.log_device_placement))

        ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir)
        if FLAGS.resume_training and ckpt and ckpt.model_checkpoint_path:
            saver.restore(sess, ckpt.model_checkpoint_path)
            current_step = int(
                ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1])
        else:
            current_step = 0
            init = tf.initialize_all_variables()
            sess.run(init)

        tf.train.start_queue_runners(sess=sess)

        summary_writer = tf.train.SummaryWriter(SUMMARY_DIR,
                                                graph_def=sess.graph_def)

        for step in xrange(current_step, FLAGS.max_steps):
            start_time = time.time()
            _, loss_value = sess.run([train_op, loss])
            duration = time.time() - start_time

            assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

            if step % 10 == 0:
                num_examples_per_step = FLAGS.batch_size
                examples_per_sec = num_examples_per_step / duration
                sec_per_batch = float(duration)

                format_str = ('%s: step %d, loss = %.2f'
                              '(%.1f examples/sec; %.3f'
                              'sec/batch)')
                print(format_str % (datetime.now(), step, loss_value,
                                    examples_per_sec, sec_per_batch))

            if step % 50 == 0:
                summary_str = sess.run(summary_op)
                summary_writer.add_summary(summary_str, step)

            if step % 100 == 0 or (step + 1) == FLAGS.max_steps:
                checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
Esempio n. 20
0
def train():
    with tf.Graph().as_default(), tf.device("/gpu:0"):
      global_step = tf.Variable(0, trainable=False)

      dirs_gray, dirs_color = make_data_directory_list()
      gray_images, color_images = input.read_dirs(dirs_gray, dirs_color, is_train=True)

      inferenced = model.inference(gray_images)
      raw_loss, total_loss = model.loss(inferenced, color_images)


      train_op = get_train_op(raw_loss, total_loss, global_step)
      summary_op = tf.merge_all_summaries()
      
      #saver = tf.train.Saver(tf.all_variables())
      saver = tf.train.Saver(tf.trainable_variables())  

      init = tf.initialize_all_variables()

      sess = tf.Session(config=tf.ConfigProto(
          allow_soft_placement=True,
          log_device_placement=FLAGS.log_device_placement))
      sess.run(init)

      ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir)

      if ckpt and ckpt.model_checkpoint_path:
          print "restore from {}".format(ckpt.model_checkpoint_path)
          saver.restore(sess, ckpt.model_checkpoint_path)

      # Start the queue runners.
      tf.train.start_queue_runners(sess=sess)

      summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph)

      for step in xrange(FLAGS.max_steps):
          start_time = time.time()
          _, value_raw_loss, value_total_loss = sess.run([train_op, raw_loss, total_loss])
          duration = time.time() - start_time


          if step % 10 == 0:
              num_examples_per_step = FLAGS.batch_size
              examples_per_sec = num_examples_per_step / duration
              sec_per_batch = float(duration)
              format_str = ('%s: step %d, raw_loss = %.2f, total_loss = %.2f (%.1f examples/sec; %.3f '
              'sec/batch)')
              print (format_str % (datetime.now(), step, value_raw_loss, value_total_loss,
              examples_per_sec, sec_per_batch))

          if step % 100 == 0:
              summary_str = sess.run(summary_op)
              summary_writer.add_summary(summary_str, step)


          # Save the model checkpoint periodically.
          if step % 1000 == 0 or (step + 1) == FLAGS.max_steps:
              checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
              saver.save(sess, checkpoint_path, global_step=step)
Esempio n. 21
0
def train():
    """Train CIFAR-10 for a number of steps."""
    with tf.Graph().as_default():
        global_step = tf.train.get_or_create_global_step()
        print(global_step)

        # Get images and labels for CIFAR-10.
        # Force input pipeline to CPU:0 to avoid operations sometimes ending up on
        # GPU and resulting in a slow down.
        images, labels = cifar10_input.distorted_inputs()
        print(images)
        print(labels)

        # Build a Graph that computes the logits predictions from the
        # inference model.
        logits = model.inference(images)

        # Calculate loss.
        loss = model.loss(logits, labels)

        # Build a Graph that trains the model with one batch of examples and
        # updates the model parameters.
        train_op = model.train(loss, global_step)
        print(logits)

        class _LoggerHook(tf.train.SessionRunHook):
            """Logs loss and runtime."""
            def begin(self):
                self._step = -1
                self._start_time = time.time()

            def before_run(self, run_context):
                self._step += 1
                return tf.train.SessionRunArgs(loss)  # Asks for loss value.

            def after_run(self, run_context, run_values):
                if self._step % FLAGS.log_frequency == 0:
                    current_time = time.time()
                    duration = current_time - self._start_time
                    self._start_time = current_time

                    loss_value = run_values.results
                    examples_per_sec = FLAGS.log_frequency * FLAGS.batch_size / duration
                    sec_per_batch = float(duration / FLAGS.log_frequency)

                    format_str = (
                        '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                        'sec/batch)')
                    print(format_str % (datetime.now(), self._step, loss_value,
                                        examples_per_sec, sec_per_batch))

        with tf.train.MonitoredTrainingSession(
                checkpoint_dir=FLAGS.train_dir,
                hooks=[
                    tf.train.StopAtStepHook(last_step=FLAGS.max_steps),
                    _LoggerHook()
                ]) as mon_sess:
            while not mon_sess.should_stop():
                mon_sess.run(train_op)
Esempio n. 22
0
def train():
    """Train SUN3D for a number of steps."""
    with tf.Graph().as_default(), tf.device('/gpu:1'):
        global_step = tf.contrib.framework.get_or_create_global_step()

        # Get images and labels for SUN3D.
        images, depths = model.inputs()

        # Build a Graph that computes the logits predictions from the
        # inference model.
        phase_train = True

        scores = model.inference(images, phase_train)
        # Calculate loss.
        loss = model.loss(scores, depths)

        # Build a Graph that trains the model with one batch of examples and
        # updates the model parameters.
        train_op = model.train(loss, global_step)
        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.96)

        class _LoggerHook(tf.train.SessionRunHook):
            """Logs loss and runtime."""
            def begin(self):
                self._step = -1

            def before_run(self, run_context):
                self._step += 1
                self._start_time = time.time()
                return tf.train.SessionRunArgs(loss)  # Asks for loss value.

            def after_run(self, run_context, run_values):
                duration = time.time() - self._start_time
                loss_value = run_values.results
                if self._step % 10 == 0:
                    num_examples_per_step = BATCH_SIZE
                    examples_per_sec = num_examples_per_step / duration
                    sec_per_batch = float(duration)

                    format_str = (
                        '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                        'sec/batch)')
                    print(format_str % (datetime.now(), self._step, loss_value,
                                        examples_per_sec, sec_per_batch))

        with tf.train.MonitoredTrainingSession(
                checkpoint_dir=TRAIN_LOG,
                hooks=[
                    tf.train.StopAtStepHook(last_step=NUM_ITER),
                    tf.train.NanTensorHook(loss),
                    _LoggerHook()
                ],
                config=tf.ConfigProto(
                    allow_soft_placement=True,
                    gpu_options=gpu_options,
                    log_device_placement=LOG_DEVICE_PLACEMENT)) as mon_sess:
            while not mon_sess.should_stop():
                print(mon_sess.run(loss))
                mon_sess.run(train_op)
Esempio n. 23
0
def pgd_conv(x, y, images_pl, labels_pl, logits_pl, exp_config, sess, eps=None, step_alpha=None, epochs=None, sizes=None,
             weights=None):
    mask_tensor_shape = [1] + list(exp_config.image_size)

    # compute loss
    loss = model.loss(logits_pl,
                      labels_pl,
                      nlabels=exp_config.nlabels,
                      loss_type=exp_config.loss_type,
                      weight_decay=exp_config.weight_decay)

    crafting_input = x.copy()
    crafting_output = crafting_input
    # crafting_target = y.copy()
    for i in range(epochs):
        grad_pl, = tf.gradients(loss, images_pl)
        grad = sess.run([grad_pl], feed_dict={images_pl: crafting_input,
                                              labels_pl: y})[0]
        assert grad is not None
        added = np.sign(grad)
        step_output = crafting_input + step_alpha * added
        total_adv = step_output - x
        total_adv = np.clip(total_adv, -eps, eps)
        crafting_output = x + total_adv
        crafting_input = crafting_output

    added = crafting_output - x
    print('PGD DONE')

    for i in range(epochs * 2):
        temp = tf.nn.conv2d(input=added, filter=weights[0], padding='SAME', data_format='NHWC')
        for j in range(len(sizes) - 1):
            temp = temp + tf.nn.conv2d(input=added, filter=weights[j + 1], padding='SAME', data_format='NHWC')

        temp = temp / float(len(sizes))  # average over multiple convolutions

        temp = temp.eval(session=sess)

        grad_pl, = tf.gradients(loss, images_pl)
        grad = sess.run([grad_pl], feed_dict={images_pl: temp,
                                              labels_pl: y})[0]
        assert grad is not None
        del temp
        added = added + step_alpha * np.sign(grad)
        added = np.clip(added, -eps, eps)

    print('SMOOTH PGD1 DONE')

    temp = tf.nn.conv2d(input=added, filter=weights[0], padding='SAME', data_format='NHWC')
    for j in range(len(sizes) - 1):
        temp = temp + tf.nn.conv2d(input=added, filter=weights[j + 1], padding='SAME', data_format='NHWC')
    temp = temp / float(len(sizes))
    temp = temp.eval(session=sess)
    crafting_output = x + temp
    del temp

    print('SMOOTH PGD2 DONE')

    return crafting_output
def get_attack_batch(model_name, count):
    if not os.path.exists(FGSM_DIR):
        os.makedirs(FGSM_DIR)

    tf.reset_default_graph()

    # computational graph
    img_batch = tf.placeholder(tf.float32,
                               shape=[None, 28 * 28],
                               name='img_batch')
    label_batch = tf.placeholder(tf.float32,
                                 shape=[None, 10],
                                 name='labels_batch')
    out = model.cnn(img_batch)
    logits = out.get('logits')
    probabilities = out.get('probabilities')
    loss = model.loss(label_batch, logits)

    img_batch_val, label_batch_val = data.get_test_batch(count)
    classes_batch_val = np.argmax(label_batch_val, axis=1)

    saver = tf.train.Saver()
    with tf.Session() as sess:
        saver.restore(
            sess, pp + MODEL_DIR + os.sep + 'model_' + model_name + '.ckpt')

        gradients = tf.gradients(loss, img_batch)
        grad_vals, probabilities_val = sess.run([gradients, probabilities],
                                                feed_dict={
                                                    img_batch: img_batch_val,
                                                    label_batch:
                                                    label_batch_val
                                                })

        grad_vals_sign = np.sign(grad_vals[0]) * 1.0 / 255.
        assigned_classes = np.argmax(probabilities_val, axis=1)

        original_images = []
        successful_attacks = []

        for i, grad in enumerate(grad_vals_sign):
            if assigned_classes[i] != classes_batch_val[i]:
                # classification should have been correct
                continue
            epss = np.arange(0., 100., 1)  # epsilon values
            attacks = [img_batch_val[i] + grad * 1 * x for x in epss]
            attacks = np.clip(attacks, 0, 1)  # clip image pixels to [0,1]

            # run classification on attacks
            probabilities_val = sess.run(probabilities,
                                         feed_dict={img_batch: attacks})

            best_attack = get_first_successful(probabilities_val, attacks)
            if best_attack is not None:
                successful_attacks.append(best_attack)
                original_images.append(img_batch_val[i])

    log_attacks(original_images, successful_attacks)
    return original_images, successful_attacks
def train():
    with tf.Graph().as_default():
        global_step = tf.Variable(0, trainable=False)

        images, labels = model.distorted_inputs()

        logits = model.inference(images)

        loss = model.loss(logits, labels)

        train_op = model.train(loss, global_step)

        saver = tf.train.Saver(tf.all_variables())

        summary_op = tf.merge_all_summaries()

        sess = tf.Session(config=tf.ConfigProto(
            log_device_placement=FLAGS.log_device_placement))

        ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir)
        if FLAGS.resume_training and ckpt and ckpt.model_checkpoint_path:
            saver.restore(sess, ckpt.model_checkpoint_path)
            current_step = int(ckpt.model_checkpoint_path
                               .split('/')[-1].split('-')[-1])
        else:
            current_step = 0
            init = tf.initialize_all_variables()
            sess.run(init)

        tf.train.start_queue_runners(sess=sess)

        summary_writer = tf.train.SummaryWriter(SUMMARY_DIR,
                                                graph_def=sess.graph_def)

        for step in xrange(current_step, FLAGS.max_steps):
            start_time = time.time()
            _, loss_value = sess.run([train_op, loss])
            duration = time.time() - start_time

            assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

            if step % 10 == 0:
                num_examples_per_step = FLAGS.batch_size
                examples_per_sec = num_examples_per_step / duration
                sec_per_batch = float(duration)

                format_str = ('%s: step %d, loss = %.2f'
                              '(%.1f examples/sec; %.3f'
                              'sec/batch)')
                print (format_str % (datetime.now(), step, loss_value,
                                     examples_per_sec, sec_per_batch))

            if step % 50 == 0:
                summary_str = sess.run(summary_op)
                summary_writer.add_summary(summary_str, step)

            if step % 100 == 0 or (step + 1) == FLAGS.max_steps:
                checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
def run_net(Xs, Ys, YsBoW=None):
    Xs, Ys = torch.from_numpy(Xs).to(args.device), torch.from_numpy(Ys).to(
        args.device)
    if (args.mt): YsBoW = torch.from_numpy(YsBoW).to(args.device)
    N, H, W = Xs.size()[0], Xs.size()[1], Xs.size()[2]
    Xs = Xs.unsqueeze(dim=1)  # .view(N, 1, H, W)
    if (args.mt):
        fVectorSp, Ys_predBoW, Ys_pred = modelSp(Xs, modelType=args.mtType)
    else:
        fVectorSp, Ys_pred = modelSp(Xs)
    loss = model.loss(Ys_pred, Ys)
    if (args.mt):
        lossBoW = model.loss(Ys_predBoW, YsBoW)
        return loss, lossBoW, Ys_pred.cpu().data.numpy(), Ys_predBoW.cpu(
        ).data.numpy(), fVectorSp
    else:
        return loss, Ys_pred.cpu().data.numpy(), fVectorSp
Esempio n. 27
0
    def __init__(self,
                 datafold,
                 adam_rate=0.0001,
                 batch_size=256,
                 n_epochs=30,
                 penalty_intensity=0.05):

        path_folder = PATH_TO_DATA + 'datafold_' + str(datafold) + '/'

        train_csv = pd.read_csv(path_folder + "train_set.csv")
        self.training_set_size = len(train_csv)
        self.train_tf_records_path = path_folder + 'train_256_3d.tfrecords'

        test_csv = pd.read_csv(path_folder + "test_set.csv")
        self.test_set_size = len(test_csv)
        self.test_tf_records_path = path_folder + 'test_256_3d.tfrecords'

        self.adam_rate = adam_rate
        self.batch_size = batch_size
        self.n_epochs = n_epochs
        self.penalty_intensity = penalty_intensity
        print("adam_rate: " + str(adam_rate))
        print("batch_size: " + str(batch_size))
        print("n_epochs: " + str(n_epochs))
        print("penalty_intensity: " + str(penalty_intensity))

        self.logdir = path_folder + '/logs_3D_CNN_LR_' + str(
            adam_rate) + '_BS_' + str(batch_size) + '_L2_' + str(
                penalty_intensity) + '/'
        self.tensorboard_n_checkpoint = self.logdir + 'tensorboard_n_checkpoint/'
        self.chkpt = self.tensorboard_n_checkpoint + 'model.ckpt'

        with tf.variable_scope('3D_CNN'):

            self.X = tf.placeholder(tf.float32, [
                None, MODIFIED_SIZE, MODIFIED_SIZE, MODIFIED_SIZE, NUM_CHANNEL
            ],
                                    name='X')
            self.y = tf.placeholder(tf.float32, [None, OUTPUT_SIZE], name='y')
            self.keep_rate = tf.placeholder(tf.float32)
            score = inference(self.X, self.keep_rate, OUTPUT_SIZE)
            softmax = tf.nn.softmax(score)
            self.cost = loss(score, self.y, self.penalty_intensity)

            self.optimizer = tf.train.AdamOptimizer(self.adam_rate).minimize(
                self.cost,
                var_list=tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES))

            self.preds = tf.equal(tf.argmax(softmax, axis=1),
                                  tf.argmax(self.y, axis=1))
            self.accuracy = tf.reduce_mean(tf.cast(self.preds, tf.float32))

        self.cost_summary = tf.summary.scalar(name='Cost', tensor=self.cost)

        self.accuracy_summary = tf.summary.scalar(name='Accuracy',
                                                  tensor=self.accuracy)

        self.summary = tf.summary.merge_all()
Esempio n. 28
0
def train():
    # 数据集
    print("start")
    image_dir = r'E:\VOC2013\JPEGImages/'  #My dir--20170727-csq
    xml_dir = r'E:\VOC2013\Annotations/'

    #获取图片和参数
    imagesname, labels, w_h_s, number = reader.input_data()
    label_data = reader.data_normalizer(labels, w_h_s, number)
    label_32 = tf.cast(label_data, tf.float32)
    print(label_32)
    print("start2")
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
    image_batch = []
    image_batch, label_batch = reader.get_batch(imagesname, label_data, number)
    print(image_batch)

    #tensor=np.array(image_batch)
    sess = tf.Session()

    coord = tf.train.Coordinator()
    ps1, ps2, ps3 = model.model(image_batch, True)

    scale1, scale2, scale3 = model.scales(ps1, ps2, ps3, True)
    loss = model.loss(scale1, label_batch)
    tf.squeeze(loss, 2)
    print(loss)
    tf.summary.scalar('loss', loss)
    train_op = model.op(loss, 0.01)
    summary_op = tf.summary.merge_all()
    train_writer = tf.summary.FileWriter(logs_train_dir, sess.graph)
    saver = tf.train.Saver()

    with tf.Session() as sess:

        sess.run(tf.global_variables_initializer())
        q = tf.train.start_queue_runners(sess=sess, coord=coord)
        #sess.run(tensor)
        for step in range(10000):

            #sess.run([ps1,ps2,ps3])
            #sess.run([scale1, scale2, scale3])

            op, loss_result = sess.run([train_op, loss])

            if step % 50 == 0:
                print(step)
                print(loss_result)
                summary_str = sess.run(summary_op)
                train_writer.add_summary(summary_str, step)

            if step % 2000 == 0 or (step + 1) == 10000:
                # 每隔2000步保存一下模型,模型保存在 checkpoint_path 中
                checkpoint_path = os.path.join(logs_train_dir, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)

    print('finish')
Esempio n. 29
0
def train():
    with tf.Graph().as_default():
        global_step = tf.train.get_or_create_global_step()
        images, labels = input.createBatch(tfrecords_name, batch_size)
        logits = model.inference(images, batch_size, n_classes)
        loss = model.loss(logits, labels)
        accuracy = model.evaluation(logits, labels)
        train_op = model.trainning(loss, learning_rate, global_step)

        class _LoggerHook(tf.train.SessionRunHook):
            """Logs loss and runtime."""
            def begin(self):
                self._step = -1
                self._start_time = time.time()

            def before_run(self, run_context):
                self._step += 1
                #if self._step % log_frequency == 0:
                # print(self.run(accuracy))
                # print("step %d, accuracy = %.2f"%(self._step ,accuracy))
                return tf.train.SessionRunArgs([loss, accuracy
                                                ])  # Asks for loss value.

            def after_run(self, run_context, run_values):
                if self._step % log_frequency == 0:
                    current_time = time.time()
                    duration = current_time - self._start_time
                    self._start_time = current_time
                    [loss_value, accuracy_value] = run_values.results
                    #accuracy_value = run_context.accuracy
                    examples_per_sec = log_frequency * batch_size / duration
                    sec_per_batch = float(duration / log_frequency)
                    format_str = (
                        '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                        'sec/batch)')
                    print(format_str % (datetime.now(), self._step, loss_value,
                                        examples_per_sec, sec_per_batch))
                    print('Accuracy = %.2f' % accuracy_value)

        with tf.train.MonitoredTrainingSession(
                checkpoint_dir=train_dir,
                hooks=[
                    tf.train.StopAtStepHook(last_step=max_steps),
                    tf.train.NanTensorHook(loss),
                    tf.train.SummarySaverHook(
                        save_steps=5,
                        output_dir=board_dir,
                        summary_op=tf.summary.merge_all()),
                    _LoggerHook()
                ]) as mon_sess:
            coord = tf.train.Coordinator()
            threads = tf.train.start_queue_runners(sess=mon_sess, coord=coord)
            while not mon_sess.should_stop():
                mon_sess.run(train_op)
                #print('dont stop')
            coord.request_stop()
            coord.join(threads)
Esempio n. 30
0
def run_net(Xs, Ys, YsBoW=None):
    Xs, Ys = torch.from_numpy(Xs).to(args.device), torch.from_numpy(Ys).to(
        args.device)
    if (args.mt): YsBoW = torch.from_numpy(YsBoW).to(args.device)
    N, H, W = Xs.size()[0], Xs.size()[1], Xs.size()[2]
    Xs = Xs.unsqueeze(dim=1)  # .view(N, 1, H, W)
    if (args.attn): Ys_pred, attn_weights = network(Xs)
    elif (args.mt): Ys_predBoW, Ys_pred = network(Xs)
    else: Ys_pred = network(Xs)
    loss = model.loss(Ys_pred, Ys)
    # pdb.set_trace()
    if (args.mt): lossBoW = model.loss(Ys_predBoW, YsBoW)
    if (args.attn): return loss, Ys_pred.cpu().data.numpy(), attn_weights
    elif (args.mt):
        return loss, lossBoW, Ys_pred.cpu().data.numpy(), Ys_predBoW.cpu(
        ).data.numpy()
    else:
        return loss, Ys_pred.cpu().data.numpy()
Esempio n. 31
0
def run_training():
    train_dir = "D:\新建文件夹\python foot/train/"
    log_train_dir = "D:\新建文件夹\python foot/train_savenet/"
    vadiation_dir = 'D:\新建文件夹\python foot/valiation/'
    train, train_labels = pre_process.get_files(train_dir)
    train_batch, train_label_batch = pre_process.get_batch(
        train, train_labels, IMG_W, IMG_H, BATCH_SIZE, CAPACITY)
    train_logits = model.inference(train_batch, BATCH_SIZE, N_CLASSES)
    train_loss = model.loss(train_logits, train_label_batch)
    train_op = model.training(train_loss, LEARNING_RATE)
    train_acc = model.evalution(train_logits, train_label_batch)
    summary_op = tf.summary.merge_all(
    )  #merge_all 可以将所有summary全部保存到磁盘,以便tensorboard显示。
    # 一般这一句就可显示训练时的各种信息。
    #vadiation, vadiation_labels = pre_process.get_files(vadiation_dir)
    #vadiation_batch, vadiation_label_batch = pre_process.get_batch(vadiation, vadiation_labels, IMG_W,IMG_H,BATCH_SIZE, CAPACITY)
    #vadiation_logits = model.inference(vadiation_batch, BATCH_SIZE, N_CLASSES)
    #vadiation_loss = model.loss(vadiation_logits, vadiation_label_batch)
    #vadiation_acc = model.evalution(vadiation_logits, vadiation_label_batch)
    sess = tf.Session()
    train_writer = tf.summary.FileWriter(log_train_dir,
                                         sess.graph)  #指定一个文件用来保存图
    saver = tf.train.Saver()

    sess.run(tf.global_variables_initializer())
    #  Coordinator  和 start_queue_runners 监控 queue 的状态,不停的入队出队
    coord = tf.train.Coordinator(
    )  #https://blog.csdn.net/weixin_42052460/article/details/80714539
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)

    try:
        for step in np.arange(STEP):
            if coord.should_stop():
                break
            _, tra_loss, tra_acc = sess.run([train_op, train_loss, train_acc])

            if step % 50 == 0:  #%.2f表示输出浮点数并保留两位小数。%%表示直接输出一个%
                print("step %d, train loss = %.2f, train accuracy  = %.2f%%" %
                      (step, tra_loss, tra_acc * 100.0))
                summary_str = sess.run(summary_op)
                train_writer.add_summary(summary_str, step)  #?????????????

            if step % 2000 == 0 or (step + 1) == STEP:
                # 每隔2000步保存一下模型,模型保存在 checkpoint_path 中
                print(
                    "step %d, vadiation loss = %.2f, vadiation accuracy  = %.2f%%"
                    % (step, vadiation_loss, vadiation_acc * 100.0))
                checkpoint_path = os.path.join(log_train_dir, "model.ckpt")
                saver.save(sess, checkpoint_path, global_step=step)
    except tf.errors.OutOfRangeError:
        print('Done training -- epoch limit reached')

    finally:
        coord.request_stop()
    coord.join(threads)
    sess.close()
Esempio n. 32
0
def train():
    train, validation = data.get_nameset()
    with tf.Graph().as_default():
        img1_placeholder, img2_placeholder, flo_placeholder = model.placeholder_inputs(
        )
        predict6, predict5, predict4, predict3, predict2 = model.inference(
            img1_placeholder, img2_placeholder)
        loss = model.loss(predict6, predict5, predict4, predict3, predict2,
                          flo_placeholder)
        global_step = tf.Variable(0, name='global_step', trainable=False)
        learning_rate = tf.train.exponential_decay(initial_learning_rate,
                                                   global_step,
                                                   decay_steps=200000,
                                                   decay_rate=0.1)
        optimizer = tf.train.AdamOptimizer(learning_rate)
        global_step = tf.Variable(0, name='global_step', trainable=False)
        train_op = optimizer.minimize(loss, global_step=global_step)

        summary = tf.summary.merge_all()
        init = tf.initialize_all_variables()
        saver = tf.train.Saver()
        sess = tf.Session()

        train_timer = Timer()

        sess.run(init)
        for step in xrange(max_steps):
            train_timer.tic()
            feed_dict = model.fill_feed_dict(train, img1_placeholder,
                                             img2_placeholder, flo_placeholder)
            _, loss_value = sess.run([train_op, loss], feed_dict=feed_dict)
            train_timer.toc()

            if step % 100 == 0:
                if step % 20 == 0:
                    log_str = (
                        '{} Epoch: {}, Step: {}, Learning rate: {},'
                        ' Loss: {:5.3f}\nSpeed: {:.3f}s/iter, Remain: {}'
                    ).format(
                        datetime.datetime.now().strftime('%m/%d %H:%M:%S'),
                        train.epochs_completed, int(step),
                        learning_rate.eval(session=sess), loss_value,
                        train_timer.average_time,
                        train_timer.remain(step, max_steps))
                    print log_str
                summary_str = sess.run(summary, feed_dict=feed_dict)
                summary_writer = tf.summary.FileWriter(log_dir, sess.graph)
                summary_writer.add_summary(summary_str, step)
                summary_writer.flush()

            if (step + 1) % 1000 == 0 or (step + 1) == max_steps:
                checkpoint_file = os.path.join(log_dir, 'model.ckpt')
                saver.save(sess, checkpoint_file, global_step=step)
                print('Validation Data Eval:')
                run_val(sess, img1_placeholder, img2_placeholder,
                        flo_placeholder, loss, validation)
Esempio n. 33
0
def eval_h5(conf, ckpt):
    """
    Train model for a number of steps.
    
    Args:
      conf: configuration dictionary
      ckpt: restore from ckpt
    """
    cw = conf["cw"]
    mb_size = conf["mb_size"]
    path_tmp = conf["path_tmp"]
    n_epochs = conf["n_epochs"]
    iw = conf["iw"]
    grad_norm_thresh = conf["grad_norm_thresh"]

    # Prepare data
    tr_stream, te_stream = tools.prepare_data(conf)
    n_tr = tr_stream.dataset.num_examples
    n_te = te_stream.dataset.num_examples

    with tf.Graph().as_default(), tf.device("/cpu:0" if FLAGS.dev_assign else None):
        # Placeholders
        Xs = [tf.placeholder(tf.float32, [None, iw, iw, 1], name="X_%02d" % i) for i in range(FLAGS.num_gpus)]
        Ys = [
            tf.placeholder(tf.float32, [None, iw - 2 * cw, iw - 2 * cw, 1], name="Y_%02d" % i)
            for i in range(FLAGS.num_gpus)
        ]

        # Calculate the gradients for each model tower
        tower_grads = []
        y_splits = []
        for i in range(FLAGS.num_gpus):
            with tf.device(("/gpu:%d" % i) if FLAGS.dev_assign else None):
                with tf.name_scope("%s_%02d" % (FLAGS.tower_name, i)) as scope:
                    # Calculate the loss for one tower. This function constructs
                    # the entire model but shares the variables across all towers.
                    y_split = model.inference(Xs[i], conf)
                    y_splits.append(y_split)
                    total_loss = model.loss(y_split, Ys[i], conf, scope)

                    # Reuse variables for the next tower.
                    tf.get_variable_scope().reuse_variables()

        y = tf.concat(0, y_splits, name="y")

        # Tensorflow boilerplate
        sess, saver, summ_writer, summ_op = tools.tf_boilerplate(None, conf, ckpt)

        # Evaluation
        psnr_tr = eval_epoch(Xs, Ys, y, sess, tr_stream, cw)
        psnr_te = eval_epoch(Xs, Ys, y, sess, te_stream, cw)
        print("approx psnr_tr=%.3f" % psnr_tr)
        print("approx psnr_te=%.3f" % psnr_te)
        tr_stream.close()
        te_stream.close()
Esempio n. 34
0
def eval_h5(conf, ckpt):
    """
    Train model for a number of steps.
    
    Args:
      conf: configuration dictionary
      ckpt: restore from ckpt
    """
    cw = conf['cw']
    mb_size = conf['mb_size']
    path_tmp = conf['path_tmp']
    n_epochs = conf['n_epochs']
    iw = conf['iw']
    grad_norm_thresh = conf['grad_norm_thresh']

    # Prepare data
    tr_stream, te_stream = tools.prepare_data(conf)
    n_tr = tr_stream.dataset.num_examples
    n_te = te_stream.dataset.num_examples

    with tf.Graph().as_default(), tf.device('/cpu:0' if FLAGS.dev_assign else None):
        # Placeholders
        Xs = [tf.placeholder(tf.float32, [None, iw, iw, 1], name='X_%02d' % i) \
              for i in range(FLAGS.num_gpus)]
        Ys = [tf.placeholder(tf.float32, [None, iw - 2*cw, iw - 2*cw, 1],
                             name='Y_%02d' % i) \
              for i in range(FLAGS.num_gpus)]

        # Calculate the gradients for each model tower
        tower_grads = []
        y_splits = []
        for i in range(FLAGS.num_gpus):
            with tf.device(('/gpu:%d' % i) if FLAGS.dev_assign else None):
                with tf.name_scope('%s_%02d' % (FLAGS.tower_name, i)) as scope:
                    # Calculate the loss for one tower. This function constructs
                    # the entire model but shares the variables across all towers.
                    y_split = model.inference(Xs[i], conf)
                    y_splits.append(y_split)
                    total_loss = model.loss(y_split, Ys[i], conf, scope)
                    
                    # Reuse variables for the next tower.
                    tf.get_variable_scope().reuse_variables()

        y = tf.concat(0, y_splits, name='y')

        # Tensorflow boilerplate
        sess, saver, summ_writer, summ_op = tools.tf_boilerplate(None, conf, ckpt)

        # Evaluation
        psnr_tr = eval_epoch(Xs, Ys, y, sess, tr_stream, cw)
        psnr_te = eval_epoch(Xs, Ys, y, sess, te_stream, cw)
        print('approx psnr_tr=%.3f' % psnr_tr)
        print('approx psnr_te=%.3f' % psnr_te)
        tr_stream.close()
        te_stream.close()
Esempio n. 35
0
def _tower_loss(images, labels, num_classes, scope, reuse_variables=None):
    """Calculate the total loss on a single tower running the ImageNet model.

    We perform 'batch splitting'. This means that we cut up a batch across multiple GPUs.

    Args:
    images: Images. 5D tensor of size [cfg.TRAIN.MINIBATCH, cfg.TRAIN.SEGMENT_NUM,
                                       cfg.TRAIN.IMAGE_HEIGHT, cfg.TRAIN.IMAGE_WIDTH,  cfg.TRAIN.INPUT_CHS].
    labels: 1-D integer Tensor of [cfg.TRAIN.MINIBATCH].
    num_classes: number of classes
    scope: unique prefix string identifying the ImageNet tower, e.g.
      'tower_0'.

    Returns:
     Tensor of shape [] containing the total loss for a batch of data
    """

    # Build inference Graph.
    with tf.variable_scope(tf.get_variable_scope(), reuse=reuse_variables):
        logits = inception.inference(images,
                                     num_classes,
                                     for_training=True,
                                     scope=scope)

    split_batch_size = tf.shape(images)[0]
    inception.loss(logits, labels, batch_size=split_batch_size)
    losses = tf.get_collection(tf.GraphKeys.LOSSES, scope)
    regularization_losses = tf.get_collection(
        tf.GraphKeys.REGULARIZATION_LOSSES)
    total_loss = tf.add_n(losses + regularization_losses, name='total_loss')
    loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg')
    loss_averages_op = loss_averages.apply(losses + [total_loss])

    # Attach a scalar summmary to all individual losses and the total loss; do the same for the averaged version of the losses.
    for l in losses + [total_loss]:
        loss_name = re.sub('%s_[0-9]*/' % inception.TOWER_NAME, '', l.op.name)
        tf.summary.scalar(loss_name + '_raw', l)
        tf.summary.scalar(loss_name, loss_averages.average(l))

    with tf.control_dependencies([loss_averages_op]):
        total_loss = tf.identity(total_loss)
    return logits, total_loss
Esempio n. 36
0
 def train(self, epoch_idx, batch_size, max_norm):
     logger, model, data = self.logger, self.model, self.data
     logger.info('At %d-th epoch with lr %f.', epoch_idx,
                 self.optimizer.param_groups[0]['lr'])
     model.train()
     nb_train_batch = ceil(data.nb_train / batch_size)
     for src, src_mask, trg, _ in tqdm(
             data.train_batch_sample(batch_size), total=nb_train_batch):
         out = model(src, src_mask, trg)
         loss = model.loss(out, trg[1:])
         self.optimizer.zero_grad()
         loss.backward()
         if max_norm > 0:
             torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
         logger.debug('loss %f with total grad norm %f', loss,
                      util.grad_norm(model.parameters()))
         self.optimizer.step()
Esempio n. 37
0
def tower_loss(images, score_maps, geo_maps, training_masks, reuse_variables=None):
    # Build inference graph
    with tf.variable_scope(tf.get_variable_scope(), reuse=reuse_variables):
        f_score, f_geometry = model.model(images, is_training=True)

    model_loss = model.loss(score_maps, f_score,
                            geo_maps, f_geometry,
                            training_masks)
    total_loss = tf.add_n([model_loss] + tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))

    # add summary
    if reuse_variables is None:
        tf.summary.image('input', images)
        tf.summary.image('score_map', score_maps)
        tf.summary.image('score_map_pred', f_score * 255)
        tf.summary.image('geo_map_0', geo_maps[:, :, :, 0:1])
        tf.summary.image('geo_map_0_pred', f_geometry[:, :, :, 0:1])
        tf.summary.image('training_masks', training_masks)
        tf.summary.scalar('model_loss', model_loss)
        tf.summary.scalar('total_loss', total_loss)

    return total_loss, model_loss
Esempio n. 38
0
def train(conf, ckpt=False):
    """
    Train model for a number of steps.
    
    Args:
      conf: configuration dictionary
      ckpt: restore from ckpt
    """
    cw = conf["cw"]
    mb_size = conf["mb_size"]
    path_tmp = conf["path_tmp"]
    n_epochs = conf["n_epochs"]
    iw = conf["iw"]
    grad_norm_thresh = conf["grad_norm_thresh"]

    tools.reset_tmp(path_tmp, ckpt)

    # Prepare data
    tr_stream, te_stream = tools.prepare_data(conf)
    n_tr = tr_stream.dataset.num_examples
    n_te = te_stream.dataset.num_examples

    with tf.Graph().as_default(), tf.device("/cpu:0" if FLAGS.dev_assign else None):
        # Exponential decay learning rate
        global_step = tf.get_variable(
            "global_step", [], initializer=tf.constant_initializer(0), dtype=tf.int32, trainable=False
        )
        lr = tools.exp_decay_lr(global_step, n_tr, conf)

        # Create an optimizer that performs gradient descent
        opt = tf.train.AdamOptimizer(lr)

        # Placeholders
        Xs = [tf.placeholder(tf.float32, [None, iw, iw, 1], name="X_%02d" % i) for i in range(FLAGS.num_gpus)]
        Ys = [
            tf.placeholder(tf.float32, [None, iw - 2 * cw, iw - 2 * cw, 1], name="Y_%02d" % i)
            for i in range(FLAGS.num_gpus)
        ]

        # Calculate the gradients for each model tower
        tower_grads = []
        y_splits = []
        for i in range(FLAGS.num_gpus):
            with tf.device(("/gpu:%d" % i) if FLAGS.dev_assign else None):
                with tf.name_scope("%s_%02d" % (FLAGS.tower_name, i)) as scope:
                    # Calculate the loss for one tower. This function constructs
                    # the entire model but shares the variables across all towers.
                    y_split, model_vars = model.inference(Xs[i], conf)
                    y_splits.append(y_split)
                    total_loss = model.loss(y_split, model_vars, Ys[i], conf["l2_reg"], scope)

                    # Calculate the gradients for the batch of data on this tower.
                    gvs = opt.compute_gradients(total_loss)

                    # Optionally clip gradients.
                    if grad_norm_thresh > 0:
                        gvs = tools.clip_by_norm(gvs, grad_norm_thresh)

                    # Keep track of the gradients across all towers.
                    tower_grads.append(gvs)

                    # Reuse variables for the next tower.
                    tf.get_variable_scope().reuse_variables()

                    # Retain the summaries from the final tower.
                    summs = tf.get_collection(tf.GraphKeys.SUMMARIES, scope)

        y = tf.concat(0, y_splits, name="y")

        # We must calculate the mean of each gradient. Note that this is the
        # synchronization point across all towers.
        gvs = tools.average_gradients(tower_grads)

        # Apply the gradients to adjust the shared variables.
        apply_grad_op = opt.apply_gradients(gvs, global_step=global_step)

        # Add a summary to track the learning rate.
        summs.append(tf.scalar_summary("learning_rate", lr))

        # Add histograms for gradients.
        for g, v in gvs:
            if g:
                v_name = re.sub("%s_[0-9]*/" % FLAGS.tower_name, "", v.op.name)
                summs.append(tf.histogram_summary(v_name + "/gradients", g))

        # Tensorflow boilerplate
        sess, saver, summ_writer, summ_op = tools.tf_boilerplate(summs, conf, ckpt)

        # Baseline error
        # bpsnr_tr = tools.baseline_psnr(tr_stream)
        # bpsnr_te = tools.baseline_psnr(te_stream)
        # print('approx baseline psnr_tr=%.3f' % bpsnr_tr)
        # print('approx baseline psnr_te=%.3f' % bpsnr_te)

        # Train
        format_str = "%s| %04d PSNR=%.3f (%.3f) (F+B: %.1fex/s; %.1fs/batch)" "(F: %.1fex/s; %.1fs/batch)"
        # step = 0
        step = sess.run(global_step)
        for epoch in range(n_epochs):
            print("--- Epoch %d ---" % epoch)
            # Training
            for X_c, y_c in tr_stream.get_epoch_iterator():
                if X_c.shape[0] < FLAGS.num_gpus:
                    continue
                y_c = y_c[:, cw:-cw, cw:-cw]
                chunk_size = X_c.shape[0]
                gpu_chunk = chunk_size // FLAGS.num_gpus
                dict_input1 = [
                    (Xs[i], X_c[i * gpu_chunk : ((i + 1) * gpu_chunk) if (i != FLAGS.num_gpus - 1) else chunk_size])
                    for i in range(FLAGS.num_gpus)
                ]
                dict_input2 = [
                    (Ys[i], y_c[i * gpu_chunk : ((i + 1) * gpu_chunk) if (i != FLAGS.num_gpus - 1) else chunk_size])
                    for i in range(FLAGS.num_gpus)
                ]
                feed = dict(dict_input1 + dict_input2)

                start_time = time.time()
                sess.run(apply_grad_op, feed_dict=feed)
                duration_tr = time.time() - start_time

                if step % 40 == 0:
                    feed2 = dict(dict_input1)

                    start_time = time.time()
                    y_eval = sess.run(y, feed_dict=feed2)
                    duration_eval = time.time() - start_time

                    psnr = tools.eval_psnr(y_c, y_eval)
                    bl_psnr = tools.eval_psnr(y_c, X_c[:, cw:-cw, cw:-cw])
                    ex_per_step_tr = mb_size * FLAGS.num_gpus / duration_tr
                    ex_per_step_eval = mb_size * FLAGS.num_gpus / duration_eval
                    print(
                        format_str
                        % (
                            datetime.now().time(),
                            step,
                            psnr,
                            bl_psnr,
                            ex_per_step_tr,
                            float(duration_tr / FLAGS.num_gpus),
                            ex_per_step_eval,
                            float(duration_eval / FLAGS.num_gpus),
                        )
                    )

                if step % 50 == 0:
                    summ_str = sess.run(summ_op, feed_dict=feed)
                    summ_writer.add_summary(summ_str, step)

                if step % 150 == 0:
                    saver.save(sess, os.path.join(path_tmp, "ckpt"), global_step=step)

                step += 1

            # Evaluation
            # psnr_tr = eval_epoch(Xs, Ys, y, sess, tr_stream, cw)
            # psnr_te = eval_epoch(Xs, Ys, y, sess, te_stream, cw)
            # print('approx psnr_tr=%.3f' % psnr_tr)
            # print('approx psnr_te=%.3f' % psnr_te)
            saver.save(sess, os.path.join(path_tmp, "ckpt"), global_step=step)

        saver.save(sess, os.path.join(path_tmp, "ckpt"), global_step=step)
        tr_stream.close()
        te_stream.close()
Esempio n. 39
0
def run_training():
    """
    Train the Classy model for a number of steps
    """
    with tf.Graph().as_default():
        global_step = tf.Variable(0, trainable=False)

        # Get images and labels for runway
        images, labels = rw.inputs(FLAGS.batch_size, NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN)

        # Batch normalization
        if FLAGS.batch_norm:
            phase_train = tf.Variable(True, trainable=False, dtype=tf.bool)
            images = batch_norm(images, 3, phase_train=phase_train)

        # Build a Graph that computes the logits predictions from the
        # inference model.
        logits = cl.inference(images, keep_prob=FLAGS.keep_prob, overlap_pool=FLAGS.overlap_pool)

        # Calculate loss.
        loss = cl.loss(logits, labels)

        # Calculate accuracy
        accuracy = cl.accuracy(logits, labels)
        cl.add_accuracy_summaries(accuracy)

        # Build a Graph that trains the model with one batch of examples and
        # updates the model parameters.
        train_op = train(loss, global_step)

        # Create a saver. Store 2 files per epoch, plus 2 for the beginning and end of training
        saver = tf.train.Saver(tf.all_variables(), max_to_keep=FLAGS.num_epochs*2+2)

        # Build the summary operation based on the TF collection of Summaries.
        summary_op = tf.merge_all_summaries()

        # Build an initialization operation to run below.
        init = tf.initialize_all_variables()

        # Start running operations on the Graph.
        sess = tf.Session(config=tf.ConfigProto(
            log_device_placement=FLAGS.log_device_placement))
        sess.run(init)

        # Start the queue runners.
        tf.train.start_queue_runners(sess=sess)

        # start the summary writer
        summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph)

        # start the training!
        accuracies = []
        losses = []
        steps_per_epoch = int(NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / FLAGS.batch_size)
        steps_per_checkpoint = int(steps_per_epoch / 2)
        max_steps = FLAGS.num_epochs * steps_per_epoch
        for step in range(max_steps):
            start_time = time.time()
            _, loss_value, acc_value = sess.run([train_op, loss, accuracy])
            duration = time.time() - start_time

            assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

            losses.append(loss_value)
            accuracies.append(acc_value)

            if step % 10 == 0:
                num_examples_per_step = FLAGS.batch_size
                examples_per_sec = num_examples_per_step / duration
                sec_per_batch = float(duration)

                format_str = ('%s: step %d, loss = %.2f, train_acc = %.2f, (%.1f examples/sec; %.3f '
                              'sec/batch)')
                print(format_str % (datetime.now(), step, loss_value, acc_value,
                                    examples_per_sec, sec_per_batch))

            if step % 100 == 0:
                summary_str = sess.run(summary_op)
                summary_writer.add_summary(summary_str, step)
                np.save(os.path.join(FLAGS.train_dir, 'tr_losses'), np.array(losses))
                np.save(os.path.join(FLAGS.train_dir, 'tr_accuracies'), np.array(accuracies))

            # Save the model checkpoint periodically.
            if step % steps_per_checkpoint == 0 or (step + 1) == max_steps or _shutdown:
                checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)

            if _shutdown:
                break

        print('Classy training finished!')
Esempio n. 40
0
  #TensorBoardのグラフに出力するスコープを指定
  with tf.Graph().as_default():
    # 画像を入れるためのTensor(28*28*3(IMAGE_PIXELS)次元の画像が任意の枚数(None)分はいる)
    images_placeholder = tf.placeholder("float", shape=(None, IMAGE_PIXELS))

    # ラベルを入れるためのTensor(3(NUM_CLASSES)次元のラベルが任意の枚数(None)分入る)
    labels_placeholder = tf.placeholder("float", shape=(None, NUM_CLASSES))

    # dropout率を入れる仮のTensor
    keep_prob = tf.placeholder("float")

    # inference()を呼び出してモデルを作る
    logits = model.inference(images_placeholder, keep_prob)

    # loss()を呼び出して損失を計算
    loss_value = model.loss(logits, labels_placeholder)

    # training()を呼び出して訓練して学習モデルのパラメーターを調整する
    train_op = model.training(loss_value, FLAGS.learning_rate)

    # 精度の計算
    acc = model.accuracy(logits, labels_placeholder)

    # 保存の準備
    saver = tf.train.Saver()

    # Sessionの作成(TensorFlowの計算は絶対Sessionの中でやらなきゃだめ)
    sess = tf.Session()

    # 変数の初期化(Sessionを開始したらまず初期化)
    sess.run(tf.global_variables_initializer())
Esempio n. 41
0
def run_training():
    """
    Train the Listnr model for a number of steps
    """
    with tf.Graph().as_default():
        global_step = tf.Variable(0, trainable=False)

        # Get images and labels for runway
        # tr_frames_t, tr_labels_t = tm.inputs(FLAGS.batch_size)
        # ts_frames_t, ts_labels_t = tm.inputs(FLAGS.batch_size, train=False)
        # frames, labels = placeholder_inputs()
        frames, labels = tm.inputs(FLAGS.batch_size, NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN)

        # Build a Graph that computes the logits predictions from the
        # inference model.
        logits = md.inference(frames)

        # Calculate loss.
        looss = md.loss(logits, labels)

        # calculate accuracy
        accuracy = md.accuracy(logits, labels)

        # Build a Graph that trains the model with one batch of examples and
        # updates the model parameters.
        train_op = train(looss, global_step)

        # Create a saver.
        saver = tf.train.Saver(tf.all_variables(), max_to_keep=FLAGS.num_epochs)

        # Build the summary operation based on the TF collection of Summaries.
        summary_op = tf.merge_all_summaries()

        # Build an initialization operation to run below.
        init = tf.initialize_all_variables()

        # Start running operations on the Graph.
        sess = tf.Session(config=tf.ConfigProto(
                log_device_placement=FLAGS.log_device_placement))
        sess.run(init)

        # Start the queue runners.
        tf.train.start_queue_runners(sess=sess)

        summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph)

        # run the training
        steps_per_epoch = int(NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / FLAGS.batch_size)

        max_steps = FLAGS.num_epochs * steps_per_epoch

        losses_epochs = []
        losses_batches = []
        accuracies_epochs = []
        accuracies_batches = []
        for step in range(max_steps+1):
            start_time = time.time()
            _, loss_value, acc_value = sess.run([train_op, looss, accuracy])
            duration = time.time() - start_time

            assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

            if step % 100 == 0:
                num_examples_per_step = FLAGS.batch_size
                examples_per_sec = num_examples_per_step / duration
                sec_per_batch = float(duration)

                format_str = ('%s: step %d, loss = %.2f, train_acc = %.2f (%.1f examples/sec; %.3f '
                              'sec/batch)')
                print(format_str % (datetime.now(), step, loss_value, acc_value, examples_per_sec, sec_per_batch))

                summary_str = sess.run(summary_op)
                summary_writer.add_summary(summary_str, step)

                losses_batches.append(loss_value)
                accuracies_batches.append(acc_value)

            # Save the model checkpoint periodically.

            if (step-1) % steps_per_epoch == 0 or (step + 1) == max_steps or _shutdown:
                checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)

                #accuracies_epochs.append(np.mean(accuracies_batches))
                #losses_epochs.append(np.mean(losses_batches))

                # save accuracy and loss
                np.save(os.path.join(FLAGS.train_dir, 'tr_loss'), np.array(losses_batches))
                np.save(os.path.join(FLAGS.train_dir, 'tr_accuracy'), np.array(accuracies_batches))
                print('Saving model: ', (step-1) / steps_per_epoch)


            if _shutdown:
                break

        print('Listnr training finished!')
def run_training():
  """Train MNIST for a number of steps."""
  # Get the sets of images and labels for training, validation, and
  # test on MNIST.
  data_sets = tf_data.read_data_sets(FLAGS.train_dir, FLAGS.fake_data)

  # Tell TensorFlow that the model will be built into the default Graph.
  with tf.Graph().as_default():
    # Generate placeholders for the images and labels.
    images_placeholder, labels_placeholder = placeholder_inputs(
        FLAGS.batch_size)

    # Build a Graph that computes predictions from the inference model.
    logits = model.inference(images_placeholder,
                             FLAGS.hidden1,
                             FLAGS.hidden2)

    # Add to the Graph the Ops for loss calculation.
    loss = model.loss(logits, labels_placeholder)

    # Add to the Graph the Ops that calculate and apply gradients.
    train_op = model.training(loss, FLAGS.learning_rate)

    # Add the Op to compare the logits to the labels during evaluation.
    eval_correct = model.evaluation(logits, labels_placeholder)

    # Build the summary operation based on the TF collection of Summaries.
    summary_op = tf.merge_all_summaries()

    # Create a saver for writing training checkpoints.
    saver = tf.train.Saver()

    # Create a session for running Ops on the Graph.
    sess = tf.Session()

    # Run the Op to initialize the variables.
    init = tf.initialize_all_variables()
    sess.run(init)

    # Instantiate a SummaryWriter to output summaries and the Graph.
    summary_writer = tf.train.SummaryWriter(FLAGS.train_dir,
                                            graph_def=sess.graph_def)

    # And then after everything is built, start the training loop.
    for step in xrange(FLAGS.max_steps):
      start_time = time.time()

      # Fill a feed dictionary with the actual set of images and labels
      # for this particular training step.
      feed_dict = fill_feed_dict(data_sets.train,
                                 images_placeholder,
                                 labels_placeholder)

      # Run one step of the model.  The return values are the activations
      # from the `train_op` (which is discarded) and the `loss` Op.  To
      # inspect the values of your Ops or variables, you may include them
      # in the list passed to sess.run() and the value tensors will be
      # returned in the tuple from the call.
      _, loss_value = sess.run([train_op, loss],
                               feed_dict=feed_dict)

      duration = time.time() - start_time

      # Write the summaries and print an overview fairly often.
      if step % 100 == 0:
        # Print status to stdout.
        print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value, duration))
        # Update the events file.
        summary_str = sess.run(summary_op, feed_dict=feed_dict)
        summary_writer.add_summary(summary_str, step)

      # Save a checkpoint and evaluate the model periodically.
      if (step + 1) % 1000 == 0 or (step + 1) == FLAGS.max_steps:
        saver.save(sess, FLAGS.train_dir, global_step=step)
        # Evaluate against the training set.
        print('Training Data Eval:')
        do_eval(sess,
                eval_correct,
                images_placeholder,
                labels_placeholder,
                data_sets.train)
        # Evaluate against the validation set.
        print('Validation Data Eval:')
        do_eval(sess,
                eval_correct,
                images_placeholder,
                labels_placeholder,
                data_sets.validation)
        # Evaluate against the test set.
        print('Test Data Eval:')
        do_eval(sess,
                eval_correct,
                images_placeholder,
                labels_placeholder,
                data_sets.test)
def train():
    '''
    Train
    '''
    with tf.Graph().as_default():
        # globalなstep数
        global_step = tf.Variable(0, trainable=False)

        # NYU Dataset V2 original size(480 x 640 x 3) -> crop -> (460 x 620 x 3)
        image_input = ImageInput('./data/nyu_depth_v2_labeled.mat')
        print("the number of train data: %d" % (len(image_input.images)))

        images = tf.placeholder(tf.float32, [None, FLAGS.crop_size_height, FLAGS.crop_size_width, FLAGS.image_depth])
        depths = tf.placeholder(tf.float32, [None, 1, 55, 74])
        invalid_depths = tf.placeholder(tf.float32, [None, 1, 55, 74])
        keep_conv = tf.placeholder(tf.float32)
        keep_hidden = tf.placeholder(tf.float32)

        # graphのoutput
        if FLAGS.refine_train:
            print("refine train.")
            logits = model.inference_refine(images, keep_conv, keep_hidden)
        else:
            print("coarse train.")
            logits = model.inference(images, keep_conv, keep_hidden)

        # loss graphのoutputとlabelを利用
        loss = model.loss(logits, depths, invalid_depths)
        # 学習オペレーション
        train_op = op.train(loss, global_step)

        # サマリー
        summary_op = tf.merge_all_summaries()

        # 初期化オペレーション
        init_op = tf.initialize_all_variables()

        # Session
        sess = tf.Session(config=tf.ConfigProto(log_device_placement=LOG_DEVICE_PLACEMENT))

        # saver
        #saver = tf.train.Saver(tf.all_variables())

        sess.run(init_op)    

        # coarseとrefineを分けて保存
        coarse_params = {}
        refine_params = {}

        if FLAGS.refine_train:
            for variable in tf.all_variables():
                variable_name = variable.name
                print("parameter: %s" % (variable_name))
                if variable_name.find("/") < 0 or variable_name.count("/") != 1:
                    print("ignore.")
                    continue
                scope, name = variable_name.split("/")
                target, _ = name.split(":")
                if variable_name.find('coarse') >= 0:
                    print("coarse parameter: %s" % (variable_name))
                    coarse_params[variable_name] = variable
                if variable_name.find('fine') >= 0:
                    print("refine parameter: %s" % (variable_name))
                    refine_params[variable_name] = variable
        else:
            for variable in tf.trainable_variables():
                variable_name = variable.name
                print("parameter: %s" %(variable_name))
                if variable_name.find("/") < 0 or variable_name.count("/") != 1:
                    print("ignore.")
                    continue
                scope, name = variable_name.split("/")
                target, _ = name.split(":")
                if variable_name.find('coarse') >= 0:
                    print("coarse parameter: %s" %(variable_name))
                    coarse_params[variable_name] = variable
                if variable_name.find('fine') >= 0:
                    print("refine parameter: %s" %(variable_name))
                    refine_params[variable_name] = variable

        # define saver
        saver_coarse = tf.train.Saver(coarse_params)
        saver_refine = tf.train.Saver(refine_params)

        # fine tune
        if FLAGS.fine_tune:
            # load coarse paramteters
            coarse_ckpt = tf.train.get_checkpoint_state(COARSE_DIR)
            if coarse_ckpt and coarse_ckpt.model_checkpoint_path:
                print("Pretrained coarse Model Loading.")
                saver_coarse.restore(sess, coarse_ckpt.model_checkpoint_path)
                print("Pretrained coarse Model Restored.")
            else:
                print("No Pretrained coarse Model.")

            # load refine parameters
            refine_ckpt = tf.train.get_checkpoint_state(REFINE_DIR)
            if refine_ckpt and refine_ckpt.model_checkpoint_path:
                print("Pretrained refine Model Loading.")
                saver_refine.restore(sess, refine_ckpt.model_checkpoint_path)
                print("Pretrained refine Model Restored.")
            else:
                print("No Pretrained refine Model.")

        # TODO train coarse or refine (change trainable)
        #if not FLAGS.coarse_train:
        #    for val in coarse_params:
        #        print val
        #if not FLAGS.refine_train:
        #    for val in coarse_params:
        #        print val

        # train refine
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=sess, coord=coord)

        # debug
        # サマリーのライターを設定
        #summary_writer = tf.train.SummaryWriter(TRAIN_DIR, graph_def=sess.graph_def)
        #batches = image_input.get_batches(FLAGS.batch_size)a
        #d = np.asarray(batches[0][0])
        #print d.shape
        #a = np.asarray(batches[0][1])
        #print a.shape
        #logits_val, logits_fine_val, loss_value = sess.run([logits, logits_fine, loss], feed_dict={images: batches[0][0], depths: batches[0][1], invalid_depths: batches[0][2], keep_conv: 1.0, keep_hidden: 1.0})
        #print len(logits_val[0])
        #print len(logits_fine_val[0])
        #print loss_value

        # max_stepまで繰り返し学習
        for step in xrange(MAX_STEPS):
            start_time = time.time()
            previous_time = start_time
            index = 0

            batches = image_input.get_batches(FLAGS.batch_size)
            vals = image_input.get_validation()
            for batch in batches:
                train = batch[0]
                depth = batch[1]
                ignore_depth = batch[2]
                _, loss_value = sess.run([train_op, loss], feed_dict={images: train, depths: depth, invalid_depths: ignore_depth, keep_conv: 0.8, keep_hidden: 0.5})
                if index % 10 == 0:
                    end_time = time.time()
                    duration = end_time - previous_time
                    num_examples_per_step = BATCH_SIZE * 10
                    examples_per_sec = num_examples_per_step / duration
                    print("%s: %d[epoch]: %d[iteration]: train loss %f: %d[examples/iteration]: %f[examples/sec]: %f[sec/iteration]" % (datetime.now(), step, index, loss_value, num_examples_per_step, examples_per_sec, duration))
                    assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

                if index % 50 == 0:
                    output_vec, cost_value = sess.run([logits, loss], feed_dict={images: vals[0], depths: vals[1], invalid_depths: vals[2], keep_conv: 1.0, keep_hidden: 1.0})
                    print("%s: %d[epoch]: %d[iteration]: validation loss: %f" % (datetime.now(), step, index, cost_value))
                    if index % 100 == 0:
                        output_dir = "predicts_%05d_%08d" % (step, index)
                        print("predicts output: %s" % output_dir)
                        data_feed_inputs_nyu.output_predict(output_vec, output_dir)

                previous_time = end_time
                index += 1
                
        #        if index % 100 == 0:
        #            pass
        #            summary_str = sess.run(summary_op, feed_dict={images: train, labels: label, keep_conv: 0.8, keep_hidden: 0.5})
        #            # サマリーに書き込む
        #            summary_writer.add_summary(summary_str, step)
        #    
            if step % 5 == 0 or (step * 1) == MAX_STEPS:
                if FLAGS.refine_train:
                    refine_checkpoint_path = REFINE_DIR + '/model.ckpt'
                    saver_refine.save(sess, refine_checkpoint_path, global_step=step)
                else:
                    coarse_checkpoint_path = COARSE_DIR + '/model.ckpt'
                    saver_coarse.save(sess, coarse_checkpoint_path, global_step=step)

        coord.request_stop()
        coord.join(threads)
        sess.close()
Esempio n. 44
0
def train():
    """Train CIFAR-10 for a number of steps."""
    with tf.Graph().as_default():
        global_step = tf.contrib.framework.get_or_create_global_step()

        # Get images and labels for CIFAR-10.
        images, labels = cifar10.get_inputs()

        # Build a Graph that computes the logits predictions from the
        # inference model.
        logits, fc1_w, fc2_w, fc1_b, fc2_b = cifar10.inference(images)

        # Calculate loss.
        loss = cifar10.loss(logits, labels)

        # L2 regularization for the fully connected parameters.
        regularizers = (tf.nn.l2_loss(fc1_w) + tf.nn.l2_loss(fc1_b) +
                        tf.nn.l2_loss(fc2_w) + tf.nn.l2_loss(fc2_b))
        # Add the regularization term to the loss.
        loss += 5e-4 * regularizers

        # Build a Graph that trains the model with one batch of examples and
        # updates the model parameters.
        train_op = cifar10.train(loss, global_step)

        class _LoggerHook(tf.train.SessionRunHook):
            """Logs loss and runtime."""

            def begin(self):
                self._step = -1
                self._start_time = time.time()

            def before_run(self, run_context):
                self._step += 1
                return tf.train.SessionRunArgs(loss)  # Asks for loss value.

            def after_run(self, run_context, run_values):
                if self._step % FLAGS.log_frequency == 0:
                    current_time = time.time()
                    duration = current_time - self._start_time
                    self._start_time = current_time

                    loss_value = run_values.results
                    examples_per_sec = FLAGS.log_frequency * FLAGS.batch_size / duration
                    sec_per_batch = float(duration / FLAGS.log_frequency)

                    format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                                  'sec/batch)')
                    print(format_str % (datetime.now(), self._step, loss_value,
                                        examples_per_sec, sec_per_batch))

        with tf.train.MonitoredTrainingSession(
                checkpoint_dir=FLAGS.train_dir,
                hooks=[tf.train.StopAtStepHook(last_step=FLAGS.max_steps),
                       tf.train.NanTensorHook(loss),
                       _LoggerHook()],
                config=tf.ConfigProto(
                    log_device_placement=FLAGS.log_device_placement, allow_soft_placement=True)) as mon_sess:

            while not mon_sess.should_stop():
                mon_sess.run(train_op)
Esempio n. 45
0
def train():
    print "Building training graph ..."
    with tf.Graph().as_default():
        initializer = tf.random_uniform_initializer(-FLAGS.init_scale, FLAGS.init_scale)
        with tf.variable_scope("char-rnn", initializer=initializer):
            keep_prob = tf.placeholder(dtype=tf.float32, shape=[], name='keep_prob')
            cell = model.build_cell(keep_prob)

            inputs = tf.placeholder(dtype=tf.int32, shape=[FLAGS.batch_size, FLAGS.num_steps], name='inputs')
            targets = tf.placeholder(dtype=tf.int32, shape=[FLAGS.batch_size, FLAGS.num_steps], name='targets')
            lr = tf.placeholder(dtype=tf.float32, shape=[], name='learning_rate')
            initial_state = tf.placeholder(dtype=tf.float32, shape=[FLAGS.batch_size, cell.state_size], name='initial_state')

            logits, final_state = model.predict(inputs, cell, initial_state, keep_prob)
            loss = model.loss(logits, targets)
            train_op = model.train_batch(loss, lr)

        # create saver and summary
        saver = tf.train.Saver(tf.all_variables())
        summary_op = tf.merge_all_summaries()

        sess = tf.Session()
        sess.run(tf.initialize_all_variables())
        summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, graph_def=sess.graph_def)

        # load data
        print "Loading data ..."
        reader = text_input.TextReader(os.path.join(FLAGS.data_dir, FLAGS.data_file))
        reader.prepare_data()
        train_loader = text_input.DataLoader(os.path.join(FLAGS.data_dir, 'train.cPickle'), FLAGS.batch_size, FLAGS.num_steps)
        test_loader = text_input.DataLoader(os.path.join(FLAGS.data_dir, 'test.cPickle'), FLAGS.batch_size, FLAGS.num_steps)

        total_steps = FLAGS.num_epochs * train_loader.num_batch
        save_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
        zero_state = cell.zero_state(FLAGS.batch_size, dtype=tf.float32).eval(session=sess)
        global_step = 0

        def eval(sess, loader, state):
            test_loss = 0.
            for _ in xrange(loader.num_batch):
                x_batch, y_batch = loader.next_batch()
                feed = {inputs: x_batch, targets: y_batch, keep_prob: 1., initial_state: state}
                state, loss_value = sess.run([final_state, loss], feed_dict=feed)
                test_loss += loss_value
            return test_loss / loader.num_batch

        # training
        for epoch in xrange(FLAGS.num_epochs):
            current_lr = FLAGS.init_lr * (FLAGS.lr_decay ** (max(epoch - FLAGS.decay_after + 1, 0)))
            state = zero_state
            training_loss = 0.
            for _ in xrange(train_loader.num_batch):
                global_step += 1
                start_time = time.time()
                x_batch, y_batch = train_loader.next_batch()
                feed = {inputs: x_batch, targets: y_batch, keep_prob: (1.-FLAGS.dropout), lr: current_lr, initial_state: state}
                state, loss_value, _ = sess.run([final_state, loss, train_op], feed_dict=feed)
                duration = time.time() - start_time
                training_loss += loss_value

                if global_step % FLAGS.log_steps == 0:
                    format_str = ('%s: step %d/%d (epoch %d/%d), loss = %.2f (%.3f sec/batch), lr: %.5f')
                    print(format_str % (datetime.now(), global_step, total_steps, epoch+1, FLAGS.num_epochs, loss_value,
                        duration, current_lr))

                if global_step % FLAGS.summary_steps == 0:
                    summary_str = sess.run(summary_op)
                    summary_writer.add_summary(summary_str, global_step)

            if epoch % FLAGS.save_epochs == 0:
                saver.save(sess, save_path, global_step)
            train_loader.reset_pointer()

            # epoch summary
            training_loss /= train_loader.num_batch
            summary_writer.add_summary(_summary_for_scalar('training_loss', training_loss), global_step)
            test_loss = eval(sess, test_loader, zero_state)
            test_loader.reset_pointer()
            summary_writer.add_summary(_summary_for_scalar('test_loss', test_loss), global_step)
            print("Epoch %d: training_loss = %.2f, test_loss = %.2f" % (epoch+1, training_loss, test_loss))
def train():
    with tf.Graph().as_default():
        # globalなstep数
        global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False)
        dataset = DataSet()

        # get trainsets
        print("The number of train images: %d", (dataset.cnt_samples(FLAGS.tfcsv)))
        images, labels = dataset.csv_inputs(FLAGS.tfcsv, FLAGS.batch_size, distorted=True)

        images_debug = datasets.debug(images)

        # get testsets
        #test_cnt = dataset.cnt_samples(FLAGS.testcsv)
        test_cnt = 100
	#test_cnt = 5
        print("The number of train images: %d", ())
        images_test, labels_test = dataset.test_inputs(FLAGS.testcsv, test_cnt)

        images_test_debug = datasets.debug(images_test)

        input_summaries = copy.copy(tf.get_collection(tf.GraphKeys.SUMMARIES))

        num_classes = FLAGS.num_classes
        restore_logits = not FLAGS.fine_tune

        # inference
        # logits is tuple (logits, aux_liary_logits, predictions)
        # logits: output of final layer, auxliary_logits: output of hidden layer, softmax: predictions
        logits = model.inference(images, num_classes, for_training=True, restore_logits=restore_logits)
        logits_test = model.inference(images_test, num_classes, for_training=False, restore_logits=restore_logits, reuse=True, dropout_keep_prob=1.0)

        # loss
        model.loss(logits, labels, batch_size=FLAGS.batch_size)
        model.loss_test(logits_test, labels_test, batch_size=test_cnt)
        losses = tf.get_collection(slim.losses.LOSSES_COLLECTION)
        losses_test = tf.get_collection(slim.losses.LOSSES_COLLECTION_TEST)

        # Calculate the total loss for the current tower.
        regularization_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
        total_loss = tf.add_n(losses + regularization_losses, name='total_loss')
        #total_loss = tf.add_n(losses, name='total_loss')
        total_loss_test = tf.add_n(losses_test, name='total_loss_test')

        # Compute the moving average of all individual losses and the total loss.
        loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg')
        loss_averages_op = loss_averages.apply(losses + [total_loss])
        loss_averages_test = tf.train.ExponentialMovingAverage(0.9, name='avg_test')
        loss_averages_op_test = loss_averages_test.apply(losses_test + [total_loss_test])

        print "="*10
        print "loss length:"
        print len(losses)
        print len(losses_test)
        print "="*10

        # for l in losses + [total_loss]:
        #     # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training
        #     # session. This helps the clarity of presentation on TensorBoard.
        #     loss_name = re.sub('%s_[0-9]*/' % model.TOWER_NAME, '', l.op.name)
        #     # Name each loss as '(raw)' and name the moving average version of the loss
        #     # as the original loss name.
        #     tf.scalar_summary(loss_name + ' (raw)', l)
        #     tf.scalar_summary(loss_name, loss_averages.average(l))

        # loss to calcurate gradients
        #
        with tf.control_dependencies([loss_averages_op]):
            total_loss = tf.identity(total_loss)
        tf.scalar_summary("loss", total_loss)

        with tf.control_dependencies([loss_averages_op_test]):
            total_loss_test = tf.identity(total_loss_test)
        tf.scalar_summary("loss_eval", total_loss_test)

        # Reuse variables for the next tower.
        #tf.get_variable_scope().reuse_variables()

        # Retain the summaries from the final tower.
        summaries = tf.get_collection(tf.GraphKeys.SUMMARIES)

        # Retain the Batch Normalization updates operations only from the
        # final tower. Ideally, we should grab the updates from all towers
        # but these stats accumulate extremely fast so we can ignore the
        # other stats from the other towers without significant detriment.
        batchnorm_updates = tf.get_collection(slim.ops.UPDATE_OPS_COLLECTION)

        # add input summaries
        # summaries.extend(input_summaries)

        # train_operation and operation summaries
        train_op = train_operation.train(total_loss, global_step, summaries, batchnorm_updates)

        # trainable variables's summary
        #for var in tf.trainable_variables():
        #    summaries.append(tf.histogram_summary(var.op.name, var))

        # saver
        saver = tf.train.Saver(tf.all_variables())

        # Build the summary operation from the last tower summaries.
        #summary_op = tf.merge_summary(summaries)
        summary_op = tf.merge_all_summaries()

        # initialization
        init = tf.initialize_all_variables()

        # session
        sess = tf.Session(config=tf.ConfigProto(
            allow_soft_placement=True,
            log_device_placement=FLAGS.log_device_placement))
        sess.run(init)

        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=sess, coord=coord)

        if FLAGS.pretrained_model_checkpoint_path:
            assert tf.gfile.Exists(FLAGS.pretrained_model_checkpoint_path)
            variables_to_restore = tf.get_collection(
                slim.variables.VARIABLES_TO_RESTORE)
            restorer = tf.train.Saver(variables_to_restore)
            restorer.restore(sess, FLAGS.pretrained_model_checkpoint_path)
            print('%s: Pre-trained model restored from %s' %
                  (datetime.now(), FLAGS.pretrained_model_checkpoint_path))

        summary_writer = tf.train.SummaryWriter(
            FLAGS.train_dir,
            graph_def=sess.graph.as_graph_def(add_shapes=True))

        for step in xrange(FLAGS.max_steps):
            start_time = time.time()
            _, logits_eval, loss_value, labels_eval, images_debug_eval = sess.run([train_op, logits[0], total_loss, labels, images_debug])
            duration = time.time() - start_time

            dataset.output_images(images_debug_eval, "debug", "train")

            assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

            if step % 10 == 0:
                examples_per_sec = FLAGS.batch_size / float(duration)
                format_str = ('train %s: step %d, loss = %.2f (%.1f examples/sec; %.3f sec/batch)')
                print(format_str % (datetime.now(), step, loss_value, examples_per_sec, duration))

            if step % 100 == 0:
                print("predict:")
                print type(logits_eval)
                print logits_eval.shape
                print logits_eval.argmax(1)
                print("target:")
                print labels_eval
                summary_str = sess.run(summary_op)
                summary_writer.add_summary(summary_str, step)

                test_start_time = time.time()
                logits_test_eval, total_loss_test_val, labels_test_eval, images_test_debug_eval = sess.run([logits_test[0], total_loss_test, labels_test, images_test_debug])
                test_duration = time.time() - test_start_time

                dataset.output_images(images_test_debug_eval, "debug_test", "test")

                print("test predict:")
                print type(logits_test_eval)
                print logits_test_eval.shape
                print logits_test_eval.argmax(1)
                print("test target:")
                print labels_test_eval
                test_examples_per_sec = test_cnt / float(test_duration)
                format_str_test = ('test %s: step %d, loss = %.2f, (%.1f examples/sec; %.3f sec/batch)')
                print(format_str_test % (datetime.now(), step, total_loss_test_val, test_examples_per_sec, test_duration))

                # Save the model checkpoint periodically.
                if step % 5000 == 0 or (step + 1) == FLAGS.max_steps:
                    checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)

        coord.request_stop()
        coord.join(threads)
        sess.close()