Exemple #1
0
def train(mnist):
    # x = tf.placeholder(tf.float32, [None, mnist_inference.INPUT_NODE], name='x-input')
    x = tf.placeholder(tf.float32, [BATCH_SIZE, IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS], name='x-input')
    y_ = tf.placeholder(tf.float32, [None, OUTPUT_NODE], name='y-input')
    regularizer = tf.contrib.layers.l2_regularizer(REGULARAZTION_RATE)
    y = mnist_inference.inference(x, False, regularizer)
    global_step = tf.Variable(0, trainable=False)

    # 滑动平均操作
    variable_averages = tf.train.ExponentialMovingAverage(MOVING_AVERAGE_DECAY, global_step)
    variable_averages_op = variable_averages.apply(tf.trainable_variables())
    # 损失函数
    cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=y, labels=tf.arg_max(y_, 1))
    cross_entropy_mean = tf.reduce_mean(cross_entropy)
    loss = cross_entropy_mean + tf.add_n(tf.get_collection('losses'))
    learning_rate = tf.train.exponential_decay(LEARNING_RATE_BASE, global_step, mnist.train.num_examples/BATCH_SIZE, LEARNING_RATE_DECAY, staircase=True)
    # 训练过程
    train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)
    with tf.control_dependencies([train_step, variable_averages_op]):
        train_op = tf.no_op(name='train')
    
    # 初始化TF 持久化类
    saver = tf.train.Saver()
    with tf.Session() as sess:
        tf.initialize_all_variables().run()

        for i in range(TRAINING_STEPS):
            xs, ys = mnist.train.next_batch(BATCH_SIZE)
            reshaped_xs = np.reshape(xs, (BATCH_SIZE, IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS))
            _, loss_value, step = sess.run([train_op, loss, global_step], feed_dict={x: reshaped_xs, y_: ys})
            if i % 100 == 0:
                print("After %d training step(s), loss on training "
                    "batch is %g." % (step, loss_value))
                saver.save(sess, os.path.join(MODEL_SAVE_PATH, MODEL_NAME), global_step=global_step)
                mnist_eval.evaluate(mnist)
def train(mnist):
    x = tf.placeholder(tf.float32, [None, mnist_inference.INPUT_NODE], name='x-input')
    y_ = tf.placeholder(tf.float32, [None, mnist_inference.OUTPUT_NODE], name='y-input')

    regularizer = tf.contrib.layers.l2_regularizer(REGULARIZATION_RATE)
    y = mnist_inference.inference(x, regularizer)

    """
    建立tf.train.ExponentialMovingAverage对象后,Saver正常保存就会存入影子变量,
    命名规则是"v/ExponentialMovingAverage"对应变量”v“
    """
    # 滑动平均模型配置
    # 注意将代表训练轮数的变量指定为不可训练的参数
    global_step = tf.Variable(0, trainable=False)
    variable_averages = tf.train.ExponentialMovingAverage(MOVING_AVERAGE_DECAY, global_step)
    # 在所有可训练的参数上使用滑动平均
    variables_averages_op = variable_averages.apply(tf.trainable_variables())

    cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=y, labels=tf.argmax(y_, 1))
    cross_entropy_mean = tf.reduce_mean(cross_entropy)
    loss = cross_entropy_mean + tf.add_n(tf.get_collection('losses'))

    learning_rate = tf.train.exponential_decay(
        LEARNING_RATE_BASE,
        global_step,
        mnist.train.num_examples / BATCH_SIZE, LEARNING_RATE_DECAY,
        staircase=True)

    train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)

    # tf.group()和tf.control_dependencies()两种机制是为了一次完成多个操作
    with tf.control_dependencies([train_step, variables_averages_op]):
        train_op = tf.no_op(name='train')  # tf.no_op()什么也不做

    saver = tf.train.Saver()
    with tf.Session() as sess:
        tf.global_variables_initializer().run()

        for i in range(TRAINING_STEPS):
            xs, ys = mnist.train.next_batch(BATCH_SIZE)
            _, loss_value, step = sess.run([train_op, loss, global_step], feed_dict={x: xs, y_: ys})

            if i % 1000 == 0:
                mnist_eval.evaluate(mnist)
                print("After %d training step(s), loss on training batch is %g." % (
                    step, loss_value))

                saver.save(sess, os.path.join(MODEL_SAVE_PATH, MODEL_NAME), global_step=global_step)
def train(config):
    # seeding randomness
    tf.set_random_seed(config.training.tf_random_seed)
    np.random.seed(config.training.np_random_seed)

    # Setting up training parameters
    max_num_training_steps = config.training.max_num_training_steps
    step_size_schedule = config.training.step_size_schedule
    weight_decay = config.training.weight_decay
    momentum = config.training.momentum
    batch_size = config.training.batch_size
    adversarial_training = config.training.adversarial_training
    eval_during_training = config.training.eval_during_training
    LAMBDA = float(config.training.unsupervised_lambda)
    if eval_during_training:
        num_eval_steps = config.training.num_eval_steps

    use_kl = config.attack.use_kl
    # Setting up output parameters
    num_output_steps = config.training.num_output_steps
    num_summary_steps = config.training.num_summary_steps
    num_checkpoint_steps = config.training.num_checkpoint_steps

    # Setting up the data and the model
    data_path = config.data.data_path
    raw_cifar = mnist_input.MNISTData(data_path, config.training.partial,
                                      config.training.unlabel)
    global_step = tf.train.get_or_create_global_step()
    model = small_cnn.Model(config.model)

    # Setting up the optimizer
    boundaries = [int(sss[0]) for sss in step_size_schedule]
    boundaries = boundaries[1:]
    values = [sss[1] for sss in step_size_schedule]
    learning_rate = tf.train.piecewise_constant(tf.cast(global_step, tf.int32),
                                                boundaries, values)
    total_loss = model.mean_xent + weight_decay * model.weight_decay_loss

    if use_kl:
        total_loss += model.mean_kl

    optimizer = tf.train.MomentumOptimizer(learning_rate, momentum)
    train_step = optimizer.minimize(total_loss, global_step=global_step)

    # Set up adversary
    attack = SpatialAttack(model, config.attack)

    # Setting up the Tensorboard and checkpoint outputs
    model_dir = config.model.output_dir
    if eval_during_training:
        eval_dir = os.path.join(model_dir, 'eval')
        if not os.path.exists(eval_dir):
            os.makedirs(eval_dir)

    # We add accuracy and xent twice so we can easily make three types of
    # comparisons in Tensorboard:
    # - train vs eval (for a single run)
    # - train of different runs
    # - eval of different runs

    saver = tf.train.Saver(max_to_keep=30)

    tf.summary.scalar('accuracy_adv_train',
                      model.accuracy,
                      collections=['adv'])
    tf.summary.scalar('accuracy_adv', model.accuracy, collections=['adv'])
    tf.summary.scalar('xent_adv_train',
                      model.xent / batch_size,
                      collections=['adv'])
    tf.summary.scalar('xent_adv', model.xent / batch_size, collections=['adv'])
    tf.summary.image('images_adv_train', model.x_image, collections=['adv'])
    adv_summaries = tf.summary.merge_all('adv')

    tf.summary.scalar('accuracy_nat_train',
                      model.accuracy,
                      collections=['nat'])
    tf.summary.scalar('accuracy_nat', model.accuracy, collections=['nat'])
    tf.summary.scalar('xent_nat_train',
                      model.xent / batch_size,
                      collections=['nat'])
    tf.summary.scalar('xent_nat', model.xent / batch_size, collections=['nat'])
    tf.summary.image('images_nat_train', model.x_image, collections=['nat'])
    tf.summary.scalar('learning_rate', learning_rate, collections=['nat'])
    nat_summaries = tf.summary.merge_all('nat')

    with tf.Session() as sess:

        # initialize data augmentation
        if config.training.data_augmentation:
            cifar = mnist_input.AugmentedMNISTData(raw_cifar, sess)
        else:
            cifar = raw_cifar

        # Initialize the summary writer, global variables, and our time counter.
        summary_writer = tf.summary.FileWriter(model_dir, sess.graph)
        if eval_during_training:
            eval_summary_writer = tf.summary.FileWriter(eval_dir)

        sess.run(tf.global_variables_initializer())
        training_time = 0.0

        len_label, len_unlabel = float(raw_cifar.train_data.n), float(
            raw_cifar.unlabeled_data.n)

        p = len_label / (len_label + len_unlabel)

        # Main training loop
        for ii in range(max_num_training_steps + 1):
            x_batch, y_batch = cifar.train_data.get_next_batch(
                int(batch_size * p), multiple_passes=True)
            if config.training.unsupervised == 'semi':

                x_unlabel, _ = cifar.unlabeled_data.get_next_batch(
                    int(batch_size * (1 - p)), multiple_passes=True)
                x_mix = np.concatenate((x_batch, x_unlabel), axis=0)

                y_prediction = sess.run(model.softmax if use_kl else model.predictions, feed_dict={model.x_input: x_mix,\
                                                                    model.y_input: np.concatenate((y_batch, y_batch), axis=0),\
                                                                    model.transform: np.zeros([len(x_mix), 3]),\
                                                                    model.weights: [1. for i in range(len(x_mix))],\
                                                                    model.is_training: False})
            elif config.training.unsupervised == 'nosemi':

                x_mix = x_batch
                y_prediction = sess.run(model.softmax if use_kl else model.predictions, feed_dict={model.x_input: x_mix,\
                                                                    model.y_input: y_batch,\
                                                                    model.transform: np.zeros([len(x_mix), 3]),\
                                                                    model.weights: [1. for i in range(len(x_mix))],\
                                                                    model.is_training: False})
            noop_trans = np.zeros([len(x_batch), 3])
            # Compute Adversarial Perturbations
            if adversarial_training:
                start = timer()
                if 'semi' in config.training.unsupervised:
                    x_batch_adv, adv_trans = attack.perturb(
                        x_mix, y_prediction, sess)
                else:
                    x_batch_adv, adv_trans = attack.perturb(
                        x_batch, y_batch, sess)
                end = timer()
                training_time += end - start
            else:
                x_batch_adv, adv_trans = x_batch, noop_trans

            nat_dict = {
                model.x_input: x_batch,
                model.y_input: y_batch,
                model.transform: noop_trans,
                model.weights: [1. for i in range(len(x_batch))],
                model.is_training: False
            }

            if use_kl:
                adv_dict = {
                    model.x_input:
                    np.concatenate((x_batch, x_batch_adv), axis=0),
                    model.y_input:
                    np.concatenate((y_batch, np.zeros(len(x_batch_adv))),
                                   axis=0),
                    model.y_pred_input:
                    np.concatenate((np.zeros(
                        (len(x_batch), 10)), y_prediction),
                                   axis=0),
                    model.transform:
                    np.concatenate((np.zeros([len(x_batch), 3]), adv_trans)),
                    model.weights: [
                        1. if i < len(x_batch) else 0
                        for i in range((len(x_batch) + len(x_batch_adv)))
                    ],
                    model.kl_weights: [
                        0. if i < len(x_batch) else LAMBDA
                        for i in range((len(x_batch) + len(x_batch_adv)))
                    ],
                    model.is_training:
                    False
                }
            else:
                adv_dict = {
                    model.x_input:
                    np.concatenate((x_batch, x_batch_adv), axis=0)
                    if 'semi' in config.training.unsupervised else x_batch_adv,
                    model.y_input:
                    np.concatenate((y_batch, y_prediction), axis=0)
                    if 'semi' in config.training.unsupervised else y_batch,
                    model.transform:
                    np.concatenate((np.zeros([len(x_batch), 3]), adv_trans))
                    if 'semi' in config.training.unsupervised else adv_trans,
                    model.weights: [
                        1. if i < len(x_batch) else LAMBDA
                        for i in range((
                            len(x_batch) +
                            len(x_batch_adv)) if 'semi' in config.training.
                                       unsupervised else len(x_batch_adv))
                    ],
                    model.is_training:
                    False
                }

            # Output to stdout
            if ii % num_output_steps == 0:
                nat_acc = sess.run(model.accuracy, feed_dict=nat_dict)
                adv_acc = sess.run(model.accuracy, feed_dict=adv_dict)
                print('Step {}:    ({})'.format(ii, datetime.now()))
                print('    training nat accuracy {:.4}%'.format(nat_acc * 100))
                print('    training adv accuracy {:.4}%'.format(adv_acc * 100))
                if ii != 0:
                    print('    {} examples per second'.format(
                        num_output_steps * batch_size / training_time))
                    training_time = 0.0

            # Tensorboard summaries
            if ii % num_summary_steps == 0:
                summary = sess.run(adv_summaries, feed_dict=adv_dict)
                summary_writer.add_summary(summary, global_step.eval(sess))
                summary = sess.run(nat_summaries, feed_dict=nat_dict)
                summary_writer.add_summary(summary, global_step.eval(sess))

            # Write a checkpoint
            if ii % num_checkpoint_steps == 0:
                saver.save(sess,
                           os.path.join(model_dir, 'checkpoint'),
                           global_step=global_step)

            if eval_during_training and ii % num_eval_steps == 0:
                attack.use_kl = False
                evaluate(model, attack, sess, config, eval_summary_writer)
                attack.use_kl = use_kl

            # Actual training step
            start = timer()
            if adversarial_training:
                adv_dict[model.is_training] = True
                sess.run(train_step, feed_dict=adv_dict)
            else:
                nat_dict[model.is_training] = True
                sess.run(train_step, feed_dict=nat_dict)
            end = timer()
            training_time += end - start
def train():
    with tf.Graph().as_default(), tf.device('/cpu:0'):
        # Create a variable to count the number of train() calls. This equals the
        # number of batches processed * FLAGS.num_gpus.
        global_step = tf.get_variable('global_step', [],
                                      initializer=tf.constant_initializer(0),
                                      trainable=False)

        # Calculate the learning rate schedule.
        num_batches_per_epoch = (mnist.NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN /
                                 (FLAGS.batch_size * FLAGS.num_gpus))
        decay_steps = int(num_batches_per_epoch * FLAGS.lr_decay_epochs)

        # Decay the learning rate exponentially based on the number of steps.
        lr = tf.train.exponential_decay(FLAGS.learning_rate,
                                        global_step,
                                        decay_steps,
                                        mnist.LEARNING_RATE_DECAY_FACTOR,
                                        staircase=True)

        # Create an optimizer that performs gradient descent.
        opt = tf.train.AdamOptimizer(lr)

        # Get images and labels for MNIST.
        mnist_dataset = input_data.read_data_sets(FLAGS.data_dir)
        images = tf.placeholder(tf.float32, [FLAGS.batch_size, 784])
        labels = tf.placeholder(tf.int16, [FLAGS.batch_size])
        # Calculate the gradients for each model tower.
        tower_grads = []
        with tf.variable_scope(tf.get_variable_scope()):
            for i in xrange(FLAGS.num_gpus):
                with tf.device('/gpu:%d' % i):
                    with tf.name_scope('%s_%d' %
                                       (mnist.TOWER_NAME, i)) as scope:
                        # Calculate the loss for one tower of the CIFAR model. This function
                        # constructs the entire CIFAR model but shares the variables across
                        # all towers.
                        loss = tower_loss(scope, images, labels)

                        # Reuse variables for the next tower.
                        tf.get_variable_scope().reuse_variables()

                        # Retain the summaries from the final tower.
                        summaries = tf.get_collection(tf.GraphKeys.SUMMARIES,
                                                      scope)

                        #Added for BN - 25.7.17 Oran
                        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
                        with tf.control_dependencies(update_ops):
                            grads = opt.compute_gradients(loss)
                            tower_grads.append(grads)

        # We must calculate the mean of each gradient. Note that this is the
        # synchronization point across all towers.
        grads = average_gradients(tower_grads)

        # Add a summary to track the learning rate.
        summaries.append(tf.summary.scalar('learning_rate', lr))

        # Add histograms for gradients.
        for grad, var in grads:
            if grad is not None:
                summaries.append(
                    tf.summary.histogram(var.op.name + '/gradients', grad))

        # Apply the gradients to adjust the shared variables.
        apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)

        # Add histograms for trainable variables.
        for var in tf.trainable_variables():
            summaries.append(tf.summary.histogram(var.op.name, var))

        # Track the moving averages of all trainable variables.
        variable_averages = tf.train.ExponentialMovingAverage(
            mnist.MOVING_AVERAGE_DECAY, global_step)
        variables_averages_op = variable_averages.apply(
            tf.trainable_variables())

        # Group all updates to into a single train op.
        train_op = tf.group(apply_gradient_op, variables_averages_op)
        #    train_op = tf.group(variables_averages_op)

        # Create a saver.
        saver = tf.train.Saver(tf.global_variables())

        # Build the summary operation from the last tower summaries.
        summary_op = tf.summary.merge(summaries)

        # Build an initialization operation to run below.
        init = tf.global_variables_initializer()

        # Start running operations on the Graph. allow_soft_placement must be set to
        # True to build towers on GPU, as some of the ops do not have GPU
        # implementations.
        sess = tf.Session(config=tf.ConfigProto(
            allow_soft_placement=True,
            log_device_placement=FLAGS.log_device_placement))
        sess.run(init)

        # Start the queue runners.
        tf.train.start_queue_runners(sess=sess)

        summary_writer = tf.summary.FileWriter(FLAGS.train_dir, sess.graph)

        step_reached = -1
        max_steps = int(FLAGS.epochs * mnist.NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN /
                        (FLAGS.batch_size * FLAGS.num_gpus))
        best_precision = 0
        f = open(FLAGS.train_dir + '/summary.txt', 'a')
        # Load model if not a new run
        if FLAGS.new_run == False:
            ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir)
            if ckpt and ckpt.model_checkpoint_path:
                saver.restore(sess, ckpt.model_checkpoint_path)
                step_reached = ckpt.model_checkpoint_path.split('/')[-1].split(
                    '-')[-1]

        for step in xrange(int(step_reached) + 1, max_steps):
            start_time = time.time()
            image_batch, label_batch = mnist_dataset.train.next_batch(
                FLAGS.batch_size)
            _, loss_value = sess.run([train_op, loss],
                                     feed_dict={
                                         images: image_batch,
                                         labels: label_batch
                                     })
            duration = time.time() - start_time

            assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

            if step % 400 == 0:
                num_examples_per_step = FLAGS.batch_size * FLAGS.num_gpus
                examples_per_sec = num_examples_per_step / duration
                sec_per_batch = duration / FLAGS.num_gpus

                format_str = (
                    '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                    'sec/batch)')
                print(format_str % (datetime.now(), step, loss_value,
                                    examples_per_sec, sec_per_batch))

            if step % 400 == 0:
                summary_str = sess.run(summary_op,
                                       feed_dict={
                                           images: image_batch,
                                           labels: label_batch
                                       })
                summary_writer.add_summary(summary_str, step)

            # Save the model checkpoint periodically.
            if step % 1000 == 0 or (step + 1) == max_steps:
                checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)

            if (step % 4000 == 0) and (step > 10000):
                precision = []
                for i in xrange(7):
                    mnist.draw_weights(sess)
                    precision += [mnist_eval.evaluate()]
                    if precision[i] > best_precision:
                        best_precision = precision[i]
                        os.system('cp ' + FLAGS.train_dir + '/weights/* ' +
                                  FLAGS.train_dir + '/best_weights/')
                print('Average precision: ' +
                      str(round(np.mean(precision), 3)))
                f.write('step: ' + str(step) + ', average precision = ' +
                        str(round(np.mean(precision), 3)) + '\n')


#      if step % 1 == 0: # Check CLT assumption
#        a1_, b1_, image_batch_, conv3_1_ = sess.run([a, b, image_batch, conv3_1])
#        # LAYER 1
#        x = conv3_1_[0,2:5,2:5,:]
#        activations = []
#        for i in range(700):
#            W = mnist.draw_ternary_weight(a1_[:,:,:,10:12], b1_[:,:,:,10:12])
#            W = W[:,:,:,0]
#            activations += [np.sum(x*W)]
#        activations = np.squeeze(np.array(activations))
#
#        mu_ = a1_[:,:,:,10] - b1_[:,:,:,10]
#        mu_bar = np.sum(x*mu_)
#        sigma = a1_[:,:,:,10] + b1_[:,:,:,10] - mu_*mu_ + 0.001
#        sigma_bar = np.sum(x*x*sigma)
#        samples = np.random.normal(mu_bar, np.sqrt(sigma_bar), 700)
#
#        max_=max(max(activations),max(samples))
#        min_=min(min(activations),min(samples))
#        plt.figure(1)
#        plt.subplot(211)
#        p_activations,_,_ = plt.hist(activations, bins=70, range=[min_,max_])
#        plt.title('Step ' +str(step)+';Layer 1 Activations: mu='+str(round(np.mean(activations),3))+'; Sigma='+str(round(np.var(activations),3)))
#        plt.subplot(212)
#        g_activations,_,_ = plt.hist(samples, bins=70, range=[min_,max_])
#        m = 0.5*(p_activations + g_activations)
#        js = 0.5*scipy.stats.entropy(p_activations, m) + 0.5*scipy.stats.entropy(g_activations, m)
##        jsd1+=[js]
##        steps1+=[step]
#        plt.title('Gaussian; mu='+str(round(mu_bar,3))+'; Sigma='+str(round(sigma_bar,3))+'; JS Divergence='+str(round(js,3)))
#        plt.show()
#        plt.savefig(FLAGS.train_dir + '/activations/' + str(step)+'.png')
#        plt.close(1)
        f.write('best precision = ' + str(round(best_precision, 3)) + '\n')
        f.close()
def train():
    """Train CIFAR-10 for a number of steps."""
    with tf.Graph().as_default(), tf.device('/cpu:0'):
        # Create a variable to count the number of train() calls. This equals the
        # number of batches processed * FLAGS.num_gpus.
        global_step = tf.get_variable('global_step', [],
                                      initializer=tf.constant_initializer(0),
                                      trainable=False)

        # Calculate the learning rate schedule.
        num_batches_per_epoch = (mnist.NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN /
                                 (FLAGS.batch_size * FLAGS.num_gpus))
        decay_steps = int(num_batches_per_epoch * mnist.NUM_EPOCHS_PER_DECAY)

        # Decay the learning rate exponentially based on the number of steps.
        lr = tf.train.exponential_decay(mnist.INITIAL_LEARNING_RATE,
                                        global_step,
                                        decay_steps,
                                        mnist.LEARNING_RATE_DECAY_FACTOR,
                                        staircase=True)

        # Create an optimizer that performs gradient descent.
        opt = tf.train.AdamOptimizer(lr)

        mnist_dataset = input_data.read_data_sets(FLAGS.data_dir)
        images = tf.placeholder(tf.float32, [FLAGS.batch_size, 784])
        labels = tf.placeholder(tf.int16, [FLAGS.batch_size])
        # Calculate the gradients for each model tower.
        tower_grads = []
        with tf.variable_scope(tf.get_variable_scope()):
            for i in xrange(FLAGS.num_gpus):
                with tf.device('/gpu:%d' % i):
                    with tf.name_scope('%s_%d' %
                                       (mnist.TOWER_NAME, i)) as scope:
                        # Dequeues one batch for the GPU
                        # Calculate the loss for one tower of the CIFAR model. This function
                        # constructs the entire CIFAR model but shares the variables across
                        # all towers.
                        loss = tower_loss(scope, images, labels)

                        # Reuse variables for the next tower.
                        tf.get_variable_scope().reuse_variables()

                        # Retain the summaries from the final tower.
                        summaries = tf.get_collection(tf.GraphKeys.SUMMARIES,
                                                      scope)

                        #Added for BN - 31.7.17 Oran
                        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
                        with tf.control_dependencies(update_ops):
                            grads = opt.compute_gradients(loss)
                            tower_grads.append(grads)

        # We must calculate the mean of each gradient. Note that this is the
        # synchronization point across all towers.
        grads = average_gradients(tower_grads)

        # Add a summary to track the learning rate.
        summaries.append(tf.summary.scalar('learning_rate', lr))

        # Add histograms for gradients.
        for grad, var in grads:
            if grad is not None:
                summaries.append(
                    tf.summary.histogram(var.op.name + '/gradients', grad))

        # Apply the gradients to adjust the shared variables.
        apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)

        # Add histograms for trainable variables.
        for var in tf.trainable_variables():
            summaries.append(tf.summary.histogram(var.op.name, var))

        # Track the moving averages of all trainable variables.
        variable_averages = tf.train.ExponentialMovingAverage(
            mnist.MOVING_AVERAGE_DECAY, global_step)
        variables_averages_op = variable_averages.apply(
            tf.trainable_variables())

        # Group all updates to into a single train op.
        train_op = tf.group(apply_gradient_op, variables_averages_op)

        # Create a saver.
        saver = tf.train.Saver(tf.global_variables())

        # Build the summary operation from the last tower summaries.
        summary_op = tf.summary.merge(summaries)

        # Build an initialization operation to run below.
        init = tf.global_variables_initializer()

        # Start running operations on the Graph. allow_soft_placement must be set to
        # True to build towers on GPU, as some of the ops do not have GPU
        # implementations.
        sess = tf.Session(config=tf.ConfigProto(
            allow_soft_placement=True,
            log_device_placement=FLAGS.log_device_placement))
        sess.run(init)

        # Start the queue runners.
        tf.train.start_queue_runners(sess=sess)

        summary_writer = tf.summary.FileWriter(FLAGS.train_dir, sess.graph)

        step_reached = -1
        # Load model if not a new run
        if FLAGS.new_run == False:
            ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir)
            if ckpt and ckpt.model_checkpoint_path:
                saver.restore(sess, ckpt.model_checkpoint_path)
                step_reached = ckpt.model_checkpoint_path.split('/')[-1].split(
                    '-')[-1]

        for step in xrange(int(step_reached) + 1, FLAGS.max_steps):
            start_time = time.time()
            image_batch, label_batch = mnist_dataset.train.next_batch(
                FLAGS.batch_size)
            _, loss_value = sess.run([train_op, loss],
                                     feed_dict={
                                         images: image_batch,
                                         labels: label_batch
                                     })
            duration = time.time() - start_time

            assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

            if step % 400 == 0:
                num_examples_per_step = FLAGS.batch_size * FLAGS.num_gpus
                examples_per_sec = num_examples_per_step / duration
                sec_per_batch = duration / FLAGS.num_gpus

                format_str = (
                    '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                    'sec/batch)')
                print(format_str % (datetime.now(), step, loss_value,
                                    examples_per_sec, sec_per_batch))

            if step % 400 == 0:
                summary_str = sess.run(summary_op,
                                       feed_dict={
                                           images: image_batch,
                                           labels: label_batch
                                       })
                summary_writer.add_summary(summary_str, step)

            # Save the model checkpoint periodically.
            if (step % int(10 * num_batches_per_epoch)
                    == 0) or (step + 1) == FLAGS.max_steps:
                checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)

            if (step % int(10 * num_batches_per_epoch) == 0) & (step > 0):
                mnist_eval.evaluate()

        mnist.save_weights(sess)
Exemple #6
0
def runTesting():
    path = "./Resources/MINSTData"
    print("path = ", path)
    mnist_test = input_data.read_data_sets(path, one_hot=True)
    mnist_eval.evaluate(mnist_test)
    pass