Esempio n. 1
0
def run_training():
    with tf.Graph().as_default():
        images, labels = inputs(train=True,
                                batch_size=FLAGS.batch_size,
                                num_epochs=FLAGS.num_epochs)
        logits = mnist.inference(images, FLAGS.hidden1, FLAGS.hidden2)
        loss = mnist.loss(logits, labels)
        train_op = mnist.training(loss, FLAGS.learning_rate)
        init_op = tf.group(tf.global_variables_initializer(),
                           tf.local_variables_initializer())
        sess = tf.Session()
        sess.run(init_op)
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=sess, coord=coord)
        try:
            step = 0
            while not coord.should_stop():
                start_time = time.time()
                _, loss_value = sess.run([train_op, loss])
                duration = time.time() - start_time
                if step % 100 == 0:
                    print('Step %d: loss = %.2f (%.3f sec)' %
                          (step, loss_value, duration))
                    step += 1
        except tf.errors.OutOfRangeError:
            print('Done training for %d epochs, %d steps.' %
                  (FLAGS.num_epochs, step))
        finally:
            coord.request_stop()
            coord.join(threads)
            sess.close()
Esempio n. 2
0
def train():
    filenames = tf.placeholder(tf.string, [None])
    dataset = tf.data.TFRecordDataset(filenames)
    dataset = dataset.map(mnist.parse_data)
    dataset = dataset.shuffle(buffer_size=50000)
    dataset = dataset.batch(FLAGS.batch_size)
    dataset = dataset.repeat()

    iterator = dataset.make_initializable_iterator()

    global_step = tf.train.get_or_create_global_step()
    images, labels = iterator.get_next()
    logits, pred = mnist.inference(images, training=True)
    loss = mnist.loss(logits, labels)
    train_op = mnist.train(loss, global_step)

    with tf.train.MonitoredTrainingSession(
        checkpoint_dir=FLAGS.train_dir,
        hooks=[tf.train.StopAtStepHook(last_step=FLAGS.max_step), tf.train.NanTensorHook(loss)],
        save_checkpoint_steps=100
    ) as mon_sess:
        mon_sess.run(iterator.initializer, feed_dict={filenames: ['train_img.tfrecords']})
        while not mon_sess.should_stop():
            _, train_loss, train_step, label = mon_sess.run([train_op, loss, global_step, labels])
            if train_step % 100 == 0:
                print('step: {}, loss: {}'.format(train_step, train_loss))
def main():
    fraction = 0.4
    min_after_dequeue = int(mnist.NUM_EXAMPLES_PER_EPOCH * fraction)
    images, labels = mnist_inputs(min_after_dequeue)
    validation_images, validation_labels = mnist_inputs(2000,
                                                        train=False,
                                                        num_epochs=None)
    with tf.variable_scope("inference") as scope:
        logits = mnist.inference(images)
        scope.reuse_variables()
        validation_logits = mnist.inference(validation_images)
    loss = mnist.loss(logits, labels)
    tf.scalar_summary("cross_entropy", loss)
    accuracy = mnist.accuracy(validation_logits, validation_labels)
    tf.scalar_summary("validation_accuracy", accuracy)
    train_op = mnist.train(loss)
    sess = tf.Session()
    sess.run(tf.initialize_local_variables())
    sess.run(tf.initialize_all_variables())
    tf.train.start_queue_runners(sess=sess)
    merge = tf.merge_all_summaries()
    writer = tf.train.SummaryWriter(
        "/home/windows98/PycharmProjects/mnist/Summary/")
    for index in range(NUM_STEPS):
        _, loss_value, summary = sess.run([train_op, loss, merge])
        writer.add_summary(summary, index + 1)
        # accuracy_score, summary = sess.run([accuracy, summary])
        # writer.add_summary(summary, index+1)
        print("step:" + str(index + 1) + " loss: " + str(loss_value))
Esempio n. 4
0
def evaluate(dataset):
    """Evaluate model on Dataset for a number of steps."""
    with tf.Graph().as_default():
        # Graph creation
        batch_size = dataset.num_examples
        images_placeholder, labels_placeholder = mnist.placeholder_inputs(
            batch_size)
        logits = mnist.inference(images_placeholder, train=False)
        validation_accuracy = tf.reduce_sum(
            mnist.evaluation(logits,
                             labels_placeholder)) / tf.constant(batch_size)
        validation_loss = mnist.loss(logits, labels_placeholder)

        # Reference to sess and saver
        sess = tf.Session()
        saver = tf.train.Saver()

        # Create summary writer
        graph_def = tf.get_default_graph().as_graph_def()
        summary_writer = tf.summary.FileWriter(FLAGS.eval_dir,
                                               graph_def=graph_def)
        step = -1
        while True:
            step = do_eval(saver,
                           summary_writer,
                           validation_accuracy,
                           validation_loss,
                           images_placeholder,
                           labels_placeholder,
                           dataset,
                           prev_global_step=step)
            if FLAGS.run_once:
                break
            time.sleep(FLAGS.eval_interval_secs)
Esempio n. 5
0
def train():
    images, labels = mnist.inputs(['train_img.tfrecords'], mnist.TRAIN_EXAMPLES_NUM,
                                  FLAGS.batch_size, shuffle=True)
    global_step = tf.train.get_or_create_global_step()

    logits, pred = mnist.inference(images, training=True)
    loss = mnist.loss(logits, labels)
    train_op = mnist.train(loss, global_step)
    saver = tf.train.Saver()
    with tf.Session() as sess:
        init_op = tf.group(
            tf.local_variables_initializer(),
            tf.global_variables_initializer())
        sess.run(init_op)
        ckpt = os.path.join(FLAGS.train_dir, 'model.ckpt')

        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess, coord=coord)

        for i in range(1, FLAGS.max_step + 1):
            _, train_loss, predict, label = sess.run([train_op, loss, pred, labels])
            # print(predict, '\n', label)
            if i % 100 == 0:
                print('step: {}, loss: {}'.format(i, train_loss))
                # print(predict, '\n', label)
                saver.save(sess, ckpt, global_step=i)

        coord.request_stop()
        coord.join(threads)
def run_training():
  data_sets = data_mnist.read_data_sets()
  with tf.Graph().as_default():
    images_placeholder, labels_placeholder = placeholder_inputs(FLAGS.batch_size)
    logits = mnist.inference(images_placeholder, FLAGS.hidden1, FLAGS.hidden2)
    loss = mnist.loss(logits, labels_placeholder)
    train_op = mnist.training(loss, FLAGS.learning_rate)
    eval_correct = mnist.evaluation(logits, labels_placeholder)

    summary_op = tf.merge_all_summaries()
    saver = tf.train.Saver()
    
    sess = tf.Session()
    sess.run(tf.initialize_all_variables())
    summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph)

    # Start the training loop.
    for step in xrange(FLAGS.max_steps):
      start_time = time.time()
      feed_dict = fill_feed_dict(data_sets.train, images_placeholder, labels_placeholder)
      _, loss_value = sess.run([train_op, loss], feed_dict=feed_dict)
      duration = time.time() - start_time

      if step % 100 == 0:
        print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value, duration))
        summary_str = sess.run(summary_op, feed_dict=feed_dict)
        summary_writer.add_summary(summary_str, step)
        summary_writer.flush()

      if (step + 1) % 1000 == 0 or (step + 1) == FLAGS.max_steps:
        checkpoint_file = os.path.join(FLAGS.train_dir, 'checkpoint')
        saver.save(sess, checkpoint_file, global_step=step)
        do_eval(sess,eval_correct, images_placeholder, labels_placeholder, data_sets.train)
        do_eval(sess, eval_correct, images_placeholder, labels_placeholder, data_sets.validation)
        do_eval(sess, eval_correct, images_placeholder, labels_placeholder, data_sets.test)
def main():
    images, labels = inputs()
    reshaped_images = tf.reshape(images, [
        mnist.BATCH_SIZE, mnist.IMAGE_HEIGHT, mnist.IMAGE_WIDTH,
        mnist.IMAGE_DEPTH
    ])
    logits = mnist.inference(reshaped_images)
    loss = mnist.loss(logits, labels)
    accuracy = mnist.accuracy(logits, labels)
    train_op = mnist.train(loss)
    init = tf.initialize_all_variables()
    with tf.Session() as sess:
        sess.run(init)
        for index in range(NUM_STEPS):
            batch_x, batch_y = mnist_data.train.next_batch(mnist.BATCH_SIZE)
            _, loss_value = sess.run([train_op, loss],
                                     feed_dict={
                                         images: batch_x,
                                         labels: batch_y
                                     })
            print("step:" + str(index + 1) + " loss: " + str(loss_value))
            if (index + 1) % 10 == 0:
                validation_x, validation_y = mnist_data.validation.next_batch(
                    mnist.BATCH_SIZE)
                accuracy_score = sess.run(accuracy,
                                          feed_dict={
                                              images: validation_x,
                                              labels: validation_y
                                          })
                print("accuracy : " + str(accuracy_score))
def run_training():
    data_sets = input_data.read_data_sets(FLAGS.input_data_dir,
                                          FLAGS.fake_data)

    with tf.Graph().as_default():
        images_placeholder, labels_placeholder = placeholder_inputs(
            FLAGS.batch_size)
        logits = mnist.inference(images_placeholder, FLAGS.hidden1,
                                 FLAGS.hidden2)
        loss = mnist.loss(logits, labels_placeholder)
        train_op = mnist.training(loss, FLAGS.learning_rate)
        eval_correct = mnist.evaluation(logits, labels_placeholder)

        # Build the summary tensor based on the TF collection of Summaries
        summary = tf.summary.merge_all()

        init = tf.global_variables_initializer()

        # Create a saver for writing training checkpoints
        saver = tf.train.Saver()

        sess = tf.Session()

        # Instantiate a SummaryWriter to output summries and the Graph
        summary_writer = tf.summary.FileWriter(FLAGS.log_dir, sess.graph)

        sess.run(init)

        for step in xrange(FLAGS.max_steps):
            start_time = time.time()
            feed_dict = fill_feed_dict(data_sets.train, images_placeholder,
                                       labels_placeholder)
            _, loss_value = sess.run([train_op, loss], feed_dict=feed_dict)
            duration = time.time() - start_time

            if step % 100 == 0:
                print('Step %d: loss = %.2f (%.3f sec)' %
                      (step, loss_value, duration))
                # Update the events file
                summary_str = sess.run(summary, feed_dict=feed_dict)
                summary_writer.add_summary(summary_str, step)
                summary_writer.flush()

            # Save a checkpoint and evaluate the model periodically.
            if (step + 1) % 1000 == 0 or (step + 1) == FLAGS.max_steps:
                checkpoint_file = os.path.join(FLAGS.log_dir, 'model.ckpt')
                saver.save(sess, checkpoint_file, global_step=step)
                # Evaluate against the training set.
                print('Training Data Eval:')
                do_eval(sess, eval_correct, images_placeholder,
                        labels_placeholder, data_sets.train)
                # Evaluate against the validation set.
                print('Validation Data Eval:')
                do_eval(sess, eval_correct, images_placeholder,
                        labels_placeholder, data_sets.validation)
                # Evaluate aginst the test set.
                print('Test Data Eval:')
                do_eval(sess, eval_correct, images_placeholder,
                        labels_placeholder, data_sets.test)
Esempio n. 9
0
def train():
    """Train CIFAR-10 for a number of steps."""
    with tf.Graph().as_default():
        global_step = tf.train.get_or_create_global_step()

        # Get images and labels for CIFAR-10.
        # Force input pipeline to CPU:0 to avoid operations sometimes ending up on
        # GPU and resulting in a slow down.
        with tf.device('/cpu:0'):
            images, labels = mnist.distorted_inputs()

        # Build a Graph that computes the logits predictions from the
        # inference model.
        logits = mnist.inference(images)

        # Calculate loss.
        loss = mnist.loss(logits, labels)

        # Build a Graph that trains the model with one batch of examples and
        # updates the model parameters.
        train_op = mnist.train(loss, global_step)

        class _LoggerHook(tf.train.SessionRunHook):
            """Logs loss and runtime."""
            def begin(self):
                self._step = -1
                self._start_time = time.time()

            def before_run(self, run_context):
                self._step += 1
                return tf.train.SessionRunArgs(loss)  # Asks for loss value.

            def after_run(self, run_context, run_values):
                if self._step % FLAGS.log_frequency == 0:
                    current_time = time.time()
                    duration = current_time - self._start_time
                    self._start_time = current_time

                    loss_value = run_values.results
                    examples_per_sec = FLAGS.log_frequency * FLAGS.batch_size / duration
                    sec_per_batch = float(duration / FLAGS.log_frequency)

                    format_str = (
                        '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                        'sec/batch)')
                    print(format_str % (datetime.now(), self._step, loss_value,
                                        examples_per_sec, sec_per_batch))

        with tf.train.MonitoredTrainingSession(
                checkpoint_dir=FLAGS.train_dir,
                hooks=[
                    tf.train.StopAtStepHook(last_step=FLAGS.max_steps),
                    tf.train.NanTensorHook(loss),
                    _LoggerHook()
                ],
                config=tf.ConfigProto(log_device_placement=FLAGS.
                                      log_device_placement)) as mon_sess:
            while not mon_sess.should_stop():
                mon_sess.run(train_op)
def run_training():
    data_sets = input_data.read_data_sets(fake_data)

    with tf.Graph().as_default():
        image_placeholder, label_placeholder = placeholder_inputs(batch_size)

        logits = mnist.inference(image_placeholder, hidden1_unit, hidden2_unit)
        #计算损失
        loss = mnist.loss(logits, label_placeholder)
        #训练
        train_op = mnist.training(loss, 0.01)
        #计算正确分类数
        eval_correct = mnist.evaluation(logits, label_placeholder)
        #合并默认图中的所有summaries操作
        summary_op = tf.merge_all_summaries()
        #用于保存网络中的变量
        saver = tf.train.Saver()

        sess = tf.Session()
        #初始化所有变量
        sess.run(tf.initialize_all_variables())

        summary_writter = tf.train.SummaryWriter(train_dir, sess.graph)

        for step in xrange(max_steps):
            start_time = time.time()
            #获取feed_dict字典
            feed_dict = fill_placeholder(data_sets.train, image_placeholder,
                                         label_placeholder)

            _, loss_value = sess.run([train_op, loss], feed_dict=feed_dict)

            duration = time.time() - start_time

            if step % 100 == 0:
                print(
                    'Step %d:loss=%.2f (%.3f sec) %(step,loss_value,duration)')
                summary_str = sess.run(summary_op, feed_dict=feed_dict)
                summary_writter.add_summary(summary_str, step)
                summary_writter.flush()

            if step % 1000 == 0:
                saver.save(sess, train_dir, global_step=step)

                print('Training Data Eval:')
                do_eval(sess, eval_correct, image_placeholder,
                        label_placeholder, data_sets.train)

                print("Validation Data Eval")
                do_eval(sess, eval_correct, image_placeholder,
                        label_placeholder, data_sets.validation)

                print("Test Data Eval:")
                do_eval(sess, eval_correct, image_placeholder,
                        label_placeholder, data_sets.test)
Esempio n. 11
0
def train():

    with tf.Graph().as_default():
        global_step = tf.train.get_or_create_global_step()

        with tf.device('/cpu:0'):
            images, labels = mnist.distorted_inputs()
        print(global_step)

        #with tf.device('/gpu:0'):
        logits = mnist.inference(images)
        #with tf.device('/gpu:0'):
        loss = mnist.loss(logits, labels)
        #with tf.device('/gpu:0'):
        train_op = mnist.train(loss, global_step)

        class _LoggerHook(tf.train.SessionRunHook):
            def begin(self):
                self._step = -1
                self._start_time = time.time()

            def before_run(self, run_context):
                self._step += 1
                return tf.train.SessionRunArgs(loss)

            def after_run(self, run_context, run_values):
                if self._step % FLAGS.log_frequency == 0:
                    current_time = time.time()
                    duration = current_time - self._start_time
                    self._start_time = current_time

                    loss_value = run_values.results
                    examples_per_sec = FLAGS.log_frequency * FLAGS.batch_size / duration
                    sec_per_batch = float(duration / FLAGS.log_frequency)

                    format_str = (
                        '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                        'sec/batch)')
                    print(format_str % (datetime.now(), self._step, loss_value,
                                        examples_per_sec, sec_per_batch))

        with tf.train.MonitoredTrainingSession(
                checkpoint_dir=FLAGS.train_dir,
                hooks=[
                    tf.train.StopAtStepHook(last_step=FLAGS.max_steps),
                    tf.train.NanTensorHook(loss),
                    _LoggerHook()
                ],
                config=tf.ConfigProto(log_device_placement=FLAGS.
                                      log_device_placement)) as mon_sess:
            while not mon_sess.should_stop():
                mon_sess.run(train_op)
Esempio n. 12
0
def run_training():
    data_sets = input_data.read_data_sets('MNIST_data', FLAGS.fake_data)

    with tf.Graph().as_default():
        image_placeholder, label_placeholder = placeholder_inputs(FLAGS.batch_size)

        logits = mnist.inference(image_placeholder, FLAGS.hidden1_unit, FLAGS.hidden2_unit)

        loss = mnist.loss(logits, label_placeholder)

        train_op = mnist.training(loss, FLAGS.learning_rate)

        eval_correct = mnist.evaluation(logits, label_placeholder)

        summary_op = tf.summary.merge_all()

        saver = tf.train.Saver()

        sess = tf.Session()

        sess.run(tf.global_variables_initializer())

        summary_writer = tf.summary.FileWriter(FLAGS.train_dir, sess.graph)

        for step in range(FLAGS.max_steps):
            start_time = time.time()

            feed_dict = fill_placeholder(data_sets.train, image_placeholder, label_placeholder)

            _, loss_value = sess.run([train_op, loss], feed_dict=feed_dict)

            duration = time.time() - start_time

            if step % 100 == 0 :
                print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value, duration))

                summary_str = sess.run(summary_op, feed_dict = feed_dict)
                summary_writer.add_summary(summary_str, step)
                summary_writer.flush()

            if step % 1000 == 0:
                saver.save(sess, FLAGS.train_dir, global_step=step)

                print('Training Data Eval: ')
                do_eval(sess, eval_correct, image_placeholder, label_placeholder, data_sets.train)

                print('Validation Data Eval: ')
                do_eval(sess, eval_correct, image_placeholder, label_placeholder, data_sets.validation)

                print('Test Data Eval:')
                do_eval(sess, eval_correct, image_placeholder, label_placeholder, data_sets.test)
Esempio n. 13
0
def run_training():
    data_sets = input_data.read_data_sets(FLAGS.input_data_dir, FLAGS.fake_data)
    with tf.Graph().as_default():
        images_placeholder, labels_placeholder = placeholder_inputs(FLAGS.batch_size)
        logits = mnist.inference(images_placeholder, FLAGS.hidden1, FLAGS.hidden2)  # 模型的部分

        loss = mnist.loss(logits, labels_placeholder)

        train_op = mnist.training(loss, FLAGS.learning_rate)

        eval_correct = mnist.evaluation(logits, labels_placeholder)

        summary = tf.summary.merge_all()

        init = tf.global_variables_initializer()

        saver = tf.train.Saver()

        sess = tf.Session()

        summary_writer = tf.summary.FileWriter(FLAGS.log_dir, sess.graph)

        sess.run(init)

        for step in xrange(FLAGS.max_steps):
            start_time = time.time()
            feed_dict = fill_feed_dict(data_sets.train, images_placeholder, labels_placeholder)
            _, loss_value = sess.run([train_op, loss], feed_dict=feed_dict)

            duration = time.time() - start_time

            if step % 100 == 0:
                print 'Setp %d:loss =%.2f (%.3f sec)' % (step, loss_value, duration)
                summary_str = sess.run(summary, feed_dict=feed_dict)

                summary_writer.add_summary(summary_str, step)
                summary_writer.flush()

            if (step + 1) % 1000 == 0 or (step + 1) == FLAGS.max_steps:
                checkpoint_file = os.path.join(FLAGS.log_dir, 'model.ckpt')
                saver.save(sess, checkpoint_file, global_step=step)

                print('Training Data Eval:')
                do_eval(sess, eval_correct, images_placeholder, labels_placeholder, data_sets.train)

                print "Validation Data Eval:"
                do_eval(sess, eval_correct, images_placeholder, labels_placeholder, data_sets.validation)

                print "Test Data Eval:"
                do_eval(sess, eval_correct, images_placeholder, labels_placeholder, data_sets.test)
Esempio n. 14
0
def run_training():
    data_sets = input_data.read_data_sets(FLAGS.input_data_dir,
                                          FLAGS.fake_data)
    with tf.Graph().as_default():
        # 初始化输入占位符
        images_placeholder, labels_placeholder = placeholder_inputs(
            FLAGS.batch_size)
        # 构建神经网络
        logits = mnist.inference(images_placeholder, FLAGS.hidden1,
                                 FLAGS.hidden2)
        # 添加损失函数
        loss = mnist.loss(logits, labels_placeholder)
        # 训练
        train_op = mnist.training(loss, FLAGS.learning_rate)
        # 正确率计算
        eval_correct = mnist.evaluation(logits, labels_placeholder)

        init = tf.global_variables_initializer()
        sess = tf.Session()
        sess.run(init)

        start_time = time.time()

        for step in range(FLAGS.max_steps):

            feed_dict = fill_feed_dict(data_sets.train, images_placeholder,
                                       labels_placeholder)
            _, loss_value = sess.run([train_op, loss], feed_dict=feed_dict)
            if step % 100 == 0:
                duration = time.time() - start_time
                print('Step %d: loss = %.2f (%.3f sec)' %
                      (step, loss_value, duration))
                start_time = time.time()

            if (step + 1) % 1000 == 0 or (step + 1) == FLAGS.max_steps:
                print('Training Data Eval:')
                do_eval(sess, eval_correct, images_placeholder,
                        labels_placeholder, data_sets.train)
                print('Validation Data Eval:')
                do_eval(sess, eval_correct, images_placeholder,
                        labels_placeholder, data_sets.validation)
                print('Test Data Eval:')
                do_eval(sess, eval_correct, images_placeholder,
                        labels_placeholder, data_sets.test)

    print(FLAGS)
Esempio n. 15
0
def train_and_validation():
    training_dataset = tf.data.TFRecordDataset(['./train_img.tfrecords'])
    validation_dataset = tf.data.TFRecordDataset(['./validation_img.tfrecords'])
    test_dataset = tf.data.TFRecordDataset(['./test_img.tfrecords'])

    training_dataset = training_dataset.map(mnist.parse_data)
    training_dataset = training_dataset.shuffle(50000).batch(FLAGS.batch_size).repeat()
    validation_dataset = validation_dataset.map(mnist.parse_data).batch(FLAGS.batch_size)
    test_dataset = test_dataset.map(mnist.parse_data).batch(FLAGS.batch_size)

    iterator = tf.data.Iterator.from_structure(output_types=training_dataset.output_types,
                                               output_shapes=training_dataset.output_shapes)

    training_init_op = iterator.make_initializer(training_dataset)
    validation_init_op = iterator.make_initializer(validation_dataset)
    test_init_op = iterator.make_initializer(test_dataset)
    images, labels = iterator.get_next()

    training = tf.placeholder(dtype=tf.bool)
    logits, pred = mnist.inference(images, training=training)
    loss = mnist.loss(logits, labels)
    top_k_op = tf.nn.in_top_k(logits, labels, 1)
    global_step = tf.train.get_or_create_global_step()
    train_op = mnist.train(loss, global_step)
    saver = tf.train.Saver()
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        sess.run(training_init_op)
        print('begin to train!')
        ckpt = os.path.join(FLAGS.train_dir, 'model.ckpt')
        train_step = 0
        while train_step < FLAGS.max_step:
            _, train_loss, step, label = sess.run([train_op, loss, global_step, labels], feed_dict={training: True})
            train_step += 1
            if train_step % 100 == 0:
                saver.save(sess, ckpt, train_step)
                if train_step % 1000 == 0:
                    precision = evaluate(sess, top_k_op, training, mnist.TRAIN_EXAMPLES_NUM)
                    print('step: {}, loss: {}, training precision: {}'.format(train_step, train_loss, precision))
                sess.run(validation_init_op)
                precision = evaluate(sess, top_k_op, training, mnist.VALIDATION_EXAMPLES_NUM)
                print('step: {}, loss: {}, validation precision: {}'.format(train_step, train_loss, precision))
                sess.run(training_init_op)
        sess.run(test_init_op)
        precision = evaluate(sess, top_k_op, training, mnist.TEST_EXAMPLES_NUM)
        print('finally test precision: {}'.format(precision))
def run_training():
    with tf.Graph().as_default():
        # 输入images和labels
        images, labels = inputs(train=True,
                                batch_size=FLAGS.batch_size,
                                num_epochs=FLAGS.num_epochs)
        # 构建一个从推理模型来预测数据的图
        logits = mnist.inference(images, FLAGS.hidden1, FLAGS.hidden2)
        loss = mnist.loss(logits, labels)  # 定义损失函数

        # Add to the Graph operations that train the model.
        train_op = mnist.training(loss, FLAGS.learning_rate)

        # 初始化参数,string_input_producer内部创建了一个epoch计数变量,
        # 归入tf.GraphKeys.LOCAL_VARIABLES集合中,必须单独用initialize_local_variable()初始化
        init_op = tf.group(tf.global_variables_initializer(),
                           tf.local_variables_initializer())

        sess = tf.Session()
        sess.run(init_op)

        # Start input enqueue threads
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=sess, coord=coord)

        try:
            step = 0
            while not coord.should_stop():  # 进入死循环
                start_time = time.time()
                _, loss_value = sess.run([train_op, loss])
            duration = time.time() - start_time

            # 每100次训练输出一次结果
            if step % 100 == 0:
                print('Step %d: loss = %.2f (%.3f sec)' %
                      (step, loss_value, duration))
            step += 1
        except tf.errors.OutOfRangeError:
            print('Done training for %d epochs, %d steps.' %
                  (FLAGS.num_epochs, step))
        finally:
            coord.request_stop()  # 通知其他线程关闭

        coord.join(threads)
        sess.close()
Esempio n. 17
0
def run_training():
    data_sets = input_data.read_data_sets(data_dir)
    with tf.Graph().as_default():
        input_holder, label_holder = generate_placeholder(50, input_size)
        logits = mnist.inference(input_holder, input_size, 128, 32,
                                 label_classes)
        loss = mnist.loss(logits, label_holder)
        train_op = mnist.training(loss, 0.01)
        eval_correct = mnist.evaluation(logits, label_holder)
        summary = tf.summary.merge_all()

        init = tf.global_variables_initializer()
        saver = tf.train.Saver()
        sess = tf.Session()
        summary_writer = tf.summary.FileWriter(log_dir, sess.graph)

        sess.run(init)
        for step in range(2000):
            start_time = time.time()
            feed_dict = fill_feed_dict(data_sets.train, input_holder,
                                       label_holder, 50)
            _, loss_value = sess.run([train_op, loss], feed_dict=feed_dict)
            duration = time.time() - start_time

            if step % 100 == 0:
                print('Step %d: loss = %.2f (%.3f sec)' %
                      (step, loss_value, duration))
                summary_str = sess.run(summary, feed_dict=feed_dict)
                summary_writer.add_summary(summary_str, step)
                summary_writer.flush()

            if (step + 1) % 1000 == 0 or (step + 1) == 2000:
                checkpoint_file = os.path.join(log_dir, 'model.ckpt')
                saver.save(sess, checkpoint_file, global_step=step)
                print('Training Data Eval:')
                do_eval(sess, eval_correct, input_holder, label_holder,
                        data_sets.train, 50)
                print('Validation Data Eval:')
                do_eval(sess, eval_correct, input_holder, label_holder,
                        data_sets.validation, 50)
                print('Test Data Eval:')
                do_eval(sess, eval_correct, input_holder, label_holder,
                        data_sets.test, 50)
def run_training():
    data_sets = data_mnist.read_data_sets()
    with tf.Graph().as_default():
        images_placeholder, labels_placeholder = placeholder_inputs(
            FLAGS.batch_size)
        logits = mnist.inference(images_placeholder, FLAGS.hidden1,
                                 FLAGS.hidden2)
        loss = mnist.loss(logits, labels_placeholder)
        train_op = mnist.training(loss, FLAGS.learning_rate)
        eval_correct = mnist.evaluation(logits, labels_placeholder)

        summary_op = tf.merge_all_summaries()
        saver = tf.train.Saver()

        sess = tf.Session()
        sess.run(tf.initialize_all_variables())
        summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph)

        # Start the training loop.
        for step in xrange(FLAGS.max_steps):
            start_time = time.time()
            feed_dict = fill_feed_dict(data_sets.train, images_placeholder,
                                       labels_placeholder)
            _, loss_value = sess.run([train_op, loss], feed_dict=feed_dict)
            duration = time.time() - start_time

            if step % 100 == 0:
                print('Step %d: loss = %.2f (%.3f sec)' %
                      (step, loss_value, duration))
                summary_str = sess.run(summary_op, feed_dict=feed_dict)
                summary_writer.add_summary(summary_str, step)
                summary_writer.flush()

            if (step + 1) % 1000 == 0 or (step + 1) == FLAGS.max_steps:
                checkpoint_file = os.path.join(FLAGS.train_dir, 'checkpoint')
                saver.save(sess, checkpoint_file, global_step=step)
                do_eval(sess, eval_correct, images_placeholder,
                        labels_placeholder, data_sets.train)
                do_eval(sess, eval_correct, images_placeholder,
                        labels_placeholder, data_sets.validation)
                do_eval(sess, eval_correct, images_placeholder,
                        labels_placeholder, data_sets.test)
Esempio n. 19
0
def run_training():
    data_sets = read_data_sets('/tmp/tensorflow/mnist/input_data', False)
    with tf.Graph().as_default():

        images_placeholder, labels_placeholder = placeholder_inputs(
            100)  # 每100张送进去计算一次所以这里的向量维度100

        logits = mnist.inference(images_placeholder, 128, 32)

        loss = mnist.loss(logits, labels_placeholder)

        train_op = mnist.training(loss, 0.01)

        summary = tf.summary.merge_all()

        saver = tf.train.Saver()

        init = tf.global_variables_initializer()  #前面都是定义变量,这里对该图中的变量进行初始化

        sess = tf.Session()

        summary_writer = tf.summary.FileWriter("logs/", sess.graph)

        sess.run(init)

        for step in xrange(2000):  # 每一步取100张,一共2000部,取20万张

            feed_dict = fill_feed_dict(data_sets.train, images_placeholder,
                                       labels_placeholder)

            _, loss_value = sess.run([train_op, loss], feed_dict=feed_dict)
            #sess.run()函数对loss进行计算,得到lossValue用于查看拟合度是否上身,主要的计算在于执行train_op,返回值_表示匿名,既不关心(应为train_op只是一个计算过程,所以没有返回实体,所以打印并没有区别)
            if step % 100 == 0:
                print(loss_value)
                summary_str = sess.run(summary, feed_dict=feed_dict)
                summary_writer.add_summary(summary_str, step)
                summary_writer.flush()

            #    print(train_op)
        save_path = saver.save(sess, "myMnistNet/save_net.ckpt")
        print(save_path)
Esempio n. 20
0
def mnist_training():
    x = tf.placeholder(tf.float32,
                       shape=[None, IMG_SIZE * IMG_SIZE],
                       name='x_ph')
    y_ = tf.placeholder(tf.float32, shape=[None, NUM_CLASSES], name='y_ph')

    logits = mnist.inference(x)
    loss = mnist.loss(logits, y_)
    train_op = mnist.training(loss, LR)
    eval_correct = mnist.evaluation(logits, y_)
    summary = tf.summary.merge_all()

    init = tf.global_variables_initializer()
    saver = tf.train.Saver(max_to_keep=cfg.MNIST.RUN.models_to_save)
    sess = tf.Session()

    # Instantiate a SummaryWriter to output summaries and the Graph.
    #summary_writer = tf.summary.FileWriter(, sess.graph)

    sess.run(init)
    print "*****TRAINING STARTED*******"
    for i in range(MAX_ITER):
        if i % 100 == 0 and i > 0:
            print('Step %d: loss = %.2f' % (i, loss_val))
            #targets=sess.run(tf.cast(mnist_db.test.labels,tf.int32))
            #prediction=sess.run(eval_correct,feed_dict={x:mnist_db.test.images,y_:targets})
            #print('Step %d: loss = %.2f, accuracy %.4f' % (i, loss_val,prediction))
            saver.save(sess,
                       os.path.join(cfg.MNIST.RUN.models_dir, 'model'),
                       global_step=i)
        batch = mnist_db.train.next_batch(BATCH_SIZE)
        _, loss_val = sess.run([train_op, loss],
                               feed_dict={
                                   x: batch[0],
                                   y_: batch[1]
                               })
    saver.save(
        sess,
        os.path.join(cfg.MNIST.RUN.models_dir, cfg.MNIST.RUN.last_model_name))
Esempio n. 21
0
def tower_loss(scope, images, labels):
    # Build inference Graph.
    logits = mnist.inference(images, None, None, None, train=True)

    # Build the portion of the Graph calculating the losses. Note that we will
    # assemble the total_loss using a custom function below.
    _ = mnist.loss(logits, labels)

    # Assemble all of the losses for the current tower only.
    losses = tf.get_collection('losses', scope)

    # Calculate the total loss for the current tower.
    total_loss = tf.add_n(losses, name='total_loss')

    # Attach a scalar summary to all individual losses and the total loss; do the
    # same for the averaged version of the losses.
    for l in losses + [total_loss]:
        # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training
        # session. This helps the clarity of presentation on tensorboard.
        loss_name = re.sub('%s_[0-9]*/' % mnist.TOWER_NAME, '', l.op.name)
        tf.summary.scalar(loss_name, l)

    return total_loss
def run_training():
  """Train MNIST for a number of steps."""
  # Get the sets of images and labels for training, validation, and
  # test on MNIST.
  data_sets = input_data.read_data_sets(FLAGS.train_dir, FLAGS.fake_data)
  # Tell TensorFlow that the model will be built into the default Graph.
  with tf.Graph().as_default():
    # Generate placeholders for the images and labels.
    images_placeholder, labels_placeholder = placeholder_inputs(
        FLAGS.batch_size)
    # Build a Graph that computes predictions from the inference model.
    logits = mnist.inference(images_placeholder,
                             FLAGS.hidden1,
                             FLAGS.hidden2)
    # Add to the Graph the Ops for loss calculation.
    loss = mnist.loss(logits, labels_placeholder)
    # Add to the Graph the Ops that calculate and apply gradients.
    train_op = mnist.training(loss, FLAGS.learning_rate)
    # Add the Op to compare the logits to the labels during evaluation.
    eval_correct = mnist.evaluation(logits, labels_placeholder)
    # Build the summary operation based on the TF collection of Summaries.
    summary_op = tf.merge_all_summaries()
    # Create a saver for writing training checkpoints.
    saver = tf.train.Saver()
    # Create a session for running Ops on the Graph.
    sess = tf.Session()
    # Run the Op to initialize the variables.
    init = tf.initialize_all_variables()
    sess.run(init)
    # Instantiate a SummaryWriter to output summaries and the Graph.
    summary_writer = tf.train.SummaryWriter(FLAGS.train_dir,
                                            graph_def=sess.graph_def)
    # And then after everything is built, start the training loop.
    for step in xrange(FLAGS.max_steps):
      start_time = time.time()
      # Fill a feed dictionary with the actual set of images and labels
      # for this particular training step.
      feed_dict = fill_feed_dict(data_sets.train,
                                 images_placeholder,
                                 labels_placeholder)
      # Run one step of the model.  The return values are the activations
      # from the `train_op` (which is discarded) and the `loss` Op.  To
      # inspect the values of your Ops or variables, you may include them
      # in the list passed to sess.run() and the value tensors will be
      # returned in the tuple from the call.
      _, loss_value = sess.run([train_op, loss],
                               feed_dict=feed_dict)
      duration = time.time() - start_time
      # Write the summaries and print an overview fairly often.
      if step % 100 == 0:
        # Print status to stdout.
        print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value, duration))
        # Update the events file.
        summary_str = sess.run(summary_op, feed_dict=feed_dict)
        summary_writer.add_summary(summary_str, step)
      # Save a checkpoint and evaluate the model periodically.
      if (step + 1) % 1000 == 0 or (step + 1) == FLAGS.max_steps:
        saver.save(sess, FLAGS.train_dir, global_step=step)
        # Evaluate against the training set.
        print('Training Data Eval:')
        do_eval(sess,
                eval_correct,
                images_placeholder,
                labels_placeholder,
                data_sets.train)
        # Evaluate against the validation set.
        print('Validation Data Eval:')
        do_eval(sess,
                eval_correct,
                images_placeholder,
                labels_placeholder,
                data_sets.validation)
        # Evaluate against the test set.
        print('Test Data Eval:')
        do_eval(sess,
                eval_correct,
                images_placeholder,
                labels_placeholder,
                data_sets.test)
def run_training():
    # 获取数据
    data_sets = input_data.read_data_sets(FLAGS.input_data_dir, FLAGS.fake_data)
    # 在默认Graph下运行.
    with tf.Graph().as_default():
        # 配置graph
        images_placeholder, labels_placeholder = placeholder_inputs(FLAGS.batch_size)
        # logits是shape为(batch_size,NUM_CLASSES),表示每条数据预测后未归一化的概率分布
        logits = mnist.inference(images_placeholder,
                                    FLAGS.hidden1,
                                    FLAGS.hidden2)
        # 损失函数
        loss = mnist.loss(logits, labels_placeholder)
        train_op = mnist.training(loss, FLAGS.learning_rate)
        eval_correct = mnist.evaluation(logits, labels_placeholder)

        # 汇聚tensor
        summary = tf.summary.merge_all()
        # 建立初始化机制
        init = tf.global_variables_initializer()
        # 建立保存机制
        saver = tf.train.Saver()
        # 建立Session
        sess = tf.Session()

        # 建立一个SummaryWriter输出汇聚的tensor
        summary_writer = tf.summary.FileWriter(FLAGS.log_dir, sess.graph)

        # 开始执行

        # 执行变量
        sess.run(init)

        # 开始训练,2000次循环
        for step in xrange(FLAGS.max_steps):
            start_time = time.time()

            #获取当次循环的数据
            feed_dict = fill_feed_dict(data_sets.train,
                                        images_placeholder,
                                        labels_placeholder)

            # 丢弃了train数据, 记录loss数据, sess.run(train_op)是最简单的训练代码
            # run(self, fetches, feed_dict=None, options=None, run_metadata=None)
            # fetches格式很自由,详情见help(tf.Session.run)
            _, loss_value = sess.run([train_op, loss], feed_dict=feed_dict)
            duration = time.time() - start_time

            # 每训练100次输出当前损失,并记录数据
            if step % 100 == 0:
                print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value, duration))
                summary_str = sess.run(summary, feed_dict=feed_dict)
                summary_writer.add_summary(summary_str, step)
                summary_writer.flush()

            # 每1000次测试模型
            if (step + 1) % 1000 == 0 or (step + 1) == FLAGS.max_steps:
                checkpoint_file = os.path.join(FLAGS.log_dir, 'model.ckpt')
                saver.save(sess, checkpoint_file, global_step=step)
                # Evaluate against the training set.
                print('Training Data Eval:')
                do_eval(sess,
                        eval_correct,
                        images_placeholder,
                        labels_placeholder,
                        data_sets.train)
                # Evaluate against the validation set.
                print('Validation Data Eval:')
                do_eval(sess,
                        eval_correct,
                        images_placeholder,
                        labels_placeholder,
                        data_sets.validation)
                # Evaluate against the test set.
                print('Test Data Eval:')
                do_eval(sess,
                        eval_correct,
                        images_placeholder,
                        labels_placeholder,
                        data_sets.test)
Esempio n. 24
0
def run_training():
    """
    Train MNIST for a number of steps
    """

    data_sets = input_data.read_data_sets(FLAGS.train_dir, FLAGS.fake_data)

    with tf.Graph().as_default():
        images_placeholder, labels_placeholder = placeholder_inputs(FLAGS.batch_size)
        logits = mnist.inference(
            images_placeholder,
            FLAGS.hidden1,
            FLAGS.hidden2
        )
        loss = mnist.loss(logits, labels_placeholder)
        train_op = mnist.training(loss, FLAGS.learning_rate)
        eval_correct = mnist.evaluation(logits, labels_placeholder)
        
        summary_op = tf.merge_all_summaries()

        saver = tf.train.Saver()

        sess = tf.Session()

        init = tf.initialize_all_variables()
        sess.run(init)

        summary_writer = tf.training.summary_io.SummaryWriter(
            FLAGS.train_dir,
            graph_def = sess.graph_def
        )

        for step in xrange(FLAGS.max_steps):
            start_time = time.time()

            feed_dict = fill_feed_dict(
                data_sets.train,
                images_placeholder,
                labels_placeholder
            )

            _, loss_value = sess.run([train_op, loss], feed_dict = feed_dict)

            duration = time.time() - start_time

            if step % 100 == 0:
                print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value, duration))
                summary_str = sess.run(summary_op, feed_dict=feed_dict)
                summary_writer.add_summary(summary_str, step)

            if (step + 1) % 1000 == 0 or (step + 1) == FLAGS.max_steps:
                saver.save(sess, FLAGS.train_dir, global_step = step)

                print('Evaluating on training data...')
                do_eval(
                    sess,
                    eval_correct,
                    images_placeholder,
                    labels_placeholder,
                    data_sets.train
                )

                print('Evaluating on validation data...')
                do_eval(
                    sess,
                    eval_correct,
                    images_placeholder,
                    labels_placeholder,
                    data_sets.validation
                )

                print('Evaluating on test data...')
                do_eval(
                    sess,
                    eval_correct,
                    images_placeholder,
                    labels_placeholder,
                    data_sets.test
                )
Esempio n. 25
0
def tower_loss(scope):
    images, labels = mnist_inputs(min_after_dequeue=MIN_AFTER_DEQUEUE)
    logits = mnist.inference(images)
    _ = mnist.loss(logits, labels)
    loss = tf.get_collection("loss", scope=scope)[0]
    return loss
Esempio n. 26
0
def run_training():
    """
    TRAIN MNIST for a number of steps
    """
    # Get the sets of images and labels for training, validation, and
    # test on MNIST
    data_sets = input_data.read_data_sets(FLAGS.input_data_dir,
                                          FLAGS.fake_data)

    # Tell TensorFlow that the model will be built into the default Graph
    with tf.Graph().as_default():
        # Generate placeholders for the images and labels
        images_placeholder, labels_placeholder = placeholder_inputs(
            FLAGS.batch_size)

        # Build a Graph that computes predictions from the inference model
        logits = mnist.inference(images_placeholder, FLAGS.hidden1,
                                 FLAGS.hidden2)

        # Add to the Graph the Ops for loss calculation
        loss = mnist.loss(logits, labels_placeholder)

        # Add to the Graph the Ops that calculate and apply gradients
        train_op = mnist.training(loss, FLAGS.learning_rate)

        # Add the Op to compare the logits to the labels during evaluation
        eval_correct = mnist.evaluation(logits, labels_placeholder)

        # Build the summary Tensor based on the TF collection of Summaries
        summary = tf.summary.merge_all()

        # Add the variable initializer Op
        init = tf.global_variables_initializer()

        # Create a saver for writing training checkpoints
        saver = tf.train.Saver()
        # Create a session for running Ops on the Graph
        sess = tf.Session()
        # Instantiate a SummaryWriter to output summaries and the Graph
        summary_writer = tf.summary.FileWriter(FLAGS.log_dir, sess.graph)

        # And then after everything is built:

        # Run the Op to initialize the variables
        sess.run(init)

        # Start the training loop
        for step in xrange(FLAGS.max_steps):
            start_time = time.time()
            """
            TODO: Add progress bar

            """
            # Fill a feed dictionary with the actual set of images and labels
            # for t his particular training step
            feed_dict = fill_feed_dict(data_sets.train, images_placeholder,
                                       labels_placeholder)
            # Run one step of the model. The return values are the activations
            # from the 'train_op' (which is discarded) and the 'loss' Op.
            # To inspect the values of your Ops or variables, you m ay include
            # them in the list passed to sess.run() and the value tensors
            # will be returned in the tuple from the calculate
            _, loss_value = sess.run([train_op, loss], feed_dict=feed_dict)

            duration = time.time() - start_time

            # Write the summaries and print an overview fairly often
            if step % 100 == 0:
                # Print status to stdout
                print('Step %d: loss = %.2f (%.3f)' %
                      (step, loss_value, duration))
                #Update the events file
                summary_str = sess.run(summary, feed_dict=feed_dict)
                summary_writer.add_summary(summary_str, step)
                summary_writer.flush()

            # Save a checkpoint and evaluate the model periodically
            if (step + 1) % 1000 == 0 or (step + 1) == FLAGS.max_steps:
                checkpoint_file = os.path.join(FLAGS.log_dir, 'model.ckpt')
                saver.save(sess, checkpoint_file, global_step=step)
                # Evaluate against the training set
                print('Training Data Eval:')
                do_eval(sess, eval_correct, images_placeholder,
                        labels_placeholder, data_sets.train)
                # Evaluate against the validationset
                print('Validation Data Eval:')
                do_eval(sess, eval_correct, images_placeholder,
                        labels_placeholder, data_sets.validation)
                # Evaluate against the test set
                print('Test Data Eval:')
                do_eval(sess, eval_correct, images_placeholder,
                        labels_placeholder, data_sets.test)
Esempio n. 27
0
def run_training():
    """Train MNIST for a number of steps."""
    # Get the sets of images and labels for training, validation, and test on MNIST.
    data_sets = aymericdamien.input_data.read_data_sets(FLAGS.train_dir, FLAGS.fake_data)

    # Tell TensorFlow that the model will be built into the default Graph.
    with tf.Graph().as_default():
        # Generate placeholders for the images and labels
        images_placeholder, labels_placeholder = placeholder_inputs(FLAGS.batch_size)

        # Build a graph the Ops for loss calculation
        logits = mnist.inference(
            images_placeholder,
            FLAGS.hidden1,
            FLAGS.hidden2
        )

        # Add to the graph ops for loss calculation
        loss = mnist.loss(logits, labels_placeholder)

        # Add to the Graph the Ops that calculate and apply gradients.
        train_op = mnist.training(loss, FLAGS.learning_rate)

        # Add the Op to compare the logits to the labels during evaluation
        eval_correct = mnist.evaluation(logits, labels_placeholder)

        # Build the summary operation based on the TF collection of Summaries.
        summary_op = tf.merge_all_summaries()

        # Create a saver for writing training checkpoints
        saver = tf.train.Saver()

        # create a session for running ops on the graph
        session = tf.Session()

        # Run the Op to initialize the variables
        init = tf.initialize_all_variables()
        session.run(init)

        # Instantiate a SummaryWriter to output summaries and the Graph
        summary_writer = tf.train.SummaryWriter(
            FLAGS.train_dir,
            graph_def=session.graph_def
        )

        # After everything is built start the training loop
        for step in xrange(FLAGS.max_steps):
            start_time = time()

            # Fill the feed dictionary with the actual set of images and labels
            # for this training step
            feed_dict = fill_feed_dict(
                data_sets.train,
                images_placeholder,
                labels_placeholder
            )

            # run one step of the model, return values are the activations from
            # the 'train_op' (which is discarded) and the 'loss' Op.  To inspect
            # the values of your Ops or Variables, you may include them in the list
            # passed to the session.Run() and the value tensors will be returned in
            # the tuple from the call
            _, loss_value = session.run(
                [train_op, loss],
                feed_dict=feed_dict
            )

            duration = time() - start_time

            if step % 100 == 0:
                # print status update
                print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value, duration))
                # Update the events file.
                summary_str = session.run(summary_op, feed_dict=feed_dict)
                summary_writer.add_summary(summary_str, step)

            # Save a checkpoint and evaluate the model periodically
            if (step + 1) % 1000 == 0 or (step + 1) == FLAGS.max_steps:
                saver.save(session, FLAGS.train_dir, global_step=step)
                # Evaluate against the training set.
                print('Training Data Eval:')
                do_evaluation(
                    session,
                    eval_correct,
                    images_placeholder,
                    labels_placeholder,
                    data_sets.train)

                # Evaluate against the validation set.
                print('Validation Data Eval:')
                do_evaluation(
                    session,
                    eval_correct,
                    images_placeholder,
                    labels_placeholder,
                    data_sets.validation)
                # Evaluate against the test set.
                print('Test Data Eval:')
                do_evaluation(
                    session,
                    eval_correct,
                    images_placeholder,
                    labels_placeholder,
                    data_sets.test)
def run_training():
    # Get the sets of images and labels for training, validation, and test on MNIST.
    data_sets = input_data.read_data_sets(FLAGS.input_data_dir,
                                          FLAGS.fake_data)

    # Tell TensorFlow that the model will be built into the default Graph.
    with tf.Graph().as_default():
        image_placeholder, labels_placeholder = placeholder_inputs(
            FLAGS.batch_size, mnist.IMAGE_PIEXLS)

        logits = mnist.inference(image_placeholder, FLAGS.hidden_unt1,
                                 FLAGS.hidden_unt2)

        loss = mnist.loss(logits, labels_placeholder)

        train_op = mnist.training(loss, learning_rate=FLAGS.lr)

        eval_correct = mnist.evalution(logits, labels_placeholder)

        summary = tf.summary.merge_all()

        # Add the variable initializer Op.
        init = tf.global_variables_initializer()

        # Create a saver for writing training checkpoints.
        saver = tf.train.Saver()

        with tf.Session() as sess:
            # Instantiate a SummaryWriter to output summaries and the Graph.
            summary_writer = tf.summary.FileWriter(FLAGS.log_dir, sess.graph)

            # Run the Op to initialize the variables.
            sess.run(init)

            feed_dict = fill_feed_dict(data_sets.train, image_placeholder,
                                       labels_placeholder)

            for step in xrange(FLAGS.max_steps):
                start_time = time.time()

                _, loss_value = sess.run([train_op, loss], feed_dict=feed_dict)

                duration = time.time() - start_time

                if step % 100 == 0:
                    # Print status to stdout.
                    print('Step %d: loss = %.2f (%.3f sec)' %
                          (step, loss_value, duration))
                    # Update the events file.
                    summary_str = sess.run(summary, feed_dict=feed_dict)
                    summary_writer.add_summary(summary_str, step)
                    summary_writer.flush()

                # Save a checkpoint and evaluate the model periodically.
                if (step + 1) % 1000 == 0 or (step + 1) == FLAGS.max_steps:
                    checkpoint_file = os.path.join(FLAGS.log_dir, 'model.ckpt')
                    saver.save(sess, checkpoint_file, global_step=step)
                    # Evaluate against the training set.
                    print('Training Data Eval:')
                    do_eval(sess, eval_correct, image_placeholder,
                            labels_placeholder, data_sets.train)
                    # Evaluate against the validation set.
                    print('Validation Data Eval:')
                    do_eval(sess, eval_correct, image_placeholder,
                            labels_placeholder, data_sets.validation)
                    # Evaluate against the test set.
                    print('Test Data Eval:')
                    do_eval(sess, eval_correct, image_placeholder,
                            labels_placeholder, data_sets.test)
Esempio n. 29
0
def run_training(learning_rate=FLAGS.learning_rate,
        momentum=FLAGS.momentum,
        max_norm=FLAGS.max_norm,
        weight_decay=FLAGS.weight_decay,
        keep_prob=FLAGS.keep_prob,
        keep_input=FLAGS.keep_input,
        beta2=FLAGS.beta2,
        num_layers=FLAGS.num_layers):
  """Train MNIST for a number of steps."""
  # Get the sets of images and labels for training, validation, and
  # test on MNIST.
  data_sets = input_data.read_data_sets(FLAGS.train_dir, FLAGS.fake_data)

  # Tell TensorFlow that the model will be built into the default Graph.
  with tf.Graph().as_default():
    # Generate placeholders for the images and labels.
    images_placeholder = tf.placeholder(tf.float32, shape=(None,
                                                         mnist.IMAGE_PIXELS), name='images')
    labels_placeholder = tf.placeholder(tf.int32, shape=[None], name='labels')

    keep_prob_pl = tf.placeholder(tf.float32, name='keep_prob_pl')
    keep_input_pl = tf.placeholder(tf.float32, name='keep_input_pl')
    learning_rate_pl = tf.placeholder(tf.float32, name='learning_rate_pl')
    
    def fill_feed_dict(data_set, batch_size=FLAGS.batch_size):
      # Create the feed_dict for the placeholders filled with the next
      # `batch size ` examples.
      images_feed, labels_feed = data_set.next_batch(batch_size,
                                                     FLAGS.fake_data)
      feed_dict = {
          images_placeholder: images_feed,
          labels_placeholder: labels_feed,
          keep_prob_pl: keep_prob,
          keep_input_pl: keep_input,
          learning_rate_pl: learning_rate
      }
      return feed_dict
    
    def fill_feed_dict_eval(data_set):
      return {
        images_placeholder: data_set._images,
        labels_placeholder: data_set._labels,
        keep_prob_pl: 1.0,
        keep_input_pl: 1.0,
      }

    # Build a Graph that computes predictions from the inference model.
    with tf.variable_scope('feed_forward_model') as scope:
      logits, bn = mnist.inference(images_placeholder,
                         FLAGS.hidden1,
                         num_layers,
                         weight_decay,
                         keep_prob_pl,
                         keep_input_pl,
                         max_norm)
                     
    # Add to the Graph the Ops for loss calculation.
    loss = mnist.loss(logits, labels_placeholder)
    #loss_eval = mnist.loss( logits_eval, labels_placeholder)
    # Add to the Graph the Ops that calculate and apply gradients.
    train_op = mnist.training(loss, learning_rate_pl, momentum, beta2)
    
    with tf.control_dependencies([train_op]):
      train_op = tf.group(*[b.get_assigner() for b in bn])           
    # Add the Op to compare the logits to the labels during evaluation.
    eval_correct = mnist.evaluation(logits, labels_placeholder)
    results = tf.placeholder( tf.float32, [4])

    summarize_evaluation = tf.scalar_summary(['correct_train', 'loss_train', 'correct_test', 'loss_test'], results)
    # Build the summary operation based on the TF collection of Summaries.
    summary_op = tf.merge_all_summaries()
  
    # Create a saver for writing training checkpoints.
    saver = tf.train.Saver(max_to_keep=2)

    train_loss = test_loss = 0
    train_cor = test_cor = 0.97
    previous_test_loss = None

    first_step = 0
    # Create a session for running Ops on the Graph.
    sess = tf.Session()

    summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph_def)
    restore_path = tf.train.latest_checkpoint("/Users/mikowals/projects/mnist")
    if restore_path:
      saver.restore(sess, restore_path)
      first_step = int(restore_path.split('/')[-1].split('-')[-1])
      print('retored variables from ',  restore_path)
    else:
      # Run the Op to initialize the variables.
      print('initializing variables')
      init = tf.initialize_all_variables()
      sess.run(init)

    # And then after everything is built, start the training loop.
    for step in range(first_step,FLAGS.max_steps):
      start_time = time.time()

      # Fill a feed dictionary with the actual set of images and labels
      # for this particular training step.
      feed_dict = fill_feed_dict(data_sets.train)
      
      # Run one step of the model.  The return values are the activations
      # from the `train_op` (which is discarded) and the `loss` Op.  To
      # inspect the values of your Ops or variables, you may include them
      # in the list passed to sess.run() and the value tensors will be
      # returned in the tuple from the call.
      _, loss_value = sess.run([train_op, loss],
                               feed_dict=feed_dict)
      
      duration = time.time() - start_time
      # Write the summaries and print an overview fairly often.
      
      # Save a checkpoint and evaluate the model periodically.
      if (step + 1) % 1000 == 0 or (step + 1) == FLAGS.max_steps:
        saver.save(sess, FLAGS.train_dir, global_step=step)
        # Evaluate against the training set.
        
        # Evaluate against the validation set.
        print('training Data Eval:')
        feed_dict = fill_feed_dict_eval(data_sets.train)
        train_cor, train_loss = sess.run([eval_correct, loss], feed_dict=feed_dict)
        train_cor = train_cor / data_sets.train.num_examples
        print(train_cor, train_loss)
  
        print('Validation Data Eval:')
        feed_dict = fill_feed_dict_eval(data_sets.validation)
        test_cor, test_loss = sess.run([eval_correct, loss], feed_dict=feed_dict)
        test_cor = test_cor / data_sets.validation.num_examples
        print (test_cor, test_loss )
        #if previous_test_loss and test_loss > previous_test_loss:
        #  learning_rate = learning_rate * 0.6
        #if previous_test_loss and test_loss < previous_test_loss:
        #  learning_rate = learning_rate * 1.02
        #previous_test_loss = test_loss
        

      if step > 1000 and step % 100 == 0:
        # Print status to stdout.
        print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value, duration))
        # Update the events file.
        feed_dict[results] = [
          train_cor, 
          train_loss, 
          test_cor, 
          test_loss]
        summary_str = sess.run(summary_op, feed_dict=feed_dict)
        summary_writer.add_summary(summary_str, step)

  return -test_cor 
Esempio n. 30
0
def train_model():
    """
	This method deals with training the model in mnist-model.py by running optimization over a number of steps.
	"""

    #read input da(ta first
    #read_data_sets() function will ensure that the correct data has been downloaded to your local training folder
    #and then unpack that data to return a dictionary of DataSet instances.
    #FLAGS.fake_data can be ignored as it is used-for unit testing purposes
    print("Training Started! ")
    data = input_data.read_data_sets(FLAGS.input_data_dir, FLAGS.fake_data)

    #We need to specify that our model will be used with the Default global Tensor Flow graph.
    #A default global TF graph tf.Graph() is a collection of ops that may be executed as a group
    with tf.Graph().as_default():
        # Generate placeholders for the images and labels.
        images_ph, labels_ph = input_placeholders(FLAGS.batch_size)

        # Build a Graph that computes predictions from the inference model.
        logits = mnist.inference(images_ph, FLAGS.num_hidden1_nodes,
                                 FLAGS.num_hidden1_nodes)

        # Add the loss calculation op to the default Graph
        loss = mnist.loss(logits, labels_ph)

        # Add the minimization op the Graph
        train_op = mnist.training(loss, FLAGS.eta)

        # Add the Evaluation to test predictions to the Graph
        eval_correct = mnist.evaluation(logits, labels_ph)

        # Build the summary Tensor based on the TF collection of Summaries.
        summary = tf.summary.merge_all()

        # Add the variable initializer Op. to the Graph
        init = tf.global_variables_initializer()

        #Create a Tensor Flow Saver for writing Training Checkpoints
        saver = tf.train.Saver()

        #Now once all the build preparation is completed and all the ops are added to the Graph,
        #we need to create a Session in order to run the computaional Graph
        sess = tf.Session()

        #We also need to create a TF Summary Writer in order to record all the summaries and the Graph
        summary_writer = tf.summary.FileWriter(FLAGS.log_dir, sess.graph)

        #Now all the required Ops are attached to the Default Graph and all is built,
        #Start the session by initialing all the TF variables.
        sess.run(init)

        #Start the Training
        for step in xrange(FLAGS.max_steps):
            start_time = time.time()

            #Update the feed_dict with the next batch of samples to train  with.
            feed_dict = fill_feed_dictionary(data.train, images_ph, labels_ph)

            #Run one step of the training by running Ops train_op and loss
            #No need to store the activations returned by the train_op minimization step
            _, loss_val = sess.run([train_op, loss], feed_dict=feed_dict)

            duration = time.time() - start_time

            #Record all the training summaries generates and print the training progress/statistics
            #after every 100th iteration
            if step % 100 == 0:
                # Print status to stdout.
                print('Step %d: loss = %.2f (%.3f sec)' %
                      (step, loss_val, duration))

                #Now we need to update the events file with summaries
                #Run the summary Op attached to the Graph
                #Everytime the summary is evaluated, new summaries are written into the events files
                summary_str = sess.run(summary, feed_dict=feed_dict)
                summary_writer.add_summary(summary_str, step)
                summary_writer.flush()

#Save the Model at every 1000th iteration and perform evaluation on complete data
            if (step + 1) % 1000 == 0 or (step + 1) == FLAGS.max_steps:
                checkpoint_file = os.path.join(FLAGS.log_dir, 'model.ckpt')
                saver.save(sess, checkpoint_file, global_step=step)

#Evaluate against the training set
            print('Training Data Evaluation:')
            do_eval(sess, eval_correct, images_ph, labels_ph, data.train)
            print('Validation Data Evaluation:')
            do_eval(sess, eval_correct, images_ph, labels_ph, data.validation)
            # Evaluate against the test set.
            print('Test Data Evaluation:')
            do_eval(sess, eval_correct, images_ph, labels_ph, data.test)
Esempio n. 31
0
def run_training():
    """Train MNIST for a number of steps."""
    # Get the sets of images and labels for training, validation, and
    # test on MNIST.
    data_sets = input_data.read_data_sets(FLAGS.train_dir, FLAGS.fake_data)

    # labels and images are properties of the DataSet class, which is
    # defined at tensorflow/contrib/learn/python/learn/datasets/mnist.py
    # numpy.savetxt("/tmp/xx.csv", data_sets.train.labels, delimiter=",")
    # numpy.savetxt("/tmp/yy.csv", data_sets.test.images, delimiter=",")
    # sys.exit(1)

    # Tell TensorFlow that the model will be built into the default Graph.
    with tf.Graph().as_default():
        # Generate placeholders for the images and labels.
        images_placeholder, labels_placeholder = placeholder_inputs(FLAGS.batch_size)

        # Build a Graph that computes predictions from the inference model.
        logits = mnist.inference(images_placeholder, FLAGS.hidden1, FLAGS.hidden2)

        # Add to the Graph the Ops for loss calculation.
        loss = mnist.loss(logits, labels_placeholder)

        # Add to the Graph the Ops that calculate and apply gradients.
        train_op = mnist.training(loss, FLAGS.learning_rate)

        # Add the Op to compare the logits to the labels during evaluation.
        eval_correct = mnist.evaluation(logits, labels_placeholder)

        # Build the summary operation based on the TF collection of Summaries.
        summary_op = tf.merge_all_summaries()

        # Add the variable initializer Op.
        init = tf.initialize_all_variables()

        # Create a saver for writing training checkpoints.
        saver = tf.train.Saver()

        # Create a session for running Ops on the Graph.
        sess = tf.Session()

        # Instantiate a SummaryWriter to output summaries and the Graph.
        summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph)

        # And then after everything is built:

        # Run the Op to initialize the variables.
        sess.run(init)

        # Start the training loop.
        for step in xrange(FLAGS.max_steps):
            start_time = time.time()

            # Fill a feed dictionary with the actual set of images and labels
            # for this particular training step.
            feed_dict = fill_feed_dict(data_sets.train, images_placeholder, labels_placeholder)

            # Run one step of the model.  The return values are the activations
            # from the `train_op` (which is discarded) and the `loss` Op.  To
            # inspect the values of your Ops or variables, you may include them
            # in the list passed to sess.run() and the value tensors will be
            # returned in the tuple from the call.
            _, loss_value = sess.run([train_op, loss], feed_dict=feed_dict)

            duration = time.time() - start_time

            # Write the summaries and print an overview fairly often.
            if step % 100 == 0:
                # Print status to stdout.
                print("Step %d: loss = %.2f (%.3f sec)" % (step, loss_value, duration))
                # Update the events file.
                summary_str = sess.run(summary_op, feed_dict=feed_dict)
                summary_writer.add_summary(summary_str, step)
                summary_writer.flush()

            # Save a checkpoint and evaluate the model periodically.
            if (step + 1) % 1000 == 0 or (step + 1) == FLAGS.max_steps:
                checkpoint_file = os.path.join(FLAGS.train_dir, "checkpoint")
                saver.save(sess, checkpoint_file, global_step=step)
                # Evaluate against the training set.
                print("Training Data Eval:")
                do_eval(sess, eval_correct, images_placeholder, labels_placeholder, data_sets.train)
                # Evaluate against the validation set.
                print("Validation Data Eval:")
                do_eval(sess, eval_correct, images_placeholder, labels_placeholder, data_sets.validation)
                # Evaluate against the test set.
                print("Test Data Eval:")
                do_eval(sess, eval_correct, images_placeholder, labels_placeholder, data_sets.test)

            if (step + 1) == FLAGS.max_steps:
                float_formatter = lambda x: "%.2f" % x
                numpy.set_printoptions(formatter={"float_kind": float_formatter})
                feed_dict = fill_feed_dict(data_sets.test, images_placeholder, labels_placeholder)
                #  output with softmax
                #  output = sess.run(tf.nn.softmax(logits), feed_dict=feed_dict)
                #  output without softmax
                output = sess.run(tf.argmax(logits, dimension=1), feed_dict=feed_dict)
                numpy.savetxt("/tmp/outputX.csv", output, delimiter=",")
def run_training():
    """Train MNIST for a number of steps."""
    # Get the sets of images and labels for training, validation, and
    # test on MNIST.
    data_sets=reader(patchlength=0,\
              maxlength=300,\
              embedding_size=100,\
              num_verbs=10,\
              allinclude=False,\
              shorten=False,\
              shorten_front=False,\
              testflag=False,\
              passnum=0,\
              dpflag=False)

    # Tell TensorFlow that the model will be built into the default Graph.
    with tf.Graph().as_default():
        # Generate placeholders for the images and labels.
        images_placeholder, labels_placeholder = placeholder_inputs(
            FLAGS.batch_size)

        # Build a Graph that computes predictions from the inference model.
        logits, keep_prob = mnist.inference(images_placeholder, FLAGS.hidden1,
                                            FLAGS.hidden2)

        # Add to the Graph the Ops for loss calculation.
        loss = mnist.loss(logits, labels_placeholder)

        # Add to the Graph the Ops that calculate and apply gradients.
        train_op = mnist.training(loss, FLAGS.learning_rate)

        # Add the Op to compare the logits to the labels during evaluation.
        eval_correct = mnist.evaluation(logits, labels_placeholder)

        # Build the summary Tensor based on the TF collection of Summaries.
        summary = tf.summary.merge_all()

        # Add the variable initializer Op.
        init = tf.global_variables_initializer()

        # Create a saver for writing training checkpoints.
        saver = tf.train.Saver()

        # Create a session for running Ops on the Graph.
        sess = tf.Session()

        # Instantiate a SummaryWriter to output summaries and the Graph.
        summary_writer = tf.summary.FileWriter(FLAGS.log_dir, sess.graph)

        # And then after everything is built:

        # Run the Op to initialize the variables.
        with tf.Session() as session:
            sess.run(init)
            if True:
                model_file = tf.train.latest_checkpoint(FLAGS.log_dir)
                saver.restore(sess, model_file)

            # Start the training loop.
            start_time = time.time()
            for step in xrange(FLAGS.max_steps):

                # Fill a feed dictionary with the actual set of images and labels
                # for this particular training step.

                inputs, answers = data_sets.list_tags(FLAGS.batch_size,
                                                      test=False)
                #      print(len(inputs),len(inputs[0]),inputs[0])
                #      input()
                inputs2 = []
                for i in range(len(inputs)):
                    inputs2.append(inputs[i] / 255)
#      print(len(inputs2),len(inputs2[0]),inputs2[0])
#      input()
                feed_dict = {
                    images_placeholder: inputs2,
                    labels_placeholder: answers,
                    keep_prob: 0.5
                }
                # Run one step of the model.  The return values are the activations
                # from the `train_op` (which is discarded) and the `loss` Op.  To
                # inspect the values of your Ops or variables, you may include them
                # in the list passed to sess.run() and the value tensors will be
                # returned in the tuple from the call.
                _, loss_value, logi = sess.run([train_op, loss, logits],
                                               feed_dict=feed_dict)

                duration = time.time() - start_time

                # Write the summaries and print an overview fairly often.
                if step % 100 == 0:
                    # Print status to stdout.
                    print('Step %d: loss = %.2f (%.3f sec)' %
                          (step, loss_value, duration))
                    #            print(logi)
                    #            print(answers)
                    for i0 in range(FLAGS.batch_size):
                        lgans = np.argmax(logi[i0])
                        if (lgans != answers[i0] and False):
                            for tt in range(784):
                                if (tt % 28 == 0): print(' ')
                                if (inputs[i0][tt] != 0):
                                    print('1', end=' ')
                                else:
                                    print('0', end=' ')


#                      print('np',np.argmax(i),answers,answers[i0],'np')
                            print(lgans, answers[i0])
                    # Update the events file.
                    summary_str = sess.run(summary, feed_dict=feed_dict)
                    summary_writer.add_summary(summary_str, step)
                    summary_writer.flush()
                if (step + 1) % 500 == 0 or (step + 1) == FLAGS.max_steps:
                    #print('Training Data Eval:')
                    do_eval(sess, eval_correct, data_sets, FLAGS.batch_size,
                            images_placeholder, labels_placeholder, keep_prob)
                    do_evalfake(sess, eval_correct, data_sets,
                                FLAGS.batch_size, images_placeholder,
                                labels_placeholder, logits, keep_prob)
                    # Save a checkpoint and evaluate the model periodically.
                    #if (step + 1) % 1000 == 0 or (step + 1) == FLAGS.max_steps:
                    checkpoint_file = os.path.join(FLAGS.log_dir, 'model.ckpt')
                    saver.save(sess, checkpoint_file, global_step=step)
                    print('saved to', checkpoint_file)
                '''
def train(target, dataset, cluster_spec):
    """Train Inception on a dataset for a number of steps."""
    # Number of workers and parameter servers are infered from the workers and ps
    # hosts string.
    num_workers = len(cluster_spec.as_dict()['worker'])
    num_parameter_servers = len(cluster_spec.as_dict()['ps'])
    # If no value is given, num_replicas_to_aggregate defaults to be the number of
    # workers.
    if FLAGS.num_replicas_to_aggregate == -1:
        num_replicas_to_aggregate = num_workers
    else:
        num_replicas_to_aggregate = FLAGS.num_replicas_to_aggregate

    # Both should be greater than 0 in a distributed training.
    assert num_workers > 0 and num_parameter_servers > 0, (
        ' num_workers and '
        'num_parameter_servers'
        ' must be > 0.')

    # Choose worker 0 as the chief. Note that any worker could be the chief
    # but there should be only one chief.
    is_chief = (FLAGS.task_id == 0)

    # Ops are assigned to worker by default.
    with tf.device(
            tf.train.replica_device_setter(
                worker_device='/job:worker/task:%d' % FLAGS.task_id,
                cluster=cluster_spec)):

        # Create a variable to count the number of train() calls. This equals the
        # number of updates applied to the variables. The PS holds the global step.
        global_step = tf.Variable(0, name="global_step", trainable=False)

        # Calculate the learning rate schedule.
        num_batches_per_epoch = (dataset.num_examples / FLAGS.batch_size)

        # Decay steps need to be divided by the number of replicas to aggregate.
        # This was the old decay schedule. Don't want this since it decays too fast with a fixed learning rate.
        decay_steps = int(num_batches_per_epoch * FLAGS.num_epochs_per_decay /
                          num_replicas_to_aggregate)
        # New decay schedule. Decay every few steps.
        #decay_steps = int(num_batches_per_epoch * FLAGS.num_epochs_per_decay / num_workers)

        # Decay the learning rate exponentially based on the number of steps.
        lr = tf.train.exponential_decay(FLAGS.initial_learning_rate,
                                        global_step,
                                        decay_steps,
                                        FLAGS.learning_rate_decay_factor,
                                        staircase=True)

        images, labels = mnist.placeholder_inputs(FLAGS.batch_size)

        # Number of classes in the Dataset label set plus 1.
        # Label 0 is reserved for an (unused) background class.
        logits = mnist.inference(images)

        # Add classification loss.
        total_loss = mnist.loss(logits, labels)

        # Create an optimizer that performs gradient descent.
        opt = tf.train.AdamOptimizer(lr)

        # Use V2 optimizer
        if FLAGS.interval_method or FLAGS.worker_times_cdf_method:
            opt = TimeoutReplicasOptimizer(
                opt,
                global_step,
                replicas_to_aggregate=num_replicas_to_aggregate,
                total_num_replicas=num_workers)
        else:
            opt = WeightedGradsOptimizer(
                #      opt = tf.train.SyncReplicasOptimizer(
                opt,
                replicas_to_aggregate=num_replicas_to_aggregate,
                total_num_replicas=num_workers)

        # Compute gradients with respect to the loss.
        grads = opt.compute_gradients(total_loss)

        #===============================================================================================
        #    batch_idx_placeholder = tf.placeholder(dtype=tf.int32, shape=(int(num_workers),))
        #    worker_kill_placeholder = tf.placeholder(dtype=tf.int32, shape=(FLAGS.num_worker_kill,))
        matrix_placeholder = tf.placeholder(dtype=tf.float32,
                                            shape=((int(num_batches_per_epoch),
                                                    int(num_workers))))
        '''
    weight_vec_placeholder = tf.placeholder(dtype=tf.float32,
                                            shape=(num_workers,))
    grad_list = [x[0] for x in grads]
    new_grad_list = []
    for g_idx in range(len(grad_list)):
        grad_on_worker = grad_list[g_idx]
        weight = tf.slice(weight_vec_placeholder, [i], [1])
        new_grad_list.append(tf.mul(grad_on_worker, weight))
    for x_idx in range(len(grads)):
        x = grads[x_idx]
        x[0] = new_grad_list[x_idx]
    '''
        #===============================================================================================

        if FLAGS.interval_method or FLAGS.worker_times_cdf_method:
            #      apply_gradients_op = opt.apply_gradients(grads, FLAGS.task_id, global_step=global_step, collect_cdfs=FLAGS.worker_times_cdf_method,
            #                            batch_idx_list=batch_idx_placeholder, worker_kill_list=worker_kill_placeholder,
            #                            num_workers=int(num_workers), num_batches_per_epoch=int(num_batches_per_epoch))
            apply_gradients_op = opt.apply_gradients(
                grads,
                FLAGS.task_id,
                global_step=global_step,
                collect_cdfs=FLAGS.worker_times_cdf_method,
                matrix_to_solve=matrix_placeholder,
                num_batches_per_epoch=int(num_batches_per_epoch))
        else:
            apply_gradients_op = opt.apply_gradients(grads,
                                                     global_step=global_step)

        with tf.control_dependencies([apply_gradients_op]):
            train_op = tf.identity(total_loss, name='train_op')

        # Get chief queue_runners, init_tokens and clean_up_op, which is used to
        # synchronize replicas.
        # More details can be found in sync_replicas_optimizer.
        chief_queue_runners = [opt.get_chief_queue_runner()]
        init_tokens_op = opt.get_init_tokens_op()
        #clean_up_op = opt.get_clean_up_op()

        # Create a saver.
        saver = tf.train.Saver()

        # Build the summary operation based on the TF collection of Summaries.
        summary_op = tf.summary.merge_all()

        # Build an initialization operation to run below.
        init_op = tf.global_variables_initializer()

        test_print_op = logging_ops.Print(0, [0], message="Test print success")

        # We run the summaries in the same thread as the training operations by
        # passing in None for summary_op to avoid a summary_thread being started.
        # Running summaries and training operations in parallel could run out of
        # GPU memory.
        if is_chief:
            local_init_op = opt.chief_init_op
        else:
            local_init_op = opt.local_step_init_op

        local_init_opt = [local_init_op]
        ready_for_local_init_op = opt.ready_for_local_init_op

        sv = tf.train.Supervisor(
            is_chief=is_chief,
            local_init_op=local_init_op,
            ready_for_local_init_op=ready_for_local_init_op,
            logdir=FLAGS.train_dir,
            init_op=init_op,
            summary_op=None,
            global_step=global_step,
            saver=saver,
            save_model_secs=FLAGS.save_interval_secs)

        tf.logging.info('%s Supervisor' % datetime.now())

        sess_config = tf.ConfigProto(
            allow_soft_placement=True,
            log_device_placement=FLAGS.log_device_placement)

        # Get a session.
        sess = sv.prepare_or_wait_for_session(target, config=sess_config)

        # Start the queue runners.
        queue_runners = tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS)
        sv.start_queue_runners(sess, queue_runners)
        tf.logging.info('Started %d queues for processing input data.',
                        len(queue_runners))

        if is_chief:
            if not FLAGS.interval_method or FLAGS.worker_times_cdf_method:
                sv.start_queue_runners(sess, chief_queue_runners)
            sess.run(init_tokens_op)

        # TIMEOUT client overseer.
        # Even if not using timeout, we want to wait until all machines are ready.
        timeout_client, timeout_server = launch_manager(sess, FLAGS)

        # Train, checking for Nans. Concurrently run the summary operation at a
        # specified interval. Note that the summary_op and train_op never run
        # simultaneously in order to prevent running out of GPU memory.
        next_summary_time = time.time() + FLAGS.save_summaries_secs
        begin_time = time.time()
        cur_iteration = -1
        iterations_finished = set()

        if FLAGS.task_id == 0 and FLAGS.interval_method:
            opt.start_interval_updates(sess, timeout_client)

        #the result of normal eqiation waited to be solved like min||Ax - b||^2


#    b = np.ones(int(num_batches_per_epoch))
        interval = np.arange(0, int(num_batches_per_epoch))
        idx_list = np.random.choice(interval, int(num_workers), replace=False)
        while not sv.should_stop():
            sys.stdout.flush()
            tf.logging.info("A new iteration...")

            cur_iteration += 1

            #sess.run([opt._wait_op], options=tf.RunOptions(timeout_in_ms=10000))
            #sess.run([opt._wait_op])
            #sess.run([test_print_op])

            if FLAGS.worker_times_cdf_method:
                sess.run([opt._wait_op])
                timeout_client.broadcast_worker_dequeued_token(cur_iteration)

            start_time = time.time()
            feed_dict = mnist.fill_feed_dict(dataset, images, labels,
                                             FLAGS.batch_size)

            run_options = tf.RunOptions()
            run_metadata = tf.RunMetadata()

            #===============================================================================================
            #      interval_2 = np.arange(0, int(num_workers))
            #      workers_to_kill = np.random.choice(interval_2, FLAGS.num_worker_kill, replace=False)

            LS_start_time = time.time()
            interval_2 = np.arange(0, int(num_workers))
            workers_to_kill = np.random.choice(interval_2,
                                               FLAGS.num_worker_kill,
                                               replace=False)
            #interval_2 = np.arange(0, WORKER_NUM)
            #workers_to_kill = np.random.choice(interval_2, NUM_WORKER_KILL, replace=False)
            A = np.zeros((int(num_workers), int(num_batches_per_epoch)))
            for i in range(A.shape[0]):
                if i == A.shape[0] - 1:
                    A[i][idx_list[i]] = 1
                    A[i][idx_list[0]] = 1
                else:
                    A[i][idx_list[i]] = 1
                    A[i][idx_list[i + 1]] = 1

            for i in range(len(idx_list)):
                element = idx_list[i]
                if element == A.shape[1] - 1:
                    idx_list[i] = 0
                else:
                    idx_list[i] += 1

            for k in workers_to_kill:
                A[k] = 0

            A_for_calc = np.transpose(A)
            #      x = np.dot(np.linalg.pinv(A_for_calc), b)
            #      tf.logging.info("workers killed this iteration:")
            #      tf.logging.info(str(workers_to_kill))
            #  tf.logging.info("The matrix to solve:")
            #  for item in A_for_calc:
            #    tf.logging.info(str(item))
            #      tf.logging.info("Solution of LS:")
            #      tf.logging.info(str(x))
            #      LS_duration = time.time() - LS_start_time
            #      tf.logging.info("LS run time: %s" % str(LS_duration))

            #===============================================================================================

            if FLAGS.timeline_logging:
                run_options.trace_level = tf.RunOptions.FULL_TRACE
                run_options.output_partition_graphs = True

            #timeout_ms = random.randint(300, 1200)
            #tf.logging.info("SETTING TIMEOUT FOR %d ms" % timeout_ms)
            #run_options.timeout_in_ms = 1000 * 60 * 1

            # Increment current iteration
            # Two more tiem in placeholder feed_dict
            feed_dict[matrix_placeholder] = A_for_calc

            tf.logging.info("RUNNING SESSION... %f" % time.time())
            loss_value, step = sess.run([train_op, global_step],
                                        feed_dict=feed_dict,
                                        run_metadata=run_metadata,
                                        options=run_options)
            tf.logging.info("DONE RUNNING SESSION...")

            if FLAGS.worker_times_cdf_method:
                timeout_client.broadcast_worker_finished_computing_gradients(
                    cur_iteration)

            assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

            # Log the elapsed time per iteration
            finish_time = time.time()

            # Create the Timeline object, and write it to a json
            if FLAGS.timeline_logging:
                tl = timeline.Timeline(run_metadata.step_stats)
                ctf = tl.generate_chrome_trace_format()
                with open(
                        '%s/worker=%d_timeline_iter=%d.json' %
                    (FLAGS.train_dir, FLAGS.task_id, step), 'w') as f:
                    f.write(ctf)

            if step > FLAGS.max_steps:
                break

            duration = time.time() - start_time
            examples_per_sec = FLAGS.batch_size / float(duration)
            format_str = ('Worker %d: %s: step %d, loss = %f'
                          '(%.1f examples/sec; %.3f  sec/batch)')
            tf.logging.info(format_str %
                            (FLAGS.task_id, datetime.now(), step, loss_value,
                             examples_per_sec, duration))

            # Determine if the summary_op should be run on the chief worker.
            if is_chief and next_summary_time < time.time(
            ) and FLAGS.should_summarize:

                tf.logging.info('Running Summary operation on the chief.')
                summary_str = sess.run(summary_op)
                sv.summary_computed(sess, summary_str)
                tf.logging.info('Finished running Summary operation.')

                # Determine the next time for running the summary.
                next_summary_time += FLAGS.save_summaries_secs

        if is_chief:
            tf.logging.info('Elapsed Time: %f' % (time.time() - begin_time))

        # Stop the supervisor.  This also waits for service threads to finish.
        sv.stop()

        # Save after the training ends.
        if is_chief:
            saver.save(sess,
                       os.path.join(FLAGS.train_dir, 'model.ckpt'),
                       global_step=global_step)
Esempio n. 34
0
def main(_):
    # build a model
    data = input_data.read_data_sets(FLAGS.data_dir, one_hot=False,
                                     fake_data=False)
    # make placeholders
    images = tf.placeholder(tf.float32, [FLAGS.batch_size, mnist.IMAGE_PIXELS],
                            name='inputs')
    labels = tf.placeholder(tf.int32, [FLAGS.batch_size], name='labels')

    # build model up to inference
    logits = mnist.inference(images, FLAGS.hidden_size, FLAGS.num_layers,
                             do_weightnorm=FLAGS.weightnorm,
                             do_batchnorm=FLAGS.batchnorm,
                             train=True)
    if not FLAGS.batchnorm:
        eval_logits = logits
    else:
        eval_logits = mnist.inference(images, FLAGS.hidden_size, FLAGS.num_layers,
                                      do_weightnorm=FLAGS.weightnorm,
                                      do_batchnorm=FLAGS.batchnorm,
                                      train=False)

    # get a loss function
    loss = mnist.loss(logits, labels, 'train_xent')
    eval_loss = mnist.loss(eval_logits, labels, 'eval_xent')
    # add a sumary of this to track the training loss
   
    # get training ops
    train_op, gstep = mnist.training(loss, FLAGS.learning_rate, FLAGS.momentum)

    # get an op to return precision on a batch
    eval_op = mnist.evaluation(eval_logits, labels)

    valid_var = tf.Variable(0, 'validation performance')
    valid_summ = tf.scalar_summary('validation accuracy', valid_var)

    # get summary op
    summarise = tf.merge_all_summaries()
    with tf.Session() as sess:
        writer = tf.train.SummaryWriter(FLAGS.logdir, sess.graph_def)
        tf.initialize_all_variables().run()
        # do some training
        print('nb: {} steps per epoch'.format(
            data.train.num_examples // FLAGS.batch_size))
        print('Step 0/{}.'.format(FLAGS.max_steps), end='')
        for i in range(FLAGS.max_steps):
            if (i+1) % 5 == 0:
                # write summaries, check on validation set
                if (i+1) % 100 == 0:
                    valid_perf = evaluate(sess,
                                          data.validation,
                                          logits,
                                          [eval_op, eval_loss],
                                          FLAGS.batch_size,
                                          images,
                                          labels,
                                          gstep,
                                          writer)
                    print()
                summ_str, _, _ = sess.run([summarise, loss, train_op],
                                          fill_feed(data.train, images, labels,
                                                    FLAGS.batch_size))
                writer.add_summary(summ_str, gstep.eval(session=sess))
            else:
                # do a step of training
                loss_val, _ = sess.run([loss, train_op],
                                   fill_feed(data.train, images, labels,
                                             FLAGS.batch_size))
                print('\rStep {} (loss {})'.format(i+1, loss_val), end='', flush=True)
        print('\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
        print('Test evaluation:')
        evaluate(sess, data.test, logits, [eval_op, eval_loss], FLAGS.batch_size, images, labels, gstep, writer)
        print('\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
Esempio n. 35
0
def train(dataset, testset):
    """Train on dataset for a number of steps."""
    with tf.Graph().as_default(), tf.device('/gpu:0'):
        # Create a variable to count the number of train() calls. This equals the
        # number of batches processed * FLAGS.num_gpus.
        global_step = tf.Variable(0, name="global_step", trainable=False)

        # Calculate the learning rate schedule.
        num_batches_per_epoch = (dataset.num_examples / FLAGS.batch_size)

        decay_steps = int(num_batches_per_epoch * FLAGS.num_epochs_per_decay)

        # Decay the learning rate exponentially based on the number of steps.
        lr = tf.train.exponential_decay(FLAGS.initial_learning_rate,
                                        global_step,
                                        decay_steps,
                                        FLAGS.learning_rate_decay_factor,
                                        staircase=True)

        # Create an optimizer that performs gradient descent.
        opt = tf.train.AdamOptimizer(lr)

        #fetch the data batch from training set
        images, labels = mnist.placeholder_inputs(FLAGS.batch_size)
        logits = mnist.inference(images)

        #calc the loss and gradients
        total_loss = mnist.loss(logits, labels)
        grads = opt.compute_gradients(total_loss)

        # Apply the gradients to adjust the shared variables.
        apply_gradients_op = opt.apply_gradients(grads,
                                                 global_step=global_step)

        with tf.control_dependencies([apply_gradients_op]):
            train_op = tf.identity(total_loss, name='train_op')

        # Create a saver.
        saver = tf.train.Saver(tf.global_variables())

        # Build the summary operation from the last tower summaries.
        summary_op = tf.summary.merge_all()

        # For testing trained model
        test_size = testset.num_examples
        test_images_placeholder, test_labels_placeholder = mnist.placeholder_inputs(
            FLAGS.batch_size)
        #    logits_test = mnist.inference(test_images_placeholder, train=False)
        #pred = mnist.predictions(logits_test)
        validation_accuracy = tf.reduce_sum(mnist.evaluation(
            logits, labels)) / tf.constant(FLAGS.batch_size)
        # Build an initialization operation to run below.
        init = tf.global_variables_initializer()

        # Start running operations on the Graph. allow_soft_placement must be set to
        # True to build towers on GPU, as some of the ops do not have GPU
        # implementations.
        sess = tf.Session(config=tf.ConfigProto(
            allow_soft_placement=True,
            log_device_placement=FLAGS.log_device_placement))
        sess.run(init)

        # Start the queue runners.
        tf.train.start_queue_runners(sess=sess)
        for step in range(FLAGS.max_steps):
            feed_dict = mnist.fill_feed_dict(dataset, images, labels,
                                             FLAGS.batch_size)
            start_time = time.time()
            _, loss_value, acc = sess.run(
                [train_op, total_loss, validation_accuracy],
                feed_dict=feed_dict)
            #  acc = sess.run(validation_accuracy, feed_dict=feed_dict_test)
            duration = time.time() - start_time

            assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

            examples_per_sec = FLAGS.batch_size / float(duration)
            format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                          'sec/batch); acc=%.4f')
            tf.logging.info(format_str % (datetime.now(), step, loss_value,
                                          examples_per_sec, duration, acc))

            if step % 500 == 0 or (step + 1) == FLAGS.max_steps:
                checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
def run_training():
    """Train MNIST for a number of steps."""

    # Tell TensorFlow that the model will be built into the default Graph.
    with tf.Graph().as_default():
        # Input images and labels.
        images, labels = inputs(train=True,
                                batch_size=FLAGS.batch_size,
                                num_epochs=FLAGS.num_epochs)

        # Build a Graph that computes predictions from the inference model.
        logits = mnist.inference(images, FLAGS.hidden1, FLAGS.hidden2)

        # Add to the Graph the loss calculation.
        loss = mnist.loss(logits, labels)
        print('---------------------------- ' + str(logits))

        # Add to the Graph operations that train the model.
        train_op = mnist.training(loss, FLAGS.learning_rate)

        # The op for initializing the variables.
        init_op = tf.group(tf.global_variables_initializer(),
                           tf.local_variables_initializer())

        # Create a session for running operations in the Graph.
        sess = tf.Session()

        # Initialize the variables (the trained variables and the
        # epoch counter).
        sess.run(init_op)

        # Start input enqueue threads.
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=sess, coord=coord)

        try:
            step = 0
            while step != 3:
                start_time = time.time()

                # Run one step of the model.  The return values are
                # the activations from the `train_op` (which is
                # discarded) and the `loss` op.  To inspect the values
                # of your ops or variables, you may include them in
                # the list passed to sess.run() and the value tensors
                # will be returned in the tuple from the call.
                _, loss_value = sess.run([train_op, loss])

                #duration = time.time() - start_time

                # Print an overview fairly often.
                #if step % 100 == 0:
                #  print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value,
                #                                             duration))
                step += 1
        except tf.errors.OutOfRangeError:
            print('Done training for %d epochs, %d steps.' %
                  (FLAGS.num_epochs, step))
        finally:
            # When done, ask the threads to stop.
            coord.request_stop()

        # Wait for threads to finish.
        coord.join(threads)
        sess.close()
Esempio n. 37
0
def run_training():
    """Train MNIST for a number of steps."""
    # Get the sets of images and labels for training, validation, and
    # test on MNIST.
    data_sets = input_data.read_data_sets(FLAGS.data_dir, FLAGS.fake_data)

    # Tell TensorFlow that the model will be built into the default Graph.
    with tf.Graph().as_default():
        # Generate placeholders for the images and labels.
        images_placeholder, labels_placeholder = placeholder_inputs(
            FLAGS.batch_size)

        # Build a Graph that computes predictions from the inference model.
        logits = mnist.inference(images_placeholder)

        # Add to the Graph the Ops for loss calculation.
        loss = mnist.loss(logits, labels_placeholder)

        # Add to the Graph the Ops that calculate and apply gradients.
        train_op = mnist.training(loss, FLAGS.batch_size)

        # Add the Op to compare the logits to the labels during evaluation.
        eval_correct = mnist.evaluation(logits, labels_placeholder)

        # Build the summary operation based on the TF collection of Summaries.
        summary_op = tf.summary.merge_all()

        # Create a saver for writing training checkpoints.
        saver = tf.train.Saver()

        # Create a session for running Ops on the Graph.
        sess = tf.Session()

        # Run the Op to initialize the variables.
        init = tf.global_variables_initializer()
        sess.run(init)

        # Instantiate a SummaryWriter to output summaries and the Graph.
        summary_writer = tf.summary.FileWriter(FLAGS.train_dir,
                                               graph_def=sess.graph_def)

        # And then after everything is built, start the training loop.
        for step in range(FLAGS.max_steps):
            start_time = time.time()

            # Fill a feed dictionary with the actual set of images and labels
            # for this particular training step.
            feed_dict = fill_feed_dict(data_sets.train, images_placeholder,
                                       labels_placeholder)

            # Run one step of the model.  The return values are the activations
            # from the `train_op` (which is discarded) and the `loss` Op.  To
            # inspect the values of your Ops or variables, you may include them
            # in the list passed to sess.run() and the value tensors will be
            # returned in the tuple from the call.
            _, loss_value = sess.run([train_op, loss], feed_dict=feed_dict)

            duration = time.time() - start_time

            # Write the summaries and print an overview fairly often.
            if step % 100 == 0:
                # Print status to stdout.
                print('Step %d: loss = %.2f (%.3f sec)' %
                      (step, loss_value, duration))
                # Update the events file.
                summary_str = sess.run(summary_op, feed_dict=feed_dict)
                summary_writer.add_summary(summary_str, step)

            # Save a checkpoint and evaluate the model periodically.
            if (step + 1) % 1000 == 0 or (step + 1) == FLAGS.max_steps:
                saver.save(sess, FLAGS.train_dir, global_step=step)
                # Evaluate against the training set.
                print('Training Data Eval:')
                do_eval(sess, eval_correct, images_placeholder,
                        labels_placeholder, data_sets.train)
                # Evaluate against the validation set.
                print('Validation Data Eval:')
                do_eval(sess, eval_correct, images_placeholder,
                        labels_placeholder, data_sets.validation)
                # Evaluate against the test set.
                print('Test Data Eval:')
                do_eval(sess, eval_correct, images_placeholder,
                        labels_placeholder, data_sets.test)
Esempio n. 38
0
def run_training(settings: Settings) -> float:
    tf.gfile.MakeDirs(settings.log_dir)

    data_sets = copy.deepcopy(DATASETS)

    with tf.Graph().as_default():
        images_placeholder, labels_placeholder = placeholder_inputs(
            settings.batch_size, )

        logits = mnist.inference(
            images_placeholder,
            settings.hidden1,
            settings.hidden2,
        )

        loss = mnist.loss(logits, labels_placeholder)

        train_op = mnist.training(loss, settings.learning_rate)

        eval_correct = mnist.evaluation(logits, labels_placeholder)

        summary = tf.summary.merge_all()

        init = tf.global_variables_initializer()

        saver = tf.train.Saver()

        sess = tf.Session()

        summary_writer = tf.summary.FileWriter(settings.log_dir, sess.graph)

        sess.run(init)

        for step in range(settings.max_steps):

            feed_dict = fill_feed_dict(
                data_sets.train,
                images_placeholder,
                labels_placeholder,
                settings,
            )

            _, loss_value = sess.run([train_op, loss], feed_dict=feed_dict)

            # Write the summaries and print an overview fairly often.
            if step % 100 == 0:
                # Update the events file.
                summary_str = sess.run(summary, feed_dict=feed_dict)
                summary_writer.add_summary(summary_str, step)
                summary_writer.flush()

            # Save a checkpoint and evaluate the model periodically.
            if (step + 1) % 1000 == 0 or (step + 1) == settings.max_steps:
                checkpoint_file = os.path.join(settings.log_dir, 'model.ckpt')
                saver.save(sess, checkpoint_file, global_step=step)
                # Evaluate against the training set.
                # print('Training Data Eval:')
                do_eval(
                    sess,
                    eval_correct,
                    images_placeholder,
                    labels_placeholder,
                    data_sets.train,
                    settings,
                )
                # Evaluate against the validation set.
                # print('Validation Data Eval:')
                do_eval(
                    sess,
                    eval_correct,
                    images_placeholder,
                    labels_placeholder,
                    data_sets.validation,
                    settings,
                )
                # Evaluate against the test set.
                # print('Test Data Eval:')
                acc = do_eval(
                    sess,
                    eval_correct,
                    images_placeholder,
                    labels_placeholder,
                    data_sets.test,
                    settings,
                )
    return 1 - acc