Esempio n. 1
0
def loss_and_accuracy_per_gpu(phase_train, scope='gpu_i'):
    # train/test inputs
    train_image_batch, train_label_batch = m.make_train_batch(
        FLAGS.train_tf_path, FLAGS.train_batch_size)
    val_image_batch, val_label_batch = m.make_validation_batch(
        FLAGS.val_tf_path, FLAGS.val_batch_size)
    image_batch, label_batch = control_flow_ops.cond(
        phase_train, lambda: (train_image_batch, train_label_batch), lambda:
        (val_image_batch, val_label_batch))

    # model outputs
    logits = m.residual_net(image_batch, FLAGS.residual_net_n, 10, phase_train)

    # total loss
    m.loss(logits, label_batch)
    loss = tf.add_n(tf.get_collection('losses', scope), name='total_loss')
    accuracy = m.accuracy(logits, label_batch)
    tf.scalar_summary('train_loss/' + scope, loss)
    tf.scalar_summary('train_accuracy/' + scope, accuracy)

    return loss, accuracy, logits
def loss_and_accuracy_per_gpu(phase_train, scope='gpu_i'):
    # train/test inputs
    train_image_batch, train_label_batch = m.make_train_batch(
        FLAGS.train_tf_path, FLAGS.train_batch_size)
    val_image_batch, val_label_batch = m.make_validation_batch(
        FLAGS.val_tf_path, FLAGS.val_batch_size)
    image_batch, label_batch = control_flow_ops.cond(phase_train,
                                                     lambda: (
                                                         train_image_batch, train_label_batch),
                                                     lambda: (val_image_batch, val_label_batch))

    # model outputs
    logits = m.residual_net(
        image_batch, FLAGS.residual_net_n, 10, phase_train)

    # total loss
    m.loss(logits, label_batch)
    loss = tf.add_n(tf.get_collection('losses', scope), name='total_loss')
    accuracy = m.accuracy(logits, label_batch)
    tf.scalar_summary('train_loss/' + scope, loss)
    tf.scalar_summary('train_accuracy/' + scope, accuracy)

    return loss, accuracy, logits
Esempio n. 3
0
def train_and_val():
    with tf.Graph().as_default():
        # train/test phase indicator
        phase_train = tf.placeholder(tf.bool, name='phase_train')

        # learning rate is manually set
        learning_rate = tf.placeholder(tf.float32, name='learning_rate')

        # global step
        global_step = tf.Variable(0, trainable=False, name='global_step')

        # train/test inputs
        train_image_batch, train_label_batch = m.make_train_batch(FLAGS.train_tf_path, FLAGS.train_batch_size)
        val_image_batch, val_label_batch = m.make_validation_batch(FLAGS.val_tf_path, FLAGS.val_batch_size)
        image_batch, label_batch = control_flow_ops.cond(phase_train,
            lambda: (train_image_batch, train_label_batch),
            lambda: (val_image_batch, val_label_batch))

        # model outputs
        logits = m.residual_net(image_batch, FLAGS.residual_net_n, 10, phase_train)

        # total loss
        loss = m.loss(logits, label_batch)
        accuracy = m.accuracy(logits, label_batch)
        tf.scalar_summary('train_loss', loss)
        tf.scalar_summary('train_accuracy', accuracy)

        # train one step
        train_op = m.train_op(loss, global_step, learning_rate)

        # saver
        saver = tf.train.Saver(tf.all_variables())

        # start session
        sess = tf.Session(config=tf.ConfigProto(log_device_placement=False))

        # summary
        summary_op = tf.merge_all_summaries()
        summary_writer = tf.train.SummaryWriter(FLAGS.log_dir, graph_def=sess.graph_def)
        for var in tf.trainable_variables():
            tf.histogram_summary('params/' + var.op.name, var)

        # initialization (TODO: or load)
        init_op = tf.initialize_all_variables()
        print('Initializing...')
        sess.run(init_op, {phase_train.name: True})

        # train loop
        tf.train.start_queue_runners(sess=sess)
        curr_lr = 0.0
        lr_scale = 1.0
        for step in xrange(FLAGS.max_steps):
            # set learning rate manually
            if step <= 32000:
                _lr = lr_scale * 1e-1
            elif step <= 48000:
                _lr = lr_scale * 1e-2
            else:
                _lr = lr_scale * 1e-3
            if curr_lr != _lr:
                curr_lr = _lr
                print('Learning rate set to %f' % curr_lr)

            fetches = [train_op, loss]
            if step % FLAGS.summary_interval == 0:
                fetches += [accuracy, summary_op]
            sess_outputs = sess.run(fetches, {phase_train.name: True, learning_rate.name: curr_lr})

            if step % FLAGS.summary_interval == 0:
                train_loss_value, train_acc_value, summary_str = sess_outputs[1:]
                print('[%s] Iteration %d, train loss = %f, train accuracy = %f' %
                    (datetime.now(), step, train_loss_value, train_acc_value))
                summary_writer.add_summary(summary_str, step)

            if step > 0 and step % FLAGS.val_interval == 0:
                print('Evaluating...')
                n_val_samples = 10000
                val_batch_size = FLAGS.val_batch_size
                n_val_batch = int(n_val_samples / val_batch_size)
                val_logits = np.zeros((n_val_samples, 10), dtype=np.float32)
                val_labels = np.zeros((n_val_samples), dtype=np.int64)
                val_losses = []
                for i in xrange(n_val_batch):
                    fetches = [logits, label_batch, loss]
                    session_outputs = sess.run(fetches, {phase_train.name: False})
                    val_logits[i*val_batch_size:(i+1)*val_batch_size,:] = session_outputs[0]
                    val_labels[i*val_batch_size:(i+1)*val_batch_size] = session_outputs[1]
                    val_losses.append(session_outputs[2])
                pred_labels = np.argmax(val_logits, axis=1)
                val_accuracy = np.count_nonzero(pred_labels == val_labels) / n_val_samples
                val_loss = float(np.mean(np.asarray(val_losses)))
                print('Test accuracy = %f' % val_accuracy)
                val_summary = tf.Summary()
                val_summary.value.add(tag='val_accuracy', simple_value=val_accuracy)
                val_summary.value.add(tag='val_loss', simple_value=val_loss)
                summary_writer.add_summary(val_summary, step)

            if step > 0 and step % FLAGS.save_interval == 0:
                checkpoint_path = os.path.join(FLAGS.log_dir, 'checkpoint')
                saver.save(sess, checkpoint_path, global_step=step)
                print('Checkpoint saved at %s' % checkpoint_path)
Esempio n. 4
0
def train_and_val():
    with tf.Graph().as_default():
        # train/test phase indicator
        phase_train = tf.placeholder(tf.bool, name='phase_train')

        # learning rate is manually set
        learning_rate = tf.placeholder(tf.float32, name='learning_rate')
        tf.scalar_summary('learning_rate', learning_rate)

        # global step
        global_step = tf.Variable(0, trainable=False, name='global_step')

        # train/test inputs
        train_image_batch, train_label_batch = m.make_train_batch(
            FLAGS.train_tf_path, FLAGS.train_batch_size)
        val_image_batch, val_label_batch = m.make_validation_batch(
            FLAGS.val_tf_path, FLAGS.val_batch_size)
        image_batch, label_batch = control_flow_ops.cond(
            phase_train, lambda: (train_image_batch, train_label_batch),
            lambda: (val_image_batch, val_label_batch))

        # model outputs
        logits = m.residual_net(image_batch, FLAGS.residual_net_n, 10,
                                phase_train)

        # total loss
        loss = m.loss(logits, label_batch)
        accuracy = m.accuracy(logits, label_batch)
        tf.scalar_summary('train_loss', loss)
        tf.scalar_summary('train_accuracy', accuracy)

        # train one step
        train_op = m.train_op(loss, global_step, learning_rate)

        # saver
        saver = tf.train.Saver(tf.all_variables())

        # start session
        sess = tf.Session(config=tf.ConfigProto(log_device_placement=False))

        # summary writer
        summary_op = tf.merge_all_summaries()
        summary_writer = tf.train.SummaryWriter(FLAGS.log_dir,
                                                graph=sess.graph)

        # initialize parameters or load from a checkpoint
        if FLAGS.load_dir != '':
            # load from checkpoint
            checkpoint = tf.train.get_checkpoint_state(FLAGS.load_dir)
            model_checkpoint_path = checkpoint.model_checkpoint_path
            if checkpoint and model_checkpoint_path:
                saver.restore(sess, model_checkpoint_path)
                print('Model restored from %s' % model_checkpoint_path)
            else:
                raise 'Load directory provided by no checkpoint found'
        else:
            init_op = tf.initialize_all_variables()
            print('Initializing...')
            sess.run(init_op, {phase_train.name: True})

        print('Start training...')
        # train loop
        tf.train.start_queue_runners(sess=sess)
        curr_lr = 0.0
        for step in xrange(FLAGS.max_steps):
            # # set learning rate manually
            # if step <= 5000:
            #   _lr = 1e-2
            # elif step <= 32000:
            #   _lr = 1e-1
            # elif step <= 48000:
            #   _lr = 1e-2
            # else:
            #   _lr = 1e-3
            # set learning rate manually
            if step <= 48000:
                _lr = 1e-2
            else:
                _lr = 1e-3
            if curr_lr != _lr:
                curr_lr = _lr
                print('Learning rate set to %f' % curr_lr)

            # train
            fetches = [train_op, loss]
            if step > 0 and step % FLAGS.summary_interval == 0:
                fetches += [accuracy, summary_op]
            sess_outputs = sess.run(fetches, {
                phase_train.name: True,
                learning_rate.name: curr_lr
            })

            # summary
            if step > 0 and step % FLAGS.summary_interval == 0:
                train_loss_value, train_acc_value, summary_str = sess_outputs[
                    1:]
                print(
                    '[%s] Iteration %d, train loss = %f, train accuracy = %f' %
                    (datetime.now(), step, train_loss_value, train_acc_value))
                summary_writer.add_summary(summary_str, step)

            # validation
            if step > 0 and step % FLAGS.val_interval == 0:
                print('Evaluating...')
                n_val_samples = 10000
                val_batch_size = FLAGS.val_batch_size
                n_val_batch = int(n_val_samples / val_batch_size)
                val_logits = np.zeros((n_val_samples, 10), dtype=np.float32)
                val_labels = np.zeros((n_val_samples), dtype=np.int64)
                val_losses = []
                for i in xrange(n_val_batch):
                    fetches = [logits, label_batch, loss]
                    session_outputs = sess.run(fetches,
                                               {phase_train.name: False})
                    val_logits[i * val_batch_size:(i + 1) *
                               val_batch_size, :] = session_outputs[0]
                    val_labels[i * val_batch_size:(i + 1) *
                               val_batch_size] = session_outputs[1]
                    val_losses.append(session_outputs[2])
                pred_labels = np.argmax(val_logits, axis=1)
                val_accuracy = np.count_nonzero(
                    pred_labels == val_labels) / n_val_samples
                val_loss = float(np.mean(np.asarray(val_losses)))
                print('Test accuracy = %f' % val_accuracy)
                val_summary = tf.Summary()
                val_summary.value.add(tag='val_accuracy',
                                      simple_value=val_accuracy)
                val_summary.value.add(tag='val_loss', simple_value=val_loss)
                summary_writer.add_summary(val_summary, step)

            # save variables
            if step > 0 and step % FLAGS.save_interval == 0:
                checkpoint_path = os.path.join(FLAGS.log_dir, 'checkpoint')
                saver.save(sess, checkpoint_path, global_step=step)
                print('Checkpoint saved at %s' % checkpoint_path)
Esempio n. 5
0
def train_and_val():
    with tf.Graph().as_default():
        # train/test phase indicator
        phase_train = tf.placeholder(tf.bool, name='phase_train')

        # learning rate is manually set
        learning_rate = tf.placeholder(tf.float32, name='learning_rate')

        # global step
        global_step = tf.Variable(0, trainable=False, name='global_step')

        # train/test inputs
        train_image_batch, train_label_batch = m.make_train_batch(
            FLAGS.train_tf_path, FLAGS.train_batch_size)
        val_image_batch, val_label_batch = m.make_validation_batch(
            FLAGS.val_tf_path, FLAGS.val_batch_size)
        image_batch, label_batch = control_flow_ops.cond(
            phase_train, lambda: (train_image_batch, train_label_batch),
            lambda: (val_image_batch, val_label_batch))

        # model outputs
        logits = m.residual_net(image_batch, FLAGS.residual_net_n, 10,
                                phase_train)

        # total loss
        m.loss(logits, label_batch)
        loss = tf.add_n(tf.get_collection('losses'), name='total_loss')
        m.summary_losses()
        accuracy = m.accuracy(logits, label_batch)
        tf.scalar_summary('train_loss', loss)
        tf.scalar_summary('train_accuracy', accuracy)

        # saver
        saver = tf.train.Saver(tf.all_variables())

        # start session
        sess = tf.Session(config=tf.ConfigProto(log_device_placement=False))

        # summary
        for var in tf.trainable_variables():
            tf.histogram_summary('params/' + var.op.name, var)

        init_op = tf.initialize_all_variables()
        if FLAGS.restore_path is None:
            # initialization
            print('Initializing...')
            sess.run(init_op, {phase_train.name: True})
        else:
            # restore from previous checkpoint
            sess.run(init_op, {phase_train.name: True})
            print('Restore variable from %s' % FLAGS.restore_path)
            saver.restore(sess, FLAGS.restore_path)

        # train loop
        tf.train.start_queue_runners(sess=sess)

        n_samples = 10000
        batch_size = FLAGS.val_batch_size
        n_iter = int(np.floor(n_samples / batch_size))
        accuracies = []
        losses = []
        for step in xrange(n_iter):
            fetches = [loss, accuracy]
            val_loss, val_acc = sess.run(fetches, {phase_train.name: False})
            losses.append(val_loss)
            accuracies.append(val_acc)
            print('[%s] Iteration %d, val loss = %f, val accuracy = %f' %
                  (datetime.now(), step, val_loss, val_acc))

        val_acc = np.mean(accuracies)
        val_loss = np.mean(losses)

        print('val losses is %f, accuracy is %f' % (val_loss, val_acc))
Esempio n. 6
0
def train_and_val():
    with tf.Graph().as_default():
        # train/test phase indicator
        phase_train = tf.placeholder(tf.bool, name='phase_train')

        # learning rate is manually set
        learning_rate = tf.placeholder(tf.float32, name='learning_rate')

        # global step
        global_step = tf.Variable(0, trainable=False, name='global_step')

        # train/test inputs
        train_image_batch, train_label_batch = m.make_train_batch(
            FLAGS.train_tf_path, FLAGS.train_batch_size)
        val_image_batch, val_label_batch = m.make_validation_batch(
            FLAGS.val_tf_path, FLAGS.val_batch_size)
        image_batch, label_batch = control_flow_ops.cond(phase_train,
                                                         lambda: (
                                                             train_image_batch, train_label_batch),
                                                         lambda: (val_image_batch, val_label_batch))

        # model outputs
        logits = m.residual_net(
            image_batch, FLAGS.residual_net_n, 10, phase_train)

        # total loss
        m.loss(logits, label_batch)
        loss = tf.add_n(tf.get_collection('losses'), name='total_loss')
        m.summary_losses()
        accuracy = m.accuracy(logits, label_batch)
        tf.scalar_summary('train_loss', loss)
        tf.scalar_summary('train_accuracy', accuracy)

        # saver
        saver = tf.train.Saver(tf.all_variables())

        # start session
        sess = tf.Session(config=tf.ConfigProto(log_device_placement=False))

        # summary
        for var in tf.trainable_variables():
            tf.histogram_summary('params/' + var.op.name, var)

        init_op = tf.initialize_all_variables()
        if FLAGS.restore_path is None:
            # initialization
            print('Initializing...')
            sess.run(init_op, {phase_train.name: True})
        else:
            # restore from previous checkpoint
            sess.run(init_op, {phase_train.name: True})
            print('Restore variable from %s' % FLAGS.restore_path)
            saver.restore(sess, FLAGS.restore_path)

        # train loop
        tf.train.start_queue_runners(sess=sess)

        n_samples = 10000
        batch_size = FLAGS.val_batch_size
        n_iter = int(np.floor(n_samples / batch_size))
        accuracies = []
        losses = []
        for step in xrange(n_iter):
            fetches = [loss, accuracy]
            val_loss, val_acc = sess.run(
                fetches, {phase_train.name: False})
            losses.append(val_loss)
            accuracies.append(val_acc)
            print('[%s] Iteration %d, val loss = %f, val accuracy = %f' %
                  (datetime.now(), step, val_loss, val_acc))

        val_acc = np.mean(accuracies)
        val_loss = np.mean(losses)

        print('val losses is %f, accuracy is %f' % (val_loss, val_acc))