Exemple #1
0
def build_optimizer(global_step):
    """Build the CLI specified optimizer, log the learning rate and enalble
    learning rate decay is specified.
    Args:
        global_step: integer tensor, the current training step
    Returns:
        optimizer: tf.Optimizer object initialized
    """
    # Extract the initial learning rate
    initial_lr = float(ARGS.optimizer_args['learning_rate'])

    if ARGS.lr_decay:
        # Decay the learning rate exponentially based on the number of steps.
        steps_per_decay = STEPS_PER_EPOCH * ARGS.lr_decay_epochs
        learning_rate = tf.train.exponential_decay(initial_lr,
                                                   global_step,
                                                   steps_per_decay,
                                                   ARGS.lr_decay_factor,
                                                   staircase=True)
        # Update the learning rate parameter of the optimizer
        ARGS.optimizer_args['learning_rate'] = learning_rate
    else:
        learning_rate = tf.constant(initial_lr)

    # Log the learning rate
    tf_log(tf.summary.scalar('learning_rate', learning_rate))

    # Instantiate the optimizer
    optimizer = getattr(tf.train, ARGS.optimizer)(**ARGS.optimizer_args)
    return optimizer
Exemple #2
0
def log_io(inputs, outputs=None):
    """Log inputs and outputs batch of images.
    Args:
        inputs: tensor with shape [Batch_size, height, widht, depth]
        outputs: if present must be the same dimensions as inputs
    """
    with tf.variable_scope('visualization'):
        grid_side = math.floor(math.sqrt(ARGS.batch_size))
        inputs = put_kernels_on_grid(
            tf.transpose(inputs, perm=(1, 2, 3, 0))[:, :, :, 0:grid_side**2],
            grid_side)

        if outputs is None:
            tf_log(tf.summary.image('inputs', inputs, max_outputs=1))
            return

        inputs = tf.pad(inputs, [[0, 0], [0, 0], [0, 10], [0, 0]])
        outputs = put_kernels_on_grid(
            tf.transpose(outputs, perm=(1, 2, 3, 0))[:, :, :, 0:grid_side**2],
            grid_side)
        tf_log(
            tf.summary.image('input_output',
                             tf.concat([inputs, outputs], axis=2),
                             max_outputs=1))
def train():
    """Train model.

    Returns:
        best validation error. Save best model"""

    best_validation_error_value = float('inf')

    with tf.Graph().as_default(), tf.device(TRAIN_DEVICE):
        global_step = tf.Variable(0, trainable=False, name="global_step")

        # Get images and labels for CIFAR-10.
        images, _ = DATASET.distorted_inputs(BATCH_SIZE)

        # Build a Graph that computes the reconstructions predictions from the
        # inference model.
        is_training_, reconstructions = MODEL.get(images,
                                                  train_phase=True,
                                                  l2_penalty=L2_PENALTY)

        # display original images next to reconstructed images
        with tf.variable_scope("visualization"):
            grid_side = math.floor(math.sqrt(BATCH_SIZE))
            inputs = put_kernels_on_grid(
                tf.transpose(images, perm=(1, 2, 3, 0))[:, :, :,
                                                        0:grid_side**2],
                grid_side)

            outputs = put_kernels_on_grid(
                tf.transpose(reconstructions,
                             perm=(1, 2, 3, 0))[:, :, :, 0:grid_side**2],
                grid_side)
        tf_log(
            tf.summary.image('input_output',
                             tf.concat(2, [inputs, outputs]),
                             max_outputs=1))

        # Calculate loss.
        loss = MODEL.loss(reconstructions, images)
        # reconstruction error
        error_ = tf.placeholder(tf.float32, shape=())
        error = tf.summary.scalar('error', error_)

        if LR_DECAY:
            # Decay the learning rate exponentially based on the number of steps.
            learning_rate = tf.train.exponential_decay(INITIAL_LR,
                                                       global_step,
                                                       STEPS_PER_DECAY,
                                                       LR_DECAY_FACTOR,
                                                       staircase=True)
        else:
            learning_rate = tf.constant(INITIAL_LR)

        tf_log(tf.summary.scalar('learning_rate', learning_rate))
        train_op = OPTIMIZER.minimize(loss, global_step=global_step)

        # Create the train saver.
        variables = variables_to_save([global_step])
        train_saver = tf.train.Saver(variables, max_to_keep=2)
        # Create the best model saver
        best_saver = tf.train.Saver(variables, max_to_keep=1)

        # read collection after that every op added its own
        # summaries in the train_summaries collection
        train_summaries = tf.summary.merge(
            tf.get_collection_ref(MODEL_SUMMARIES))

        # Build an initialization operation to run below.
        init = tf.variables_initializer(tf.global_variables() +
                                        tf.local_variables())

        # Start running operations on the Graph.
        with tf.Session(config=tf.ConfigProto(
                allow_soft_placement=True)) as sess:
            sess.run(init)

            # Start the queue runners with a coordinator
            coord = tf.train.Coordinator()
            threads = tf.train.start_queue_runners(sess=sess, coord=coord)

            if not RESTART:  # continue from the saved checkpoint
                # restore previous session if exists
                checkpoint = tf.train.latest_checkpoint(LOG_DIR)
                if checkpoint:
                    train_saver.restore(sess, checkpoint)
                else:
                    print("[I] Unable to restore from checkpoint")

            train_log = tf.summary.FileWriter(os.path.join(
                LOG_DIR, str(InputType.train)),
                                              graph=sess.graph)
            validation_log = tf.summary.FileWriter(os.path.join(
                LOG_DIR, str(InputType.validation)),
                                                   graph=sess.graph)

            # Extract previous global step value
            old_gs = sess.run(global_step)

            # Restart from where we were
            for step in range(old_gs, MAX_STEPS):
                start_time = time.time()
                _, loss_value = sess.run([train_op, loss],
                                         feed_dict={is_training_: True})
                duration = time.time() - start_time

                if np.isnan(loss_value):
                    print('Model diverged with loss = NaN')
                    break

                # update logs every 10 iterations
                if step % 10 == 0:
                    num_examples_per_step = BATCH_SIZE
                    examples_per_sec = num_examples_per_step / duration
                    sec_per_batch = float(duration)

                    format_str = ('{}: step {}, loss = {:.2f} '
                                  '({:.1f} examples/sec; {:.3f} sec/batch)')
                    print(
                        format_str.format(datetime.now(), step, loss_value,
                                          examples_per_sec, sec_per_batch))
                    # log train error and summaries
                    train_error_summary_line, train_summary_line = sess.run(
                        [error, train_summaries],
                        feed_dict={
                            error_: loss_value,
                            is_training_: True
                        })
                    train_log.add_summary(train_error_summary_line,
                                          global_step=step)
                    train_log.add_summary(train_summary_line, global_step=step)

                # Save the model checkpoint at the end of every epoch
                # evaluate train and validation performance
                if (step > 0 and step % STEPS_PER_EPOCH
                        == 0) or (step + 1) == MAX_STEPS:
                    checkpoint_path = os.path.join(LOG_DIR, 'model.ckpt')
                    train_saver.save(sess, checkpoint_path, global_step=step)

                    # validation error
                    validation_error_value = evaluate.error(
                        LOG_DIR,
                        MODEL,
                        DATASET,
                        InputType.validation,
                        device=EVAL_DEVICE)

                    summary_line = sess.run(
                        error, feed_dict={error_: validation_error_value})
                    validation_log.add_summary(summary_line, global_step=step)

                    print('{} ({}): train error = {} validation error = {}'.
                          format(datetime.now(), int(step / STEPS_PER_EPOCH),
                                 loss_value, validation_error_value))
                    if validation_error_value < best_validation_error_value:
                        best_validation_error_value = validation_error_value
                        best_saver.save(sess,
                                        os.path.join(BEST_MODEL_DIR,
                                                     'model.ckpt'),
                                        global_step=step)
            # end of for

            validation_log.close()
            train_log.close()

            # When done, ask the threads to stop.
            coord.request_stop()
            # Wait for threads to finish.
            coord.join(threads)
    return best_validation_error_value
Exemple #4
0
def classifier():
    """Trains the classifier and saves the best model:
    that's the model with the highest validation accuracy).
    """

    best_va = 0.0

    with tf.Graph().as_default(), tf.device(ARGS.train_device):
        global_step = tf.Variable(0, trainable=False, name='global_step')

        # Get images and labels
        with tf.device('/cpu:0'):
            images, labels = DATASET.distorted_inputs(ARGS.batch_size)
        log_io(images)

        # Build a Graph that computes the logits predictions from the
        # inference model.
        is_training_, logits = MODEL.get(images,
                                         DATASET.num_classes,
                                         train_phase=True,
                                         l2_penalty=ARGS.l2_penalty)

        # Calculate loss.
        loss = MODEL.loss(logits, labels)
        tf_log(tf.summary.scalar('loss', loss))

        # Create optimizer and log learning rate
        optimizer = build_optimizer(global_step)
        train_op = optimizer.minimize(loss,
                                      global_step=global_step,
                                      var_list=variables_to_train(
                                          ARGS.trainable_scopes))

        train_accuracy = metrics.accuracy_op(logits, labels)
        # General validation summary
        accuracy_value_ = tf.placeholder(tf.float32, shape=())
        accuracy_summary = tf.summary.scalar('accuracy', accuracy_value_)

        # read collection after that every op added its own
        # summaries in the train_summaries collection
        train_summaries = tf.summary.merge(
            tf.get_collection_ref(MODEL_SUMMARIES))

        # Build an initialization operation to run below.
        init = [
            tf.variables_initializer(tf.global_variables() +
                                     tf.local_variables()),
            tf.tables_initializer()
        ]

        # Start running operations on the Graph.
        with tf.Session(config=tf.ConfigProto(
                allow_soft_placement=True)) as sess:
            sess.run(init)

            # Start the queue runners with a coordinator
            coord = tf.train.Coordinator()
            threads = tf.train.start_queue_runners(sess=sess, coord=coord)

            # Create the savers.
            train_saver, best_saver = build_train_savers([global_step])
            restore_or_restart(sess, global_step)
            train_log, validation_log = build_loggers(sess.graph)

            # Extract previous global step value
            old_gs = sess.run(global_step)

            # Restart from where we were
            for step in range(old_gs, MAX_STEPS):
                start_time = time.time()
                _, loss_value = sess.run([train_op, loss],
                                         feed_dict={is_training_: True})

                duration = time.time() - start_time

                if np.isnan(loss_value):
                    print('Model diverged with loss = NaN')
                    break

                # update logs every 10 iterations
                if step % STEPS_PER_LOG == 0:
                    examples_per_sec = ARGS.batch_size / duration
                    sec_per_batch = float(duration)

                    format_str = ('{}: step {}, loss = {:.4f} '
                                  '({:.1f} examples/sec; {:.3f} sec/batch)')
                    print(
                        format_str.format(datetime.now(), step, loss_value,
                                          examples_per_sec, sec_per_batch))
                    # log train values
                    summary_lines = sess.run(train_summaries,
                                             feed_dict={is_training_: True})
                    train_log.add_summary(summary_lines, global_step=step)

                # Save the model checkpoint at the end of every epoch
                # evaluate train and validation performance
                if (step > 0 and step % STEPS_PER_EPOCH
                        == 0) or (step + 1) == MAX_STEPS:
                    checkpoint_path = os.path.join(LOG_DIR, 'model.ckpt')
                    train_saver.save(sess, checkpoint_path, global_step=step)

                    # validation accuracy
                    va_value = eval_model(LOG_DIR, InputType.validation)

                    summary_line = sess.run(
                        accuracy_summary,
                        feed_dict={accuracy_value_: va_value})
                    validation_log.add_summary(summary_line, global_step=step)

                    # train accuracy
                    ta_value = sess.run(train_accuracy,
                                        feed_dict={is_training_: False})
                    summary_line = sess.run(
                        accuracy_summary,
                        feed_dict={accuracy_value_: ta_value})
                    train_log.add_summary(summary_line, global_step=step)

                    print(
                        '{} ({}): train accuracy = {:.3f} validation accuracy = {:.3f}'
                        .format(datetime.now(), int(step / STEPS_PER_EPOCH),
                                ta_value, va_value))
                    # save best model
                    if va_value > best_va:
                        best_va = va_value
                        best_saver.save(sess,
                                        os.path.join(BEST_MODEL_DIR,
                                                     'model.ckpt'),
                                        global_step=step)
            # end of for

            validation_log.close()
            train_log.close()

            # When done, ask the threads to stop.
            coord.request_stop()
            # Wait for threads to finish.
            coord.join(threads)
Exemple #5
0
def detector():
    """Trains the detector and saves the best model:
    that's the model with the highest IoU.
    """

    best_iou = 0.0

    with tf.Graph().as_default(), tf.device(ARGS.train_device):
        global_step = tf.Variable(0, trainable=False, name='global_step')

        with tf.device('/cpu:0'):
            images, ground_truth = DATASET.distorted_inputs(ARGS.batch_size)

        # Build a Graph that computes the logits predictions from the
        # inference model.
        # predictions has shape: [batch_size, n, m, num_bboxes, 4 + num_classes]
        # 4 = coords
        # n & m = 1 when training and input has the expected shape of the network
        is_training_, predictions = MODEL.get(images,
                                              DATASET.num_classes,
                                              train_phase=True,
                                              l2_penalty=ARGS.l2_penalty)

        # Calculate loss.
        loss = MODEL.loss(predictions, ground_truth)
        tf_log(tf.summary.scalar('loss', loss))

        # reshape predictions in order to be useful in training
        predictions = tf.squeeze(predictions, axis=[1, 2])
        angle = predictions[:, :1]
        logits = predictions[:, 1:]

        # reshape ground truth in order to be useful in training
        ground_truth = tf.squeeze(ground_truth, axis=[1])
        real_angle = ground_truth[:, :1]
        labels = tf.cast(ground_truth[:, 1], tf.int32)

        # add dimension to real angle, in order to get a tensor with shape:
        # [batch_size, 1=num_bboxes, 1=angle]
        log_io(images)

        # Create optimizer and log learning rate
        optimizer = build_optimizer(global_step)
        train_op = optimizer.minimize(loss,
                                      global_step=global_step,
                                      var_list=variables_to_train(
                                          ARGS.trainable_scopes))

        #iou_value_ = tf.placeholder(tf.float32, shape=())
        #iou_summary = tf.summary.scalar('iou', iou_value_)

        # Train accuracy op
        train_accuracy = metrics.accuracy_op(logits, labels)

        # General validation summary
        accuracy_value_ = tf.placeholder(tf.float32, shape=())
        accuracy_summary = tf.summary.scalar('accuracy', accuracy_value_)

        with tf.variable_scope("angle_distance"):
            angle_distance = tf.reduce_mean(180. -
                                            tf.mod(tf.abs(real_angle -
                                                          angle), 360.) - 180.)
            tf_log(tf.summary.scalar('angle_distance', angle_distance))

        # read collection after that every op added its own
        # summaries in the train_summaries collection
        train_summaries = tf.summary.merge(
            tf.get_collection_ref(MODEL_SUMMARIES))

        # Build an initialization operation to run below.
        init = [
            tf.variables_initializer(tf.global_variables() +
                                     tf.local_variables()),
            tf.tables_initializer()
        ]

        # Start running operations on the Graph.
        with tf.Session(config=tf.ConfigProto(
                allow_soft_placement=True)) as sess:
            sess.run(init)

            # Start the queue runners with a coordinator
            coord = tf.train.Coordinator()
            threads = tf.train.start_queue_runners(sess=sess, coord=coord)

            # Create the savers.
            restore_or_restart(sess, global_step)
            train_saver, best_saver = build_train_savers([global_step])
            train_log, validation_log = build_loggers(sess.graph)

            # Extract previous global step value
            old_gs = sess.run(global_step)

            # Restart from where we were
            for step in range(old_gs, MAX_STEPS):
                start_time = time.time()
                _, loss_value = sess.run([train_op, loss],
                                         feed_dict={is_training_: True})

                duration = time.time() - start_time

                if np.isnan(loss_value):
                    print('Model diverged with loss = NaN')
                    break

                # update logs every 10 iterations
                if step % STEPS_PER_LOG == 0:
                    examples_per_sec = ARGS.batch_size / duration
                    sec_per_batch = float(duration)

                    format_str = ('{}: step {}, loss = {:.4f} '
                                  '({:.1f} examples/sec; {:.3f} sec/batch)')
                    print(
                        format_str.format(datetime.now(), step, loss_value,
                                          examples_per_sec, sec_per_batch))
                    # log train values
                    summary_lines = sess.run(train_summaries,
                                             feed_dict={is_training_: True})
                    train_log.add_summary(summary_lines, global_step=step)

                # Save the model checkpoint at the end of every epoch
                # evaluate train and validation performance
                if (step > 0 and step % STEPS_PER_EPOCH
                        == 0) or (step + 1) == MAX_STEPS:
                    checkpoint_path = os.path.join(LOG_DIR, 'model.ckpt')
                    train_saver.save(sess, checkpoint_path, global_step=step)

                    # train metrics
                    ta_value = sess.run(train_accuracy,
                                        feed_dict={is_training_: False})

                    summary_line = sess.run(
                        accuracy_summary,
                        feed_dict={accuracy_value_: ta_value})
                    train_log.add_summary(summary_line, global_step=step)

                    # TODO: validation metrics

                    print('{} ({}): train acc: {:.3f}'.format(
                        datetime.now(), int(step / STEPS_PER_EPOCH), ta_value))

                    # TODO: save best model
                    #if validation_iou_value > best_iou:
                    #    best_iou = validation_iou_value
                    #    best_saver.save(
                    #        sess,
                    #        os.path.join(BEST_MODEL_DIR, 'model.ckpt'),
                    #        global_step=step)
                # end of for

            validation_log.close()
            train_log.close()

            # When done, ask the threads to stop.
            coord.request_stop()
            # Wait for threads to finish.
            coord.join(threads)