Ejemplo n.º 1
0
def eval_one_epoch(args, sess, dataset, image_paths_placeholder,
                   labels_placeholder, is_training_placeholder, enqueue_op,
                   clones):
    batch_size = args.batch_size * args.num_gpus
    image_paths, num_per_class = all_val_entities(args, dataset)
    print('eval image paths', len(image_paths))
    nrof_origin_samples = len(image_paths)
    assert (sum(num_per_class) == nrof_origin_samples)
    #print(num_per_class)
    #print(image_paths[0:10])
    assert (args.batch_size % 3 == 0)
    triplet_size = args.batch_size // 3
    _a = int(math.ceil(len(image_paths) / batch_size))
    nrof_samples = _a * batch_size
    while nrof_samples > len(image_paths):
        image_paths.append(image_paths[0])

    #for p in image_paths:
    #  print(p)

    #print('Running forward pass on sampled images: ', end='')
    start_time = time.time()
    nrof_examples = len(image_paths)
    assert (nrof_examples % batch_size == 0)
    labels_array = np.reshape(np.arange(nrof_examples), (-1, 3))
    image_paths_array = np.reshape(np.expand_dims(np.array(image_paths), 1),
                                   (-1, 3))
    print(image_paths_array.shape)
    print(labels_array.shape)
    sess.run(
        enqueue_op, {
            image_paths_placeholder: image_paths_array,
            labels_placeholder: labels_array
        })
    emb_array = np.zeros((nrof_examples, args.embedding_size),
                         dtype=np.float32)
    nrof_batches = int(np.ceil(nrof_examples / batch_size))
    print('eval batches', nrof_batches)
    for i in xrange(nrof_batches):
        if i % 10 == 0:
            prt('running eval batch %d' % i)
        ops = []
        for clone in clones:
            with tf.device(clone.device):
                embeddings, labels, _ = clone.outputs
            ops += [embeddings, labels]
        ops_value = sess.run(ops, feed_dict={is_training_placeholder: False})
        for k in xrange(args.num_gpus):
            emb = ops_value[k * 2]
            #prt(emb.shape)
            lab = ops_value[k * 2 + 1]
            #prt(lab.shape)
            emb_array[lab, :] = emb
        sys.stdout.flush()
    print('%.3f' % (time.time() - start_time))
    emb_array = emb_array[0:nrof_origin_samples, :]
    score = top1_recall(emb_array, num_per_class)
    print('top1 recall: %f' % score)
Ejemplo n.º 2
0
def do_training(train_op, init_fn=None, summary_op=None, lr=None):
    global savers
    graph = ops.get_default_graph()
    with graph.as_default():
        global_step = variables.get_or_create_global_step()
        saver = tf_saver.Saver(max_to_keep=0)

        with ops.name_scope('init_ops'):
            init_op = tf_variables.global_variables_initializer()

            ready_op = tf_variables.report_uninitialized_variables()

            local_init_op = control_flow_ops.group(
                tf_variables.local_variables_initializer(),
                data_flow_ops.tables_initializer())

        summary_writer = supervisor.Supervisor.USE_DEFAULT
        with ops.name_scope('train_step'):
            train_step_kwargs = {}

            if not FLAGS.max_number_of_steps is None:
                should_stop_op = math_ops.greater_equal(
                    global_step, FLAGS.max_number_of_steps)
            else:
                should_stop_op = constant_op.constant(False)
            train_step_kwargs['should_stop'] = should_stop_op
            if FLAGS.log_every_n_steps > 0:
                train_step_kwargs['should_log'] = math_ops.equal(
                    math_ops.mod(global_step, FLAGS.log_every_n_steps), 0)
        prefix = "loc/net"
        lp = len(prefix)
        vdic = {
            "InceptionV2" + v.op.name[lp:]: v
            for v in tf.trainable_variables()
            if v.name.startswith(prefix) and v.name.find("Logits/") < 0
        }
        _saver = tf_saver.Saver(vdic)
        savers.append(_saver)
        for i in xrange(NUM_STN):
            prefix = "stn%d/net" % i
            lp = len(prefix)
            vdic = {
                "InceptionV2" + v.op.name[lp:]: v
                for v in tf.trainable_variables()
                if v.name.startswith(prefix) and v.name.find("Logits/") < 0
            }
            # saver = tf.train.Saver(vdic)
            _saver = tf_saver.Saver(vdic)
            savers.append(_saver)
    prt("savers %d" % len(savers))

    is_chief = True
    logdir = FLAGS.train_dir

    sv = supervisor.Supervisor(graph=graph,
                               is_chief=is_chief,
                               logdir=logdir,
                               init_op=init_op,
                               init_feed_dict=None,
                               local_init_op=local_init_op,
                               ready_for_local_init_op=None,
                               ready_op=ready_op,
                               summary_op=summary_op,
                               summary_writer=summary_writer,
                               global_step=global_step,
                               saver=saver,
                               save_summaries_secs=FLAGS.save_summaries_secs,
                               save_model_secs=FLAGS.save_interval_secs,
                               init_fn=init_fn)

    if summary_writer is not None:
        train_step_kwargs['summary_writer'] = sv.summary_writer

    with sv.managed_session('', start_standard_services=False,
                            config=None) as sess:
        logging.info('Starting Session.')
        if is_chief:
            if logdir:
                sv.start_standard_services(sess)
        elif startup_delay_steps > 0:
            _wait_for_step(
                sess, global_step,
                min(startup_delay_steps, number_of_steps or sys.maxint))
        sv.start_queue_runners(sess)
        logging.info('Starting Queues.')
        try:
            while not sv.should_stop():
                total_loss, global_step_value, should_stop = train_step(
                    sess, train_op, global_step, lr, train_step_kwargs)
                current_epoch = int(
                    math.ceil(float(global_step_value) / FLAGS.steps_in_epoch))
                if global_step_value > 0 and global_step_value % FLAGS.save_every_n_steps == 0:
                    sv.saver.save(sess,
                                  sv.save_path,
                                  global_step=sv.global_step)

                if should_stop:
                    logging.info('Stopping Training.')
                    break
        except errors.OutOfRangeError:
            # OutOfRangeError is thrown when epoch limit per
            # tf.train.limit_epochs is reached.
            logging.info('Caught OutOfRangeError. Stopping Training.')
        if logdir and sv.is_chief:
            logging.info('Finished training! Saving model to disk.')
            sv.saver.save(sess, sv.save_path, global_step=sv.global_step)
Ejemplo n.º 3
0
def train_step(sess, train_op, global_step, lr, train_step_kwargs):
    """Function that takes a gradient step and specifies whether to stop.

    Args:
      sess: The current session.
      train_op: An `Operation` that evaluates the gradients and returns the
        total loss.
      global_step: A `Tensor` representing the global training step.
      train_step_kwargs: A dictionary of keyword arguments.

    Returns:
      The total loss and a boolean indicating whether or not to stop training.

    Raises:
      ValueError: if 'should_trace' is in `train_step_kwargs` but `logdir` is not.
    """
    start_time = time.time()

    trace_run_options = None
    run_metadata = None
    if 'should_trace' in train_step_kwargs:
        if 'logdir' not in train_step_kwargs:
            raise ValueError(
                'logdir must be present in train_step_kwargs when '
                'should_trace is present')
        if sess.run(train_step_kwargs['should_trace']):
            trace_run_options = config_pb2.RunOptions(
                trace_level=config_pb2.RunOptions.FULL_TRACE)
            run_metadata = config_pb2.RunMetadata()

    total_loss, lr_value, np_global_step = sess.run(
        [train_op, lr, global_step],
        options=trace_run_options,
        run_metadata=run_metadata)
    time_elapsed = time.time() - start_time

    if run_metadata is not None:
        tl = timeline.Timeline(run_metadata.step_stats)
        trace = tl.generate_chrome_trace_format()
        trace_filename = os.path.join(train_step_kwargs['logdir'],
                                      'tf_trace-%d.json' % np_global_step)
        logging.info('Writing trace to %s', trace_filename)
        file_io.write_string_to_file(trace_filename, trace)
        if 'summary_writer' in train_step_kwargs:
            train_step_kwargs['summary_writer'].add_run_metadata(
                run_metadata, 'run_metadata-%d' % np_global_step)

    if 'should_log' in train_step_kwargs:
        if sess.run(train_step_kwargs['should_log']):
            logging.info('global step %d: loss = %.4f (%.3f sec/step)',
                         np_global_step, total_loss, time_elapsed)
            prt('global step %d with lr %.4f: loss = %.4f (%.3f sec/step)' %
                (np_global_step, lr_value, total_loss, time_elapsed))

    # TODO(nsilberman): figure out why we can't put this into sess.run. The
    # issue right now is that the stop check depends on the global step. The
    # increment of global step often happens via the train op, which used
    # created using optimizer.apply_gradients.
    #
    # Since running `train_op` causes the global step to be incremented, one
    # would expected that using a control dependency would allow the
    # should_stop check to be run in the same session.run call:
    #
    #   with ops.control_dependencies([train_op]):
    #     should_stop_op = ...
    #
    # However, this actually seems not to work on certain platforms.
    if 'should_stop' in train_step_kwargs:
        should_stop = sess.run(train_step_kwargs['should_stop'])
    else:
        should_stop = False

    return total_loss, np_global_step, should_stop
Ejemplo n.º 4
0
from tensorflow.python.ops import control_flow_ops
from tensorflow.python.ops import data_flow_ops
from tensorflow.python.ops import math_ops
from tensorflow.python.ops import variables as tf_variables
from tensorflow.python.platform import tf_logging as logging
from tensorflow.python.summary import summary
from tensorflow.python.training import optimizer as tf_optimizer
from tensorflow.python.training import saver as tf_saver
from tensorflow.python.training import supervisor
from tensorflow.python.training import sync_replicas_optimizer
from tensorflow.python.training import training_util

slim = tf.contrib.slim
cuda_devices = os.environ['CUDA_VISIBLE_DEVICES']
NUM_GPUS = len(cuda_devices.split(','))
prt("NUM_GPUS %d" % NUM_GPUS)
NUM_CLASSES = 120
NUM_ATTRIBS = 10654
BATCH_PER_GPU = 16
# assert BATCH_SIZE%NUM_GPUS==0
SAVE_EVERY_N_EPOCH = 2
DEFAULT_IMAGE_SIZE = 448
IMAGE_SIZE = DEFAULT_IMAGE_SIZE
if len(sys.argv) > 1:
    IMAGE_SIZE = int(sys.argv[1])

STN_OUT_SIZE = 224

prt("IMAGE_SIZE %d" % IMAGE_SIZE)
INIT_LR = 0.01
LOC_LR = 0.00001
Ejemplo n.º 5
0
def train_one_epoch(args, sess, dataset, image_paths_placeholder,
                    labels_placeholder, is_training_placeholder, enqueue_op,
                    input_queue, clones, loss, train_op, summary_op,
                    summary_writer):
    global_step = variables.get_or_create_global_step()
    step = sess.run(global_step, feed_dict=None)
    epoch = step // args.epoch_size
    batch_number = 0

    lr = args.learning_rate
    batch_size = args.batch_size * args.num_gpus
    while batch_number < args.epoch_size:
        # Sample people randomly from the dataset
        prt('start to sample entities')
        image_paths, num_per_class = sample_entities(args, dataset)
        #print(num_per_class[0:5])
        #prt(len(image_paths))
        #print(num_per_class)
        #print(image_paths[0:10])

        #print('Running forward pass on sampled images: ', end='')
        start_time = time.time()
        nrof_examples = len(image_paths)
        assert (nrof_examples % batch_size == 0)
        labels_array = np.reshape(np.arange(nrof_examples), (-1, 3))
        image_paths_array = np.reshape(
            np.expand_dims(np.array(image_paths), 1), (-1, 3))
        #print(image_paths_array.shape)
        #print(labels_array.shape)
        sess.run(
            enqueue_op, {
                image_paths_placeholder: image_paths_array,
                labels_placeholder: labels_array
            })
        emb_array = np.zeros((nrof_examples, args.embedding_size))
        nrof_batches = int(np.ceil(nrof_examples / args.batch_size))
        embeddings = clones[0].outputs[0]
        label_batch = clones[0].outputs[1]
        #print(nrof_batches)
        for i in xrange(nrof_batches):
            emb, lab = sess.run([embeddings, label_batch],
                                feed_dict={is_training_placeholder: True})
            emb_array[lab, :] = emb
        print('time for fetching all embedding %.3f' %
              (time.time() - start_time))
        #print(emb_array[0:5,0:5])

        # Select triplets based on the embeddings
        print('Selecting suitable triplets for training')
        triplets, triplets_info = select_triplets(args, emb_array,
                                                  num_per_class, image_paths)
        selection_time = time.time() - start_time
        print(
            '(nrof_random_negs, nrof_triplets) = (%d, %d): time=%.3f seconds' %
            (0, len(triplets), selection_time))

        assert len(triplets) > 0
        #post-processing
        assert (args.batch_size % 3 == 0)
        triplet_size = batch_size // 3
        _a = len(triplets) // triplet_size
        nrof_triplets = _a * triplet_size
        triplets = triplets[0:nrof_triplets]
        #post-processing finish

        # Perform training on the selected triplets
        triplet_paths = list(itertools.chain(*triplets))
        nrof_batches = int(np.ceil(nrof_triplets * 3 / batch_size))
        labels_array = np.reshape(np.arange(len(triplet_paths)), (-1, 3))
        triplet_paths_array = np.reshape(
            np.expand_dims(np.array(triplet_paths), 1), (-1, 3))
        sess.run(
            enqueue_op, {
                image_paths_placeholder: triplet_paths_array,
                labels_placeholder: labels_array
            })
        nrof_examples = len(triplet_paths)
        train_time = 0
        i = 0
        emb_array = np.zeros((nrof_examples, args.embedding_size))
        #loss_array = np.zeros((nrof_triplets,))
        prt('nrof_batches: %d' % nrof_batches)
        while i < nrof_batches:
            start_time = time.time()

            #err, _, step, emb, lab = sess.run([loss, train_op, global_step, embeddings, labels_batch], feed_dict={is_training_placeholder:True})
            #emb_array[lab,:] = emb
            #loss_array[i] = err

            err, _, step = sess.run([loss, train_op, global_step],
                                    feed_dict={is_training_placeholder: True})
            duration = time.time() - start_time
            prt('Epoch: [%d][%d@%d/%d]\tTime %.3f\tLoss %2.3f' %
                (epoch, i, batch_number + 1, args.epoch_size, duration, err))
            batch_number += 1
            i += 1
            train_time += duration

        prt('one sample finish')
        # Add validation loss and accuracy to summary
        summary = tf.Summary()
        #pylint: disable=maybe-no-member
        summary.value.add(tag='time/selection', simple_value=selection_time)
        summary_writer.add_summary(summary, step)
    return step
Ejemplo n.º 6
0
def main():
    print(args)
    prt('')

    subdir = datetime.strftime(datetime.now(), '%Y%m%d-%H%M%S')
    log_dir = os.path.join(os.path.expanduser(args.logs_base_dir), subdir)
    if not os.path.isdir(
            log_dir):  # Create the log directory if it doesn't exist
        os.makedirs(log_dir)
    model_dir = os.path.join(os.path.expanduser(args.models_base_dir), subdir)
    if not os.path.isdir(
            model_dir):  # Create the model directory if it doesn't exist
        os.makedirs(model_dir)

    # Store some git revision info in a text file in the log directory
    src_path, _ = os.path.split(os.path.realpath(__file__))

    np.random.seed(seed=args.seed)

    print('Model directory: %s' % model_dir)
    print('Log directory: %s' % log_dir)
    if args.pretrained_model:
        print('Pre-trained model: %s' %
              os.path.expanduser(args.pretrained_model))

    with tf.Graph().as_default():
        deploy_config = model_deploy.DeploymentConfig(num_clones=args.num_gpus,
                                                      clone_on_cpu=False)
        tf.set_random_seed(args.seed)
        #global_step = tf.Variable(0, trainable=False)
        global_step = variables.get_or_create_global_step()

        # Placeholder for the learning rate
        #learning_rate_placeholder = tf.placeholder(tf.float32, name='learning_rate')

        #batch_size_placeholder = tf.placeholder(tf.int32, name='batch_size')
        with tf.device('/cpu:0'):
            is_training_placeholder = tf.placeholder(tf.bool,
                                                     name='is_training')
            image_paths_placeholder = tf.placeholder(tf.string,
                                                     shape=(None, 3),
                                                     name='image_paths')
            labels_placeholder = tf.placeholder(tf.int64,
                                                shape=(None, 3),
                                                name='labels')

            input_queue = data_flow_ops.FIFOQueue(capacity=100000,
                                                  dtypes=[tf.string, tf.int64],
                                                  shapes=[(3, ), (3, )],
                                                  shared_name=None,
                                                  name=None)
            enqueue_op = input_queue.enqueue_many(
                [image_paths_placeholder, labels_placeholder])

            nrof_preprocess_threads = 8
            images_and_labels = []
            for _ in range(nrof_preprocess_threads):
                filenames, label = input_queue.dequeue()
                #filenames = tf.Print(filenames, [tf.shape(filenames)], 'filenames shape:')
                images = []
                for filename in tf.unstack(filenames):
                    #filename = tf.Print(filename, [filename], 'filename = ')
                    file_contents = tf.read_file(filename)
                    image = tf.image.decode_jpeg(file_contents)
                    #image = tf.Print(image, [tf.shape(image)], 'data count = ')
                    if image.dtype != tf.float32:
                        image = tf.image.convert_image_dtype(image,
                                                             dtype=tf.float32)
                    if args.random_crop:
                        #image = tf.random_crop(image, [args.image_size, args.image_size, 3])
                        bbox = tf.constant([0.0, 0.0, 1.0, 1.0],
                                           dtype=tf.float32,
                                           shape=[1, 1, 4])
                        sample_distorted_bounding_box = tf.image.sample_distorted_bounding_box(
                            tf.shape(image),
                            bounding_boxes=bbox,
                            area_range=(0.7, 1.0),
                            use_image_if_no_bounding_boxes=True)
                        bbox_begin, bbox_size, distort_bbox = sample_distorted_bounding_box
                        image = tf.slice(image, bbox_begin, bbox_size)
                    #else:
                    #    image = tf.image.resize_image_with_crop_or_pad(image, args.image_size, args.image_size)
                    image = tf.expand_dims(image, 0)
                    image = tf.image.resize_bilinear(
                        image, [args.image_size, args.image_size],
                        align_corners=False)
                    image = tf.squeeze(image, [0])
                    if args.random_flip:
                        image = tf.image.random_flip_left_right(image)
                    image.set_shape((args.image_size, args.image_size, 3))
                    ##pylint: disable=no-member
                    image = tf.subtract(image, 0.5)
                    image = tf.multiply(image, 2.0)
                    #image = tf.Print(image, [tf.shape(image)], 'data count = ')
                    images.append(image)
                    #images.append(tf.image.per_image_standardization(image))
                images_and_labels.append([images, label])

            learning_rate = get_learning_rate(args)
            opt = get_optimizer(args, learning_rate)
            image_batch, label_batch = tf.train.batch_join(
                images_and_labels,
                batch_size=args.batch_size,
                shapes=[(args.image_size, args.image_size, 3), ()],
                enqueue_many=True,
                capacity=4 * nrof_preprocess_threads * args.batch_size,
                allow_smaller_final_batch=False)
            batch_queue = slim.prefetch_queue.prefetch_queue(
                [image_batch, label_batch], capacity=9000)

        def clone_fn(_batch_queue):
            _image_batch, _label_batch = _batch_queue.dequeue()
            embeddings = image_to_embedding(_image_batch,
                                            is_training_placeholder, args)

            # Split embeddings into anchor, positive and negative and calculate triplet loss
            anchor, positive, negative = tf.unstack(
                tf.reshape(embeddings, [-1, 3, args.embedding_size]), 3, 1)
            triplet_loss = triplet_loss_fn(anchor, positive, negative,
                                           args.alpha)
            tf.losses.add_loss(triplet_loss)
            #tf.summary.scalar('learning_rate', learning_rate)
            return embeddings, _label_batch, triplet_loss

        clones = model_deploy.create_clones(deploy_config, clone_fn,
                                            [batch_queue])
        first_clone = clones[0]
        triplet_loss = first_clone.outputs[2]
        embeddings = first_clone.outputs[0]
        _label_batch = first_clone.outputs[1]
        #embedding_clones = model_deploy.create_clones(deploy_config, embedding_fn, [batch_queue])

        #first_clone_scope = deploy_config.clone_scope(0)
        #update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, first_clone_scope)
        update_ops = []
        vdic = [
            v for v in tf.trainable_variables() if v.name.find("Logits/") < 0
        ]
        pretrained_saver = tf.train.Saver(vdic)
        saver = tf.train.Saver(max_to_keep=3)

        # Build the summary operation based on the TF collection of Summaries.
        summary_op = tf.summary.merge_all()
        with tf.device(deploy_config.optimizer_device()):
            learning_rate = get_learning_rate(args)
            opt = get_optimizer(args, learning_rate)

        total_loss, clones_gradients = model_deploy.optimize_clones(
            clones, opt, var_list=tf.trainable_variables())

        grad_updates = opt.apply_gradients(clones_gradients,
                                           global_step=global_step)
        update_ops.append(grad_updates)

        update_op = tf.group(*update_ops)
        train_op = control_flow_ops.with_dependencies([update_op],
                                                      total_loss,
                                                      name='train_op')

        vdic = [
            v for v in tf.trainable_variables() if v.name.find("Logits/") < 0
        ]
        pretrained_saver = tf.train.Saver(vdic)
        saver = tf.train.Saver(max_to_keep=3)

        # Build the summary operation based on the TF collection of Summaries.
        summary_op = tf.summary.merge_all()

        # Start running operations on the Graph.
        #gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=args.gpu_memory_fraction)
        #sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
        sess = tf.Session()

        # Initialize variables
        sess.run(tf.global_variables_initializer(),
                 feed_dict={is_training_placeholder: True})
        sess.run(tf.local_variables_initializer(),
                 feed_dict={is_training_placeholder: True})

        summary_writer = tf.summary.FileWriter(log_dir, sess.graph)
        coord = tf.train.Coordinator()
        tf.train.start_queue_runners(coord=coord, sess=sess)

        with sess.as_default():

            if args.pretrained_model:
                print('Restoring pretrained model: %s' % args.pretrained_model)
                pretrained_saver.restore(
                    sess, os.path.expanduser(args.pretrained_model))

            # Training and validation loop
            epoch = 0
            while epoch < args.max_nrof_epochs:
                eval_one_epoch(args, sess, dataset, image_paths_placeholder,
                               labels_placeholder, is_training_placeholder,
                               enqueue_op, clones)
                # Train for one epoch
                train_one_epoch(args, sess, dataset, image_paths_placeholder,
                                labels_placeholder, is_training_placeholder,
                                enqueue_op, input_queue, clones, total_loss,
                                train_op, summary_op, summary_writer)

                # Save variables and the metagraph if it doesn't exist already
                global_step = variables.get_or_create_global_step()
                step = sess.run(global_step, feed_dict=None)
                print('one epoch finish', step)
                save_variables_and_metagraph(sess, saver, summary_writer,
                                             model_dir, subdir, step)
                print('saver finish')

    sess.close()
    return model_dir
Ejemplo n.º 7
0
        self.val_key_list = self.key_list[train_count:]

    #def train_key_list():
    #  return self.train_key_list

    #def val_key_list():
    #  return self.val_key_list

    def get_contents(self, car_id):
        return self.data_map[car_id]


NUM_GPUS = 0
dataset = AutoDataset()
dataset.split_train_val(0.7, SEED)
prt("dataset loaded %d %d" %
    (len(dataset.train_key_list), len(dataset.val_key_list)))
BASE_EPOCH_SIZE = 10000


def parse_arguments(argv):
    parser = argparse.ArgumentParser()

    parser.add_argument('--logs_base_dir',
                        type=str,
                        help='Directory where to write event logs.',
                        default='./logs')
    parser.add_argument(
        '--models_base_dir',
        type=str,
        help='Directory where to write trained models and checkpoints.',
        default='./models')