コード例 #1
0
 def __init__(self):
     self.cache_path = os.path.join(config.DATA_DIR, "MSCOCO", "cache",
                                    "cache.pkl")
     self.caption_file_names = {
         "eval": "captions_val2014.json",
         "train": "captions_train2014.json"
     }
     self.image_feature_names = {
         "eval": "eval_img_feature.pkl",
         "test": "test_img_feature.pkl",
         "train": "train_img_feature.pkl"
     }
     DataLoader.__init__(self, MSCOCODataLoader.MSCOCO_text_processor,
                         self.cache_path)
コード例 #2
0
    def test_get_paths_training_set(self):
        loader_class = DataLoader(datafolder, True, resample_fs)

        file_paths = loader_class.__get_paths__()
        expected = [
            datafolder + '01/2_raw_data_13-13_22.03.16.txt',
            datafolder + '02/2_raw_data_14-21_22.03.16.txt',
            datafolder + '03/2_raw_data_09-34_11.04.16.txt'
        ]
        self.assertEqual(file_paths, expected,
                         'Path of text files with prefix 2_ should be loaded')
        self.assertEqual(len(file_paths), 3,
                         'It should return a total of 3 file paths')
        self.assertNotIn('mock_data/EMG_dataset/README.txt', file_paths,
                         'It should not load any extraneous path')
コード例 #3
0
def main():
    args = get_args()
    m_config = process_config(args.config)

    config = tf.ConfigProto(log_device_placement=True)
    config.gpu_options.allow_growth = True

    with tf.Session(config=config) as sess:
        # create_dirs([config.summary_dir, config.checkpoint_dir])
        data = DataLoader(config=m_config)
        model = VGG16(data_loader=data, config=m_config)
        logger = Logger(sess=sess, config=m_config)

        trainer = Trainer(
            sess=sess,
            model=model,
            data=data,
            config=m_config,
            logger=logger)

        trainer.train()
コード例 #4
0
def main(model_config, train_config, track_config):

    # GPU Config
    gpu_list = train_config['train_data_config'].get('gpu_ids', '0')
    num_gpus = len(gpu_list.split(','))
    if num_gpus > 1:
        os.environ['CUDA_VISIBLE_DEVICES'] = gpu_list
    else:
        os.environ['CUDA_VISIBLE_DEVICES'] = auto_select_gpu()

    # Create training directory which will be used to save: configurations, model files, TensorBoard logs
    train_dir = train_config['train_dir']
    if not osp.isdir(train_dir):
        logging.info('Creating training directory: %s', train_dir)
        mkdir_p(train_dir)

    g = tf.Graph()
    with g.as_default():
        # Set fixed seed for reproducible experiments
        random.seed(train_config['seed'])
        np.random.seed(train_config['seed'])
        tf.set_random_seed(train_config['seed'])

        #Build global step
        with tf.name_scope('train/'):
            global_step = tf.Variable(initial_value=0,
                                      name='global_step',
                                      trainable=False,
                                      collections=[
                                          tf.GraphKeys.GLOBAL_STEP,
                                          tf.GraphKeys.GLOBAL_VARIABLES
                                      ])

        Model = get_model(model_config['Model'])

        # build training dataloader and validation dataloader
        #---train
        train_dataloader = DataLoader(train_config['train_data_config'],
                                      is_training=True)
        train_dataloader.build()
        train_inputs = train_dataloader.get_one_batch()

        #---validation
        val_dataloader = DataLoader(train_config['validation_data_config'],
                                    is_training=False)
        val_dataloader.build()
        val_inputs = val_dataloader.get_one_batch()

        # Save configurations for future reference
        save_cfgs(train_dir, model_config, train_config, track_config)

        if train_config['lr_config'].get('lr_warmup', False):
            warmup_epoch_num = 10
            init_lr_ratio = 0.8
            warmup_steps = warmup_epoch_num * int(
                train_config['train_data_config']['num_examples_per_epoch']
            ) // train_config['train_data_config']['batch_size']
            inc_per_step = (
                1 - init_lr_ratio
            ) * train_config['lr_config']['initial_lr'] / warmup_steps
            warmup_lr = train_config['lr_config'][
                'initial_lr'] * init_lr_ratio + inc_per_step * tf.to_float(
                    global_step)
            learning_rate = tf.cond(
                tf.less(global_step,
                        warmup_steps), lambda: tf.identity(warmup_lr),
                lambda: _configure_learning_rate(train_config, global_step -
                                                 warmup_steps))
        else:
            learning_rate = _configure_learning_rate(train_config, global_step)

        optimizer = _configure_optimizer(train_config, learning_rate)
        tf.summary.scalar('learning_rate', learning_rate)

        # Set up the training ops
        examplars, instances, gt_examplar_boxes, gt_instance_boxes = tf.split(train_inputs[0],num_gpus), \
                                                                     tf.split(train_inputs[1],num_gpus), \
                                                                     tf.split(train_inputs[2],num_gpus), \
                                                                     tf.split(train_inputs[3],num_gpus)

        if train_config['train_data_config'].get('time_decay', False):
            time_intervals = tf.split(train_inputs[4], num_gpus)

        tower_grads = []
        with tf.variable_scope(tf.get_variable_scope()):
            for i in range(num_gpus):
                with tf.device('/gpu:%d' % i):
                    if train_config['train_data_config'].get(
                            'time_decay', False):
                        inputs = [
                            examplars[i], instances[i], gt_examplar_boxes[i],
                            gt_instance_boxes[i], time_intervals[i]
                        ]
                    else:
                        inputs = [
                            examplars[i], instances[i], gt_examplar_boxes[i],
                            gt_instance_boxes[i]
                        ]
                    model = tower_model(Model,
                                        inputs,
                                        model_config,
                                        train_config,
                                        mode='train')
                    # Reuse variables for the next tower.
                    tf.get_variable_scope().reuse_variables()
                    grads = optimizer.compute_gradients(model.total_loss)
                    tower_grads.append(grads)
        grads = average_gradients(tower_grads)

        #Clip gradient
        gradients, tvars = zip(*grads)
        clip_gradients, _ = tf.clip_by_global_norm(
            gradients, train_config['clip_gradients'])
        train_op = optimizer.apply_gradients(zip(clip_gradients, tvars),
                                             global_step=global_step)

        #Build validation model
        with tf.device('/gpu:0'):
            model_va = Model(model_config,
                             train_config,
                             mode='validation',
                             inputs=val_inputs)
            model_va.build(reuse=True)

        #Save Model setup
        saver = tf.train.Saver(
            tf.global_variables(),
            max_to_keep=train_config['max_checkpoints_to_keep'])

        summary_writer = tf.summary.FileWriter(train_dir, g)
        summary_op = tf.summary.merge_all()

        global_variables_init_op = tf.global_variables_initializer()
        local_variables_init_op = tf.local_variables_initializer()

        # Dynamically allocate GPU memory
        gpu_options = tf.GPUOptions(allow_growth=True)
        sess_config = tf.ConfigProto(gpu_options=gpu_options,
                                     allow_soft_placement=True)
        #inter_op_parallelism_threads = 16, intra_op_parallelism_threads = 16, log_device_placement=True)

        ######Debug timeline
        if Debug:
            from tensorflow.python.client import timeline
            run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
            run_metadata = tf.RunMetadata()
        ######Debug timeline

        sess = tf.Session(config=sess_config)
        model_path = tf.train.latest_checkpoint(train_config['train_dir'])

        if not model_path:
            sess.run(global_variables_init_op)
            sess.run(local_variables_init_op)
            start_step = 0
            if model_config['embed_config']['embedding_checkpoint_file']:
                model.init_fn(sess)
            elif model_config['finetuned_checkpoint_file']:
                finetuned_checkpoint_file = tf.train.latest_checkpoint(
                    model_config['finetuned_checkpoint_file'])
                logging.info('Restore from last checkpoint: {}'.format(
                    finetuned_checkpoint_file))
                sess.run(local_variables_init_op)
                sess.run(global_variables_init_op)
                restore_op = tf.contrib.slim.assign_from_checkpoint_fn(
                    finetuned_checkpoint_file,
                    tf.global_variables(),
                    ignore_missing_vars=True)
                restore_op(sess)
                #reset global step saved in checkpoint
                global_step_reset_op = global_step.assign(0)
                sess.run(global_step_reset_op)
        else:
            logging.info('Restore from last checkpoint: {}'.format(model_path))
            sess.run(local_variables_init_op)
            sess.run(global_variables_init_op)
            #saver.restore(sess, model_path)
            restore_op = tf.contrib.slim.assign_from_checkpoint_fn(
                model_path, tf.global_variables(), ignore_missing_vars=True)
            restore_op(sess)

        start_step = tf.train.global_step(sess, global_step.name) + 1
        print_trainable(sess)  #help function, can be disenable
        g.finalize()  # Finalize graph to avoid adding ops by mistake

        # Training loop
        data_config = train_config['train_data_config']
        total_steps = int(data_config['epoch'] *
                          data_config['num_examples_per_epoch'] /
                          data_config['batch_size'])
        logging.info('Train for {} steps'.format(total_steps))
        for step in range(start_step, total_steps):
            try:
                start_time = time.time()
                if Debug:
                    _, loss, batch_loss, current_lr = sess.run(
                        [
                            train_op, model.total_loss, model.batch_loss,
                            learning_rate
                        ],
                        run_metadata=run_metadata,
                        options=run_options)
                    t1 = timeline.Timeline(run_metadata.step_stats)
                    ctf = t1.generate_chrome_trace_format()
                    with open('timeline.json', 'w') as f:
                        f.write(ctf)
                else:
                    _, loss, batch_loss, current_lr = sess.run([
                        train_op, model.total_loss, model.batch_loss,
                        learning_rate
                    ])
                duration = time.time() - start_time

                if step % 10 == 0:
                    examples_per_sec = data_config['batch_size'] / float(
                        duration)
                    time_remain = data_config['batch_size'] * (
                        total_steps - step) / examples_per_sec
                    current_epoch = (
                        step * data_config['batch_size']
                    ) // data_config['num_examples_per_epoch'] + 1
                    m, s = divmod(time_remain, 60)
                    h, m = divmod(m, 60)
                    format_str = (
                        '%s: epoch %d-step %d,lr = %f, total loss = %.3f, batch loss = %.3f (%.1f examples/sec; %.3f '
                        'sec/batch; %dh:%02dm:%02ds remains)')
                    logging.info(
                        format_str %
                        (datetime.now(), current_epoch, step, current_lr, loss,
                         batch_loss, examples_per_sec, duration, h, m, s))

                if step % 200 == 0:
                    summary_str = sess.run(summary_op)
                    summary_writer.add_summary(summary_str, step)

                if step % train_config['save_model_every_n_step'] == 0 or (
                        step + 1) == total_steps:
                    checkpoint_path = osp.join(train_config['train_dir'],
                                               'model.ckpt')
                    saver.save(sess, checkpoint_path, global_step=step)
            except KeyboardInterrupt:
                checkpoint_path = osp.join(train_config['train_dir'],
                                           'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
                print("save model.ckpt-%d" % (step))
                break
            except:
                print(traceback.format_exc())
                print("Error found in current step, continue")
コード例 #5
0
 def __init__(self):
     self.cache_path = os.path.join(Filckr8kImageProcessor.data_dir,
                                    "cache", "cache.pkl")
     DataLoader.__init__(self, Flickr8KDataLoader.Flickr8k_text_processor,
                         self.cache_path)