Esempio n. 1
0
def train_vqvae(params, dataset, checkpoint_dir):
    logger.set_logger_dir(checkpoint_dir)

    dataset_params = params['dataset']
    model_params = params['model']
    trainer_params = params['trainer']
    image_shape = model_params['image_shape']

    train_ds, val_ds, sample_train, sample_test = load_toy_dataset(
        dataset, trainer_params['batch_size'],
        trainer_params['num_parallel'])

    params.to_file(os.path.join(logger.get_logger_dir(), 'config.json'))

    model = BaseVQVAE.from_params(model_params)

    trainer_config = AutoResumeTrainConfig(
        always_resume=recover,
        model=model,
        dataflow=train_ds,
        callbacks=[
            Reconstruct(model, sample_train, sample_test,
                        os.path.join(checkpoint_dir, 'images')),
            ModelSaver(max_to_keep=5, checkpoint_dir=checkpoint_dir),
            InferenceRunner(input=val_ds,
                            infs=ScalarStats(['loss', 'perplexity'])),
            MaxSaver(monitor_stat='validation_loss'),
            CompressResource(os.path.join(checkpoint_dir, 'images'),
                             os.path.join(checkpoint_dir, 'images.zip'))
        ],
        steps_per_epoch=trainer_params['steps_per_epoch'],
        max_epoch=trainer_params['max_epochs']
    )
    launch_train_with_config(trainer_config, SimpleTrainer())
def get_config(model):
    nr_tower = max(get_num_gpu(), 1)
    batch = args.batch // nr_tower

    logger.info("Running on {} towers. Batch size per tower: {}".format(nr_tower, batch))

    callbacks = [ThroughputTracker(args.batch)]
    if args.fake:
        data = QueueInput(FakeData(
            [[batch, 224, 224, 3], [batch]], 1000, random=False, dtype='uint8'))
    else:
        data = QueueInput(
            get_imagenet_dataflow(args.data, 'train', batch),
            # use a larger queue
            queue=tf.FIFOQueue(200, [tf.uint8, tf.int32], [[batch, 224, 224, 3], [batch]])
        )

        BASE_LR = 30
        SCALED_LR = BASE_LR * (args.batch / 256.0)
        callbacks.extend([
            ModelSaver(),
            EstimatedTimeLeft(),
            ScheduledHyperParamSetter(
                'learning_rate', [
                    (0, SCALED_LR),
                    (60, SCALED_LR * 1e-1),
                    (70, SCALED_LR * 1e-2),
                    (80, SCALED_LR * 1e-3),
                    (90, SCALED_LR * 1e-4),
                ]),
        ])

        dataset_val = get_imagenet_dataflow(args.data, 'val', 64)
        infs = [ClassificationError('wrong-top1', 'val-error-top1'),
                ClassificationError('wrong-top5', 'val-error-top5')]
        if nr_tower == 1:
            callbacks.append(InferenceRunner(QueueInput(dataset_val), infs))
        else:
            callbacks.append(DataParallelInferenceRunner(
                dataset_val, infs, list(range(nr_tower))))

    if args.load.endswith(".npz"):
        # a released model in npz format
        init = SmartInit(args.load)
    else:
        # a pre-trained checkpoint
        init = SaverRestore(args.load, ignore=("learning_rate", "global_step"))
    return TrainConfig(
        model=model,
        data=data,
        callbacks=callbacks,
        steps_per_epoch=100 if args.fake else 1281167 // args.batch,
        session_init=init,
        max_epoch=100,
    )
Esempio n. 3
0
def train_pixelcnn_prior(params, checkpoint_dir, recover=True, force=False):
    if force and os.path.exists(checkpoint_dir):
        shutil.rmtree(checkpoint_dir)
    logger.set_logger_dir(checkpoint_dir)

    dataset_params = params['dataset']
    model_params = params['model']
    trainer_params = params['trainer']

    train_ds, val_ds, sample_train, sample_val, sample_train_label, \
        sample_val_label = get_dataflow(
            dataset_params['path'], False,
            dataset_params['train_val_split'], trainer_params['batch_size'],
            trainer_params['num_parallel'])

    vqvae_checkpoint_path = trainer_params['vqvae_checkpoint_path']
    vqvae_config_path = os.path.join(os.path.split(vqvae_checkpoint_path)[0],
                                     'config.json')
    model_params['vqvae_model_params'] = vqvae_config_path

    latent_shape = model_params['latent_shape']
    num_labels = model_params['num_labels']

    params.to_file(os.path.join(logger.get_logger_dir(), 'config.json'))

    model = BasePixelCNNPrior.from_params(model_params)

    trainer = SyncMultiGPUTrainerParameterServer(
        gpus=trainer_params['num_gpus'], ps_device=None)
    trainer_config = AutoResumeTrainConfig(
        always_resume=recover,
        model=model,
        dataflow=train_ds,
        callbacks=[
            SequentialSampling(trainer_params['num_examples_to_generate'],
                               latent_shape, num_labels, model,
                               os.path.join(checkpoint_dir, 'images')),
            Reconstruct(model, sample_train, sample_val,
                        os.path.join(checkpoint_dir, 'images'),
                        sample_train_label, sample_val_label),
            ModelSaver(max_to_keep=5, checkpoint_dir=checkpoint_dir),
            InferenceRunner(input=val_ds,
                            infs=ScalarStats(['loss'])),
            MinSaver(monitor_stat='validation_loss'),
            CompressResource(os.path.join(checkpoint_dir, 'images'),
                             os.path.join(checkpoint_dir, 'images.zip')),
            RestoreWeights(vqvae_checkpoint_path),
            Notification('Training status', 'Complete')
        ],
        steps_per_epoch=trainer_params['steps_per_epoch'],
        max_epoch=trainer_params['max_epochs']
    )
    launch_train_with_config(trainer_config, trainer)
Esempio n. 4
0
def _inference_runner_train_cbs(args, ds_val, insrc_val, val_cbs):
    train_cbs = []
    if args.do_remote_child_inf_runner:
        if args.num_classes > 1:
            val_cbs.extend(
                generate_classification_callbacks(args.net_info.master))
        else:
            val_cbs.extend(generate_regression_callbacks(args.net_info.master))
        inf_runner = InferenceRunner(ds_val or insrc_val,
                                     [ScalarStats('cost')] + val_cbs)
        train_cbs.append(inf_runner)
    return train_cbs
def train(args, logdir):

    # model
    model = Net1()

    preprocessing(data_path)
    preprocessing(test_path)

    # dataflow
    df = Net1DataFlow(data_path, hp.train1.batch_size)
    df_test = Net1DataFlow(test_path, hp.train1.batch_size)

    #datas = df.get_data()
    #print(datas[1])
    # set logger for event and model saver
    logger.set_logger_dir(logdir)
    #session_conf = tf.ConfigProto(
    #    gpu_options=tf.GPUOptions(
    #        allow_growth=True,
    #    ),)

    # cv test code
    # https://github.com/tensorpack/tensorpack/blob/master/examples/boilerplate.py

    train_conf = AutoResumeTrainConfig(
        model=model,
        data=QueueInput(df(n_prefetch=hp.train1.batch_size * 10, n_thread=1)),
        callbacks=[
            ModelSaver(checkpoint_dir=logdir),
            InferenceRunner(
                df_test(n_prefetch=1),
                ScalarStats(['net1/eval/loss', 'net1/eval/acc'], prefix='')),
        ],
        max_epoch=hp.train1.num_epochs,
        steps_per_epoch=hp.train1.steps_per_epoch,
        #session_config=session_conf
    )
    ckpt = '{}/{}'.format(
        logdir, args.ckpt) if args.ckpt else tf.train.latest_checkpoint(logdir)
    num_gpu = hp.train1.num_gpu

    if ckpt:
        train_conf.session_init = SaverRestore(ckpt)

    if args.gpu:
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
        train_conf.nr_tower = len(args.gpu.split(','))
        num_gpu = len(args.gpu.split(','))
        trainer = SyncMultiGPUTrainerReplicated(num_gpu)
    else:
        trainer = SimpleTrainer()

    launch_train_with_config(train_conf, trainer=trainer)
Esempio n. 6
0
def train_image_embedding_softmax(params, checkpoint_dir, recover=True,
                                  force=False):
    if force and os.path.exists(checkpoint_dir):
        shutil.rmtree(checkpoint_dir)
    logger.set_logger_dir(checkpoint_dir)

    dataset_params = params['dataset']
    model_params = params['model']
    trainer_params = params['trainer']

    train_ds, val_ds, _, _, _, _ = get_dataflow(
        dataset_params['path'], False, dataset_params['train_val_split'],
        trainer_params['batch_size'], trainer_params['num_parallel'])

    vqvae_checkpoint_path = trainer_params['vqvae_checkpoint_path']
    vqvae_config_path = os.path.join(os.path.split(vqvae_checkpoint_path)[0],
                                     'config.json')
    model_params['vqvae_model_params'] = vqvae_config_path

    params.to_file(os.path.join(logger.get_logger_dir(), 'config.json'))

    model = BaseImageEmbedding.from_params(model_params)

    trainer = SyncMultiGPUTrainerParameterServer(
        gpus=trainer_params['num_gpus'], ps_device=None)
    trainer_config = AutoResumeTrainConfig(
        always_resume=recover,
        model=model,
        dataflow=train_ds,
        callbacks=[
            InferenceRunner(input=val_ds, infs=[
                ScalarStats('loss'),
                ClassificationError('correct_prediction',
                                    'val-correct_prediction')]),
            ModelSaver(max_to_keep=5, checkpoint_dir=checkpoint_dir),
            MinSaver(monitor_stat='val-correct_prediction'),
            RestoreWeights(vqvae_checkpoint_path),
            SendStat('Training status', [
                'loss', 'accuracy',
                'validation_loss', 'val-correct_prediction'],
                after_every=2),
            Notification('Training status', 'Complete')
        ],
        steps_per_epoch=trainer_params['steps_per_epoch'],
        max_epoch=trainer_params['max_epochs']
    )
    launch_train_with_config(trainer_config, trainer)
Esempio n. 7
0
def train_vae(params, checkpoint_dir, recover=True, force=False):
    if force and os.path.exists(checkpoint_dir):
        shutil.rmtree(checkpoint_dir)
    logger.set_logger_dir(checkpoint_dir)

    dataset_params = params['dataset']
    model_params = params['model']
    trainer_params = params['trainer']

    train_ds, val_ds, sample_train, sample_val, _, _ = \
        get_dataflow(dataset_params['path'],
                     dataset_params['binarizer'],
                     dataset_params['train_val_split'],
                     trainer_params['batch_size'],
                     trainer_params['num_parallel'])

    params.to_file(os.path.join(logger.get_logger_dir(), 'config.json'))

    latent_dim = model_params['latent_dim']
    model = BaseVAE.from_params(model_params)

    trainer = SyncMultiGPUTrainerParameterServer(
        gpus=trainer_params['num_gpus'], ps_device=None)
    trainer_config = AutoResumeTrainConfig(
        always_resume=recover,
        model=model,
        dataflow=train_ds,
        callbacks=[
            Sampling(model, trainer_params['num_examples_to_generate'],
                     latent_dim, os.path.join(checkpoint_dir, 'images')),
            Reconstruct(model, sample_train, sample_val,
                        os.path.join(checkpoint_dir, 'images')),
            ModelSaver(max_to_keep=5, checkpoint_dir=checkpoint_dir),
            InferenceRunner(input=val_ds,
                            infs=ScalarStats(['avg_logpx_z', 'neg_elbo'])),
            MinSaver(monitor_stat='validation_neg_elbo'),
            CompressResource(os.path.join(checkpoint_dir, 'images'),
                             os.path.join(checkpoint_dir, 'images.zip')),
            Notification('Training status', 'Complete')
        ],
        steps_per_epoch=trainer_params['steps_per_epoch'],
        max_epoch=trainer_params['max_epochs']
    )
    launch_train_with_config(trainer_config, trainer)
def ptb_training_cbs(model, args, ptb_data_dir, train_cbs):
    # compute some callbacks for training
    # shift_state_callback_train = PerStepHookWithControlDependencies(
    #     op_func=lambda : model.update_state(),
    #     dependencies_func=lambda self : [self.trainer.train_op]
    # )
    #train_cbs.append(shift_state_callback_train)
    train_cbs.append(RunOp(lambda: model.reset_state()))
    if args.training_type in ['tensorpack', 'petridish']:
        train_cbs.append(
            HyperParamSetterWithFunc('learning_rate', lambda e, x: x * 0.80
                                     if e > 6 else x))
    if args.training_type in ['tensorpack', 'petridish', 'darts_final']:
        # TODO keep these for now for debugging;
        # remove for search
        l_splits = ['valid', 'test']
        for split in l_splits:
            data = PennTreeBankDataFlow(split,
                                        ptb_data_dir,
                                        args.batch_size,
                                        args.model_rnn_max_len,
                                        var_size=False)
            #shift_state_inf = PerStepInferencer(
            #    op_func=lambda : model.inference_update_tensor(name_only=True))
            inferencer = InferenceRunner(
                data,
                [
                    ScalarStats(['avg_batch_cost', 'seq_len'], prefix=split),
                    #shift_state_inf
                ],
                tower_name='InferenceTower_{}'.format(split))
            reset_state_cb = RunOp(lambda: model.reset_state())
            train_cbs.extend([inferencer, reset_state_cb])
        print_cb = CallbackFactory(trigger=lambda self: [
            self.trainer.monitors.put_scalar(
                '{}_perplexity'.format(split),
                np.exp((self.trainer.monitors.get_latest(
                    '{}_avg_batch_cost'.format(split)) / self.trainer.monitors.
                        get_latest('{}_seq_len'.format(split)))))
            for split in l_splits
        ])
        train_cbs.append(print_cb)
    return train_cbs
Esempio n. 9
0
def train(args, logdir):
    # model
    model = Net1()

    # dataflow
    TIMIT_TRAIN_WAV = 'TIMIT/TRAIN/*/*/*.npz'
    TIMIT_TEST_WAV = 'TIMIT/TEST/*/*/*.npz'

    print(os.path.join(hp.train1.preproc_data_path, args.case, TIMIT_TRAIN_WAV))
    print(os.path.join(hp.train1.preproc_data_path, args.case, TIMIT_TEST_WAV))

    df = Net1DataFlow(os.path.join(hp.train1.preproc_data_path, args.case, TIMIT_TRAIN_WAV), hp.train1.batch_size)
    df_test = Net1DataFlow(os.path.join(hp.train1.preproc_data_path, args.case, TIMIT_TEST_WAV), hp.train1.batch_size)

    # set logger for event and model saver
    logger.set_logger_dir(logdir)
    train_conf = AutoResumeTrainConfig(
        model=model,
        data=QueueInput(df(n_prefetch=1000, n_thread=8)),
        callbacks=[
            ModelSaver(checkpoint_dir=logdir),
            InferenceRunner(df_test(n_prefetch=1),
                            ScalarStats(['net1/eval/loss', 'net1/eval/acc'],prefix='')),
        ],
        max_epoch=hp.train1.num_epochs,
        steps_per_epoch=hp.train1.steps_per_epoch,
        #session_config=session_conf
    )
    ckpt = '{}/{}'.format(logdir, args.ckpt) if args.ckpt else tf.train.latest_checkpoint(logdir)

    if ckpt:
        train_conf.session_init = SaverRestore(ckpt)

    if hp.default.use_gpu == True:
        os.environ['CUDA_VISIBLE_DEVICES'] = hp.default.gpu_list
        train_conf.nr_tower = len(hp.default.gpu_list.split(','))
        num_gpu = len(hp.default.gpu_list.split(','))
        trainer = SyncMultiGPUTrainerReplicated(num_gpu)
    else:
        os.environ['CUDA_VISIBLE_DEVICES'] = ''
        trainer = SimpleTrainer()

    launch_train_with_config(train_conf, trainer=trainer)
Esempio n. 10
0
    def _default_callbacks(self):
        self.callbacks = [
            ModelSaver(max_to_keep=self.args.max_to_keep),
            EstimatedTimeLeft(),
        ]

        if self.args.gpu and self.args.gpu != "-1":
            self.callbacks.append(GPUUtilizationTracker())

        if self.args.validation is not None:
            self.callbacks.append(
                InferenceRunner(self.dataflow(True),
                                [ScalarStats(self.total_cost_var)]))

        self.callbacks.append(
            MinSaver(self.validation_total_cost_var if self.args.
                     validation is not None else self.total_cost_var))

        self._network_specific_callbacks()
Esempio n. 11
0
    M.add(KL.Conv2D(32, 3, activation='relu', padding='same'))
    M.add(KL.MaxPooling2D())
    M.add(KL.Conv2D(32, 3, padding='same', activation='relu'))
    M.add(KL.Flatten())
    M.add(
        KL.Dense(512,
                 activation='relu',
                 kernel_regularizer=regularizers.l2(1e-5)))
    M.add(KL.Dropout(0.5))
    M.add(
        KL.Dense(10, activation=None,
                 kernel_regularizer=regularizers.l2(1e-5)))
    M.add(KL.Activation('softmax'))

    trainer = SimpleTrainer()

    setup_keras_trainer(trainer,
                        model=M,
                        input=QueueInput(dataset_train),
                        optimizer=tf.train.AdamOptimizer(1e-3),
                        loss='categorical_crossentropy',
                        metrics=['accuracy'])
    trainer.train_with_defaults(
        callbacks=[
            ModelSaver(),
            InferenceRunner(dataset_test,
                            [ScalarStats(['total_loss', 'accuracy'])]),
        ],
        steps_per_epoch=dataset_train.size(),
    )
Esempio n. 12
0
def critic_train(ctrl,
                 data,
                 log_dir,
                 model_dir,
                 prev_dir,
                 vs_name,
                 split_train_val=False):
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)

    lr_schedule = []
    max_epoch = ctrl.critic_train_epoch
    lr = ctrl.critic_init_lr
    for epoch in range(0, max_epoch):
        if epoch % 1 == 0:
            lr_schedule.append((epoch + 1, lr))
            lr *= 0.9
    ds_size = len(data[0])
    idxs = list(range(ds_size))
    np.random.shuffle(idxs)

    if split_train_val:
        train_size = ds_size * 9 // 10
        if train_size == 0:
            train_size = ds_size
        val_start = train_size
    else:
        train_size = ds_size
        val_start = ds_size * 9 // 10
    if ds_size - val_start == 0:
        val_start = 0

    data_train = [[col[k] for k in idxs[:train_size]] for col in data]
    data_val = [[col[k] for k in idxs[val_start:]] for col in data]

    model = critic_factory(ctrl, is_train=True, vs_name=vs_name)
    ds_train = critic_dataflow_factory(ctrl, data_train, is_train=True)
    ds_val = critic_dataflow_factory(ctrl, data_val, is_train=False)
    session_config = None
    device = 0
    if ctrl.critic_type == CriticTypes.LSTM:
        session_config = tf.ConfigProto(device_count={'GPU': 0})
        device = -1
    extra_callbacks = DEFAULT_CALLBACKS()
    extra_callbacks = list(
        filter(lambda x: not isinstance(x, ProgressBar), extra_callbacks))
    logger.info("Extra callbacks are {}".format(
        list(map(lambda x: x.__class__, extra_callbacks))))
    # Put this into callbacks for in-training validation/inferencing
    inference_callback = InferenceRunner(
        ds_val, [ScalarStats('{}/cost'.format(vs_name))], device=device)
    config = TrainConfig(
        dataflow=ds_train,
        callbacks=[
            ModelSaver(checkpoint_dir=model_dir,
                       max_to_keep=1,
                       keep_checkpoint_every_n_hours=100),
            ScheduledHyperParamSetter('learning_rate', lr_schedule)
        ],
        extra_callbacks=extra_callbacks,
        model=model,
        monitors=[JSONWriter(), ScalarPrinter()],  #, TFEventWriter()],
        steps_per_epoch=ds_train.size(),
        max_epoch=max_epoch,
        session_config=session_config)
    ckpt = tf.train.latest_checkpoint(prev_dir if prev_dir else model_dir)
    if ckpt:
        config.session_init = SaverRestore(ckpt)
    launch_train_with_config(config, SimpleTrainer())