Esempio n. 1
0
def main(cfg):
    print(cfg)
    
    tf.reset_default_graph()
    
    logger.set_logger_dir('tflogs', action='d')

    copyfile(hydra.utils.to_absolute_path('model.py'), 'model.py')
    copyfile(hydra.utils.to_absolute_path('dataflow.py'), 'dataflow.py')
    
    if cfg.cat_name == 'smpl':
        train_df = SMPLDataFlow(cfg, True, 1000)
        val_df = VisSMPLDataFlow(cfg, True, 1000, port=1080)
    else:
        train_df = ShapeNetDataFlow(cfg, cfg.data.train_txt, True)
        val_df = VisDataFlow(cfg, cfg.data.val_txt, False, port=1080)
    
    config = TrainConfig(
        model=Model(cfg),
        dataflow=BatchData(PrefetchData(train_df, cpu_count() // 2, cpu_count() // 2), cfg.batch_size),
        callbacks=[
            ModelSaver(),
            SimpleMovingAverage(['recon_loss', 'GAN/loss_d', 'GAN/loss_g', 'GAN/gp_loss', 'symmetry_loss'], 100),
            PeriodicTrigger(val_df, every_k_steps=30)
        ],
        monitors=tensorpack.train.DEFAULT_MONITORS() + [ScalarPrinter(enable_step=True, enable_epoch=False)],
        max_epoch=10
    )
    launch_train_with_config(config, SimpleTrainer())
Esempio n. 2
0
def train_vqvae(params, dataset, checkpoint_dir):
    logger.set_logger_dir(checkpoint_dir)

    dataset_params = params['dataset']
    model_params = params['model']
    trainer_params = params['trainer']
    image_shape = model_params['image_shape']

    train_ds, val_ds, sample_train, sample_test = load_toy_dataset(
        dataset, trainer_params['batch_size'],
        trainer_params['num_parallel'])

    params.to_file(os.path.join(logger.get_logger_dir(), 'config.json'))

    model = BaseVQVAE.from_params(model_params)

    trainer_config = AutoResumeTrainConfig(
        always_resume=recover,
        model=model,
        dataflow=train_ds,
        callbacks=[
            Reconstruct(model, sample_train, sample_test,
                        os.path.join(checkpoint_dir, 'images')),
            ModelSaver(max_to_keep=5, checkpoint_dir=checkpoint_dir),
            InferenceRunner(input=val_ds,
                            infs=ScalarStats(['loss', 'perplexity'])),
            MaxSaver(monitor_stat='validation_loss'),
            CompressResource(os.path.join(checkpoint_dir, 'images'),
                             os.path.join(checkpoint_dir, 'images.zip'))
        ],
        steps_per_epoch=trainer_params['steps_per_epoch'],
        max_epoch=trainer_params['max_epochs']
    )
    launch_train_with_config(trainer_config, SimpleTrainer())
Esempio n. 3
0
def main():
    args = parse_args()
    args.seed = init_rand(seed=args.seed)

    _, log_file_exist = initialize_logging(
        logging_dir_path=args.save_dir,
        logging_file_name=args.logging_file_name,
        script_args=args,
        log_packages=args.log_packages,
        log_pip_packages=args.log_pip_packages)
    logger.set_logger_dir(args.save_dir)

    batch_size = prepare_tf_context(num_gpus=args.num_gpus,
                                    batch_size=args.batch_size)

    classes = 1000
    net, inputs_desc = prepare_model(
        model_name=args.model,
        classes=classes,
        use_pretrained=args.use_pretrained,
        pretrained_model_file_path=args.resume.strip())

    train_dataflow = get_data(is_train=True,
                              batch_size=batch_size,
                              data_dir_path=args.data_dir)
    val_dataflow = get_data(is_train=False,
                            batch_size=batch_size,
                            data_dir_path=args.data_dir)

    train_net(net=net,
              session_init=inputs_desc,
              batch_size=batch_size,
              num_epochs=args.num_epochs,
              train_dataflow=train_dataflow,
              val_dataflow=val_dataflow)
Esempio n. 4
0
def train(args, logdir):

    # model
    # ;model = Net1()

    # dataflow
    # ;df = Net1DataFlow(hp.train1.data_path, hp.train1.batch_size)

    # set logger for event and model saver
    logger.set_logger_dir(logdir)

    session_conf = tf.ConfigProto(gpu_options=tf.GPUOptions(
        allow_growth=True, ), )

    train_conf = TrainConfig(
        # ;model=model,
        # ;data=QueueInput(df(n_prefetch=1000, n_thread=4)),
        callbacks=[
            ModelSaver(checkpoint_dir=logdir),
            # TODO EvalCallback()
        ],
        # ;max_epoch=hp.train1.num_epochs,
        # ;steps_per_epoch=hp.train1.steps_per_epoch,
        # session_config=session_conf
    )
    ckpt = '{}/{}'.format(
        logdir, args.ckpt) if args.ckpt else tf.train.latest_checkpoint(logdir)
    if ckpt:
        train_conf.session_init = SaverRestore(ckpt)

    if args.gpu:
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
        train_conf.nr_tower = len(args.gpu.split(','))
def train(args, logdir1, logdir2):
    # model
    model = Net2()

    preprocessing(data_path, logdir2)

    # dataflow
    df = Net2DataFlow(data_path, hp.train2.batch_size)

    # set logger for event and model saver
    logger.set_logger_dir(logdir2)

    # session_conf = tf.ConfigProto(
    #     gpu_options=tf.GPUOptions(
    #         allow_growth=True,
    #         per_process_gpu_memory_fraction=0.6,
    #     ),
    # )

    dataset_size = len(glob.glob(data_path + '/wav/*.wav'))
    print("\t\data_path : ", data_path)
    print("\t\tDataset Size : ", dataset_size)
    print("\t\tBatch Size : ", hp.train2.batch_size)
    print("\t\tSteps per epoch : ", (dataset_size // hp.train2.batch_size))
    from time import sleep
    sleep(10)

    session_inits = []
    ckpt2 = '{}/{}'.format(
        logdir2,
        args.ckpt) if args.ckpt else tf.train.latest_checkpoint(logdir2)
    if ckpt2:
        session_inits.append(SaverRestore(ckpt2))
    ckpt1 = tf.train.latest_checkpoint(logdir1)
    if ckpt1:
        session_inits.append(SaverRestore(ckpt1, ignore=['global_step']))
    train_conf = AutoResumeTrainConfig(
        model=model,
        data=QueueInput(df(n_prefetch=1000, n_thread=8)),
        callbacks=[
            # TODO save on prefix net2
            ModelSaver(checkpoint_dir=logdir2),
            # ConvertCallback(logdir2, hp.train2.test_per_epoch),
        ],
        max_epoch=hp.train2.num_epochs,
        steps_per_epoch=dataset_size // hp.train2.batch_size,
        session_init=ChainInit(session_inits))
    if args.gpu:
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
        train_conf.nr_tower = len(args.gpu.split(','))
        gpu_list = args.gpu.split(',')
        gpu_list = list(map(int, gpu_list))

    #trainer = SimpleTrainer()
    trainer = SyncMultiGPUTrainerReplicated(gpu_list)
    #trainer = AsyncMultiGPUTrainer(gpu_list, False)

    launch_train_with_config(train_conf, trainer=trainer)
Esempio n. 6
0
def _setup_logging(logdir, is_horovod):
    # Setup logging ...

    if is_horovod:
        hvd.init()
    if not is_horovod or hvd.rank() == 0:
        logger.set_logger_dir(logdir, 'd')

    logger.info("Environment Information:\n" + collect_env_info())
Esempio n. 7
0
def get_avatar_synth_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--train_dir',
                        help='Directory of train data',
                        default='./data/bitmoji/train')
    parser.add_argument('--test_dir',
                        help='Directory of test data',
                        default='./data/bitmoji/test')
    parser.add_argument('--logger_dir',
                        help='Directory to save logs and model checkpoints',
                        default=os.path.join('save', 'log', date_str()))
    parser.add_argument('--load_path',
                        help='Path of the model checkpoint to load')
    parser.add_argument('--epochs',
                        help='Number of epochs to train',
                        default=100000,
                        type=int)
    parser.add_argument('--batch_size',
                        help='Minibatch size',
                        default=512,
                        type=int)
    parser.add_argument('--lr', help='Learning rate', default=1e-4, type=float)
    parser.add_argument(
        '--lr_decay',
        help='The multiple by which to decay the learning rate every epoch',
        default=0.96,
        type=float)
    parser.add_argument('--resume_lr',
                        help='Resume the learning rate from the previous run',
                        action='store_true')
    parser.add_argument(
        '--keep_prob',
        help='The keep probability for dropout (always 1 for testing)',
        default=0.5,
        type=float)
    parser.add_argument(
        '--summary_freq',
        help='Frequency (in steps) with which to write tensorboard summaries',
        default=100,
        type=int)
    parser.add_argument('--gpu',
                        help='Comma separated list of GPU(s) to use',
                        default='0')
    parser.add_argument('--num_threads',
                        help='The number of threads to read and process data',
                        default=32,
                        type=int)

    args = parser.parse_args()

    if args.gpu:
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu

    set_logger_dir(args.logger_dir)

    return args
def train(args, logdir):

    # model
    model = Net1()

    preprocessing(data_path)
    preprocessing(test_path)

    # dataflow
    df = Net1DataFlow(data_path, hp.train1.batch_size)
    df_test = Net1DataFlow(test_path, hp.train1.batch_size)

    #datas = df.get_data()
    #print(datas[1])
    # set logger for event and model saver
    logger.set_logger_dir(logdir)
    #session_conf = tf.ConfigProto(
    #    gpu_options=tf.GPUOptions(
    #        allow_growth=True,
    #    ),)

    # cv test code
    # https://github.com/tensorpack/tensorpack/blob/master/examples/boilerplate.py

    train_conf = AutoResumeTrainConfig(
        model=model,
        data=QueueInput(df(n_prefetch=hp.train1.batch_size * 10, n_thread=1)),
        callbacks=[
            ModelSaver(checkpoint_dir=logdir),
            InferenceRunner(
                df_test(n_prefetch=1),
                ScalarStats(['net1/eval/loss', 'net1/eval/acc'], prefix='')),
        ],
        max_epoch=hp.train1.num_epochs,
        steps_per_epoch=hp.train1.steps_per_epoch,
        #session_config=session_conf
    )
    ckpt = '{}/{}'.format(
        logdir, args.ckpt) if args.ckpt else tf.train.latest_checkpoint(logdir)
    num_gpu = hp.train1.num_gpu

    if ckpt:
        train_conf.session_init = SaverRestore(ckpt)

    if args.gpu:
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
        train_conf.nr_tower = len(args.gpu.split(','))
        num_gpu = len(args.gpu.split(','))
        trainer = SyncMultiGPUTrainerReplicated(num_gpu)
    else:
        trainer = SimpleTrainer()

    launch_train_with_config(train_conf, trainer=trainer)
Esempio n. 9
0
def train_pixelcnn_prior(params, checkpoint_dir, recover=True, force=False):
    if force and os.path.exists(checkpoint_dir):
        shutil.rmtree(checkpoint_dir)
    logger.set_logger_dir(checkpoint_dir)

    dataset_params = params['dataset']
    model_params = params['model']
    trainer_params = params['trainer']

    train_ds, val_ds, sample_train, sample_val, sample_train_label, \
        sample_val_label = get_dataflow(
            dataset_params['path'], False,
            dataset_params['train_val_split'], trainer_params['batch_size'],
            trainer_params['num_parallel'])

    vqvae_checkpoint_path = trainer_params['vqvae_checkpoint_path']
    vqvae_config_path = os.path.join(os.path.split(vqvae_checkpoint_path)[0],
                                     'config.json')
    model_params['vqvae_model_params'] = vqvae_config_path

    latent_shape = model_params['latent_shape']
    num_labels = model_params['num_labels']

    params.to_file(os.path.join(logger.get_logger_dir(), 'config.json'))

    model = BasePixelCNNPrior.from_params(model_params)

    trainer = SyncMultiGPUTrainerParameterServer(
        gpus=trainer_params['num_gpus'], ps_device=None)
    trainer_config = AutoResumeTrainConfig(
        always_resume=recover,
        model=model,
        dataflow=train_ds,
        callbacks=[
            SequentialSampling(trainer_params['num_examples_to_generate'],
                               latent_shape, num_labels, model,
                               os.path.join(checkpoint_dir, 'images')),
            Reconstruct(model, sample_train, sample_val,
                        os.path.join(checkpoint_dir, 'images'),
                        sample_train_label, sample_val_label),
            ModelSaver(max_to_keep=5, checkpoint_dir=checkpoint_dir),
            InferenceRunner(input=val_ds,
                            infs=ScalarStats(['loss'])),
            MinSaver(monitor_stat='validation_loss'),
            CompressResource(os.path.join(checkpoint_dir, 'images'),
                             os.path.join(checkpoint_dir, 'images.zip')),
            RestoreWeights(vqvae_checkpoint_path),
            Notification('Training status', 'Complete')
        ],
        steps_per_epoch=trainer_params['steps_per_epoch'],
        max_epoch=trainer_params['max_epochs']
    )
    launch_train_with_config(trainer_config, trainer)
Esempio n. 10
0
    def _dataflow(self, validation: bool = False) -> DataFlow:
        assert self.step is not None
        assert isinstance(self.step, df.Dataflows)

        logger.set_logger_dir(self.args.save, action="k")
        return df.get_data(
            self.step,
            self.args.validation
            if self.args.validation is not None else self.args.data,
            self.args.batch_size,
            n_proc=self.args.nproc,
            n_gpus=get_num_gpu(),
        )
Esempio n. 11
0
def train_image_embedding_softmax(params, checkpoint_dir, recover=True,
                                  force=False):
    if force and os.path.exists(checkpoint_dir):
        shutil.rmtree(checkpoint_dir)
    logger.set_logger_dir(checkpoint_dir)

    dataset_params = params['dataset']
    model_params = params['model']
    trainer_params = params['trainer']

    train_ds, val_ds, _, _, _, _ = get_dataflow(
        dataset_params['path'], False, dataset_params['train_val_split'],
        trainer_params['batch_size'], trainer_params['num_parallel'])

    vqvae_checkpoint_path = trainer_params['vqvae_checkpoint_path']
    vqvae_config_path = os.path.join(os.path.split(vqvae_checkpoint_path)[0],
                                     'config.json')
    model_params['vqvae_model_params'] = vqvae_config_path

    params.to_file(os.path.join(logger.get_logger_dir(), 'config.json'))

    model = BaseImageEmbedding.from_params(model_params)

    trainer = SyncMultiGPUTrainerParameterServer(
        gpus=trainer_params['num_gpus'], ps_device=None)
    trainer_config = AutoResumeTrainConfig(
        always_resume=recover,
        model=model,
        dataflow=train_ds,
        callbacks=[
            InferenceRunner(input=val_ds, infs=[
                ScalarStats('loss'),
                ClassificationError('correct_prediction',
                                    'val-correct_prediction')]),
            ModelSaver(max_to_keep=5, checkpoint_dir=checkpoint_dir),
            MinSaver(monitor_stat='val-correct_prediction'),
            RestoreWeights(vqvae_checkpoint_path),
            SendStat('Training status', [
                'loss', 'accuracy',
                'validation_loss', 'val-correct_prediction'],
                after_every=2),
            Notification('Training status', 'Complete')
        ],
        steps_per_epoch=trainer_params['steps_per_epoch'],
        max_epoch=trainer_params['max_epochs']
    )
    launch_train_with_config(trainer_config, trainer)
Esempio n. 12
0
def train(args, logdir1, logdir2):
    # model
    model = Net2()

    # dataflow
    df = Net2DataFlow(hp.train2.data_path, hp.train2.batch_size)

    # set logger for event and model saver
    logger.set_logger_dir(logdir2)

    session_conf = tf.ConfigProto(
    #    log_device_placement=True,
        allow_soft_placement=True,
        gpu_options=tf.GPUOptions(
    #         allow_growth=True,
            per_process_gpu_memory_fraction=0.6,
        ),
    )

    session_inits = []
    ckpt2 = '{}/{}'.format(logdir2, args.ckpt) if args.ckpt else tf.train.latest_checkpoint(logdir2)
    if ckpt2:
        session_inits.append(SaverRestore(ckpt2))
    ckpt1 = tf.train.latest_checkpoint(logdir1)
    if ckpt1:
        session_inits.append(SaverRestore(ckpt1, ignore=['global_step']))
    train_conf = TrainConfig(
        model=model,
        data=QueueInput(df(n_prefetch=1000, n_thread=4)),
        callbacks=[
            # TODO save on prefix net2
            ModelSaver(checkpoint_dir=logdir2),
            # ConvertCallback(logdir2, hp.train2.test_per_epoch),
        ],
        max_epoch=hp.train2.num_epochs,
        steps_per_epoch=hp.train2.steps_per_epoch,
        session_init=ChainInit(session_inits),
        session_config=session_conf
    )
    if args.gpu:
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
        train_conf.nr_tower = len(args.gpu.split(','))

    #trainer = SyncMultiGPUTrainerParameterServer(hp.train2.num_gpu)
    trainer = SimpleTrainer()
    launch_train_with_config(train_conf, trainer=trainer)
Esempio n. 13
0
def local_crawler_main(auto_dir,
                       nr_gpu,
                       launch_log_dir,
                       n_parallel=10000,
                       num_init_use_all_gpu=2):
    """
    Args:
    auto_dir (str) : dir for looking for xxx.sh to run
    nr_gpu (int): Number of gpu on local contaienr
    launch_log_dir (str) : where the launcher logs stuff and hold tmp scripts.
    n_parallel (int) : maximum number of parallel jobs.
    num_init_use_all_gpu (int) : num of init jobs that will use all gpu
    """
    logger.set_logger_dir(launch_log_dir, action='d')
    launcher = os.path.basename(os.path.normpath(launch_log_dir))
    crawl_local_auto_scripts_and_launch(auto_dir, nr_gpu, launcher, n_parallel,
                                        num_init_use_all_gpu)
Esempio n. 14
0
def train(args, logdir):

    # model
    print("####model")
    model = Net1()

    # dataflow
    print("####dataflow")
    df = Net1DataFlow(hp.Train1.data_path, hp.Train1.batch_size)

    # set logger for event and model saver
    print("####logger")
    logger.set_logger_dir(logdir)

    print("####session_conf")
    session_conf = tf.ConfigProto(gpu_options=tf.GPUOptions(
        allow_growth=True, ),
                                  allow_soft_placement=True)

    print("####train_conf")
    train_conf = TrainConfig(
        model=model,
        data=QueueInput(df(n_prefetch=1000, n_thread=5)),
        callbacks=[
            ModelSaver(checkpoint_dir=logdir),
            # TODO EvalCallback()
        ],
        max_epoch=hp.Train1.num_epochs,
        steps_per_epoch=hp.Train1.steps_per_epoch,
        session_config=session_conf)
    print("####ckpt")
    ckpt = '{}/{}'.format(
        logdir, args.ckpt) if args.ckpt else tf.train.latest_checkpoint(logdir)
    if ckpt:
        train_conf.session_init = SaverRestore(ckpt)

    if args.gpu:
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
        train_conf.nr_tower = len(args.gpu.split(','))

    print("####trainer")
    trainer = SyncMultiGPUTrainerReplicated(hp.Train1.num_gpu)

    print("####launch_train_with_config")
    launch_train_with_config(train_conf, trainer=trainer)
Esempio n. 15
0
def main(_):
    args = parse_args()
    # set gpu/cpu mode
    if int(args.gpu_id) >= 0:
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_id
    else:
        os.environ['CUDA_VISIBLE_DEVICES'] = ''

    checkpoint_dir = os.path.join('./checkpoints/', args.name)
    logger.set_logger_dir(checkpoint_dir)

    # set up deblur models
    M = model.DEBLUR(args)

    ds_train = get_data(args.dataroot,
                        phase='train',
                        crop_size=args.cropSize,
                        batch_size=args.batchSize)
    ds_val = get_data(args.dataroot,
                      phase='val',
                      crop_size=args.cropSize,
                      batch_size=args.batchSize)

    trainer = SeparateGANTrainer(ds_train, M, g_period=6)
    trainer.train_with_defaults(
        callbacks=[
            ModelSaver(max_to_keep=5, checkpoint_dir=checkpoint_dir),
            ScheduledHyperParamSetter('learning_rate',
                                      [(300, args.learning_rate),
                                       (args.max_epoch, 0)],
                                      interp='linear'),
            InferenceRunner(ds_val, [
                ScalarStats('PSNR_BASE'),
                ScalarStats('PSNR_2'),
                ScalarStats('PSNR_IMPRO2'),
                ScalarStats('pixel_loss2'),
                ScalarStats('feature_loss2')
            ])
        ],
        session_init=SaverRestore(checkpoint_dir +
                                  '/model-431249.data-00000-of-00001')
        if args.continue_train else None,
        starting_epoch=1,
        steps_per_epoch=args.steps_per_epoch,
        max_epoch=args.max_epoch)
Esempio n. 16
0
def train_vae(params, checkpoint_dir, recover=True, force=False):
    if force and os.path.exists(checkpoint_dir):
        shutil.rmtree(checkpoint_dir)
    logger.set_logger_dir(checkpoint_dir)

    dataset_params = params['dataset']
    model_params = params['model']
    trainer_params = params['trainer']

    train_ds, val_ds, sample_train, sample_val, _, _ = \
        get_dataflow(dataset_params['path'],
                     dataset_params['binarizer'],
                     dataset_params['train_val_split'],
                     trainer_params['batch_size'],
                     trainer_params['num_parallel'])

    params.to_file(os.path.join(logger.get_logger_dir(), 'config.json'))

    latent_dim = model_params['latent_dim']
    model = BaseVAE.from_params(model_params)

    trainer = SyncMultiGPUTrainerParameterServer(
        gpus=trainer_params['num_gpus'], ps_device=None)
    trainer_config = AutoResumeTrainConfig(
        always_resume=recover,
        model=model,
        dataflow=train_ds,
        callbacks=[
            Sampling(model, trainer_params['num_examples_to_generate'],
                     latent_dim, os.path.join(checkpoint_dir, 'images')),
            Reconstruct(model, sample_train, sample_val,
                        os.path.join(checkpoint_dir, 'images')),
            ModelSaver(max_to_keep=5, checkpoint_dir=checkpoint_dir),
            InferenceRunner(input=val_ds,
                            infs=ScalarStats(['avg_logpx_z', 'neg_elbo'])),
            MinSaver(monitor_stat='validation_neg_elbo'),
            CompressResource(os.path.join(checkpoint_dir, 'images'),
                             os.path.join(checkpoint_dir, 'images.zip')),
            Notification('Training status', 'Complete')
        ],
        steps_per_epoch=trainer_params['steps_per_epoch'],
        max_epoch=trainer_params['max_epochs']
    )
    launch_train_with_config(trainer_config, trainer)
Esempio n. 17
0
def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--train_path',
                        help='Path to train csv.',
                        default='./data/ljspeech-processed/train.txt')
    parser.add_argument('--test_path',
                        help='Path to test csv.',
                        default='./data/ljspeech-processed/test.txt')
    parser.add_argument('--save_dir',
                        help='Directory to save logs and model checkpoints',
                        default=os.path.join('save', 'wavenet', date_str()))
    parser.add_argument('--load_path',
                        help='Path of the model checkpoint to load')
    parser.add_argument(
        '--summary_freq',
        help=
        'Frequency (in train steps) with which to write tensorboard summaries',
        default=20,
        type=int)
    parser.add_argument('--steps_per_epoch',
                        help='Steps per epoch, defaults to the batch size',
                        default=None,
                        type=int)
    parser.add_argument('--skip_inferencing',
                        help='Whether or not to skip inferencing after epochs',
                        action='store_true')
    parser.add_argument('--gpu', help='Which GPU to use')
    parser.add_argument('--n_threads',
                        help='The number of threads to read and process data',
                        default=2,
                        type=int)
    parser.add_argument('--resume_lr',
                        help='Resume the learning rate from the loaded run',
                        action='store_true')

    args = parser.parse_args()

    if args.gpu:
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu

    set_logger_dir(args.save_dir)

    return args
Esempio n. 18
0
def train(args, logdir):
    # model
    model = Net1()

    # dataflow
    TIMIT_TRAIN_WAV = 'TIMIT/TRAIN/*/*/*.npz'
    TIMIT_TEST_WAV = 'TIMIT/TEST/*/*/*.npz'

    print(os.path.join(hp.train1.preproc_data_path, args.case, TIMIT_TRAIN_WAV))
    print(os.path.join(hp.train1.preproc_data_path, args.case, TIMIT_TEST_WAV))

    df = Net1DataFlow(os.path.join(hp.train1.preproc_data_path, args.case, TIMIT_TRAIN_WAV), hp.train1.batch_size)
    df_test = Net1DataFlow(os.path.join(hp.train1.preproc_data_path, args.case, TIMIT_TEST_WAV), hp.train1.batch_size)

    # set logger for event and model saver
    logger.set_logger_dir(logdir)
    train_conf = AutoResumeTrainConfig(
        model=model,
        data=QueueInput(df(n_prefetch=1000, n_thread=8)),
        callbacks=[
            ModelSaver(checkpoint_dir=logdir),
            InferenceRunner(df_test(n_prefetch=1),
                            ScalarStats(['net1/eval/loss', 'net1/eval/acc'],prefix='')),
        ],
        max_epoch=hp.train1.num_epochs,
        steps_per_epoch=hp.train1.steps_per_epoch,
        #session_config=session_conf
    )
    ckpt = '{}/{}'.format(logdir, args.ckpt) if args.ckpt else tf.train.latest_checkpoint(logdir)

    if ckpt:
        train_conf.session_init = SaverRestore(ckpt)

    if hp.default.use_gpu == True:
        os.environ['CUDA_VISIBLE_DEVICES'] = hp.default.gpu_list
        train_conf.nr_tower = len(hp.default.gpu_list.split(','))
        num_gpu = len(hp.default.gpu_list.split(','))
        trainer = SyncMultiGPUTrainerReplicated(num_gpu)
    else:
        os.environ['CUDA_VISIBLE_DEVICES'] = ''
        trainer = SimpleTrainer()

    launch_train_with_config(train_conf, trainer=trainer)
Esempio n. 19
0
def train_image_embedding_triplet(params, checkpoint_dir, recover=True,
                                  force=False):
    if force and os.path.exists(checkpoint_dir):
        shutil.rmtree(checkpoint_dir)
    logger.set_logger_dir(checkpoint_dir)

    dataset_params = params['dataset']
    model_params = params['model']
    trainer_params = params['trainer']

    train_ds = get_triplet_dataflow(
        dataset_params['path'], trainer_params['items_per_batch'],
        trainer_params['images_per_item'], trainer_params['num_parallel'])

    vqvae_checkpoint_path = trainer_params['vqvae_checkpoint_path']
    vqvae_config_path = os.path.join(os.path.split(vqvae_checkpoint_path)[0],
                                     'config.json')
    model_params['vqvae_model_params'] = vqvae_config_path

    params.to_file(os.path.join(logger.get_logger_dir(), 'config.json'))

    model = BaseImageEmbedding.from_params(model_params)

    trainer = SyncMultiGPUTrainerParameterServer(
        gpus=trainer_params['num_gpus'], ps_device=None)
    trainer_config = AutoResumeTrainConfig(
        always_resume=recover,
        model=model,
        dataflow=train_ds,
        callbacks=[
            ModelSaver(max_to_keep=5, checkpoint_dir=checkpoint_dir),
            MinSaver(monitor_stat='loss'),
            RestoreWeights(vqvae_checkpoint_path),
            SendStat('Training status', ['loss', 'pos_triplet_frac'],
                     after_every=2),
            Notification('Training status', 'Complete')
        ],
        steps_per_epoch=trainer_params['steps_per_epoch'],
        max_epoch=trainer_params['max_epochs']
    )
    launch_train_with_config(trainer_config, trainer)
Esempio n. 20
0
def train(args, logdir):

    # model
    model = Net()

    # dataflow
    df = NetDataFlow(hp.train.data_path, hp.train.batch_size)

    # set logger for event and model saver
    logger.set_logger_dir(logdir)

    session_conf = tf.ConfigProto(
        gpu_options=tf.GPUOptions(
            allow_growth=True,
        ),)
    session_conf.gpu_options.per_process_gpu_memory_fraction = 0.45  # 占用GPU90%的显存

    train_conf = TrainConfig(
        model=model,
        data=QueueInput(df(n_prefetch=1000, n_thread=4)),
        callbacks=[
            ModelSaver(checkpoint_dir=logdir),
            # TODO EvalCallback()
        ],
        max_epoch=hp.train.num_epochs,
        steps_per_epoch=hp.train.steps_per_epoch,
        # session_config=session_conf
    )
    ckpt = '{}/{}'.format(logdir, args.ckpt) if args.ckpt else tf.train.latest_checkpoint(logdir)
    if ckpt:
        train_conf.session_init = SaverRestore(ckpt)

    if args.gpu:
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
        train_conf.nr_tower = len(args.gpu.split(','))

    trainer = SyncMultiGPUTrainerReplicated(hp.train.num_gpu)

    launch_train_with_config(train_conf, trainer=trainer)
Esempio n. 21
0
            temp = temp[keys[i]]
        temp[keys[-1]] = value

    # set GPU machine
    if config['gpu'] in [None, 'None', '']:
        os.environ['CUDA_VISIBLE_DEVICES'] = ''
        num_gpu = 0
    else:
        os.environ['CUDA_VISIBLE_DEVICES'] = config['gpu']
        num_gpu = max(get_num_gpu(), 1)
    config['num_gpu'] = num_gpu

    # set log directory
    if config['logdir'] in [None, 'None', '']:
        logger.auto_set_dir()
    else:
        logger.set_logger_dir('train_log/' + config['logdir'], action='d')
    # save configuration
    with open(logger.get_logger_dir() + '/config.json', 'w') as outfile:
        json.dump(config, outfile)

    # get train config
    train_config = get_train_config(config)

    # train the model
    if num_gpu > 1:
        launch_train_with_config(train_config,
                                 SyncMultiGPUTrainerReplicated(num_gpu))
    else:
        launch_train_with_config(train_config, SimpleTrainer())
Esempio n. 22
0
        # manually build the graph with batch=1
        input_desc = [
            InputDesc(tf.float32, [1, 224, 224, 3], 'input'),
            InputDesc(tf.int32, [1], 'label')
        ]
        input = PlaceholderInput()
        input.setup(input_desc)
        with TowerContext('', is_training=False):
            model.build_graph(*input.get_input_tensors())
        model_utils.describe_trainable_vars()

        tf.profiler.profile(
            tf.get_default_graph(),
            cmd='op',
            options=tf.profiler.ProfileOptionBuilder.float_operation())
        logger.info("Note that TensorFlow counts flops in a different way from the paper.")
        logger.info("TensorFlow counts multiply+add as two flops, however the paper counts them "
                    "as 1 flop because it can be executed in one instruction.")
    else:
        if args.v2:
            name = "ShuffleNetV2-{}x".format(args.ratio)
        else:
            name = "ShuffleNetV1-{}x-g{}".format(args.ratio, args.group)
        logger.set_logger_dir(os.path.join('train_log', name))

        nr_tower = max(get_num_gpu(), 1)
        config = get_config(model, nr_tower)
        if args.load:
            config.session_init = get_model_loader(args.load)
        launch_train_with_config(config, SyncMultiGPUTrainerParameterServer(nr_tower))
Esempio n. 23
0
              kernel_initializer=tf.keras.initializers.VarianceScaling(
                  scale=2.0, mode='fan_in'),
              kernel_regularizer=tf.keras.regularizers.l2(5e-5))(x)

    M = tf.keras.models.Model(input, x, name='resnet50')
    return M


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--data', help='ILSVRC dataset dir')
    parser.add_argument('--fake',
                        help='use fakedata to test or benchmark this model',
                        action='store_true')
    args = parser.parse_args()
    logger.set_logger_dir(os.path.join("train_log", "imagenet-resnet-keras"))

    tf.keras.backend.set_image_data_format('channels_first')

    num_gpu = get_num_gpu()
    if args.fake:
        df_train = FakeData([[64, 224, 224, 3], [64, 1000]],
                            5000,
                            random=False,
                            dtype='uint8')
        df_val = FakeData([[64, 224, 224, 3], [64, 1000]], 5000, random=False)
    else:
        batch_size = TOTAL_BATCH_SIZE // num_gpu
        assert args.data is not None
        df_train = get_imagenet_dataflow(args.data, 'train', batch_size,
                                         fbresnet_augmentor(True))
Esempio n. 24
0
    x = Flatten()(x)
    x = Dense(1000, activation='softmax', name='fc1000',
              kernel_initializer=tf.keras.initializers.VarianceScaling(
                  scale=2.0, mode='fan_in'),
              kernel_regularizer=tf.keras.regularizers.l2(5e-5))(x)

    M = tf.keras.models.Model(input, x, name='resnet50')
    return M


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--data', help='ILSVRC dataset dir')
    parser.add_argument('--fake', help='use fakedata to test or benchmark this model', action='store_true')
    args = parser.parse_args()
    logger.set_logger_dir(os.path.join("train_log", "imagenet-resnet-keras"))

    tf.keras.backend.set_image_data_format('channels_first')

    nr_gpu = get_nr_gpu()
    if args.fake:
        df_train = FakeData([[64, 224, 224, 3], [64, 1000]], 5000, random=False, dtype='uint8')
        df_val = FakeData([[64, 224, 224, 3], [64, 1000]], 5000, random=False)
    else:
        batch_size = TOTAL_BATCH_SIZE // nr_gpu
        assert args.data is not None
        df_train = get_imagenet_dataflow(
            args.data, 'train', batch_size, fbresnet_augmentor(True))
        df_val = get_imagenet_dataflow(
            args.data, 'val', batch_size, fbresnet_augmentor(False))
    x = Flatten()(x)
    x = Dense(1000, activation='softmax', name='fc1000',
              kernel_initializer=tf.keras.initializers.VarianceScaling(
                  scale=2.0, mode='fan_in'),
              kernel_regularizer=tf.keras.regularizers.l2(5e-5))(x)

    M = tf.keras.models.Model(input, x, name='resnet50')
    return M


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--data', help='ILSVRC dataset dir')
    parser.add_argument('--fake', help='use fakedata to test or benchmark this model', action='store_true')
    args = parser.parse_args()
    logger.set_logger_dir("train_log/imagenet-resnet-keras")

    tf.keras.backend.set_image_data_format('channels_first')

    nr_gpu = get_nr_gpu()
    if args.fake:
        df_train = FakeData([[64, 224, 224, 3], [64, 1000]], 5000, random=False, dtype='uint8')
        df_val = FakeData([[64, 224, 224, 3], [64, 1000]], 5000, random=False)
    else:
        batch_size = TOTAL_BATCH_SIZE // nr_gpu
        assert args.data is not None
        df_train = get_imagenet_dataflow(
            args.data, 'train', batch_size, fbresnet_augmentor(True))
        df_val = get_imagenet_dataflow(
            args.data, 'val', batch_size, fbresnet_augmentor(False))
Esempio n. 26
0
def get_config(nr_tower, args):
    
    TOTAL_BATCH_SIZE = args.batch_size
    batchsize = TOTAL_BATCH_SIZE // nr_tower
    logger.info("Running on {} towers. Batch size per tower: {}".format(nr_tower, batchsize))
    
    max_epoch = args.num_epochs
    lr = args.initial_learning_rate
    num_epochs_before_decay = args.num_epochs_before_decay
    decay_factor = args.decay_factor
    num_decay = int(max_epoch/num_epochs_before_decay)
    
    if args.dataset_mode == 'train_fine':
        dataset_size = 2975
    elif args.dataset_mode == 'validation_fine':
        dataset_size = 500
    elif args.dataset_mode == 'train_patches':
        dataset_size = 2975*8 #23800
    elif args.dataset_mode == 'validation_patches':
        dataset_size = 500*8
    elif args.dataset_mode == 'train_coarse':
        dataset_size = 14440
    elif args.dataset_mode == 'combine_patches':
        dataset_size = 23800 + 3000
    elif args.dataset_mode == 'combine_val_patches':
        dataset_size = 1000
        
    steps_per_epoch = int(dataset_size / TOTAL_BATCH_SIZE)
    max_iter = max_epoch * steps_per_epoch
    
    schedule=[]
    if args.lr_type == 'poly':
        end_lr = 2e-5
        for i in range(max_epoch):
            ep = i 
            val = (lr - end_lr) * np.power((1 - 1.*i / num_epochs_before_decay), 0.9) + end_lr
            schedule.append((ep, val))
    if args.lr_type == 'exponential_decay':
        for i in range(num_decay):
            ep = i * num_epochs_before_decay
            val = lr * np.power(decay_factor, i)
            schedule.append((ep, val))
    
    model = ShuffleTensorpack(args, sub_rate=args.sub_rate, batchsize=batchsize)
    
    dataset_train = get_city_dataflow(args.dataset_mode, batchsize, args.sub_rate, 
                                      is_train=True, random_crop=args.random_crop)
      
    logger.set_logger_dir(os.path.join('log', args.exp_name+'_'+str(datetime.date.today())))
    checkpoint_dir = os.path.join('log', args.exp_name+'_'+str(datetime.date.today()),'save')
    infs = [ScalarStats(names='mean_iou', prefix='val')] # val_mean_IoU
    callbacks = [
        PeriodicTrigger(ModelSaver(max_to_keep=5, checkpoint_dir=checkpoint_dir),every_k_steps=250),       
        ScheduledHyperParamSetter('learning_rate', schedule=schedule),
        EstimatedTimeLeft(),
        MergeAllSummaries(period=250),
        ]
    
    if args.save_val_max is True:
        dataset_val = get_city_dataflow(args.dataset_val_mode, TOTAL_BATCH_SIZE, args.sub_rate, 
                                        is_train=False, random_crop=args.random_crop)     
        callbacks.extend([PeriodicTrigger(DataParallelInferenceRunner(dataset_val, infs, [0,1,2,3]),every_k_steps=250),
                         PeriodicTrigger(MaxSaver(monitor_stat='val_mean_iou', checkpoint_dir=checkpoint_dir),every_k_steps=250)])
    
    return AutoResumeTrainConfig(model=model,
                                dataflow=dataset_train,
                                callbacks=callbacks,
                                steps_per_epoch=steps_per_epoch,
                                max_epoch=max_epoch,)
Esempio n. 27
0
def mvsnet_main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--logdir', help='path to save model ckpt', default='.')
    parser.add_argument('--data', help='path to dataset', required=True)
    parser.add_argument('--load', help='load a model for training or evaluation')
    parser.add_argument('--exp_name', help='model ckpt name')
    parser.add_argument('--gpu', help='comma separated list of GPU(s) to use.')
    parser.add_argument('--mode', '-m', help='train / val / test', choices=['train', 'val', 'test', 'fake'])
    parser.add_argument('--out', default='./',
                        help='output path for evaluation and test, default to current folder')
    parser.add_argument('--batch', default=1, type=int, help="Batch size per tower.")
    parser.add_argument('--max_d', help='depth num for MVSNet', required=True, type=int)
    parser.add_argument('--max_h', help='depth num for MVSNet', required=True, type=int)
    parser.add_argument('--max_w', help='depth num for MVSNet', required=True, type=int)
    parser.add_argument('--interval_scale', required=True, type=float)
    parser.add_argument('--view_num', required=True, type=int)
    parser.add_argument('--refine', default=False)
    parser.add_argument('--feature', help='feature extraction branch', choices=['uninet', 'unet'], default='unet')
    parser.add_argument('--threshold', type=float)
    parser.add_argument('--regularize', default='3DCNN', choices=['3DCNN', 'GRU'])

    args = parser.parse_args()

    if args.feature == 'unet':
        feature_branch_function = unet_feature_extraction_branch
    else:
        feature_branch_function = uni_feature_extraction_branch

    if args.gpu:
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu

    if args.mode == 'train' or args.mode == 'fake':

        model = MVSNet(depth_num=args.max_d, bn_training=None, bn_trainable=None, batch_size=args.batch,
                       branch_function=feature_branch_function, is_refine=args.refine, height=args.max_h,
                       width=args.max_w, view_num=args.view_num, regularize_type=args.regularize)

        if args.exp_name is None:
            if not args.refine:
                exp_name = '{}-{}-b{}-{}-{}-{}-no-refine'.format(args.max_d, args.interval_scale, args.batch, os.path.basename(args.data),
                                                                args.feature,
                                            datetime.datetime.now().strftime("%m%d-%H%M"))
            else:
                exp_name = '{}-{}-b{}-{}-{}-{}-refine'.format(args.max_d, args.interval_scale, args.batch,
                                                    os.path.basename(args.data),
                                                    args.feature,
                                                    datetime.datetime.now().strftime("%m%d-%H%M"))
        else:
            exp_name = args.exp_name
        logger.set_logger_dir(os.path.join(args.logdir, exp_name))
        config = get_train_conf(model, args)
        if args.load:
            config.session_init = get_model_loader(args.load)
        gpus_id = args.gpu.split(',')
        gpus = len(gpus_id)
        logger.info('num of gpus to use: {}'.format(gpus))
        if gpus > 1:
            trainer = SyncMultiGPUTrainerParameterServer(gpus)
            # trainer = SyncMultiGPUTrainerReplicated(gpus, mode='cpu')
        else:
            trainer = SimpleTrainer()
        # trainer = SimpleTrainer()
        launch_train_with_config(config, trainer)

    elif args.mode == 'val':
        assert args.load, 'in eval mode, you have to specify a trained model'
        assert args.out, 'in eval mode, you have to specify the output dir path'
        logger.set_logger_dir(args.out)
        model = MVSNet(depth_num=args.max_d, bn_training=None, bn_trainable=None, batch_size=args.batch,
                       branch_function=feature_branch_function, is_refine=args.refine, height=args.max_h,
                       width=args.max_w, view_num=args.view_num)
        sess_init = get_model_loader(args.load)
        avg_loss, avg_less_three_acc, avg_less_one_acc = evaluate(model, sess_init, args)
        logger.info(f'val loss: {avg_loss}')
        logger.info(f'val less three acc: {avg_less_three_acc}')
        logger.info(f'val less one acc: {avg_less_one_acc}')

    else:  # test
        assert args.load, 'in eval mode, you have to specify a trained model'
        assert args.out, 'in eval mode, you have to specify the output dir path'
        assert args.data, 'in eval mode, you have to specify the data dir path'
        logger.set_logger_dir(args.out)
        model = MVSNet(depth_num=args.max_d, bn_training=None, bn_trainable=None, batch_size=args.batch,
                       branch_function=feature_branch_function, is_refine=args.refine, height=args.max_h,
                       width=args.max_w, view_num=args.view_num, regularize_type=args.regularize)
        sess_init = get_model_loader(args.load)
        test(model, sess_init, args)
Esempio n. 28
0
                predictions = inference(pred, x_test, tta=False, mode='test')
                submit(predictions, fnames)

    else:
        train_df = pd.read_csv(os.path.join('/data/kaggle/HPA', 'train.csv'))
        num_training = len(train_df)
        if config.EXTRA:
            extra_df = pd.read_csv(
                os.path.join('/data/kaggle/HPA',
                             'HPAv18RGBY_WithoutUncertain_wodpl.csv'))
            num_training += len(extra_df)

        num_training = int(num_training * 0.85 * 0.8)
        print("num_training", num_training)

        logger.set_logger_dir(args.logdir)
        training_callbacks = [
            ModelSaver(max_to_keep=100, keep_checkpoint_every_n_hours=1),
            GPUUtilizationTracker(),
        ]
        # heuristic setting for baseline
        # 105678 train+extra

        stepnum = num_training // (config.BATCH * get_nr_gpu()) + 1
        max_epoch = 50

        if config.FREEZE:
            max_epoch = 4
            TRAINING_SCHEDULE = ScheduledHyperParamSetter(
                'learning_rate', [(0, 1e-3)])
        else:
Esempio n. 29
0
        batch = 128    # something that can run on one gpu
        ds = get_data('val', batch)
        eval_classification(model, SmartInit(args.load), ds)
    elif args.flops:
        # manually build the graph with batch=1
        with TowerContext('', is_training=False):
            model.build_graph(
                tf.placeholder(tf.float32, [1, 224, 224, 3], 'input'),
                tf.placeholder(tf.int32, [1], 'label')
            )
        model_utils.describe_trainable_vars()

        tf.profiler.profile(
            tf.get_default_graph(),
            cmd='op',
            options=tf.profiler.ProfileOptionBuilder.float_operation())
        logger.info("Note that TensorFlow counts flops in a different way from the paper.")
        logger.info("TensorFlow counts multiply+add as two flops, however the paper counts them "
                    "as 1 flop because it can be executed in one instruction.")
    else:
        if args.v2:
            name = "ShuffleNetV2-{}x".format(args.ratio)
        else:
            name = "ShuffleNetV1-{}x-g{}".format(args.ratio, args.group)
        logger.set_logger_dir(os.path.join('train_log', name))

        nr_tower = max(get_num_gpu(), 1)
        config = get_config(model, nr_tower)
        config.session_init = SmartInit(args.load)
        launch_train_with_config(config, SyncMultiGPUTrainerParameterServer(nr_tower))
Esempio n. 30
0
    hp.set_hparam_yaml(case)
    if r:
        remove_all_files(hp.logdir)

    # model
    model = IAFVocoder(batch_size=hp.train.batch_size, length=hp.signal.length)

    # dataset
    dataset = Dataset(hp.data_path,
                      hp.train.batch_size,
                      length=hp.signal.length)
    print('dataset size is {}'.format(len(dataset.wav_files)))

    # set logger for event and model saver
    logger.set_logger_dir(hp.logdir)

    train_conf = TrainConfig(
        model=model,
        data=TFDatasetInput(dataset()),
        callbacks=[
            ModelSaver(checkpoint_dir=hp.logdir),
            RunUpdateOps()  # for batch norm, exponential moving average
            # TODO GenerateCallback()
        ],
        max_epoch=hp.train.num_epochs,
        steps_per_epoch=hp.train.steps_per_epoch,
    )
    ckpt = '{}/{}'.format(
        hp.logdir, ckpt) if ckpt else tf.train.latest_checkpoint(hp.logdir)
    if ckpt:
Esempio n. 31
0
    parser.add_argument('--num-layers', type=int, default=2)
    parser.add_argument('--batch-size', type=int, default=20)
    parser.add_argument('--keep-prob', type=float, default=0.5)
    parser.add_argument('--init-lr', type=float, default=1.0)
#    parser.add_argument('--warmup-epochs', type=int, default=6)
    parser.add_argument('--epochs', type=int, default=40)
    parser.add_argument('--vocab-size', type=int)
    global args
    args = parser.parse_args()
    if args.gpu:
        os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu)

    config = get_config()
    config.session_init = SmartInit(args.load)
    global trainer
    trainer = HorovodTrainer()
    if not trainer.is_chief:
        os.environ['WANDB_MODE'] = 'dryrun'
    elif not args.logdir:
        logger.auto_set_dir(action="d")
    else:
        logger.set_logger_dir(args.logdir, action="d")
    wandb_id = os.environ.get('WANDB_ID', None)
    if wandb_id is None:
        wandb.init(config=vars(args))
    else:
        wandb.init(config=vars(args), id=f"{wandb_id}{trainer._rank}")
    wandb.config.update({'SLURM_JOB_ID': os.environ.get('SLURM_JOB_ID', None)})
    wandb.tensorboard.patch(save=False)
    launch_train_with_config(config, HorovodTrainer())
def get_config(train_or_test, train_config=None, load_model=None):
    isTrain = train_or_test == 'train'
    if args.model_name is None:
        args.model_name = "no_name"
    log_dir = 'train_log/' + args.model_name
    logger.set_logger_dir(log_dir, 'n')

    dataset_train = 1
    dataset_val = None
    steps_per_epoch = 0

    # prepare dataset
    # dataflow structure [im, label] in parralel
    if isTrain:
        print(">>>>>> Loading training and validation sets")
        dataset_train = get_data('train',
                                 image_size=args.image_size,
                                 scale_size=args.scale_size,
                                 scale=args.scale,
                                 multi_crop=args.multi_crop,
                                 crop_per_case=args.crop_per_case,
                                 normalize=args.aug_norm,
                                 shuffle=True)

        steps_per_epoch = dataset_train.size(
        )  #/args.num_gpu if (args.mp != 0 or args.mp != 1)  else dataset_train.size()# = |data|/(batch size * num gpu)

        dataset_val = get_data('val',
                               image_size=args.image_size,
                               scale_size=args.scale_size,
                               scale=args.scale,
                               multi_crop=args.multi_crop,
                               crop_per_case=args.crop_per_case,
                               normalize=args.aug_norm,
                               shuffle=False)

    drop_rate = args.drop_out if args.drop_out is not None else 0.0

    print(" >>>>>>>>>> Steps Per Epoch: ", steps_per_epoch)
    print(">>>>>> Constructing Neural Network...")

    denseModel = Model(depth=args.depth,
                       image_size=args.scale_size,
                       lr_init=args.lr_init,
                       kernels=args.kernels,
                       kernel_size=args.kernel_size,
                       expansion=args.expansion,
                       class_0=args.class_0,
                       class_1=args.class_1,
                       drop_rate=drop_rate,
                       drop_pattern=args.drop_pattern,
                       bn_momentum=args.bn_momentum,
                       skip_norm=args.skip_norm,
                       train_or_test=isTrain)

    if isTrain:
        print(
            "Setting up training configuration: callbacks, validation checks and hyperparameter scheduling."
        )
        return TrainConfig(
            dataflow=dataset_train,
            callbacks=[
                MovingAverageSummary(),
                ModelSaver(),  # Record state graph at intervals during epochs
                InferenceRunner(
                    input=dataset_val,
                    infs=[ScalarStats('cost'),
                          ClassificationError()],
                ),
                MinSaver(
                    'validation_error'
                ),  #save model with min val-error, must be after inference
                #ScheduledHyperParamSetter('learning_rate',
                #                          [(args.drop_0, args.scale_lr*args.lr_0),
                #                           (args.drop_1,  args.scale_lr*args.lr_1)]),
                #HyperParamSetterWithFunc('learning_rate',
                #                         lambda e, x: x * float(0.1) if e % 15 == 0 and e > args.drop_2 else x),# (1+e)/(2*20) #ScheduledHyperParamSetter('learning_rate',[(args.drop_0, args.scale_lr*args.lr_0), (args.drop_1,  args.scale_lr*args.lr_1), (args.drop_2,  args.scale_lr*args.lr_2), (args.drop_3,  args.scale_lr*args.lr_3)]), # denote current hyperparameter)
                StatMonitorParamSetter('learning_rate',
                                       'validation_error',
                                       lambda x: x * 0.1,
                                       threshold=1e-15,
                                       last_k=20),
                MergeAllSummaries()
            ],
            model=denseModel,
            session_creator=None,
            session_config=train_config,
            steps_per_epoch=steps_per_epoch,
            max_epoch=args.max_epoch,
        )
    else:
        """
      Predictive model configuration for testing 
      and classifying.
      """
        class TestParamSetter(Callback):
            #def _before_run(self, _):
            #   return tf.train.SessionRunArgs(fetches=[],feed_dict={'PlaceholderWithDefault_1:0':1.0, 'PlaceholderWithDefault_2:0':False})#'drop_rate:0':1, 'train_or_test:0':False
            def _setup_graph(self):
                self._drop_rate = [
                    k for k in tf.global_variables()
                    if k.name == 'PlaceholderWithDefault_1:0'
                ][0]
                self._train_or_test = [
                    k for k in tf.global_variables()
                    if k.name == 'PlaceholderWithDefault_2:0'
                ][0]

            def _trigger_step(self):
                self._drop_rate.load(1.0)
                self._train_or_test.load(False)

        print(">>>>>> Constructing prediction variables.")
        return PredictConfig(
            model=denseModel,
            input_names=['input', 'label'],  #denseModel._get_inputs(),
            output_names=[
                'output', 'train_error', 'cross_entropy_loss', 'input'
            ],
        )
Esempio n. 33
0
def get_s2b_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--train_dir_bitmoji',
                        help='Directory of bitmoji train data',
                        default='./data/bitmoji/train')
    parser.add_argument('--test_dir_bitmoji',
                        help='Directory of bitmoji test data',
                        default='./data/bitmoji/test')
    parser.add_argument('--train_dir_face',
                        help='Directory of real face train data',
                        default='./data/celeba/train')
    parser.add_argument('--test_dir_face',
                        help='Directory of real face test data',
                        default='./data/celeba/test')
    parser.add_argument('--logger_dir',
                        help='Directory to save logs and model checkpoints',
                        default=os.path.join('save', 's2b', date_str()))
    parser.add_argument('--load_path',
                        help='Path of the model checkpoint to load',
                        default=os.path.join('save', 's2b', 'default',
                                             'model'))
    parser.add_argument('--epochs',
                        help='Number of epochs to train',
                        default=100000,
                        type=int)
    parser.add_argument('--batch_size',
                        help='Minibatch size',
                        default=128,
                        type=int)
    parser.add_argument('--lr', help='Learning rate', default=1e-4, type=float)
    parser.add_argument(
        '--decay',
        help=
        'The multiple by which to decay learning rate, instance noise stddev '
        'and discriminator uncertainty threshhold every epoch',
        default=0.98,
        type=float)
    parser.add_argument('--resume_lr',
                        help='Resume the learning rate from the previous run',
                        action='store_true')
    parser.add_argument(
        '--keep_prob',
        help='The keep probability for dropout (always 1 for testing)',
        default=0.5,
        type=float)
    parser.add_argument(
        '--summary_freq',
        help='Frequency (in steps) with which to write tensorboard summaries',
        default=20,
        type=int)
    parser.add_argument('--gpu', help='Which GPU to use')
    parser.add_argument('--num_threads',
                        help='The number of threads to read and process data',
                        default=32,
                        type=int)

    args = parser.parse_args()

    if args.gpu:
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu

    set_logger_dir(args.logger_dir)

    return args