Ejemplo n.º 1
0
def train(args, logdir1, logdir2):
    # model
    model = Net2()

    preprocessing(data_path, logdir2)

    # dataflow
    df = Net2DataFlow(data_path, hp.train2.batch_size)

    # set logger for event and model saver
    logger.set_logger_dir(logdir2)

    # session_conf = tf.ConfigProto(
    #     gpu_options=tf.GPUOptions(
    #         allow_growth=True,
    #         per_process_gpu_memory_fraction=0.6,
    #     ),
    # )

    dataset_size = len(glob.glob(data_path + '/wav/*.wav'))
    print("\t\data_path : ", data_path)
    print("\t\tDataset Size : ", dataset_size)
    print("\t\tBatch Size : ", hp.train2.batch_size)
    print("\t\tSteps per epoch : ", (dataset_size // hp.train2.batch_size))
    from time import sleep
    sleep(10)

    session_inits = []
    ckpt2 = '{}/{}'.format(
        logdir2,
        args.ckpt) if args.ckpt else tf.train.latest_checkpoint(logdir2)
    if ckpt2:
        session_inits.append(SaverRestore(ckpt2))
    ckpt1 = tf.train.latest_checkpoint(logdir1)
    if ckpt1:
        session_inits.append(SaverRestore(ckpt1, ignore=['global_step']))
    train_conf = AutoResumeTrainConfig(
        model=model,
        data=QueueInput(df(n_prefetch=1000, n_thread=8)),
        callbacks=[
            # TODO save on prefix net2
            ModelSaver(checkpoint_dir=logdir2),
            # ConvertCallback(logdir2, hp.train2.test_per_epoch),
        ],
        max_epoch=hp.train2.num_epochs,
        steps_per_epoch=dataset_size // hp.train2.batch_size,
        session_init=ChainInit(session_inits))
    if args.gpu:
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
        train_conf.nr_tower = len(args.gpu.split(','))
        gpu_list = args.gpu.split(',')
        gpu_list = list(map(int, gpu_list))

    #trainer = SimpleTrainer()
    trainer = SyncMultiGPUTrainerReplicated(gpu_list)
    #trainer = AsyncMultiGPUTrainer(gpu_list, False)

    launch_train_with_config(train_conf, trainer=trainer)
Ejemplo n.º 2
0
def train(args, logdir):

    # model
    model = Net1()

    preprocessing(data_path)
    preprocessing(test_path)

    # dataflow
    df = Net1DataFlow(data_path, hp.train1.batch_size)
    df_test = Net1DataFlow(test_path, hp.train1.batch_size)

    #datas = df.get_data()
    #print(datas[1])
    # set logger for event and model saver
    logger.set_logger_dir(logdir)
    #session_conf = tf.ConfigProto(
    #    gpu_options=tf.GPUOptions(
    #        allow_growth=True,
    #    ),)

    # cv test code
    # https://github.com/tensorpack/tensorpack/blob/master/examples/boilerplate.py

    train_conf = AutoResumeTrainConfig(
        model=model,
        data=QueueInput(df(n_prefetch=hp.train1.batch_size * 10, n_thread=1)),
        callbacks=[
            ModelSaver(checkpoint_dir=logdir),
            InferenceRunner(
                df_test(n_prefetch=1),
                ScalarStats(['net1/eval/loss', 'net1/eval/acc'], prefix='')),
        ],
        max_epoch=hp.train1.num_epochs,
        steps_per_epoch=hp.train1.steps_per_epoch,
        #session_config=session_conf
    )
    ckpt = '{}/{}'.format(
        logdir, args.ckpt) if args.ckpt else tf.train.latest_checkpoint(logdir)
    num_gpu = hp.train1.num_gpu

    if ckpt:
        train_conf.session_init = SaverRestore(ckpt)

    if args.gpu:
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
        train_conf.nr_tower = len(args.gpu.split(','))
        num_gpu = len(args.gpu.split(','))
        trainer = SyncMultiGPUTrainerReplicated(num_gpu)
    else:
        trainer = SimpleTrainer()

    launch_train_with_config(train_conf, trainer=trainer)
Ejemplo n.º 3
0
def get_config(model, fake=False):
    nr_tower = max(get_num_gpu(), 1)
    assert args.batch % nr_tower == 0
    batch = args.batch // nr_tower

    logger.info("Running on {} towers. Batch size per tower: {}".format(
        nr_tower, batch))
    if batch < 32 or batch > 64:
        logger.warn(
            "Batch size per tower not in [32, 64]. This probably will lead to worse accuracy than reported."
        )
    if fake:
        data = QueueInput(
            FakeData(
                [[batch, 224, 224, 3], [batch], [batch, 224, 224, 3], [batch]],
                1000,
                random=False,
                dtype='uint8'))
        callbacks = []
    else:
        data = QueueInput(get_data('train', batch))

        START_LR = 0.1
        BASE_LR = START_LR * (args.batch / 256.0)
        callbacks = [
            ModelSaver(),
            EstimatedTimeLeft(),
            ScheduledHyperParamSetter('learning_rate',
                                      [(0, min(START_LR, BASE_LR)),
                                       (30, BASE_LR * 1e-1),
                                       (45, BASE_LR * 1e-2),
                                       (55, BASE_LR * 1e-3)]),
        ]
        if BASE_LR > START_LR:
            callbacks.append(
                ScheduledHyperParamSetter('learning_rate', [(0, START_LR),
                                                            (5, BASE_LR)],
                                          interp='linear'))

        infs = [
            ClassificationError('wrong-top1', 'val-error-top1'),
            ClassificationError('wrong-top5', 'val-error-top5')
        ]
        dataset_val = get_data('val', batch)
        if nr_tower == 1:
            # single-GPU inference with queue prefetch
            callbacks.append(InferenceRunner(QueueInput(dataset_val), infs))
        else:
            # multi-GPU inference (with mandatory queue prefetch)
            callbacks.append(
                DataParallelInferenceRunner(dataset_val, infs,
                                            list(range(nr_tower))))

    return AutoResumeTrainConfig(
        model=model,
        data=data,
        callbacks=callbacks,
        steps_per_epoch=100 if args.fake else 1280000 // args.batch,
        max_epoch=60,
    )
Ejemplo n.º 4
0
def get_config(model, scales, distill=False, fake=False, data_aug=True):
    nr_tower = max(get_nr_gpu(), 1)
    batch = TOTAL_BATCH_SIZE // nr_tower

    if fake:
        logger.info("For benchmark, batch size is fixed to 64 per tower.")
        dataset_train = FakeData([[64, 224, 224, 3], [64]],
                                 1000,
                                 random=False,
                                 dtype='uint8')
        callbacks = []
    else:
        logger.info("Running on {} towers. Batch size per tower: {}".format(
            nr_tower, batch))
        dataset_train = get_data('train', batch, data_aug)
        dataset_val = get_data('val', batch, data_aug)
        callbacks = [ModelSaver()]
        if data_aug:
            callbacks.append(
                ScheduledHyperParamSetter('learning_rate', [(30, 1e-2),
                                                            (60, 1e-3),
                                                            (85, 1e-4),
                                                            (95, 1e-5),
                                                            (105, 1e-6)]))
        callbacks.append(HumanHyperParamSetter('learning_rate'))
        infs = []
        for scale in scales:
            infs.append(
                ClassificationError('wrong-scale%03d-top1' % scale,
                                    'val-error-scale%03d-top1' % scale))
            infs.append(
                ClassificationError('wrong-scale%03d-top5' % scale,
                                    'val-error-scale%03d-top5' % scale))
        if distill:
            infs.append(
                ClassificationError('wrong-scale_ensemble-top1',
                                    'val-error-scale_ensemble-top1'))
            infs.append(
                ClassificationError('wrong-scale_ensemble-top5',
                                    'val-error-scale_ensemble-top5'))
        if nr_tower == 1:
            # single-GPU inference with queue prefetch
            callbacks.append(InferenceRunner(QueueInput(dataset_val), infs))
        else:
            # multi-GPU inference (with mandatory queue prefetch)
            callbacks.append(
                DataParallelInferenceRunner(dataset_val, infs,
                                            list(range(nr_tower))))

    return AutoResumeTrainConfig(
        model=model,
        dataflow=dataset_train,
        callbacks=callbacks,
        steps_per_epoch=5000 if TOTAL_BATCH_SIZE == 256 else 10000,
        max_epoch=120 if data_aug else 64,
        nr_tower=nr_tower)
Ejemplo n.º 5
0
def train(args, logdir):
    # model
    model = Net1()

    # dataflow
    TIMIT_TRAIN_WAV = 'TIMIT/TRAIN/*/*/*.npz'
    TIMIT_TEST_WAV = 'TIMIT/TEST/*/*/*.npz'

    print(os.path.join(hp.train1.preproc_data_path, args.case, TIMIT_TRAIN_WAV))
    print(os.path.join(hp.train1.preproc_data_path, args.case, TIMIT_TEST_WAV))

    df = Net1DataFlow(os.path.join(hp.train1.preproc_data_path, args.case, TIMIT_TRAIN_WAV), hp.train1.batch_size)
    df_test = Net1DataFlow(os.path.join(hp.train1.preproc_data_path, args.case, TIMIT_TEST_WAV), hp.train1.batch_size)

    # set logger for event and model saver
    logger.set_logger_dir(logdir)
    train_conf = AutoResumeTrainConfig(
        model=model,
        data=QueueInput(df(n_prefetch=1000, n_thread=8)),
        callbacks=[
            ModelSaver(checkpoint_dir=logdir),
            InferenceRunner(df_test(n_prefetch=1),
                            ScalarStats(['net1/eval/loss', 'net1/eval/acc'],prefix='')),
        ],
        max_epoch=hp.train1.num_epochs,
        steps_per_epoch=hp.train1.steps_per_epoch,
        #session_config=session_conf
    )
    ckpt = '{}/{}'.format(logdir, args.ckpt) if args.ckpt else tf.train.latest_checkpoint(logdir)

    if ckpt:
        train_conf.session_init = SaverRestore(ckpt)

    if hp.default.use_gpu == True:
        os.environ['CUDA_VISIBLE_DEVICES'] = hp.default.gpu_list
        train_conf.nr_tower = len(hp.default.gpu_list.split(','))
        num_gpu = len(hp.default.gpu_list.split(','))
        trainer = SyncMultiGPUTrainerReplicated(num_gpu)
    else:
        os.environ['CUDA_VISIBLE_DEVICES'] = ''
        trainer = SimpleTrainer()

    launch_train_with_config(train_conf, trainer=trainer)