Beispiel #1
0
def train_net(net,
              session_init,
              batch_size,
              num_epochs,
              train_dataflow,
              val_dataflow):

    num_towers = max(get_num_gpu(), 1)
    batch_per_tower = batch_size // num_towers
    logger.info("Running on {} towers. Batch size per tower: {}".format(num_towers, batch_per_tower))

    num_training_samples = 1281167
    step_size = num_training_samples // batch_size
    max_iter = (num_epochs - 1) * step_size
    callbacks = [
        ModelSaver(),
        ScheduledHyperParamSetter(
            'learning_rate',
            [(0, 0.5), (max_iter, 0)],
            interp='linear',
            step_based=True),
        EstimatedTimeLeft()]

    infs = [ClassificationError('wrong-top1', 'val-error-top1'),
            ClassificationError('wrong-top5', 'val-error-top5')]
    if num_towers == 1:
        # single-GPU inference with queue prefetch
        callbacks.append(InferenceRunner(
            input=QueueInput(val_dataflow),
            infs=infs))
    else:
        # multi-GPU inference (with mandatory queue prefetch)
        callbacks.append(DataParallelInferenceRunner(
            input=val_dataflow,
            infs=infs,
            gpus=list(range(num_towers))))

    config = TrainConfig(
        dataflow=train_dataflow,
        model=net,
        callbacks=callbacks,
        session_init=session_init,
        steps_per_epoch=step_size,
        max_epoch=num_epochs)

    launch_train_with_config(
        config=config,
        trainer=SyncMultiGPUTrainerParameterServer(num_towers))
Beispiel #2
0
def get_config(model, conf):
    nr_tower = max(get_nr_gpu(), 1)
    batch = conf.batch
    if conf.fake:
        logger.info("For benchmark, batch size is fixed to 64 per tower.")
        dataset_train = FakeData([[64, 224, 224, 3], [64]],
                                 1000,
                                 random=False,
                                 dtype='uint8')
        callbacks = []
    else:
        logger.info("Running on {} towers. Batch size per tower: {}".format(
            nr_tower, batch))
        dataset_train = get_data(conf.data_dir, 'train', batch)
        dataset_val = get_data(conf.data_dir, 'val', batch)
        callbacks = [
            ModelSaver(),
            ScheduledHyperParamSetter('learning_rate', [(45, 1e-2), (60, 1e-3),
                                                        (65, 1e-4), (70, 1e-5),
                                                        (75, 1e-6)]),
            HumanHyperParamSetter('learning_rate'),
        ]
        infs = [
            ClassificationError('wrong-top1', 'val-error-top1'),
            ClassificationError('wrong-top5', 'val-error-top5')
        ]
        if nr_tower == 1:
            # single-GPU inference with queue prefetch
            callbacks.append(InferenceRunner(QueueInput(dataset_val), infs))
        else:
            # multi-GPU inference (with mandatory queue prefetch)
            callbacks.append(
                DataParallelInferenceRunner(dataset_val, infs,
                                            list(range(nr_tower))))
    return TrainConfig(model=model,
                       dataflow=dataset_train,
                       callbacks=callbacks,
                       steps_per_epoch=5000,
                       max_epoch=80,
                       nr_tower=nr_tower)
Beispiel #3
0
    if save_dir is None:
        logger.auto_set_dir()
    else:
        logger.set_logger_dir(save_dir)

    dataset_train = get_data('train')
    dataset_test = get_data('test')

    config = TrainConfig(
        model=CifarResNet(n=NUM_UNITS,
                          mult_decay=mult_decay,
                          lr_init=lr_base * 0.1),
        dataflow=dataset_train,
        callbacks=[
            ModelSaver(),
            InferenceRunner(
                dataset_test,
                [ScalarStats('cost'),
                 ClassificationError('wrong_vector')]),
            ScheduledHyperParamSetter('learning_rate',
                                      [(1, lr_base), (82, lr_base * 0.1),
                                       (123, lr_base * 0.01),
                                       (164, lr_base * 0.002)])
        ],
        max_epoch=200,
        session_init=SmartInit(args.load),
    )
    num_gpu = max(get_num_gpu(), 1)
    launch_train_with_config(config,
                             SyncMultiGPUTrainerParameterServer(num_gpu))