Ejemplo n.º 1
0
def get_config():
    nr_tower = max(get_num_gpu(), 1)
    batch = args.batch
    total_batch = batch * nr_tower
    if total_batch != 128:
        logger.warn("AlexNet needs to be trained with a total batch size of 128.")
    BASE_LR = 0.01 * (total_batch / 128.)

    logger.info("Running on {} towers. Batch size per tower: {}".format(nr_tower, batch))
    dataset_train = get_data('train', batch)
    dataset_val = get_data('val', batch)

    infs = [ClassificationError('wrong-top1', 'val-error-top1'),
            ClassificationError('wrong-top5', 'val-error-top5')]
    callbacks = [
        ModelSaver(),
        GPUUtilizationTracker(),
        EstimatedTimeLeft(),
        ScheduledHyperParamSetter(
            'learning_rate',
            [(0, BASE_LR), (30, BASE_LR * 1e-1), (60, BASE_LR * 1e-2), (80, BASE_LR * 1e-3)]),
        DataParallelInferenceRunner(
            dataset_val, infs, list(range(nr_tower))),
    ]

    return TrainConfig(
        model=Model(),
        data=StagingInput(QueueInput(dataset_train)),
        callbacks=callbacks,
        steps_per_epoch=1281167 // total_batch,
        max_epoch=100,
    )
Ejemplo n.º 2
0
def train():
    dirname = os.path.join('train_log', 'train-atari-{}'.format(ENV_NAME))
    logger.set_logger_dir(dirname)

    # assign GPUs for training & inference
    num_gpu = get_num_gpu()
    global PREDICTOR_THREAD
    if num_gpu > 0:
        if num_gpu > 1:
            # use half gpus for inference
            predict_tower = list(range(num_gpu))[-num_gpu // 2:]
        else:
            predict_tower = [0]
        PREDICTOR_THREAD = len(predict_tower) * PREDICTOR_THREAD_PER_GPU
        train_tower = list(range(num_gpu))[:-num_gpu // 2] or [0]
        logger.info("[Batch-A3C] Train on gpu {} and infer on gpu {}".format(
            ','.join(map(str, train_tower)), ','.join(map(str, predict_tower))))
    else:
        logger.warn("Without GPU this model will never learn! CPU is only useful for debug.")
        PREDICTOR_THREAD = 1
        predict_tower, train_tower = [0], [0]

    # setup simulator processes
    name_base = str(uuid.uuid1())[:6]
    prefix = '@' if sys.platform.startswith('linux') else ''
    namec2s = 'ipc://{}sim-c2s-{}'.format(prefix, name_base)
    names2c = 'ipc://{}sim-s2c-{}'.format(prefix, name_base)
    procs = [MySimulatorWorker(k, namec2s, names2c) for k in range(SIMULATOR_PROC)]
    ensure_proc_terminate(procs)
    start_proc_mask_signal(procs)

    master = MySimulatorMaster(namec2s, names2c, predict_tower)
    dataflow = BatchData(DataFromQueue(master.queue), BATCH_SIZE)
    config = TrainConfig(
        model=Model(),
        dataflow=dataflow,
        callbacks=[
            ModelSaver(),
            ScheduledHyperParamSetter('learning_rate', [(20, 0.0003), (120, 0.0001)]),
            ScheduledHyperParamSetter('entropy_beta', [(80, 0.005)]),
            HumanHyperParamSetter('learning_rate'),
            HumanHyperParamSetter('entropy_beta'),
            master,
            StartProcOrThread(master),
            PeriodicTrigger(Evaluator(
                EVAL_EPISODE, ['state'], ['policy'], get_player),
                every_k_epochs=3),
        ],
        session_creator=sesscreate.NewSessionCreator(
            config=get_default_sess_config(0.5)),
        steps_per_epoch=STEPS_PER_EPOCH,
        session_init=get_model_loader(args.load) if args.load else None,
        max_epoch=1000,
    )
    trainer = SimpleTrainer() if config.nr_tower == 1 else AsyncMultiGPUTrainer(train_tower)
    launch_train_with_config(config, trainer)
Ejemplo n.º 3
0
def init_config():
    if config.TRAINER == 'horovod':
        ngpu = hvd.size()
    else:
        ngpu = get_num_gpu()
    assert ngpu % 8 == 0 or 8 % ngpu == 0, ngpu
    if config.NUM_GPUS is None:
        config.NUM_GPUS = ngpu
    else:
        if config.TRAINER == 'horovod':
            assert config.NUM_GPUS == ngpu
        else:
            assert config.NUM_GPUS <= ngpu
    print_config()
Ejemplo n.º 4
0
def get_config(model, fake=False):
    nr_tower = max(get_num_gpu(), 1)
    assert args.batch % nr_tower == 0
    batch = args.batch // nr_tower

    logger.info("Running on {} towers. Batch size per tower: {}".format(nr_tower, batch))
    if batch < 32 or batch > 64:
        logger.warn("Batch size per tower not in [32, 64]. This probably will lead to worse accuracy than reported.")
    if fake:
        data = QueueInput(FakeData(
            [[batch, 224, 224, 3], [batch]], 1000, random=False, dtype='uint8'))
        callbacks = []
    else:
        data = QueueInput(get_data('train', batch))

        START_LR = 0.1
        BASE_LR = START_LR * (args.batch / 256.0)
        callbacks = [
            ModelSaver(),
            EstimatedTimeLeft(),
            ScheduledHyperParamSetter(
                'learning_rate', [
                    (0, min(START_LR, BASE_LR)), (30, BASE_LR * 1e-1), (60, BASE_LR * 1e-2),
                    (90, BASE_LR * 1e-3), (100, BASE_LR * 1e-4)]),
        ]
        if BASE_LR > START_LR:
            callbacks.append(
                ScheduledHyperParamSetter(
                    'learning_rate', [(0, START_LR), (5, BASE_LR)], interp='linear'))

        infs = [ClassificationError('wrong-top1', 'val-error-top1'),
                ClassificationError('wrong-top5', 'val-error-top5')]
        dataset_val = get_data('val', batch)
        if nr_tower == 1:
            # single-GPU inference with queue prefetch
            callbacks.append(InferenceRunner(QueueInput(dataset_val), infs))
        else:
            # multi-GPU inference (with mandatory queue prefetch)
            callbacks.append(DataParallelInferenceRunner(
                dataset_val, infs, list(range(nr_tower))))

    return TrainConfig(
        model=model,
        data=data,
        callbacks=callbacks,
        steps_per_epoch=100 if args.fake else 1281167 // args.batch,
        max_epoch=105,
    )
Ejemplo n.º 5
0
def finalize_configs(is_training):
    """
    Run some sanity checks, and populate some configs from others
    """
    _C.DATA.NUM_CLASS = _C.DATA.NUM_CATEGORY + 1  # +1 background
    _C.RPN.NUM_ANCHOR = len(_C.RPN.ANCHOR_SIZES) * len(_C.RPN.ANCHOR_RATIOS)
    assert len(_C.FPN.ANCHOR_STRIDES) == len(_C.RPN.ANCHOR_SIZES)
    # image size into the backbone has to be multiple of this number
    _C.FPN.RESOLUTION_REQUIREMENT = _C.FPN.ANCHOR_STRIDES[3]  # [3] because we build FPN with features r2,r3,r4,r5

    if _C.MODE_FPN:
        size_mult = _C.FPN.RESOLUTION_REQUIREMENT * 1.
        _C.PREPROC.MAX_SIZE = np.ceil(_C.PREPROC.MAX_SIZE / size_mult) * size_mult

    if is_training:
        os.environ['TF_AUTOTUNE_THRESHOLD'] = '1'
        assert _C.TRAINER in ['horovod', 'replicated'], _C.TRAINER

        # setup NUM_GPUS
        if _C.TRAINER == 'horovod':
            import horovod.tensorflow as hvd
            ngpu = hvd.size()
        else:
            assert 'OMPI_COMM_WORLD_SIZE' not in os.environ
            ngpu = get_num_gpu()
        assert ngpu % 8 == 0 or 8 % ngpu == 0, ngpu
        if _C.TRAIN.NUM_GPUS is None:
            _C.TRAIN.NUM_GPUS = ngpu
        else:
            if _C.TRAINER == 'horovod':
                assert _C.TRAIN.NUM_GPUS == ngpu
            else:
                assert _C.TRAIN.NUM_GPUS <= ngpu
    else:
        # autotune is too slow for inference
        os.environ['TF_CUDNN_USE_AUTOTUNE'] = '0'

    logger.info("Config: ------------------------------------------\n" + str(_C))
Ejemplo n.º 6
0
def get_config():
    nr_tower = max(get_num_gpu(), 1)
    batch = args.batch
    total_batch = batch * nr_tower
    assert total_batch >= 256   # otherwise the learning rate warmup is wrong.
    BASE_LR = 0.01 * (total_batch / 256.)

    logger.info("Running on {} towers. Batch size per tower: {}".format(nr_tower, batch))
    dataset_train = get_data('train', batch)
    dataset_val = get_data('val', batch)

    infs = [ClassificationError('wrong-top1', 'val-error-top1'),
            ClassificationError('wrong-top5', 'val-error-top5')]
    callbacks = [
        ModelSaver(),
        GPUUtilizationTracker(),
        EstimatedTimeLeft(),
        ScheduledHyperParamSetter(
            'learning_rate',
            [(0, 0.01), (3, max(BASE_LR, 0.01))], interp='linear'),
        ScheduledHyperParamSetter(
            'learning_rate',
            [(30, BASE_LR * 1e-1), (60, BASE_LR * 1e-2), (80, BASE_LR * 1e-3)]),
        DataParallelInferenceRunner(
            dataset_val, infs, list(range(nr_tower))),
    ]

    input = QueueInput(dataset_train)
    input = StagingInput(input, nr_stage=1)
    return TrainConfig(
        model=Model(),
        data=input,
        callbacks=callbacks,
        steps_per_epoch=1281167 // total_batch,
        max_epoch=100,
    )
Ejemplo n.º 7
0
            InferenceRunner(dataset_val, [
                ClassificationError('wrong-top1', 'val-top1-error'),
                ClassificationError('wrong-top5', 'val-top5-error')]),
            ScheduledHyperParamSetter('learning_rate',
                                      [(8, 0.03), (14, 0.02), (17, 5e-3),
                                       (19, 3e-3), (24, 1e-3), (26, 2e-4),
                                       (30, 5e-5)])
        ],
        model=Model(),
        steps_per_epoch=5000,
        max_epoch=80,
    )


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--gpu', help='comma separated list of GPU(s) to use.')
    parser.add_argument('--load', help='load model')
    parser.add_argument('--data', help='ImageNet data root directory', required=True)
    args = parser.parse_args()

    if args.gpu:
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu

    config = get_config()
    if args.load:
        config.session_init = SaverRestore(args.load)
    nr_tower = get_num_gpu()
    assert nr_tower == NUM_GPU
    launch_train_with_config(config, SyncMultiGPUTrainer(NUM_GPU))
    M = tf.keras.models.Model(input, x, name='resnet50')
    return M


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--data', help='ILSVRC dataset dir')
    parser.add_argument('--fake',
                        help='use fakedata to test or benchmark this model',
                        action='store_true')
    args = parser.parse_args()
    logger.set_logger_dir(os.path.join("train_log", "imagenet-resnet-keras"))

    tf.keras.backend.set_image_data_format('channels_first')

    num_gpu = get_num_gpu()
    if args.fake:
        df_train = FakeData([[64, 224, 224, 3], [64, 1000]],
                            5000,
                            random=False,
                            dtype='uint8')
        df_val = FakeData([[64, 224, 224, 3], [64, 1000]], 5000, random=False)
    else:
        batch_size = TOTAL_BATCH_SIZE // num_gpu
        assert args.data is not None
        df_train = get_imagenet_dataflow(args.data, 'train', batch_size,
                                         fbresnet_augmentor(True))
        df_val = get_imagenet_dataflow(args.data, 'val', batch_size,
                                       fbresnet_augmentor(False))

        def one_hot(label):
Ejemplo n.º 9
0
def get_data():
    train = BatchData(dataset.Mnist('train'), 128)
    test = BatchData(dataset.Mnist('test'), 256, remainder=True)
    return train, test


if __name__ == '__main__':
    logger.auto_set_dir()
    dataset_train, dataset_test = get_data()

    cfg = TrainConfig(
        model=Model(),
        dataflow=dataset_train,
        callbacks=[
            KerasPhaseCallback(True),  # for Keras training
            ModelSaver(),
            InferenceRunner(dataset_test,
                            ScalarStats(['cross_entropy_loss', 'accuracy'])),
        ],
        max_epoch=100,
    )

    if get_num_gpu() <= 1:
        # single GPU:
        launch_train_with_config(cfg, QueueInputTrainer())
    else:
        # multi GPU:
        launch_train_with_config(cfg, SyncMultiGPUTrainerParameterServer(2))
        # "Replicated" multi-gpu trainer is not supported for Keras model
        # since Keras does not respect variable scopes.
Ejemplo n.º 10
0
            ]),
            ScheduledHyperParamSetter('learning_rate', [(8, 0.03), (14, 0.02),
                                                        (17, 5e-3), (19, 3e-3),
                                                        (24, 1e-3), (26, 2e-4),
                                                        (30, 5e-5)])
        ],
        model=Model(),
        steps_per_epoch=5000,
        max_epoch=80,
    )


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--gpu', help='comma separated list of GPU(s) to use.')
    parser.add_argument('--load', help='load model')
    parser.add_argument('--data',
                        help='ImageNet data root directory',
                        required=True)
    args = parser.parse_args()

    if args.gpu:
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu

    config = get_config()
    if args.load:
        config.session_init = SaverRestore(args.load)
    nr_tower = get_num_gpu()
    assert nr_tower == NUM_GPU
    launch_train_with_config(config, SyncMultiGPUTrainer(NUM_GPU))
Ejemplo n.º 11
0
            if cnt == 500:
                return


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--gpu', help='comma separated list of GPU(s) to use.')
    parser.add_argument('--data', help='ILSVRC dataset dir')
    parser.add_argument('--depth', type=int, default=18)
    parser.add_argument('--load', help='load model')
    parser.add_argument('--cam', action='store_true')
    args = parser.parse_args()

    DEPTH = args.depth
    if args.gpu:
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu

    num_gpu = get_num_gpu()
    BATCH_SIZE = TOTAL_BATCH_SIZE // num_gpu

    if args.cam:
        BATCH_SIZE = 128    # something that can run on one gpu
        viz_cam(args.load, args.data)
        sys.exit()

    logger.auto_set_dir()
    config = get_config()
    if args.load:
        config.session_init = get_model_loader(args.load)
    launch_train_with_config(config, SyncMultiGPUTrainerParameterServer(num_gpu))
Ejemplo n.º 12
0
    parser.add_argument('--eval', action='store_true', help='run offline evaluation instead of training')
    parser.add_argument('--batch', default=256, type=int,
                        help="total batch size. "
                        "Note that it's best to keep per-GPU batch size in [32, 64] to obtain the best accuracy."
                        "Pretrained models listed in README were trained with batch=32x8.")
    parser.add_argument('--mode', choices=['resnet', 'preact', 'se'],
                        help='variants of resnet to use', default='resnet')
    args = parser.parse_args()

    if args.gpu:
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu

    model = Model(args.depth, args.mode)
    model.data_format = args.data_format
    if args.eval:
        batch = 128    # something that can run on one gpu
        ds = get_data('val', batch)
        eval_on_ILSVRC12(model, get_model_loader(args.load), ds)
    else:
        if args.fake:
            logger.set_logger_dir(os.path.join('train_log', 'tmp'), 'd')
        else:
            logger.set_logger_dir(
                os.path.join('train_log', 'imagenet-{}-d{}-batch{}'.format(args.mode, args.depth, args.batch)))

        config = get_config(model, fake=args.fake)
        if args.load:
            config.session_init = get_model_loader(args.load)
        trainer = SyncMultiGPUTrainerReplicated(max(get_num_gpu(), 1))
        launch_train_with_config(config, trainer)
Ejemplo n.º 13
0
def finalize_configs(is_training):
    """
    Run some sanity checks, and populate some configs from others
    """
    _C.freeze(False)  # populate new keys now
    if isinstance(_C.DATA.VAL, six.string_types
                  ):  # support single string (the typical case) as well
        _C.DATA.VAL = (_C.DATA.VAL, )
    if isinstance(_C.DATA.TRAIN, six.string_types):  # support single string
        _C.DATA.TRAIN = (_C.DATA.TRAIN, )

    # finalize dataset definitions ...
    from mot.object_detection.dataset import DatasetRegistry
    datasets = list(_C.DATA.TRAIN) + list(_C.DATA.VAL)
    _C.DATA.CLASS_NAMES = DatasetRegistry.get_metadata(datasets[0],
                                                       "class_names")
    _C.DATA.NUM_CATEGORY = len(_C.DATA.CLASS_NAMES) - 1

    assert _C.BACKBONE.NORM in ['FreezeBN', 'SyncBN', 'GN',
                                'None'], _C.BACKBONE.NORM
    if _C.BACKBONE.NORM != 'FreezeBN':
        assert not _C.BACKBONE.FREEZE_AFFINE
    assert _C.BACKBONE.FREEZE_AT in [0, 1, 2]

    _C.RPN.NUM_ANCHOR = len(_C.RPN.ANCHOR_SIZES) * len(_C.RPN.ANCHOR_RATIOS)
    assert len(_C.FPN.ANCHOR_STRIDES) == len(_C.RPN.ANCHOR_SIZES)
    # image size into the backbone has to be multiple of this number
    _C.FPN.RESOLUTION_REQUIREMENT = _C.FPN.ANCHOR_STRIDES[
        3]  # [3] because we build FPN with features r2,r3,r4,r5

    if _C.MODE_FPN:
        size_mult = _C.FPN.RESOLUTION_REQUIREMENT * 1.
        _C.PREPROC.MAX_SIZE = np.ceil(
            _C.PREPROC.MAX_SIZE / size_mult) * size_mult
        assert _C.FPN.PROPOSAL_MODE in ['Level', 'Joint']
        assert _C.FPN.FRCNN_HEAD_FUNC.endswith('_head')
        assert _C.FPN.MRCNN_HEAD_FUNC.endswith('_head')
        assert _C.FPN.NORM in ['None', 'GN']

        if _C.FPN.CASCADE:
            # the first threshold is the proposal sampling threshold
            assert _C.CASCADE.IOUS[0] == _C.FRCNN.FG_THRESH
            assert len(_C.CASCADE.BBOX_REG_WEIGHTS) == len(_C.CASCADE.IOUS)

    if is_training:
        train_scales = _C.PREPROC.TRAIN_SHORT_EDGE_SIZE
        if isinstance(
                train_scales,
            (list, tuple)) and train_scales[1] - train_scales[0] > 100:
            # don't autotune if augmentation is on
            os.environ['TF_CUDNN_USE_AUTOTUNE'] = '0'
        os.environ['TF_AUTOTUNE_THRESHOLD'] = '1'
        assert _C.TRAINER in ['horovod', 'replicated'], _C.TRAINER

        lr = _C.TRAIN.LR_SCHEDULE
        if isinstance(lr, six.string_types):
            if lr.endswith("x"):
                LR_SCHEDULE_KITER = {
                    "{}x".format(k): [180 * k - 120, 180 * k - 40, 180 * k]
                    for k in range(2, 10)
                }
                LR_SCHEDULE_KITER["1x"] = [120, 160, 180]
                _C.TRAIN.LR_SCHEDULE = [
                    x * 1000 for x in LR_SCHEDULE_KITER[lr]
                ]
            else:
                _C.TRAIN.LR_SCHEDULE = eval(lr)

        # setup NUM_GPUS
        if _C.TRAINER == 'horovod':
            import horovod.tensorflow as hvd
            ngpu = hvd.size()
            logger.info("Horovod Rank={}, Size={}, LocalRank={}".format(
                hvd.rank(), hvd.size(), hvd.local_rank()))
        else:
            assert 'OMPI_COMM_WORLD_SIZE' not in os.environ
            ngpu = get_num_gpu()
        assert ngpu > 0, "Has to train with GPU!"
        assert ngpu % 8 == 0 or 8 % ngpu == 0, "Can only train with 1,2,4 or >=8 GPUs, but found {} GPUs".format(
            ngpu)
    else:
        # autotune is too slow for inference
        os.environ['TF_CUDNN_USE_AUTOTUNE'] = '0'
        ngpu = get_num_gpu()

    if _C.TRAIN.NUM_GPUS is None:
        _C.TRAIN.NUM_GPUS = ngpu
    else:
        if _C.TRAINER == 'horovod':
            assert _C.TRAIN.NUM_GPUS == ngpu
        else:
            assert _C.TRAIN.NUM_GPUS <= ngpu

    _C.freeze()
    logger.info("Config: ------------------------------------------\n" +
                str(_C))
Ejemplo n.º 14
0
def finalize_configs(is_training):
    """
    Run some sanity checks, and populate some configs from others
    """
    _C.freeze(False)  # populate new keys now
    if isinstance(_C.DATA.VAL, six.string_types
                  ):  # support single string (the typical case) as well
        _C.DATA.VAL = (_C.DATA.VAL, )
    if isinstance(_C.DATA.TRAIN, six.string_types):  # support single string
        _C.DATA.TRAIN = (_C.DATA.TRAIN, )

    # finalize dataset definitions ...
    from dataset import DatasetRegistry
    datasets = list(_C.DATA.TRAIN) + list(_C.DATA.VAL)

    # _C.DATA.CLASS_NAMES = ["BG", "class1", "class2", "class3", "class4", "class5", "class6"]
    # _C.DATA.CLASS_NAMES = [
    #     "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush"]  # noqa
    # _C.DATA.CLASS_NAMES = ["BG"] + _C.DATA.CLASS_NAMES
    # print(datasets[0])
    _C.DATA.CLASS_NAMES = DatasetRegistry.get_metadata(datasets[0],
                                                       "class_names")
    # print(_C.DATA.CLASS_NAMES)
    _C.DATA.NUM_CATEGORY = len(_C.DATA.CLASS_NAMES) - 1

    assert _C.BACKBONE.NORM in ['FreezeBN', 'SyncBN', 'GN',
                                'None'], _C.BACKBONE.NORM
    if _C.BACKBONE.NORM != 'FreezeBN':
        assert not _C.BACKBONE.FREEZE_AFFINE
    assert _C.BACKBONE.FREEZE_AT in [0, 1, 2]

    _C.RPN.NUM_ANCHOR = len(_C.RPN.ANCHOR_SIZES) * len(_C.RPN.ANCHOR_RATIOS)
    assert len(_C.FPN.ANCHOR_STRIDES) == len(_C.RPN.ANCHOR_SIZES)
    # image size into the backbone has to be multiple of this number
    _C.FPN.RESOLUTION_REQUIREMENT = _C.FPN.ANCHOR_STRIDES[
        3]  # [3] because we build FPN with features r2,r3,r4,r5

    if _C.MODE_FPN:
        size_mult = _C.FPN.RESOLUTION_REQUIREMENT * 1.
        _C.PREPROC.MAX_SIZE = np.ceil(
            _C.PREPROC.MAX_SIZE / size_mult) * size_mult
        assert _C.FPN.PROPOSAL_MODE in ['Level', 'Joint']
        assert _C.FPN.FRCNN_HEAD_FUNC.endswith('_head')
        assert _C.FPN.MRCNN_HEAD_FUNC.endswith('_head')
        assert _C.FPN.NORM in ['None', 'GN']

        if _C.FPN.CASCADE:
            # the first threshold is the proposal sampling threshold
            assert _C.CASCADE.IOUS[0] == _C.FRCNN.FG_THRESH
            assert len(_C.CASCADE.BBOX_REG_WEIGHTS) == len(_C.CASCADE.IOUS)

    if is_training:
        train_scales = _C.PREPROC.TRAIN_SHORT_EDGE_SIZE
        if isinstance(
                train_scales,
            (list, tuple)) and train_scales[1] - train_scales[0] > 100:
            # don't autotune if augmentation is on
            os.environ['TF_CUDNN_USE_AUTOTUNE'] = '0'
        os.environ['TF_AUTOTUNE_THRESHOLD'] = '1'
        assert _C.TRAINER in ['horovod', 'replicated'], _C.TRAINER

        lr = _C.TRAIN.LR_SCHEDULE
        if isinstance(lr, six.string_types):
            if lr.endswith("x"):
                LR_SCHEDULE_KITER = {
                    "{}x".format(k): [180 * k - 120, 180 * k - 40, 180 * k]
                    for k in range(2, 10)
                }
                LR_SCHEDULE_KITER["1x"] = [120, 160, 180]
                _C.TRAIN.LR_SCHEDULE = [
                    x * 1000 for x in LR_SCHEDULE_KITER[lr]
                ]
            else:
                _C.TRAIN.LR_SCHEDULE = eval(lr)

        # setup NUM_GPUS
        if _C.TRAINER == 'horovod':
            import horovod.tensorflow as hvd
            ngpu = hvd.size()
            logger.info("Horovod Rank={}, Size={}, LocalRank={}".format(
                hvd.rank(), hvd.size(), hvd.local_rank()))
        else:
            assert 'OMPI_COMM_WORLD_SIZE' not in os.environ
            ngpu = get_num_gpu()
        assert ngpu > 0, "Has to train with GPU!"
        assert ngpu % 8 == 0 or 8 % ngpu == 0, "Can only train with 1,2,4 or >=8 GPUs, but found {} GPUs".format(
            ngpu)
    else:
        # autotune is too slow for inference
        os.environ['TF_CUDNN_USE_AUTOTUNE'] = '0'
        ngpu = get_num_gpu()

    if _C.TRAIN.NUM_GPUS is None:
        _C.TRAIN.NUM_GPUS = ngpu
    else:
        if _C.TRAINER == 'horovod':
            assert _C.TRAIN.NUM_GPUS == ngpu
        else:
            assert _C.TRAIN.NUM_GPUS <= ngpu

    _C.freeze()
    logger.info("Config: ------------------------------------------\n" +
                str(_C))
Ejemplo n.º 15
0
def tp_evaluation(args, cfg, sess, model):
    num_gpu = get_num_gpu()
    df = PneuSegDF(args.mode, None, args.train_dir, args.testset_dir,
                   args.min_num_workers, cfg)
    ds = df.eval_prepared(num_gpu, args.batch_size)
    tf.train.Saver().restore(sess, args.model_file)
    if os.path.exists(args.pkl_dir):
        input(
            "Result file already exists. Press enter to continue and overwrite it when inference is done..."
        )
    pbar = tqdm(total=len(ds))
    ds.reset_state()
    # for i in range(0, len(ds), cfg.batch_size):
    #     batch = ds[i:i + cfg.batch_size]
    #     pbar.update(cfg.batch_size)
    pneu_eval = Evaluation()
    in_ims, in_gts, p_infos, in_og_ims = [], [], [], []
    bad_slice_dirs = []
    for idx, in_data in enumerate(ds):
        if not df.ex_process.og_shape:
            df.ex_process.og_shape = in_data[1].shape[:-1]
        in_ims.append(in_data[0])
        in_gts.append(in_data[1])
        if args.viz:
            in_og_ims.append(in_data[4])
        df.ex_process.tl_list.append(in_data[2])
        data_dir = in_data[3]
        pneu_eval.add_to_p_map(data_dir)
        p_id = data_dir.split('/')[-3]
        assert len(p_id) == 32
        p_infos.append((p_id, data_dir))
        if len(in_ims) == cfg.batch_size or idx == len(ds) - 1:
            assert len(in_ims) == len(in_gts) and len(in_ims) == len(p_infos)
            im_batch, in_gts = np.array(in_ims), np.array(in_gts)
            pred = sess.run(model.ops["seg_map"],
                            feed_dict={model.in_im: im_batch})
            pred = (1 / (1 + np.exp(-pred))) > .5
            pred = df.ex_process.batch_postprocess(pred)
            if args.viz:
                og_im_batch = np.array(in_og_ims)
                og_im_batch = im_normalize(
                    og_im_batch, cfg.preprocess["normalize"]["ct_interval"],
                    cfg.preprocess["normalize"]["norm_by_interval"])
                viz_patient(og_im_batch, pred, in_gts, True)
            intersection = pred * in_gts
            for i in range(pred.shape[0]):
                itsect = np.sum(intersection[i, :, :, :])
                ga = np.sum(in_gts[i, :, :, :])
                pa = np.sum(pred[i, :, :, :])
                pneu_eval.person_map[p_infos[i]
                                     [0]].pixel_info["intersection"] += itsect
                pneu_eval.person_map[p_infos[i][0]].pixel_info["gt_area"] += ga
                pneu_eval.person_map[p_infos[i]
                                     [0]].pixel_info["pred_area"] += pa
                if args.bad_slice_output:
                    if (ga != 0 or pa != 0
                        ) and (2 * itsect + 1e-8) / (ga + pa + 1e-8) < .3:
                        bad_slice_dirs.append(p_infos[i][1])
                        if args.eval_debug:
                            print(p_infos[i][1])
            pbar.update(cfg.batch_size)
            in_ims, in_gts, p_infos, in_og_ims = [], [], [], []
        if idx == len(ds) - 1:
            break
        if args.eval_debug and idx == 2000: break
    pbar.close()
    if args.bad_slice_output:
        try:
            json.dump(
                bad_slice_dirs,
                open(args.pkl_dir.replace(".pkl", "_bad_slice.json"), "w"))
        except:
            print("Failed saving as json. Save as pickle instead...")
            pickle.dump(
                bad_slice_dirs,
                open(args.pkl_dir.replace(".pkl", "_bad_slice.pkl"), "wb"))
    pneu_eval.pixel_wise_result(args.pkl_dir, True)
Ejemplo n.º 16
0
def train(args, cfg):
    out_dirs = gen_outdirs(args, "tp")
    output_dir, out_res_dir = out_dirs["output_dir"], out_dirs["out_res_dir"]
    df = PneuSegDF(args.mode, out_res_dir, args.train_dir, args.testset_dir,
                   args.min_num_workers, cfg)
    num_gpu = max(get_num_gpu(), 1)
    ds = df.prepared(num_gpu, cfg.batch_size)

    # Avoid overwritting config file
    if os.path.exists(pj(output_dir, os.path.basename(args.config))):
        input(
            "Config file will NOT be overwritten. Press Enter to continue...")
    else:
        shutil.copy(args.config, output_dir)
    logger.set_logger_dir(pj(output_dir, "log"))
    callback_list = [
        # PeriodicCallback overwritten the frequency of what's wrapped
        PeriodicCallback(ModelSaver(50, checkpoint_dir=output_dir),
                         every_k_epochs=1),
        GPUUtilizationTracker(),
        MergeAllSummaries(1 if args.train_debug else 0),
        # ProgressBar(["Loss"])
    ]
    if cfg.network["norm_layer"] == "BN_layers":
        callback_list.append(BN_layers_update())
    if cfg.lr_schedule["type"] == "epoch_wise_constant":
        schedule = [(ep, lr / num_gpu) for ep, lr in zip(
            [0] + cfg.lr_schedule["epoch_to_drop_lr"], cfg.lr_schedule["lr"])]
        callback_list.append(
            ScheduledHyperParamSetter("learning_rate", schedule))
    elif cfg.lr_schedule["type"] == "halved":
        schedule = [(0, cfg.lr_schedule["init_lr"])]
        for i in range(cfg.lr_schedule["first_epoch2drop"], cfg.max_epoch,
                       cfg.lr_schedule["period"]):
            schedule.append(
                (i, schedule[int((i - cfg.lr_schedule["first_epoch2drop"]) /
                                 cfg.lr_schedule["period"])][1] /
                 (cfg.lr_schedule["decay_rate"] * num_gpu)))
        print(schedule)
        callback_list.append(
            ScheduledHyperParamSetter("learning_rate", schedule))
    steps_per_epoch = len(ds) // num_gpu + 1
    train_cfg = TrainConfig(
        model=Tensorpack_model(cfg, steps_per_epoch),
        data=QueueInput(ds),
        steps_per_epoch=steps_per_epoch,
        callbacks=callback_list,
        monitors=[
            # ScalarPrinter(True, whitelist=["Loss", "LR"]),
            ScalarPrinter(True),
            # ScalarPrinter(),
            TFEventWriter(),
            # JSONWriter()
        ],
        max_epoch=cfg.max_epoch,
        session_init=SmartInit(args.resume),
        starting_epoch=args.resume_epoch)
    launch_train_with_config(
        train_cfg,
        SyncMultiGPUTrainerReplicated(num_gpu)
        if num_gpu > 1 else SimpleTrainer())
Ejemplo n.º 17
0
def get_config(model, fake=False, start_epoch=1):
    nr_tower = max(get_num_gpu(), 1)
    assert args.batch % nr_tower == 0
    batch = args.batch // nr_tower

    logger.info("Running on {} towers. Batch size per tower: {}".format(
        nr_tower, batch))
    if batch < 32 or batch > 64:
        logger.warn(
            "Batch size per tower not in [32, 64]. This probably will lead to worse accuracy than reported."
        )
    if fake:
        data = QueueInput(
            FakeData([[batch, 224, 224, 3], [batch]],
                     1000,
                     random=False,
                     dtype='uint8'))
        callbacks = []
    else:
        data = QueueInput(get_data('train', batch))

        START_LR = 0.1
        BASE_LR = START_LR * (args.batch / 256.0)
        callbacks = [
            ModelSaver(),
            EstimatedTimeLeft(),
            # ScheduledHyperParamSetter(
            #     'learning_rate', [
            #         (0, min(START_LR, BASE_LR)), (30, BASE_LR * 1e-1), (60, BASE_LR * 1e-2),
            #         (90, BASE_LR * 1e-3), (100, BASE_LR * 1e-4)]),
            StatMonitorParamSetter('learning_rate', 'val-error-top1',
                                   lambda x: x * 0.1, 1e-4, 5),
        ]
        if BASE_LR > START_LR:
            callbacks.append(
                ScheduledHyperParamSetter('learning_rate', [(0, START_LR),
                                                            (5, BASE_LR)],
                                          interp='linear'))

        infs = [
            ClassificationError('wrong-top1', 'val-error-top1'),
            ClassificationError('wrong-top5', 'val-error-top5')
        ]
        dataset_val = get_data('val', batch)
        if nr_tower == 1:
            # single-GPU inference with queue prefetch
            callbacks.append(InferenceRunner(QueueInput(dataset_val), infs))
        else:
            # multi-GPU inference (with mandatory queue prefetch)
            callbacks.append(
                DataParallelInferenceRunner(dataset_val, infs,
                                            list(range(nr_tower))))

    return TrainConfig(
        model=model,
        data=data,
        callbacks=callbacks,
        steps_per_epoch=100 if args.fake else (419654) // args.batch,
        max_epoch=105,
        starting_epoch=start_epoch,
    )
Ejemplo n.º 18
0
    parser.add_argument('-n', '--num_units',
                        help='number of units in each stage',
                        type=int, default=18)
    parser.add_argument('--load', help='load model for training')
    args = parser.parse_args()
    NUM_UNITS = args.num_units

    if args.gpu:
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu

    logger.auto_set_dir()

    dataset_train = get_data('train')
    dataset_test = get_data('test')

    config = TrainConfig(
        model=Model(n=NUM_UNITS),
        dataflow=dataset_train,
        callbacks=[
            ModelSaver(),
            InferenceRunner(dataset_test,
                            [ScalarStats('cost'), ClassificationError('wrong_vector')]),
            ScheduledHyperParamSetter('learning_rate',
                                      [(1, 0.1), (82, 0.01), (123, 0.001), (300, 0.0002)])
        ],
        max_epoch=400,
        session_init=SaverRestore(args.load) if args.load else None
    )
    num_gpu = max(get_num_gpu(), 1)
    launch_train_with_config(config, SyncMultiGPUTrainerParameterServer(num_gpu))
Ejemplo n.º 19
0
            cv2.imwrite("out{}.png".format('-fused' if k == 5 else str(k + 1)),
                        pred * 255)
        logger.info("Results saved to out*.png")
    else:
        pred = outputs[5][0]
        cv2.imwrite(output, pred * 255)


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--gpu', help='comma separated list of GPU(s) to use.')
    parser.add_argument('--load', help='load model')
    parser.add_argument('--view', help='view dataset', action='store_true')
    parser.add_argument('--run', help='run model on images')
    parser.add_argument('--output',
                        help='fused output filename. default to out-fused.png')
    args = parser.parse_args()
    if args.gpu:
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu

    if args.view:
        view_data()
    elif args.run:
        run(args.load, args.run, args.output)
    else:
        config = get_config()
        if args.load:
            config.session_init = get_model_loader(args.load)
        launch_train_with_config(config,
                                 SyncMultiGPUTrainer(max(get_num_gpu(), 1)))
Ejemplo n.º 20
0
def finalize_configs(is_training):
    """
    Run some sanity checks, and populate some configs from others
    """
    _C.freeze(False)  # populate new keys now
    _C.DATA.NUM_CLASS = _C.DATA.NUM_CATEGORY + 1  # +1 background
    _C.DATA.BASEDIR = os.path.expanduser(_C.DATA.BASEDIR)
    if isinstance(_C.DATA.VAL, six.string_types
                  ):  # support single string (the typical case) as well
        _C.DATA.VAL = (_C.DATA.VAL, )

    assert _C.BACKBONE.NORM in ['FreezeBN', 'SyncBN', 'GN',
                                'None'], _C.BACKBONE.NORM
    if _C.BACKBONE.NORM != 'FreezeBN':
        assert not _C.BACKBONE.FREEZE_AFFINE
    assert _C.BACKBONE.FREEZE_AT in [0, 1, 2]

    _C.RPN.NUM_ANCHOR = len(_C.RPN.ANCHOR_SIZES) * len(_C.RPN.ANCHOR_RATIOS)
    assert len(_C.FPN.ANCHOR_STRIDES) == len(_C.RPN.ANCHOR_SIZES)
    # image size into the backbone has to be multiple of this number
    _C.FPN.RESOLUTION_REQUIREMENT = _C.FPN.ANCHOR_STRIDES[
        3]  # [3] because we build FPN with features r2,r3,r4,r5

    if _C.MODE_FPN:
        size_mult = _C.FPN.RESOLUTION_REQUIREMENT * 1.
        _C.PREPROC.MAX_SIZE = np.ceil(
            _C.PREPROC.MAX_SIZE / size_mult) * size_mult
        assert _C.FPN.PROPOSAL_MODE in ['Level', 'Joint']
        assert _C.FPN.FRCNN_HEAD_FUNC.endswith('_head')
        assert _C.FPN.MRCNN_HEAD_FUNC.endswith('_head')
        assert _C.FPN.NORM in ['None', 'GN']

        if _C.FPN.CASCADE:
            # the first threshold is the proposal sampling threshold
            assert _C.CASCADE.IOUS[0] == _C.FRCNN.FG_THRESH
            assert len(_C.CASCADE.BBOX_REG_WEIGHTS) == len(_C.CASCADE.IOUS)

    if is_training:
        train_scales = _C.PREPROC.TRAIN_SHORT_EDGE_SIZE
        if isinstance(
                train_scales,
            (list, tuple)) and train_scales[1] - train_scales[0] > 100:
            # don't warmup if augmentation is on
            os.environ['TF_CUDNN_USE_AUTOTUNE'] = '0'
        os.environ['TF_AUTOTUNE_THRESHOLD'] = '1'
        assert _C.TRAINER in ['horovod', 'replicated'], _C.TRAINER

        # setup NUM_GPUS
        if _C.TRAINER == 'horovod':
            import horovod.tensorflow as hvd
            ngpu = hvd.size()

            if ngpu == hvd.local_size():
                logger.warn(
                    "It's not recommended to use horovod for single-machine training. "
                    "Replicated trainer is more stable and has the same efficiency."
                )
        else:
            assert 'OMPI_COMM_WORLD_SIZE' not in os.environ
            ngpu = get_num_gpu()
        assert ngpu % 8 == 0 or 8 % ngpu == 0, "Can only train with 1,2,4 or >=8 GPUs, but found {} GPUs".format(
            ngpu)
    else:
        # autotune is too slow for inference
        os.environ['TF_CUDNN_USE_AUTOTUNE'] = '0'
        ngpu = get_num_gpu()

    assert ngpu > 0, "Has to run with GPU!"
    if _C.TRAIN.NUM_GPUS is None:
        _C.TRAIN.NUM_GPUS = ngpu
    else:
        if _C.TRAINER == 'horovod':
            assert _C.TRAIN.NUM_GPUS == ngpu
        else:
            assert _C.TRAIN.NUM_GPUS <= ngpu

    _C.freeze()
    logger.info("Config: ------------------------------------------\n" +
                str(_C))
Ejemplo n.º 21
0
            pred = outputs[k][0]
            cv2.imwrite("out{}.png".format(
                '-fused' if k == 5 else str(k + 1)), pred * 255)
    else:
        pred = outputs[5][0]
        cv2.imwrite(output, pred * 255)


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--gpu', help='comma separated list of GPU(s) to use.')
    parser.add_argument('--load', help='load model')
    parser.add_argument('--view', help='view dataset', action='store_true')
    parser.add_argument('--run', help='run model on images')
    parser.add_argument('--output', help='fused output filename. default to out-fused.png')
    args = parser.parse_args()
    if args.gpu:
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu

    if args.view:
        view_data()
    elif args.run:
        run(args.load, args.run, args.output)
    else:
        config = get_config()
        if args.load:
            config.session_init = get_model_loader(args.load)
        launch_train_with_config(
            config,
            SyncMultiGPUTrainer(max(get_num_gpu(), 1)))
Ejemplo n.º 22
0
    ]

    input = QueueInput(dataset_train)
    input = StagingInput(input, nr_stage=1)
    return TrainConfig(
        model=Model(),
        data=input,
        callbacks=callbacks,
        steps_per_epoch=1281167 // total_batch,
        max_epoch=100,
    )


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--gpu', help='comma separated list of GPU(s) to use.')
    parser.add_argument('--data', help='ILSVRC dataset dir')
    parser.add_argument('--batch', type=int, default=32, help='batch per GPU')
    parser.add_argument('--norm', choices=['none', 'bn', 'gn'], default='none')
    args = parser.parse_args()

    if args.gpu:
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu

    logger.set_logger_dir(os.path.join('train_log', 'vgg16-norm={}'.format(args.norm)))

    config = get_config()
    nr_tower = max(get_num_gpu(), 1)
    trainer = SyncMultiGPUTrainerReplicated(nr_tower)
    launch_train_with_config(config, trainer)
Ejemplo n.º 23
0
def train():
    assert tf.test.is_gpu_available(), "Training requires GPUs!"
    dirname = os.path.join('train_log', 'train-atari-{}'.format(ENV_NAME))
    logger.set_logger_dir(dirname)

    # assign GPUs for training & inference
    num_gpu = get_num_gpu()
    global PREDICTOR_THREAD
    if num_gpu > 0:
        if num_gpu > 1:
            # use half gpus for inference
            predict_tower = list(range(num_gpu))[-num_gpu // 2:]
        else:
            predict_tower = [0]
        PREDICTOR_THREAD = len(predict_tower) * PREDICTOR_THREAD_PER_GPU
        train_tower = list(range(num_gpu))[:-num_gpu // 2] or [0]
        logger.info("[Batch-A3C] Train on gpu {} and infer on gpu {}".format(
            ','.join(map(str, train_tower)), ','.join(map(str,
                                                          predict_tower))))
    else:
        logger.warn(
            "Without GPU this model will never learn! CPU is only useful for debug."
        )
        PREDICTOR_THREAD = 1
        predict_tower, train_tower = [0], [0]

    # setup simulator processes
    name_base = str(uuid.uuid1())[:6]
    prefix = '@' if sys.platform.startswith('linux') else ''
    namec2s = 'ipc://{}sim-c2s-{}'.format(prefix, name_base)
    names2c = 'ipc://{}sim-s2c-{}'.format(prefix, name_base)
    procs = [
        MySimulatorWorker(k, namec2s, names2c) for k in range(SIMULATOR_PROC)
    ]
    ensure_proc_terminate(procs)
    start_proc_mask_signal(procs)

    master = MySimulatorMaster(namec2s, names2c, predict_tower)
    dataflow = BatchData(DataFromQueue(master.queue), BATCH_SIZE)
    config = TrainConfig(
        model=Model(),
        dataflow=dataflow,
        callbacks=[
            ModelSaver(),
            ScheduledHyperParamSetter('learning_rate', [(20, 0.0003),
                                                        (120, 0.0001)]),
            ScheduledHyperParamSetter('entropy_beta', [(80, 0.005)]),
            HumanHyperParamSetter('learning_rate'),
            HumanHyperParamSetter('entropy_beta'),
            master,
            StartProcOrThread(master),
            PeriodicTrigger(Evaluator(EVAL_EPISODE, ['state'], ['policy'],
                                      get_player),
                            every_k_epochs=3),
        ],
        session_creator=sesscreate.NewSessionCreator(
            config=get_default_sess_config(0.5)),
        steps_per_epoch=STEPS_PER_EPOCH,
        session_init=get_model_loader(args.load) if args.load else None,
        max_epoch=1000,
    )
    trainer = SimpleTrainer() if num_gpu == 1 else AsyncMultiGPUTrainer(
        train_tower)
    launch_train_with_config(config, trainer)
Ejemplo n.º 24
0
    parser.add_argument('--mode', choices=['resnet', 'preact', 'se', 'resnext32x4d'],
                        help='variants of resnet to use', default='resnet')
    args = parser.parse_args()

    if args.gpu:
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu

    model = Model(args.depth, args.mode)
    model.data_format = args.data_format
    if args.weight_decay_norm:
        model.weight_decay_pattern = ".*/W|.*/gamma|.*/beta"

    if args.eval:
        batch = 128    # something that can run on one gpu
        ds = get_imagenet_dataflow(args.data, 'val', batch)
        eval_classification(model, get_model_loader(args.load), ds)
    else:
        if args.fake:
            logger.set_logger_dir(os.path.join('train_log', 'tmp'), 'd')
        else:
            logger.set_logger_dir(
                os.path.join('train_log',
                             'imagenet-{}-d{}-batch{}'.format(
                                 args.mode, args.depth, args.batch)))

        config = get_config(model)
        if args.load:
            config.session_init = get_model_loader(args.load)
        trainer = SyncMultiGPUTrainerReplicated(max(get_num_gpu(), 1))
        launch_train_with_config(config, trainer)
Ejemplo n.º 25
0
            imgaug.ToFloat32(),
        ]
        ds_test2.reset_state()
        ds_test2 = AugmentImageComponent(ds_test2, ag_test2, 0)
        ds_test2 = BatchData(ds_test2, args.batch)
        ds_test2 = PrintData(ds_test2)

        # Setup the config
        config = TrainConfig(
            model=model,
            dataflow=ds_train,
            callbacks=[
                ModelSaver(),
                MinSaver('cost'),
                ScheduledHyperParamSetter('learning_rate',
                                          [(0, 1e-2), (50, 1e-3), (100, 1e-4), (150, 1e-5), (200, 1e-6)]),
                InferenceRunner(ds_valid, [CustomBinaryClassificationStats('estim', 'label', args, prefix='valid'),
                                           ScalarStats(['loss_xent', 'cost'], prefix='valid'),
                                           ], tower_name='ValidTower'),
                InferenceRunner(ds_test2, [CustomBinaryClassificationStats('estim', 'label', args, prefix='test2'),
                                           ScalarStats(['loss_xent', 'cost'], prefix='test2'),
                                           ], tower_name='Test2Tower'),
            ],
            max_epoch=250,
            session_init=SmartInit(args.load),
        )


        trainer = SyncMultiGPUTrainerParameterServer(max(get_num_gpu(), 1))

        launch_train_with_config(config, trainer)
Ejemplo n.º 26
0
def finalize_configs(is_training):
    """
    Run some sanity checks, and populate some configs from others
    """
    _C.DATA.NUM_CLASS = _C.DATA.NUM_CATEGORY + 1  # +1 background
    _C.DATA.BASEDIR = os.path.expanduser(_C.DATA.BASEDIR)

    assert _C.BACKBONE.NORM in ['FreezeBN', 'SyncBN', 'GN'], _C.BACKBONE.NORM
    if _C.BACKBONE.NORM != 'FreezeBN':
        assert not _C.BACKBONE.FREEZE_AFFINE
    assert _C.BACKBONE.FREEZE_AT in [0, 1, 2]

    _C.RPN.NUM_ANCHOR = len(_C.RPN.ANCHOR_SIZES) * len(_C.RPN.ANCHOR_RATIOS)
    assert len(_C.FPN.ANCHOR_STRIDES) == len(_C.RPN.ANCHOR_SIZES)
    # image size into the backbone has to be multiple of this number
    _C.FPN.RESOLUTION_REQUIREMENT = _C.FPN.ANCHOR_STRIDES[3]  # [3] because we build FPN with features r2,r3,r4,r5

    if _C.MODE_FPN:
        size_mult = _C.FPN.RESOLUTION_REQUIREMENT * 1.
        _C.PREPROC.MAX_SIZE = np.ceil(_C.PREPROC.MAX_SIZE / size_mult) * size_mult
        assert _C.FPN.PROPOSAL_MODE in ['Level', 'Joint']
        assert _C.FPN.FRCNN_HEAD_FUNC.endswith('_head')
        assert _C.FPN.MRCNN_HEAD_FUNC.endswith('_head')
        assert _C.FPN.NORM in ['None', 'GN']

        if _C.FPN.CASCADE:
            num_cascade = _C.CASCADE.NUM_STAGES
            # the first threshold is the proposal sampling threshold
            assert len(_C.CASCADE.IOUS) == num_cascade
            assert _C.CASCADE.IOUS[0] == _C.FRCNN.FG_THRESH
            assert len(_C.CASCADE.BBOX_REG_WEIGHTS) == num_cascade

    if is_training:
        train_scales = _C.PREPROC.TRAIN_SHORT_EDGE_SIZE
        if train_scales[1] - train_scales[0] > 100:
            # don't warmup if augmentation is on
            os.environ['TF_CUDNN_USE_AUTOTUNE'] = '0'
        os.environ['TF_AUTOTUNE_THRESHOLD'] = '1'
        assert _C.TRAINER in ['horovod', 'replicated'], _C.TRAINER

        # setup NUM_GPUS
        if _C.TRAINER == 'horovod':
            import horovod.tensorflow as hvd
            ngpu = hvd.size()
        else:
            assert 'OMPI_COMM_WORLD_SIZE' not in os.environ
            ngpu = get_num_gpu()
        assert ngpu > 0, "Has to run with GPU!"
        assert ngpu % 8 == 0 or 8 % ngpu == 0, ngpu
        if _C.TRAIN.NUM_GPUS is None:
            _C.TRAIN.NUM_GPUS = ngpu
        else:
            if _C.TRAINER == 'horovod':
                assert _C.TRAIN.NUM_GPUS == ngpu
            else:
                assert _C.TRAIN.NUM_GPUS <= ngpu
    else:
        # autotune is too slow for inference
        os.environ['TF_CUDNN_USE_AUTOTUNE'] = '0'

    _C.freeze()
    logger.info("Config: ------------------------------------------\n" + str(_C))
Ejemplo n.º 27
0
    parser.add_argument('--load', help='load model for training')
    args = parser.parse_args()
    NUM_UNITS = args.num_units

    if args.gpu:
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu

    logger.auto_set_dir()

    dataset_train = get_data('train')
    dataset_test = get_data('test')

    config = TrainConfig(
        model=Model(n=NUM_UNITS),
        dataflow=dataset_train,
        callbacks=[
            ModelSaver(),
            InferenceRunner(
                dataset_test,
                [ScalarStats('cost'),
                 ClassificationError('wrong_vector')]),
            ScheduledHyperParamSetter('learning_rate', [(1, 0.1), (82, 0.01),
                                                        (123, 0.001),
                                                        (300, 0.0002)])
        ],
        max_epoch=400,
        session_init=SaverRestore(args.load) if args.load else None)
    num_gpu = max(get_num_gpu(), 1)
    launch_train_with_config(config,
                             SyncMultiGPUTrainerParameterServer(num_gpu))
Ejemplo n.º 28
0
def finalize_configs(is_training):
    """
    Run some sanity checks, and populate some configs from others
    """
    _C.DATA.NUM_CLASS = _C.DATA.NUM_CATEGORY + 1  # +1 background
    _C.DATA.BASEDIR = os.path.expanduser(_C.DATA.BASEDIR)

    assert _C.BACKBONE.NORM in ['FreezeBN', 'SyncBN', 'GN'], _C.BACKBONE.NORM
    if _C.BACKBONE.NORM != 'FreezeBN':
        assert not _C.BACKBONE.FREEZE_AFFINE
    assert _C.BACKBONE.FREEZE_AT in [0, 1, 2]

    _C.RPN.NUM_ANCHOR = len(_C.RPN.ANCHOR_SIZES) * len(_C.RPN.ANCHOR_RATIOS)
    assert len(_C.FPN.ANCHOR_STRIDES) == len(_C.RPN.ANCHOR_SIZES)
    # image size into the backbone has to be multiple of this number
    _C.FPN.RESOLUTION_REQUIREMENT = _C.FPN.ANCHOR_STRIDES[
        3]  # [3] because we build FPN with features r2,r3,r4,r5

    if _C.MODE_FPN:
        size_mult = _C.FPN.RESOLUTION_REQUIREMENT * 1.
        _C.PREPROC.MAX_SIZE = np.ceil(
            _C.PREPROC.MAX_SIZE / size_mult) * size_mult
        assert _C.FPN.PROPOSAL_MODE in ['Level', 'Joint']
        assert _C.FPN.FRCNN_HEAD_FUNC.endswith('_head')
        assert _C.FPN.MRCNN_HEAD_FUNC.endswith('_head')
        assert _C.FPN.NORM in ['None', 'GN']

        if _C.FPN.CASCADE:
            num_cascade = _C.CASCADE.NUM_STAGES
            # the first threshold is the proposal sampling threshold
            assert len(_C.CASCADE.IOUS) == num_cascade
            assert _C.CASCADE.IOUS[0] == _C.FRCNN.FG_THRESH
            assert len(_C.CASCADE.BBOX_REG_WEIGHTS) == num_cascade

    if is_training:
        train_scales = _C.PREPROC.TRAIN_SHORT_EDGE_SIZE
        if train_scales[1] - train_scales[0] > 100:
            # don't warmup if augmentation is on
            os.environ['TF_CUDNN_USE_AUTOTUNE'] = '0'
        os.environ['TF_AUTOTUNE_THRESHOLD'] = '1'
        assert _C.TRAINER in ['horovod', 'replicated'], _C.TRAINER

        # setup NUM_GPUS
        if _C.TRAINER == 'horovod':
            import horovod.tensorflow as hvd
            ngpu = hvd.size()
        else:
            assert 'OMPI_COMM_WORLD_SIZE' not in os.environ
            ngpu = get_num_gpu()
        assert ngpu > 0, "Has to run with GPU!"
        assert ngpu % 8 == 0 or 8 % ngpu == 0, ngpu
        if _C.TRAIN.NUM_GPUS is None:
            _C.TRAIN.NUM_GPUS = ngpu
        else:
            if _C.TRAINER == 'horovod':
                assert _C.TRAIN.NUM_GPUS == ngpu
            else:
                assert _C.TRAIN.NUM_GPUS <= ngpu
    else:
        # autotune is too slow for inference
        os.environ['TF_CUDNN_USE_AUTOTUNE'] = '0'

    _C.freeze()
    logger.info("Config: ------------------------------------------\n" +
                str(_C))
Ejemplo n.º 29
0
def train(checkpoint_dir,
          model_name,
          dataset,
          num_epochs,
          quant_type,
          batch_size_per_gpu,
          lr=None,
          post_quantize_only=False):
    train_data, test_data, (img_shape,
                            label_shape) = datasets.DATASETS[dataset]()

    num_gpus = max(gpu.get_num_gpu(), 1)
    effective_batch_size = batch_size_per_gpu * num_gpus
    train_data = BatchData(train_data, batch_size_per_gpu)
    test_data = BatchData(test_data, batch_size_per_gpu, remainder=True)
    steps_per_epoch = len(train_data) // num_gpus

    if lr:
        if isinstance(lr, str):
            lr = ast.literal_eval(lr)
        if isinstance(lr, float):
            lr_schedule = [(0, lr)]
        else:
            lr_schedule = lr
    else:
        lr_schedule = [(0, 0.005), (8, 0.1), (25, 0.005), (30, 0)]

    if num_epochs is None:
        num_epochs = lr_schedule[-1][0]
    if post_quantize_only:
        start_quantising_at_epoch = 0
    else:
        start_quantising_at_epoch = lr_schedule[-2][0] if len(
            lr_schedule) > 1 else max(0, num_epochs - 5)

    logger.info(f"Training with LR schedule: {str(lr_schedule)}")
    logger.info(f"Quantising at epoch {start_quantising_at_epoch}")

    # train_data = FakeData([(batch_size_per_gpu,) + img_shape, (batch_size_per_gpu, ) + label_shape])

    model_func, input_spec, output_spec = get_model_func(
        "train",
        model_name,
        quant_type,
        img_shape,
        num_classes=label_shape[0],
        quant_delay=steps_per_epoch * start_quantising_at_epoch)
    target_spec = [
        tf.TensorSpec(t.shape, t.dtype, name=t.name.split("/")[-1] + "_target")
        for t in output_spec
    ]
    model = KerasModel(get_model=model_func,
                       input_signature=input_spec,
                       target_signature=target_spec,
                       input=train_data,
                       trainer=SyncMultiGPUTrainerParameterServer(
                           num_gpus, ps_device='gpu'))

    lr = tf.get_variable('learning_rate',
                         initializer=lr_schedule[0][1],
                         trainable=False)
    tf.summary.scalar('learning_rate-summary', lr)
    model.compile(optimizer=tf.train.MomentumOptimizer(learning_rate=lr,
                                                       momentum=0.9),
                  loss="categorical_crossentropy",
                  metrics=["categorical_accuracy"])

    model.fit(steps_per_epoch=steps_per_epoch,
              max_epoch=num_epochs,
              callbacks=[
                  ModelSaver(max_to_keep=1, checkpoint_dir=checkpoint_dir),
                  DataParallelInferenceRunner(
                      test_data, ScalarStats(model._stats_to_inference),
                      num_gpus),
                  ScheduledHyperParamSetter('learning_rate',
                                            lr_schedule,
                                            interp="linear"),
                  StatMonitorParamSetter('learning_rate',
                                         'validation_categorical_accuracy',
                                         lambda x: x / 2,
                                         threshold=0.001,
                                         last_k=10,
                                         reverse=True)
              ],
              session_init=SaverRestore(checkpoint_dir + "/checkpoint")
              if post_quantize_only else None)
Ejemplo n.º 30
0
    if args.gpu:
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu

    if args.apply:
        apply(args.load, args.lowres, args.output)
    else:
        logger.auto_set_dir()

        if args.load:
            session_init = SaverRestore(args.load)
        else:
            assert os.path.isfile(args.vgg19)
            param_dict = dict(np.load(args.vgg19))
            param_dict = {'VGG19/' + name: value for name, value in six.iteritems(param_dict)}
            session_init = DictRestore(param_dict)

        nr_tower = max(get_num_gpu(), 1)
        data = QueueInput(get_data(args.data))
        model = Model()

        trainer = SeparateGANTrainer(data, model, d_period=3)

        trainer.train_with_defaults(
            callbacks=[
                ModelSaver(keep_checkpoint_every_n_hours=2)
            ],
            session_init=session_init,
            steps_per_epoch=len(data) // 4,
            max_epoch=300
        )
Ejemplo n.º 31
0
    parser.add_argument('--gpu', help='comma separated list of GPU(s) to use.')
    parser.add_argument('--load', help='load model')
    parser.add_argument('--sample', action='store_true', help='run sampling')
    parser.add_argument('--data', help='Image directory', required=True)
    parser.add_argument('--mode', choices=['AtoB', 'BtoA'], default='AtoB')
    parser.add_argument('-b', '--batch', type=int, default=1)
    args = parser.parse_args()
    if args.gpu:
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu

    BATCH = args.batch

    if args.sample:
        assert args.load
        sample(args.data, args.load)
    else:
        logger.auto_set_dir()

        data = QueueInput(get_data())
        trainer = GANTrainer(data, Model(), get_num_gpu())

        trainer.train_with_defaults(
            callbacks=[
                PeriodicTrigger(ModelSaver(), every_k_epochs=3),
                ScheduledHyperParamSetter('learning_rate', [(200, 1e-4)])
            ],
            steps_per_epoch=data.size(),
            max_epoch=300,
            session_init=SaverRestore(args.load) if args.load else None
        )