def get_config(model, fake=False):
    nr_tower = max(get_nr_gpu(), 1)
    batch = TOTAL_BATCH_SIZE // nr_tower

    if fake:
        logger.info("For benchmark, batch size is fixed to 64 per tower.")
        dataset_train = FakeData(
            [[64, 224, 224, 3], [64]], 1000, random=False, dtype='uint8')
        callbacks = []
    else:
        logger.info("Running on {} towers. Batch size per tower: {}".format(nr_tower, batch))
        dataset_train = get_data('train', batch)
        dataset_val = get_data('val', batch)
        callbacks = [
            ModelSaver(),
            ScheduledHyperParamSetter('learning_rate',
                                      [(30, 1e-2), (60, 1e-3), (85, 1e-4), (95, 1e-5), (105, 1e-6)]),
            HumanHyperParamSetter('learning_rate'),
        ]
        infs = [ClassificationError('wrong-top1', 'val-error-top1'),
                ClassificationError('wrong-top5', 'val-error-top5')]
        if nr_tower == 1:
            callbacks.append(InferenceRunner(QueueInput(dataset_val), infs))
        else:
            callbacks.append(DataParallelInferenceRunner(
                dataset_val, infs, list(range(nr_tower))))

    return TrainConfig(
        model=model,
        dataflow=dataset_train,
        callbacks=callbacks,
        steps_per_epoch=5000,
        max_epoch=110,
        nr_tower=nr_tower
    )
Beispiel #2
0
def get_config(model, nr_tower):
    batch = TOTAL_BATCH_SIZE // nr_tower

    logger.info("Running on {} towers. Batch size per tower: {}".format(
        nr_tower, batch))
    dataset_train = get_data('train', batch)
    dataset_val = get_data('val', batch)
    callbacks = [
        ModelSaver(),
        ScheduledHyperParamSetter('learning_rate', [(0, 3e-1), (30, 3e-2),
                                                    (60, 3e-3), (90, 3e-4)]),
        HumanHyperParamSetter('learning_rate'),
    ]
    infs = [
        ClassificationError('wrong-top1', 'val-error-top1'),
        ClassificationError('wrong-top5', 'val-error-top5')
    ]
    if nr_tower == 1:
        # single-GPU inference with queue prefetch
        callbacks.append(InferenceRunner(QueueInput(dataset_val), infs))
    else:
        # multi-GPU inference (with mandatory queue prefetch)
        callbacks.append(
            DataParallelInferenceRunner(dataset_val, infs,
                                        list(range(nr_tower))))

    return TrainConfig(
        model=model,
        dataflow=dataset_train,
        callbacks=callbacks,
        steps_per_epoch=5000,
        max_epoch=100,
    )
Beispiel #3
0
def get_config(model, fake=False):
    nr_tower = max(get_num_gpu(), 1)
    assert args.batch % nr_tower == 0
    batch = args.batch // nr_tower

    logger.info("Running on {} towers. Batch size per tower: {}".format(
        nr_tower, batch))
    if batch < 32 or batch > 64:
        logger.warn(
            "Batch size per tower not in [32, 64]. This probably will lead to worse accuracy than reported."
        )
    if fake:
        data = QueueInput(
            FakeData([[batch, 224, 224, 3], [batch]],
                     1000,
                     random=False,
                     dtype='uint8'))
        callbacks = []
    else:
        data = QueueInput(get_data('train', batch))

        START_LR = 0.1
        BASE_LR = START_LR * (args.batch / 256.0)
        callbacks = [
            ModelSaver(),
            EstimatedTimeLeft(),
            ScheduledHyperParamSetter('learning_rate',
                                      [(0, min(START_LR, BASE_LR)),
                                       (30, BASE_LR * 1e-1),
                                       (60, BASE_LR * 1e-2),
                                       (90, BASE_LR * 1e-3),
                                       (100, BASE_LR * 1e-4)]),
        ]
        if BASE_LR > START_LR:
            callbacks.append(
                ScheduledHyperParamSetter('learning_rate', [(0, START_LR),
                                                            (5, BASE_LR)],
                                          interp='linear'))

        infs = [
            ClassificationError('wrong-top1', 'val-error-top1'),
            ClassificationError('wrong-top5', 'val-error-top5')
        ]
        dataset_val = get_data('val', batch)
        if nr_tower == 1:
            # single-GPU inference with queue prefetch
            callbacks.append(InferenceRunner(QueueInput(dataset_val), infs))
        else:
            # multi-GPU inference (with mandatory queue prefetch)
            callbacks.append(
                DataParallelInferenceRunner(dataset_val, infs,
                                            list(range(nr_tower))))

    return TrainConfig(
        model=model,
        data=data,
        callbacks=callbacks,
        steps_per_epoch=100 if args.fake else 1281167 // args.batch,
        max_epoch=105,
    )
Beispiel #4
0
def get_config(model, nr_tower):
    batch = TOTAL_BATCH_SIZE // nr_tower

    logger.info("Running on {} towers. Batch size per tower: {}".format(nr_tower, batch))
    dataset_train = get_data('train', batch)
    dataset_val = get_data('val', batch)
    callbacks = [
        ModelSaver(),
        ScheduledHyperParamSetter('learning_rate',
                                  [(0, 3e-1), (30, 3e-2), (60, 3e-3), (90, 3e-4)]),
        HumanHyperParamSetter('learning_rate'),
    ]
    infs = [ClassificationError('wrong-top1', 'val-error-top1'),
            ClassificationError('wrong-top5', 'val-error-top5')]
    if nr_tower == 1:
        # single-GPU inference with queue prefetch
        callbacks.append(InferenceRunner(QueueInput(dataset_val), infs))
    else:
        # multi-GPU inference (with mandatory queue prefetch)
        callbacks.append(DataParallelInferenceRunner(
            dataset_val, infs, list(range(nr_tower))))

    return TrainConfig(
        model=model,
        dataflow=dataset_train,
        callbacks=callbacks,
        steps_per_epoch=5000,
        max_epoch=100,
    )
Beispiel #5
0
def get_config(fake=False, data_format='NCHW'):
    nr_tower = max(get_nr_gpu(), 1)
    global BATCH_SIZE
    BATCH_SIZE = TOTAL_BATCH_SIZE // nr_tower

    if fake:
        logger.info("For benchmark, batch size is fixed to 64 per tower.")
        dataset_train = dataset_val = FakeData([[64, 224, 224, 3], [64]],
                                               1000,
                                               random=False,
                                               dtype='uint8')
    else:
        logger.info("Running on {} towers. Batch size per tower: {}".format(
            nr_tower, BATCH_SIZE))
        dataset_train = get_data('train')
        dataset_val = get_data('val')

    return TrainConfig(
        model=Model(data_format=data_format),
        dataflow=dataset_train,
        callbacks=[
            ModelSaver(),
            InferenceRunner(dataset_val, [
                ClassificationError('wrong-top1', 'val-error-top1'),
                ClassificationError('wrong-top5', 'val-error-top5')
            ]),
            ScheduledHyperParamSetter('learning_rate', [(30, 1e-2), (60, 1e-3),
                                                        (85, 1e-4), (95, 1e-5),
                                                        (105, 1e-6)]),
            HumanHyperParamSetter('learning_rate'),
        ],
        steps_per_epoch=5000,
        max_epoch=110,
        nr_tower=nr_tower)
Beispiel #6
0
    def save(self, path, force=False):
        """Save the fitted model in the given path."""
        if os.path.exists(path) and not force:
            logger.info(
                'The indicated path already exists. Use `force=True` to overwrite.'
            )
            return

        base_path = os.path.dirname(path)
        if not os.path.exists(base_path):
            os.makedirs(base_path)

        model = self.model
        dataset_predictor = self.simple_dataset_predictor

        self.model = None
        self.simple_dataset_predictor = None

        with open('{}/TGANModel'.format(self.output), 'wb') as f:
            pickle.dump(self, f)

        self.model = model
        self.simple_dataset_predictor = dataset_predictor

        self.tar_folder(path)

        logger.info('Model saved successfully.')
Beispiel #7
0
def get_config(model, nr_tower):
    batch = TOTAL_BATCH_SIZE // nr_tower

    logger.info("Running on {} towers. Batch size per tower: {}".format(nr_tower, batch))
    dataset_train = get_data('train', batch)
    dataset_val = get_data('val', batch)

    step_size = 1280000 // TOTAL_BATCH_SIZE
    max_iter = 3 * 10**5
    max_epoch = (max_iter // step_size) + 1
    callbacks = [
        ModelSaver(),
        ScheduledHyperParamSetter('learning_rate',
                                  [(0, 0.5), (max_iter, 0)],
                                  interp='linear', step_based=True),
    ]
    infs = [ClassificationError('wrong-top1', 'val-error-top1'),
            ClassificationError('wrong-top5', 'val-error-top5')]
    if nr_tower == 1:
        # single-GPU inference with queue prefetch
        callbacks.append(InferenceRunner(QueueInput(dataset_val), infs))
    else:
        # multi-GPU inference (with mandatory queue prefetch)
        callbacks.append(DataParallelInferenceRunner(
            dataset_val, infs, list(range(nr_tower))))

    return TrainConfig(
        model=model,
        dataflow=dataset_train,
        callbacks=callbacks,
        steps_per_epoch=step_size,
        max_epoch=max_epoch,
    )
Beispiel #8
0
def convert_param_name(param):
    print('--> convert_param_name ...')
    resnet_param = {}
    for k in param.keys():
        logger.info("Load the weights of the module {}".format(k.split(":")[0]))
        resnet_param[k.split(":")[0]] = param[k]

    return resnet_param
Beispiel #9
0
def get_config(model, fake=False):
    start_ = 0
    nr_tower = max(get_nr_gpu(), 1)
    assert args.batch % nr_tower == 0
    batch = args.batch // nr_tower

    logger.info("Running on {} towers. Batch size per tower: {}".format(nr_tower, batch))
    if fake:
        dataset_train = FakeData(
            [[batch, 224, 224, 3], [batch]], 1000, random=False, dtype='uint8')
        callbacks = []
    else:
        dataset_train = get_data('train', batch)
        dataset_val = get_data('val', batch)

        START_LR = 0.1
        BASE_LR = START_LR * (args.batch / 256.0)
        if start_ < 31:
            lr_setting =[(max(30-start_, 0) , BASE_LR * 1e-1), (60 - start_, BASE_LR * 1e-2),(
                90 - start_, BASE_LR * 1e-3), (105 - start_, BASE_LR * 1e-4)]
        elif start_ < 61:
            lr_setting =[(max(60 - start_, 0), BASE_LR * 1e-2),(
                90 - start_, BASE_LR * 1e-3), (105 - start_, BASE_LR * 1e-4)]
        elif start_ < 91:
            lr_setting =[(max(90 - start_, 0), BASE_LR * 1e-3), (105 - start_, BASE_LR * 1e-4)]
        else:
            print('not found learning rate setting!!!!!!!!!!!!!')

        callbacks = [
            ModelSaver(),
            EstimatedTimeLeft(),
            ScheduledHyperParamSetter(
                'learning_rate', lr_setting),
            # TensorPrinter(['tower1/group3/block2/conv2/Abs_0', 'tower1/group3/block2/conv2/Abs_1:0', 'tower1/group3/block2/conv2/Abs_2:0'])
        ]
        if BASE_LR > START_LR:
            callbacks.append(
                ScheduledHyperParamSetter(
                    'learning_rate', [(0, START_LR), (5, BASE_LR)], interp='linear'))

        infs = [ClassificationError('wrong-top1', 'val-error-top1'),
                ClassificationError('wrong-top5', 'val-error-top5')]
        if nr_tower == 1:
            # single-GPU inference with queue prefetch
            callbacks.append(InferenceRunner(QueueInput(dataset_val), infs))
        else:
            # multi-GPU inference (with mandatory queue prefetch)
            callbacks.append(DataParallelInferenceRunner(
                dataset_val, infs, list(range(nr_tower))))

    return TrainConfig(
        model=model,
        dataflow=dataset_train,
        callbacks=callbacks,
        steps_per_epoch=100 if args.fake else 1280000 // args.batch,
        max_epoch=120,
    )
Beispiel #10
0
def get_config(model, scales, distill=False, fake=False, data_aug=True):
    nr_tower = max(get_nr_gpu(), 1)
    batch = TOTAL_BATCH_SIZE // nr_tower

    if fake:
        logger.info("For benchmark, batch size is fixed to 64 per tower.")
        dataset_train = FakeData([[64, 224, 224, 3], [64]],
                                 1000,
                                 random=False,
                                 dtype='uint8')
        callbacks = []
    else:
        logger.info("Running on {} towers. Batch size per tower: {}".format(
            nr_tower, batch))
        dataset_train = get_data('train', batch, data_aug)
        dataset_val = get_data('val', batch, data_aug)
        callbacks = [ModelSaver()]
        if data_aug:
            callbacks.append(
                ScheduledHyperParamSetter('learning_rate', [(30, 1e-2),
                                                            (60, 1e-3),
                                                            (85, 1e-4),
                                                            (95, 1e-5),
                                                            (105, 1e-6)]))
        callbacks.append(HumanHyperParamSetter('learning_rate'))
        infs = []
        for scale in scales:
            infs.append(
                ClassificationError('wrong-scale%03d-top1' % scale,
                                    'val-error-scale%03d-top1' % scale))
            infs.append(
                ClassificationError('wrong-scale%03d-top5' % scale,
                                    'val-error-scale%03d-top5' % scale))
        if distill:
            infs.append(
                ClassificationError('wrong-scale_ensemble-top1',
                                    'val-error-scale_ensemble-top1'))
            infs.append(
                ClassificationError('wrong-scale_ensemble-top5',
                                    'val-error-scale_ensemble-top5'))
        if nr_tower == 1:
            # single-GPU inference with queue prefetch
            callbacks.append(InferenceRunner(QueueInput(dataset_val), infs))
        else:
            # multi-GPU inference (with mandatory queue prefetch)
            callbacks.append(
                DataParallelInferenceRunner(dataset_val, infs,
                                            list(range(nr_tower))))

    return AutoResumeTrainConfig(
        model=model,
        dataflow=dataset_train,
        callbacks=callbacks,
        steps_per_epoch=5000 if TOTAL_BATCH_SIZE == 256 else 10000,
        max_epoch=120 if data_aug else 64,
        nr_tower=nr_tower)
Beispiel #11
0
def get_config(model, fake=False):
    nr_tower = max(get_nr_gpu(), 1)
    assert args.batch % nr_tower == 0
    batch = args.batch // nr_tower

    if fake:
        logger.info("For benchmark, batch size is fixed to 64 per tower.")
        dataset_train = FakeData([[64, 224, 224, 3], [64]],
                                 1000,
                                 random=False,
                                 dtype='uint8')
        callbacks = []
    else:
        logger.info("Running on {} towers. Batch size per tower: {}".format(
            nr_tower, batch))
        dataset_train = get_data('train', batch)
        dataset_val = get_data('val', batch)

        BASE_LR = 0.1 * (args.batch / 256.0)
        callbacks = [
            ModelSaver(),
            EstimatedTimeLeft(),
            ScheduledHyperParamSetter('learning_rate',
                                      [(30, BASE_LR * 1e-1),
                                       (60, BASE_LR * 1e-2),
                                       (85, BASE_LR * 1e-3),
                                       (95, BASE_LR * 1e-4),
                                       (105, BASE_LR * 1e-5)]),
        ]
        if BASE_LR > 0.1:
            callbacks.append(
                ScheduledHyperParamSetter('learning_rate', [(0, 0.1),
                                                            (3, BASE_LR)],
                                          interp='linear'))

        infs = [
            ClassificationError('wrong-top1', 'val-error-top1'),
            ClassificationError('wrong-top5', 'val-error-top5')
        ]
        if nr_tower == 1:
            # single-GPU inference with queue prefetch
            callbacks.append(InferenceRunner(QueueInput(dataset_val), infs))
        else:
            # multi-GPU inference (with mandatory queue prefetch)
            callbacks.append(
                DataParallelInferenceRunner(dataset_val, infs,
                                            list(range(nr_tower))))

    return TrainConfig(
        model=model,
        dataflow=dataset_train,
        callbacks=callbacks,
        steps_per_epoch=100 if args.fake else 1280000 // args.batch,
        max_epoch=110,
    )
Beispiel #12
0
def get_config(model):
    batch = 1

    logger.info("For benchmark, batch size is fixed to 1 per tower.")
    data = QueueInput(
        FakeData([[1, 224, 224, 3], [1]], 1, random=False, dtype='uint8'))

    return TrainConfig(model=model,
                       data=data,
                       callbacks=[],
                       steps_per_epoch=1,
                       max_epoch=1)
Beispiel #13
0
def get_config(model, fake=False):
    nr_tower = max(get_nr_gpu(), 1)
    batch = TOTAL_BATCH_SIZE // nr_tower

    if fake:
        logger.info("For benchmark, batch size is fixed to 64 per tower.")
        dataset_train = FakeData([[64, 224, 224, 3], [64]],
                                 1000,
                                 random=False,
                                 dtype='uint8')
        callbacks = []
    else:
        logger.info("Running on {} towers. Batch size per tower: {}".format(
            nr_tower, batch))
        dataset_train = get_data('train', batch)
        dataset_val = get_data('val', batch)
        callbacks = [
            ModelSaver(),
            ScheduledHyperParamSetter('learning_rate',
                                      [(0, 5e-4), (20, 2.5e-4), (40, 1.25e-4),
                                       (60, 5e-5), (80, 2.5e-5),
                                       (100, 1.25e-5), (120, 5e-6)]),
            HumanHyperParamSetter('learning_rate'),
        ]
        # Finetune COCO
        #[(0, 5e-4), (20, 2.5e-4), (40, 1.25e-4), (60, 5e-5),(80, 2.5e-5), (100, 1.25e-5), (120, 5e-6)]
        #JT COCO
        #[(0, 2.5e-4), (20, 1.25e-4), (40, 5e-5), (60, 2.5e-5),(80, 1e-5), (100, 5e-6), (120, 2.5e-6)]
        #Fintune to VOC
        #[(0, 1.25e-4), (20, 5e-5), (40, 2.5e-5), (60,1.25e-5),(80, 5e-6), (100, 2.5e-6), (120, 1.25e-6)]

        #infs = [ClassificationError('wrong-top1', 'val-error-top1'),
        #        ClassificationError('wrong-top5', 'val-error-top5')]
        infs = [
            ClassificationError('loss-wrong-top1', 'loss-val-error-top1'),
            ClassificationError('loss-wrong-top5', 'loss-val-error-top5')
        ]
        if nr_tower == 1:
            # single-GPU inference with queue prefetch
            callbacks.append(InferenceRunner(QueueInput(dataset_val), infs))
        else:
            # multi-GPU inference (with mandatory queue prefetch)
            callbacks.append(
                DataParallelInferenceRunner(dataset_val, infs,
                                            list(range(nr_tower))))

    return TrainConfig(model=model,
                       dataflow=dataset_train,
                       callbacks=callbacks,
                       steps_per_epoch=1522,
                       max_epoch=140,
                       nr_tower=nr_tower)
Beispiel #14
0
def guess_inputs(input_dir):
    meta_candidates = []
    model_candidates = []
    for path in os.listdir(input_dir):
        if path.startswith('graph-') and path.endswith('.meta'):
            meta_candidates.append(path)
        if path.startswith('model-') and path.endswith('.index'):
            modelid = int(path[len('model-'):-len('.index')])
            model_candidates.append((path, modelid))
    assert len(meta_candidates)
    meta = sorted(meta_candidates)[-1]
    if len(meta_candidates) > 1:
        logger.info("Choosing {} from {} as graph file.".format(
            meta, meta_candidates))
    else:
        logger.info("Choosing {} as graph file.".format(meta))

    assert len(model_candidates)
    model = sorted(model_candidates, key=lambda x: x[1])[-1][0]
    if len(model_candidates) > 1:
        logger.info("Choosing {} from {} as model file.".format(
            model, [x[0] for x in model_candidates]))
    else:
        logger.info("Choosing {} as model file.".format(model))
    return os.path.join(input_dir, model), os.path.join(input_dir, meta)
Beispiel #15
0
def log_config_info(config):
    config_keys = sorted(config.keys())
    data = []
    for k in config_keys:
        data.append([k, config[k]])
    headers = ['config_name', 'content']

    table = tabulate(data, headers=headers)
    logger.info(colored("List of Config Args: \n", 'cyan') + table)

    # save as json
    writefile = os.path.join(logger.get_logger_dir(), 'config.json')
    with open(writefile, 'w') as f:
        json.dump(config, f)
def get_config(model, fake=False, xla=False):
    nr_tower = max(get_nr_gpu(), 1)
    batch = TOTAL_BATCH_SIZE // nr_tower

    if fake:
        logger.info("For benchmark, batch size is fixed to 64 per tower.")
        dataset_train = FakeData([[64, 224, 224, 3], [64]],
                                 1000,
                                 random=False,
                                 dtype='uint8')
        callbacks = []
    else:
        logger.info("Running on {} towers. Batch size per tower: {}".format(
            nr_tower, batch))
        dataset_train = get_data('train', batch)
        dataset_val = get_data('val', batch)
        callbacks = [
            ModelSaver(),
            ScheduledHyperParamSetter('learning_rate', [(10, 1e-2), (20, 1e-3),
                                                        (85, 1e-4), (95, 1e-5),
                                                        (105, 1e-6)]),
            HumanHyperParamSetter('learning_rate'),
        ]
        infs = [
            ClassificationError('wrong-top1', 'val-error-top1'),
            ClassificationError('wrong-top5', 'val-error-top5')
        ]
        if nr_tower == 1:
            # single-GPU inference with queue prefetch
            callbacks.append(InferenceRunner(QueueInput(dataset_val), infs))
        else:
            # multi-GPU inference (with mandatory queue prefetch)
            callbacks.append(
                DataParallelInferenceRunner(dataset_val, infs,
                                            list(range(nr_tower))))

        config = tf.ConfigProto()
        jit_level = 0
        if xla:
            # Turns on XLA JIT compilation
            jit_level = tf.OptimizerOptions.ON_1
        config.graph_options.optimizer_options.global_jit_level = jit_level

    return TrainConfig(model=model,
                       dataflow=dataset_train,
                       callbacks=callbacks,
                       steps_per_epoch=10,
                       max_epoch=1,
                       nr_tower=nr_tower)
def get_config(model, fake=False):
    nr_tower = max(get_num_gpu(), 1)
    assert args.batch % nr_tower == 0
    batch = args.batch // nr_tower

    logger.info("Running on {} towers. Batch size per tower: {}".format(nr_tower, batch))
    if batch < 32 or batch > 64:
        logger.warn("Batch size per tower not in [32, 64]. This probably will lead to worse accuracy than reported.")
    if fake:
        data = QueueInput(FakeData(
            [[batch, 224, 224, 3], [batch]], 1000, random=False, dtype='uint8'))
        callbacks = []
    else:
        data = QueueInput(get_data('train', batch))

        START_LR = 0.1
        BASE_LR = START_LR * (args.batch / 256.0)
        callbacks = [
            ModelSaver(),
            EstimatedTimeLeft(),
            ScheduledHyperParamSetter(
                'learning_rate', [
                    (0, min(START_LR, BASE_LR)), (30, BASE_LR * 1e-1), (60, BASE_LR * 1e-2),
                    (90, BASE_LR * 1e-3), (100, BASE_LR * 1e-4)]),
        ]
        if BASE_LR > START_LR:
            callbacks.append(
                ScheduledHyperParamSetter(
                    'learning_rate', [(0, START_LR), (5, BASE_LR)], interp='linear'))

        infs = [ClassificationError('wrong-top1', 'val-error-top1'),
                ClassificationError('wrong-top5', 'val-error-top5')]
        dataset_val = get_data('val', batch)
        if nr_tower == 1:
            # single-GPU inference with queue prefetch
            callbacks.append(InferenceRunner(QueueInput(dataset_val), infs))
        else:
            # multi-GPU inference (with mandatory queue prefetch)
            callbacks.append(DataParallelInferenceRunner(
                dataset_val, infs, list(range(nr_tower))))

    return TrainConfig(
        model=model,
        data=data,
        callbacks=callbacks,
        steps_per_epoch=100 if args.fake else 1281167 // args.batch,
        max_epoch=105,
    )
Beispiel #18
0
def get_config(args, model, num_gpus, num_towers):
    """
    Create the TensorPack Trainer configuration.

    :param args: The cli arguments.
    :param model: The model object to train.
    :param num_gpus: The number of gpus on which to train
    :param num_towers: The number of data parallel towers to create

    :return: A TrainConfig object.
    """
    logger.info("Running on {} towers. Batch size per tower: {}".format(
        num_towers, args.batch_size))

    df_train = avatar_synth_df(args.train_dir, args.batch_size,
                               args.num_threads)
    df_test = avatar_synth_df(args.test_dir, args.batch_size, args.num_threads)

    def update_lr(epoch, cur_lr):
        """ Approximate exponential decay of the learning rate """
        if args.resume_lr:
            return cur_lr * args.lr_decay
        else:
            return args.lr * args.lr_decay**epoch

    callbacks = [
        cb.ModelSaver(),
        cb.MinSaver('val-error-top1'),
        cb.HyperParamSetterWithFunc('tower0/Avatar_Synth/LR:0', update_lr),
        cb.MergeAllSummaries(period=args.summary_freq),
    ]
    infs = [cb.ScalarStats('Avatar_Synth/Cost')]

    if num_gpus > 0:
        callbacks.append(cb.GPUUtilizationTracker())

    if num_towers == 1:  # single-GPU inference with queue prefetch
        callbacks.append(cb.InferenceRunner(QueueInput(df_test), infs))
    else:  # multi-GPU inference (with mandatory queue prefetch)
        callbacks.append(
            cb.DataParallelInferenceRunner(df_test, infs,
                                           list(range(num_towers))))

    return TrainConfig(model=model,
                       dataflow=df_train,
                       callbacks=callbacks,
                       max_epoch=args.epochs,
                       nr_tower=num_towers)
def get_config(model, checkpoint_dir, target_shape, fake=False):
    nr_tower = max(get_nr_gpu(), 1)
    batch = TOTAL_BATCH_SIZE // nr_tower

    if fake:
        logger.info("For benchmark, batch size is fixed to 64 per tower.")
        dataset_train = FakeData([[64, target_shape, target_shape, 3], [64]],
                                 1000,
                                 random=False,
                                 dtype='uint8')
        callbacks = []
    else:
        logger.info("Running on {} towers. Batch size per tower: {}".format(
            nr_tower, batch))
        dataset_train = get_data('train', batch, target_shape)
        dataset_val = get_data('val', batch, target_shape)
        callbacks = [
            ModelSaver(checkpoint_dir=checkpoint_dir),
            ScheduledHyperParamSetter('learning_rate', [(30, 1e-2), (60, 1e-3),
                                                        (85, 1e-4), (95, 1e-5),
                                                        (105, 1e-6)]),
            HumanHyperParamSetter('learning_rate'),
        ]
        infs = [
            ClassificationError('wrong-top1', 'val-error-top1'),
            ClassificationError('wrong-top5', 'val-error-top5')
        ]
        if nr_tower == 1:
            # single-GPU inference with queue prefetch
            callbacks.append(InferenceRunner(QueueInput(dataset_val), infs))
        else:
            # multi-GPU inference (with mandatory queue prefetch)
            callbacks.append(
                DataParallelInferenceRunner(dataset_val, infs,
                                            list(range(nr_tower))))


# 7.5 it / sec testing
    return TrainConfig(
        model=model,
        dataflow=dataset_train,
        callbacks=callbacks,
        steps_per_epoch=100 if args.fake else 300,  #5000 
        max_epoch=110,
        nr_tower=nr_tower)
Beispiel #20
0
def get_config(model, fake=False):
    nr_tower = max(get_nr_gpu(), 1)
    assert args.batch % nr_tower == 0
    batch = args.batch // nr_tower

    if fake:
        logger.info("For benchmark, batch size is fixed to 64 per tower.")
        dataset_train = FakeData(
            [[64, 224, 224, 3], [64]], 1000, random=False, dtype='uint8')
        callbacks = []
    else:
        logger.info("Running on {} towers. Batch size per tower: {}".format(nr_tower, batch))
        dataset_train = get_data('train', batch)
        dataset_val = get_data('val', batch)

        BASE_LR = 0.1 * (args.batch / 256.0)
        callbacks = [
            ModelSaver(),
            EstimatedTimeLeft(),
            ScheduledHyperParamSetter(
                'learning_rate', [(30, BASE_LR * 1e-1), (60, BASE_LR * 1e-2),
                                  (90, BASE_LR * 1e-3), (100, BASE_LR * 1e-4)]),
        ]
        if BASE_LR > 0.1:
            callbacks.append(
                ScheduledHyperParamSetter(
                    'learning_rate', [(0, 0.1), (3, BASE_LR)], interp='linear'))

        infs = [ClassificationError('wrong-top1', 'val-error-top1'),
                ClassificationError('wrong-top5', 'val-error-top5')]
        if nr_tower == 1:
            # single-GPU inference with queue prefetch
            callbacks.append(InferenceRunner(QueueInput(dataset_val), infs))
        else:
            # multi-GPU inference (with mandatory queue prefetch)
            callbacks.append(DataParallelInferenceRunner(
                dataset_val, infs, list(range(nr_tower))))

    return TrainConfig(
        model=model,
        dataflow=dataset_train,
        callbacks=callbacks,
        steps_per_epoch=100 if args.fake else 1280000 // args.batch,
        max_epoch=105,
    )
Beispiel #21
0
def get_config(model):
    nr_tower = get_nr_gpu()

    logger.info("Running on {} towers. Batch size per tower: {}".format(
        nr_tower, args.batch_size_per_gpu))
    dataset_train = get_data('train', args.batch_size_per_gpu)
    dataset_val = get_data('val', args.batch_size_per_gpu)

    BASE_LR = 1e-3 * (args.batch_size_per_gpu * nr_tower / 256.0)
    callbacks = [
        ModelSaver(),
        ScheduledHyperParamSetter('learning_rate', [(0, BASE_LR),
                                                    (60, BASE_LR * 1e-1),
                                                    (90, BASE_LR * 1e-2)]),
        HumanHyperParamSetter('learning_rate'),
    ]
    '''
    if BASE_LR > 0.1:
        callbacks.append(
            ScheduledHyperParamSetter(
                'learning_rate', [(0, 0.1), (3, BASE_LR)], interp='linear'))
    '''

    infs = [
        ClassificationError('wrong-top1', 'val-error-top1'),
        ClassificationError('wrong-top5', 'val-error-top5')
    ]
    if nr_tower == 1:
        # single-GPU inference with queue prefetch
        callbacks.append(InferenceRunner(QueueInput(dataset_val), infs))
    else:
        # multi-GPU inference (with mandatory queue prefetch)
        callbacks.append(
            DataParallelInferenceRunner(dataset_val, infs,
                                        list(range(nr_tower))))

    return TrainConfig(
        model=model,
        dataflow=dataset_train,
        callbacks=callbacks,
        steps_per_epoch=1280000 // (args.batch_size_per_gpu * nr_tower),
        max_epoch=110,
    )
Beispiel #22
0
 def _restart_episode(self):
     """
     restart current episode
     """
     logger.info("Medical Player restarting episode")
     self.terminal = [False] * self.agents
     self.reward = np.zeros((self.agents, ))
     self.cnt = 0  # counter to limit number of steps per episodes
     self.num_games.feed(1)
     self._loc_history = [[(0, ) * self.dims
                           for _ in range(self._history_length)]
                          for _ in range(self.agents)]
     # list of q-value lists
     self._qvalues_history = [[(0, ) * self.actions
                               for _ in range(self._history_length)]
                              for _ in range(self.agents)]
     for i in range(0, self.agents):
         self.current_episode_score[i].reset()
     self.new_random_game()
Beispiel #23
0
def get_config(model, data_dir, crop_method_TR, color_augmentation, crop_method_TS):
    nr_tower = max(get_nr_gpu(), 1)
    batch = TOTAL_BATCH_SIZE // nr_tower

    logger.info("Running on {} towers. Batch size per tower: {}".format(nr_tower, batch))

    # data pipelines of train and validation
    dataset_train = get_data('train', data_dir, batch, crop_method_TR, \
        color_augmentation = color_augmentation, CAM_dir_pkl = CAM_DIR_PKL)
    dataset_val = get_data('val', data_dir, batch, crop_method_TS)

    # TODO
    callbacks = [
        # class callbacks.ModelSaver(max_to_keep = 10, keep_checkpoint_every_n_hours = 0.5,
        #     checkpoint_dir = None, var_collections = 'variables')
        ModelSaver(max_to_keep = MAX_EPOCH),
        # @ 20171129: finetune on ResNet d18 from ImageNet
        # maybe moderate learning_rate is perferable
        ScheduledHyperParamSetter('learning_rate',
                                  [(0, 1e-3), (20, 5e-4), (40, 1e-4), (60, 1e-5)]),
        HumanHyperParamSetter('learning_rate'),
    ]

    # 0 or 1
    infs = [ClassificationError('wrong-top1', 'val-error-top1')]

    if nr_tower == 1:
        # single-GPU inference with queue prefetch
        callbacks.append(InferenceRunner(QueueInput(dataset_val), infs))

    else:
        # multi-GPU inference (with mandatory queue prefetch)
        callbacks.append(DataParallelInferenceRunner(dataset_val, infs, list(range(nr_tower))))

    return TrainConfig(
        model           = model,
        dataflow        = dataset_train,
        callbacks       = callbacks,
        #steps_per_epoch = 5000,
        max_epoch       = MAX_EPOCH,
        nr_tower        = nr_tower
    )
Beispiel #24
0
def _import_external_ops(message):
    if "horovod" in message.lower():
        logger.info("Importing horovod ...")
        import horovod.tensorflow  # noqa
        return
    if "MaxBytesInUse" in message:
        logger.info("Importing memory_stats ...")
        from tensorflow.contrib.memory_stats import MaxBytesInUse  # noqa
        return
    if 'Nccl' in message:
        logger.info("Importing nccl ...")
        if TF_version <= (1, 12):
            try:
                from tensorflow.contrib.nccl.python.ops.nccl_ops import _validate_and_load_nccl_so
            except Exception:
                pass
            else:
                _validate_and_load_nccl_so()
            from tensorflow.contrib.nccl.ops import gen_nccl_ops  # noqa
        else:
            from tensorflow.python.ops import gen_nccl_ops  # noqa
        return
    if 'ZMQConnection' in message:
        import zmq_ops  # noqa
        return
    logger.error("Unhandled error: " + message)
Beispiel #25
0
def get_config(model, conf):
    nr_tower = max(get_nr_gpu(), 1)
    batch = conf.batch
    if conf.fake:
        logger.info("For benchmark, batch size is fixed to 64 per tower.")
        dataset_train = FakeData([[64, 224, 224, 3], [64]],
                                 1000,
                                 random=False,
                                 dtype='uint8')
        callbacks = []
    else:
        logger.info("Running on {} towers. Batch size per tower: {}".format(
            nr_tower, batch))
        dataset_train = get_data(conf.data_dir, 'train', batch)
        dataset_val = get_data(conf.data_dir, 'val', batch)
        callbacks = [
            ModelSaver(),
            ScheduledHyperParamSetter('learning_rate', [(45, 1e-2), (60, 1e-3),
                                                        (65, 1e-4), (70, 1e-5),
                                                        (75, 1e-6)]),
            HumanHyperParamSetter('learning_rate'),
        ]
        infs = [
            ClassificationError('wrong-top1', 'val-error-top1'),
            ClassificationError('wrong-top5', 'val-error-top5')
        ]
        if nr_tower == 1:
            # single-GPU inference with queue prefetch
            callbacks.append(InferenceRunner(QueueInput(dataset_val), infs))
        else:
            # multi-GPU inference (with mandatory queue prefetch)
            callbacks.append(
                DataParallelInferenceRunner(dataset_val, infs,
                                            list(range(nr_tower))))
    return TrainConfig(model=model,
                       dataflow=dataset_train,
                       callbacks=callbacks,
                       steps_per_epoch=5000,
                       max_epoch=80,
                       nr_tower=nr_tower)
Beispiel #26
0
def get_config(model, fake=False, data_aug=True):
    nr_tower = max(get_nr_gpu(), 1)
    batch = TOTAL_BATCH_SIZE // nr_tower

    if fake:
        logger.info("For benchmark, batch size is fixed to 64 per tower.")
        dataset_train = FakeData(
            [[64, 224, 224, 3], [64]], 1000, random=False, dtype='uint8')
        callbacks = []
    else:
        logger.info("Running on {} towers. Batch size per tower: {}".format(nr_tower, batch))
        dataset_train = get_data('train', batch, data_aug)
        dataset_val = get_data('val', batch, data_aug)
        callbacks = [
            ModelSaver(),
        ]
        if data_aug:
            callbacks.append(ScheduledHyperParamSetter('learning_rate',
                                                       [(30, 1e-2), (60, 1e-3), (85, 1e-4), (95, 1e-5), (105, 1e-6)]))
        callbacks.append(HumanHyperParamSetter('learning_rate'))
        infs = [ClassificationError('wrong-top1', 'val-error-top1'),
                ClassificationError('wrong-top5', 'val-error-top5')]
        if nr_tower == 1:
            # single-GPU inference with queue prefetch
            callbacks.append(InferenceRunner(QueueInput(dataset_val), infs))
        else:
            # multi-GPU inference (with mandatory queue prefetch)
            callbacks.append(DataParallelInferenceRunner(
                dataset_val, infs, list(range(nr_tower))))

    return AutoResumeTrainConfig(
        model=model,
        dataflow=dataset_train,
        callbacks=callbacks,
        steps_per_epoch=5000 if TOTAL_BATCH_SIZE == 256 else 10000,
        max_epoch=110 if data_aug else 64,
        nr_tower=nr_tower
    )
Beispiel #27
0
def get_config(model, nr_tower):
    batch = TOTAL_BATCH_SIZE // nr_tower

    logger.info("Running on {} towers. Batch size per tower: {}".format(
        nr_tower, batch))
    dataset_train = get_data('train', batch)
    dataset_val = get_data('val', batch)

    step_size = 1280000 // TOTAL_BATCH_SIZE
    max_iter = 3 * 10**5
    max_epoch = (max_iter // step_size) + 1
    callbacks = [
        ModelSaver(),
        ScheduledHyperParamSetter('learning_rate', [(0, 0.5), (max_iter, 0)],
                                  interp='linear',
                                  step_based=True),
    ]
    infs = [
        ClassificationError('wrong-top1', 'val-error-top1'),
        ClassificationError('wrong-top5', 'val-error-top5')
    ]
    if nr_tower == 1:
        # single-GPU inference with queue prefetch
        callbacks.append(InferenceRunner(QueueInput(dataset_val), infs))
    else:
        # multi-GPU inference (with mandatory queue prefetch)
        callbacks.append(
            DataParallelInferenceRunner(dataset_val, infs,
                                        list(range(nr_tower))))

    return TrainConfig(
        model=model,
        dataflow=dataset_train,
        callbacks=callbacks,
        steps_per_epoch=step_size,
        max_epoch=max_epoch,
    )
def get_config(model, fake=False):
    nr_tower = max(get_num_gpu(), 1)
    assert args.batch % nr_tower == 0
    batch = args.batch // nr_tower

    if fake:
        logger.info("For benchmark, batch size is fixed to 64 per tower.")
        dataset_train = FakeData(
            [[64, 224, 224, 3], [64]], 1000, random=False, dtype='uint8')
        callbacks = []
        steps_per_epoch = 100
    else:
        logger.info("Running on {} towers. Batch size per tower: {}".format(nr_tower, batch))

        dataset_train = get_imagenet_dataflow(args.data, 'train', batch)
        dataset_val = get_imagenet_dataflow(args.data, 'val', min(64, batch))
        steps_per_epoch = 1281167 // args.batch

        BASE_LR = 0.1 * args.batch / 256.0
        logger.info("BASELR: {}".format(BASE_LR))
        callbacks = [
            ModelSaver(),
            EstimatedTimeLeft(),
            GPUUtilizationTracker()
        ]
        if not args.cosine_lr:
            callbacks.append(
                ScheduledHyperParamSetter(
                    'learning_rate', [(0, BASE_LR), (30, BASE_LR * 1e-1), (60, BASE_LR * 1e-2),
                                      (90, BASE_LR * 1e-3)]))
        infs = [ClassificationError('wrong-top1', 'val-error-top1'),
                ClassificationError('wrong-top5', 'val-error-top5')]
        if nr_tower == 1:
            # single-GPU inference with queue prefetch
            callbacks.append(InferenceRunner(QueueInput(dataset_val), infs))
        else:
            # multi-GPU inference (with mandatory queue prefetch)
            callbacks.append(DataParallelInferenceRunner(
                dataset_val, infs, list(range(nr_tower))))

    return TrainConfig(
        model=model,
        dataflow=dataset_train,
        callbacks=callbacks,
        steps_per_epoch=steps_per_epoch,
        max_epoch=100,
    )
Beispiel #29
0
            input.setup(M.get_inputs_desc())
            M.build_graph(input)
    else:
        tf.train.import_meta_graph(args.meta)

    # loading...
    init = get_model_loader(args.model)
    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
    sess.run(tf.global_variables_initializer())
    sess.run(tf.local_variables_initializer())
    init.init(sess)

    # dump ...
    with sess.as_default():
        if args.output.endswith('npy') or args.output.endswith('npz'):
            varmanip.dump_session_params(args.output)
        else:
            var = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
            var.extend(tf.get_collection(tf.GraphKeys.MODEL_VARIABLES))
            gvars = set([k.name for k in tf.global_variables()])
            var = [v for v in var if v.name in gvars]
            var_dict = {}
            for v in var:
                name = varmanip.get_savename_from_varname(v.name)
                var_dict[name] = v
            logger.info("Variables to dump:")
            logger.info(", ".join(var_dict.keys()))
            saver = tf.train.Saver(var_list=var_dict,
                                   write_version=tf.train.SaverDef.V2)
            saver.save(sess, args.output, write_meta_graph=False)
Beispiel #30
0
    def new_random_game(self):
        """
        load image,
        set dimensions,
        randomize start point,
        init _screen, qvals,
        calc distance to goal
        """
        self.terminal = [False] * self.agents
        self.viewer = None
        # ######################################################################
        # ## generate evaluation results from 19 different points
        # if self.count_points ==0:
        #     print('\n============== new game ===============\n')
        #     # save results
        #     if self.total_loc:
        #         with open(self.csvfile, 'a') as outcsv:
        #             fields= [self.filename, self.cur_dist]
        #             writer = csv.writer(outcsv)
        #             writer.writerow(map(lambda x: x, fields))
        #         self.total_loc = []
        #     # sample a new image
        #     self._image, self._target_loc, self.filepath, self.spacing = next(self.sampled_files)
        #     scale = next(self.start_points)
        #     self.count_points +=1
        # else:
        #     self.count_points += 1
        #     logger.info('count_points {}'.format(self.count_points))
        #     scale = next(self.start_points)
        #
        # x = int(scale[0] * self._image.dims[0])
        # y = int(scale[1] * self._image.dims[1])
        # z = int(scale[2] * self._image.dims[2])
        # logger.info('starting point {}-{}-{}'.format(x,y,z))
        # ######################################################################

        # # sample a new image
        self._image, self._target_loc, self.filepath, self.spacing = next(
            self.sampled_files)
        self.filename = [
            os.path.basename(self.filepath[i]) for i in range(self.agents)
        ]

        # multiscale (e.g. start with 3 -> 2 -> 1)
        # scale can be thought of as sampling stride
        if self.multiscale:
            ## brain
            self.action_step = 9
            self.xscale = 3
            self.yscale = 3
            self.zscale = 3
            ## cardiac
            # self.action_step = 6
            # self.xscale = 2
            # self.yscale = 2
            # self.zscale = 2
        else:
            self.action_step = 1
            self.xscale = 1
            self.yscale = 1
            self.zscale = 1
        # image volume size
        self._image_dims = self._image[0].dims

        #######################################################################
        ## select random starting point
        # add padding to avoid start right on the border of the image
        if self.task == 'train':
            skip_thickness = ((int)(self._image_dims[0] / 5),
                              (int)(self._image_dims[1] / 5),
                              (int)(self._image_dims[2] / 5))
        else:
            skip_thickness = (int(self._image_dims[0] / 4),
                              int(self._image_dims[1] / 4),
                              int(self._image_dims[2] / 4))

        # TODO: should agents start at the same random points, agents get stuck
        #x=[self.rng.randint(0 + skip_thickness[0], self._image_dims[0] - skip_thickness[0])] * self.agents
        #y=[self.rng.randint(0 + skip_thickness[1], self._image_dims[1] - skip_thickness[1])] * self.agents
        #z=[self.rng.randint(0 + skip_thickness[2], self._image_dims[2] - skip_thickness[2])] * self.agents

        x = [
            self.rng.randint(0 + skip_thickness[0],
                             self._image_dims[0] - skip_thickness[0])
            for _ in range(self.agents)
        ]
        y = [
            self.rng.randint(0 + skip_thickness[1],
                             self._image_dims[1] - skip_thickness[1])
            for _ in range(self.agents)
        ]
        z = [
            self.rng.randint(0 + skip_thickness[2],
                             self._image_dims[2] - skip_thickness[2])
            for _ in range(self.agents)
        ]

        #######################################################################

        self._location = [(x[i], y[i], z[i]) for i in range(self.agents)]
        self._start_location = [(x[i], y[i], z[i]) for i in range(self.agents)]
        self._qvalues = [[
            0,
        ] * self.actions] * self.agents
        self._screen = self._current_state()

        if self.task == 'play':
            self.cur_dist = [
                0,
            ] * self.agents
        else:
            self.cur_dist = [
                self.calcDistance(self._location[i], self._target_loc[i],
                                  self.spacing) for i in range(self.agents)
            ]
        logger.info("Current distance is " + str(self.cur_dist))
                        choices=[50, 101])
    parser.add_argument('--logdir', default='train_log/ResNet-GN')
    parser.add_argument('--WS',
                        action='store_true',
                        help='Use Weight Standardization')
    args = parser.parse_args()

    model = Model()
    model.depth = args.depth
    model.use_WS = args.WS
    if args.eval:
        batch = 128  # something that can run on one gpu
        ds = get_imagenet_dataflow(args.data, 'val', batch)
        eval_on_ILSVRC12(model, get_model_loader(args.load), ds)
    else:
        if args.fake:
            logger.set_logger_dir(os.path.join('train_log', 'tmp'), 'd')
        else:
            logger.set_logger_dir(args.logdir, 'd')

        try:
            from tensorpack.tfutils import collect_env_info
            logger.info("\n" + collect_env_info())
        except Exception:
            pass
        config = get_config(model, fake=args.fake)
        if args.load:
            config.session_init = get_model_loader(args.load)
        trainer = SyncMultiGPUTrainerReplicated(max(get_num_gpu(), 1))
        launch_train_with_config(config, trainer)
Beispiel #32
0
def log_tensor_info(tensors):
    for t in tensors:
        logger.info("name: {}, shape: {}".format(t.name, t.get_shape()))
    args = parser.parse_args()

    tf.train.import_meta_graph(args.meta, clear_devices=True)

    # loading...
    init = get_model_loader(args.input)
    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
    sess.run(tf.global_variables_initializer())
    sess.run(tf.local_variables_initializer())
    init.init(sess)

    # dump ...
    with sess.as_default():
        if args.output.endswith('npy') or args.output.endswith('npz'):
            varmanip.dump_session_params(args.output)
        else:
            var = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
            var.extend(tf.get_collection(tf.GraphKeys.MODEL_VARIABLES))
            gvars = set([k.name for k in tf.global_variables()])
            var = [v for v in var if v.name in gvars]
            var_dict = {}
            for v in var:
                name = varmanip.get_savename_from_varname(v.name)
                var_dict[name] = v
            logger.info("Variables to dump:")
            logger.info(", ".join(var_dict.keys()))
            saver = tf.train.Saver(
                var_list=var_dict,
                write_version=tf.train.SaverDef.V2)
            saver.save(sess, args.output, write_meta_graph=False)