Esempio n. 1
0
def get_config(model, fake=False):
    nr_tower = max(get_num_gpu(), 1)
    assert args.batch % nr_tower == 0
    batch = args.batch // nr_tower

    logger.info("Running on {} towers. Batch size per tower: {}".format(
        nr_tower, batch))
    if batch < 32 or batch > 64:
        logger.warn(
            "Batch size per tower not in [32, 64]. This probably will lead to worse accuracy than reported."
        )
    if fake:
        data = QueueInput(
            FakeData([[batch, 224, 224, 3], [batch]],
                     1000,
                     random=False,
                     dtype='uint8'))
        callbacks = []
    else:
        data = QueueInput(get_data('train', batch))

        START_LR = 0.1
        BASE_LR = START_LR * (args.batch / 256.0)
        callbacks = [
            ModelSaver(),
            EstimatedTimeLeft(),
            ScheduledHyperParamSetter('learning_rate',
                                      [(0, min(START_LR, BASE_LR)),
                                       (30, BASE_LR * 1e-1),
                                       (60, BASE_LR * 1e-2),
                                       (90, BASE_LR * 1e-3),
                                       (100, BASE_LR * 1e-4)]),
        ]
        if BASE_LR > START_LR:
            callbacks.append(
                ScheduledHyperParamSetter('learning_rate', [(0, START_LR),
                                                            (5, BASE_LR)],
                                          interp='linear'))

        infs = [
            ClassificationError('wrong-top1', 'val-error-top1'),
            ClassificationError('wrong-top5', 'val-error-top5')
        ]
        dataset_val = get_data('val', batch)
        if nr_tower == 1:
            # single-GPU inference with queue prefetch
            callbacks.append(InferenceRunner(QueueInput(dataset_val), infs))
        else:
            # multi-GPU inference (with mandatory queue prefetch)
            callbacks.append(
                DataParallelInferenceRunner(dataset_val, infs,
                                            list(range(nr_tower))))

    return TrainConfig(
        model=model,
        data=data,
        callbacks=callbacks,
        steps_per_epoch=100 if args.fake else 1281167 // args.batch,
        max_epoch=105,
    )
Esempio n. 2
0
def get_config(fake=False, data_format='NCHW'):
    nr_tower = max(get_nr_gpu(), 1)
    global BATCH_SIZE
    BATCH_SIZE = TOTAL_BATCH_SIZE // nr_tower

    if fake:
        logger.info("For benchmark, batch size is fixed to 64 per tower.")
        dataset_train = dataset_val = FakeData([[64, 224, 224, 3], [64]],
                                               1000,
                                               random=False,
                                               dtype='uint8')
    else:
        logger.info("Running on {} towers. Batch size per tower: {}".format(
            nr_tower, BATCH_SIZE))
        dataset_train = get_data('train')
        dataset_val = get_data('val')

    return TrainConfig(
        model=Model(data_format=data_format),
        dataflow=dataset_train,
        callbacks=[
            ModelSaver(),
            InferenceRunner(dataset_val, [
                ClassificationError('wrong-top1', 'val-error-top1'),
                ClassificationError('wrong-top5', 'val-error-top5')
            ]),
            ScheduledHyperParamSetter('learning_rate', [(30, 1e-2), (60, 1e-3),
                                                        (85, 1e-4), (95, 1e-5),
                                                        (105, 1e-6)]),
            HumanHyperParamSetter('learning_rate'),
        ],
        steps_per_epoch=5000,
        max_epoch=110,
        nr_tower=nr_tower)
Esempio n. 3
0
def get_config(model, fake=False):
    nr_tower = max(get_nr_gpu(), 1)
    batch = TOTAL_BATCH_SIZE // nr_tower

    if fake:
        logger.info("For benchmark, batch size is fixed to 64 per tower.")
        dataset_train = FakeData(
            [[64, 224, 224, 3], [64]], 1000, random=False, dtype='uint8')
        callbacks = []
    else:
        logger.info("Running on {} towers. Batch size per tower: {}".format(nr_tower, batch))
        dataset_train = get_data('train', batch)
        dataset_val = get_data('val', batch)
        callbacks = [
            ModelSaver(),
            ScheduledHyperParamSetter('learning_rate',
                                      [(30, 1e-2), (60, 1e-3), (85, 1e-4), (95, 1e-5), (105, 1e-6)]),
            HumanHyperParamSetter('learning_rate'),
        ]
        infs = [ClassificationError('wrong-top1', 'val-error-top1'),
                ClassificationError('wrong-top5', 'val-error-top5')]
        if nr_tower == 1:
            callbacks.append(InferenceRunner(QueueInput(dataset_val), infs))
        else:
            callbacks.append(DataParallelInferenceRunner(
                dataset_val, infs, list(range(nr_tower))))

    return TrainConfig(
        model=model,
        dataflow=dataset_train,
        callbacks=callbacks,
        steps_per_epoch=5000,
        max_epoch=110,
        nr_tower=nr_tower
    )
Esempio n. 4
0
def get_config(model, fake=False):
    start_ = 0
    nr_tower = max(get_nr_gpu(), 1)
    assert args.batch % nr_tower == 0
    batch = args.batch // nr_tower

    logger.info("Running on {} towers. Batch size per tower: {}".format(nr_tower, batch))
    if fake:
        dataset_train = FakeData(
            [[batch, 224, 224, 3], [batch]], 1000, random=False, dtype='uint8')
        callbacks = []
    else:
        dataset_train = get_data('train', batch)
        dataset_val = get_data('val', batch)

        START_LR = 0.1
        BASE_LR = START_LR * (args.batch / 256.0)
        if start_ < 31:
            lr_setting =[(max(30-start_, 0) , BASE_LR * 1e-1), (60 - start_, BASE_LR * 1e-2),(
                90 - start_, BASE_LR * 1e-3), (105 - start_, BASE_LR * 1e-4)]
        elif start_ < 61:
            lr_setting =[(max(60 - start_, 0), BASE_LR * 1e-2),(
                90 - start_, BASE_LR * 1e-3), (105 - start_, BASE_LR * 1e-4)]
        elif start_ < 91:
            lr_setting =[(max(90 - start_, 0), BASE_LR * 1e-3), (105 - start_, BASE_LR * 1e-4)]
        else:
            print('not found learning rate setting!!!!!!!!!!!!!')

        callbacks = [
            ModelSaver(),
            EstimatedTimeLeft(),
            ScheduledHyperParamSetter(
                'learning_rate', lr_setting),
            # TensorPrinter(['tower1/group3/block2/conv2/Abs_0', 'tower1/group3/block2/conv2/Abs_1:0', 'tower1/group3/block2/conv2/Abs_2:0'])
        ]
        if BASE_LR > START_LR:
            callbacks.append(
                ScheduledHyperParamSetter(
                    'learning_rate', [(0, START_LR), (5, BASE_LR)], interp='linear'))

        infs = [ClassificationError('wrong-top1', 'val-error-top1'),
                ClassificationError('wrong-top5', 'val-error-top5')]
        if nr_tower == 1:
            # single-GPU inference with queue prefetch
            callbacks.append(InferenceRunner(QueueInput(dataset_val), infs))
        else:
            # multi-GPU inference (with mandatory queue prefetch)
            callbacks.append(DataParallelInferenceRunner(
                dataset_val, infs, list(range(nr_tower))))

    return TrainConfig(
        model=model,
        dataflow=dataset_train,
        callbacks=callbacks,
        steps_per_epoch=100 if args.fake else 1280000 // args.batch,
        max_epoch=120,
    )
def get_config(model):
    nr_tower = max(get_num_gpu(), 1)
    batch = args.batch // nr_tower

    logger.info("Running on {} towers. Batch size per tower: {}".format(nr_tower, batch))

    callbacks = [ThroughputTracker(args.batch)]
    if args.fake:
        data = QueueInput(FakeData(
            [[batch, 224, 224, 3], [batch]], 1000, random=False, dtype='uint8'))
    else:
        data = QueueInput(
            get_imagenet_dataflow(args.data, 'train', batch),
            # use a larger queue
            queue=tf.FIFOQueue(200, [tf.uint8, tf.int32], [[batch, 224, 224, 3], [batch]])
        )

        BASE_LR = 30
        SCALED_LR = BASE_LR * (args.batch / 256.0)
        callbacks.extend([
            ModelSaver(),
            EstimatedTimeLeft(),
            ScheduledHyperParamSetter(
                'learning_rate', [
                    (0, SCALED_LR),
                    (60, SCALED_LR * 1e-1),
                    (70, SCALED_LR * 1e-2),
                    (80, SCALED_LR * 1e-3),
                    (90, SCALED_LR * 1e-4),
                ]),
        ])

        dataset_val = get_imagenet_dataflow(args.data, 'val', 64)
        infs = [ClassificationError('wrong-top1', 'val-error-top1'),
                ClassificationError('wrong-top5', 'val-error-top5')]
        if nr_tower == 1:
            callbacks.append(InferenceRunner(QueueInput(dataset_val), infs))
        else:
            callbacks.append(DataParallelInferenceRunner(
                dataset_val, infs, list(range(nr_tower))))

    if args.load.endswith(".npz"):
        # a released model in npz format
        init = SmartInit(args.load)
    else:
        # a pre-trained checkpoint
        init = SaverRestore(args.load, ignore=("learning_rate", "global_step"))
    return TrainConfig(
        model=model,
        data=data,
        callbacks=callbacks,
        steps_per_epoch=100 if args.fake else 1281167 // args.batch,
        session_init=init,
        max_epoch=100,
    )
Esempio n. 6
0
def get_config(model, fake=False):
    nr_tower = max(get_nr_gpu(), 1)
    assert args.batch % nr_tower == 0
    batch = args.batch // nr_tower

    if fake:
        logger.info("For benchmark, batch size is fixed to 64 per tower.")
        dataset_train = FakeData([[64, 224, 224, 3], [64]],
                                 1000,
                                 random=False,
                                 dtype='uint8')
        callbacks = []
    else:
        logger.info("Running on {} towers. Batch size per tower: {}".format(
            nr_tower, batch))
        dataset_train = get_data('train', batch)
        dataset_val = get_data('val', batch)

        BASE_LR = 0.1 * (args.batch / 256.0)
        callbacks = [
            ModelSaver(),
            ScheduledHyperParamSetter('learning_rate',
                                      [(30, BASE_LR * 1e-1),
                                       (60, BASE_LR * 1e-2),
                                       (85, BASE_LR * 1e-3),
                                       (95, BASE_LR * 1e-4),
                                       (105, BASE_LR * 1e-5)]),
        ]
        if BASE_LR > 0.1:
            callbacks.append(
                ScheduledHyperParamSetter('learning_rate', [(0, 0.1),
                                                            (3, BASE_LR)],
                                          interp='linear'))

        infs = [
            ClassificationError('wrong-top1', 'val-error-top1'),
            ClassificationError('wrong-top5', 'val-error-top5')
        ]
        if nr_tower == 1:
            # single-GPU inference with queue prefetch
            callbacks.append(InferenceRunner(QueueInput(dataset_val), infs))
        else:
            # multi-GPU inference (with mandatory queue prefetch)
            callbacks.append(
                DataParallelInferenceRunner(dataset_val, infs,
                                            list(range(nr_tower))))

    return TrainConfig(
        model=model,
        dataflow=dataset_train,
        callbacks=callbacks,
        steps_per_epoch=100 if args.fake else 1280000 // args.batch,
        max_epoch=110,
    )
Esempio n. 7
0
def get_config(model):
    batch = 1

    logger.info("For benchmark, batch size is fixed to 1 per tower.")
    data = QueueInput(
        FakeData([[1, 224, 224, 3], [1]], 1, random=False, dtype='uint8'))

    return TrainConfig(model=model,
                       data=data,
                       callbacks=[],
                       steps_per_epoch=1,
                       max_epoch=1)
Esempio n. 8
0
def get_config(model, fake=False):
    nr_tower = max(get_nr_gpu(), 1)
    batch = TOTAL_BATCH_SIZE // nr_tower

    if fake:
        logger.info("For benchmark, batch size is fixed to 64 per tower.")
        dataset_train = FakeData([[64, 224, 224, 3], [64]],
                                 1000,
                                 random=False,
                                 dtype='uint8')
        callbacks = []
    else:
        logger.info("Running on {} towers. Batch size per tower: {}".format(
            nr_tower, batch))
        dataset_train = get_data('train', batch)
        dataset_val = get_data('val', batch)
        callbacks = [
            ModelSaver(),
            ScheduledHyperParamSetter('learning_rate',
                                      [(0, 5e-4), (20, 2.5e-4), (40, 1.25e-4),
                                       (60, 5e-5), (80, 2.5e-5),
                                       (100, 1.25e-5), (120, 5e-6)]),
            HumanHyperParamSetter('learning_rate'),
        ]
        # Finetune COCO
        #[(0, 5e-4), (20, 2.5e-4), (40, 1.25e-4), (60, 5e-5),(80, 2.5e-5), (100, 1.25e-5), (120, 5e-6)]
        #JT COCO
        #[(0, 2.5e-4), (20, 1.25e-4), (40, 5e-5), (60, 2.5e-5),(80, 1e-5), (100, 5e-6), (120, 2.5e-6)]
        #Fintune to VOC
        #[(0, 1.25e-4), (20, 5e-5), (40, 2.5e-5), (60,1.25e-5),(80, 5e-6), (100, 2.5e-6), (120, 1.25e-6)]

        #infs = [ClassificationError('wrong-top1', 'val-error-top1'),
        #        ClassificationError('wrong-top5', 'val-error-top5')]
        infs = [
            ClassificationError('loss-wrong-top1', 'loss-val-error-top1'),
            ClassificationError('loss-wrong-top5', 'loss-val-error-top5')
        ]
        if nr_tower == 1:
            # single-GPU inference with queue prefetch
            callbacks.append(InferenceRunner(QueueInput(dataset_val), infs))
        else:
            # multi-GPU inference (with mandatory queue prefetch)
            callbacks.append(
                DataParallelInferenceRunner(dataset_val, infs,
                                            list(range(nr_tower))))

    return TrainConfig(model=model,
                       dataflow=dataset_train,
                       callbacks=callbacks,
                       steps_per_epoch=1522,
                       max_epoch=140,
                       nr_tower=nr_tower)
Esempio n. 9
0
def get_config(model):
    input_sig = model.get_input_signature()
    nr_tower = max(hvd.size(), 1)
    batch = args.batch // nr_tower
    logger.info("Running on {} towers. Batch size per tower: {}".format(
        nr_tower, batch))

    callbacks = [ThroughputTracker(args.batch), UpdateMomentumEncoder()]

    if args.fake:
        data = QueueInput(
            FakeData([x.shape for x in input_sig],
                     1000,
                     random=False,
                     dtype='uint8'))
    else:
        zmq_addr = 'ipc://@imagenet-train-b{}'.format(batch)
        data = ZMQInput(zmq_addr, 25, bind=False)

        dataset = data.to_dataset(input_sig).repeat().prefetch(15)
        dataset = dataset.apply(
            tf.data.experimental.prefetch_to_device('/gpu:0'))
        data = TFDatasetInput(dataset)

        callbacks.extend([
            ModelSaver(),
            EstimatedTimeLeft(),
        ])

        if not args.v2:
            # step-wise LR in v1
            SCALED_LR = BASE_LR * (args.batch / 256.0)
            callbacks.append(
                ScheduledHyperParamSetter('learning_rate',
                                          [(0, min(BASE_LR, SCALED_LR)),
                                           (120, SCALED_LR * 1e-1),
                                           (160, SCALED_LR * 1e-2)]))
            if SCALED_LR > BASE_LR:
                callbacks.append(
                    ScheduledHyperParamSetter('learning_rate',
                                              [(0, BASE_LR), (5, SCALED_LR)],
                                              interp='linear'))

    return TrainConfig(
        model=model,
        data=data,
        callbacks=callbacks,
        steps_per_epoch=100 if args.fake else 1281167 // args.batch,
        max_epoch=200,
    )
Esempio n. 10
0
def get_config(model, fake=False, xla=False):
    nr_tower = max(get_nr_gpu(), 1)
    batch = TOTAL_BATCH_SIZE // nr_tower

    if fake:
        logger.info("For benchmark, batch size is fixed to 64 per tower.")
        dataset_train = FakeData([[64, 224, 224, 3], [64]],
                                 1000,
                                 random=False,
                                 dtype='uint8')
        callbacks = []
    else:
        logger.info("Running on {} towers. Batch size per tower: {}".format(
            nr_tower, batch))
        dataset_train = get_data('train', batch)
        dataset_val = get_data('val', batch)
        callbacks = [
            ModelSaver(),
            ScheduledHyperParamSetter('learning_rate', [(10, 1e-2), (20, 1e-3),
                                                        (85, 1e-4), (95, 1e-5),
                                                        (105, 1e-6)]),
            HumanHyperParamSetter('learning_rate'),
        ]
        infs = [
            ClassificationError('wrong-top1', 'val-error-top1'),
            ClassificationError('wrong-top5', 'val-error-top5')
        ]
        if nr_tower == 1:
            # single-GPU inference with queue prefetch
            callbacks.append(InferenceRunner(QueueInput(dataset_val), infs))
        else:
            # multi-GPU inference (with mandatory queue prefetch)
            callbacks.append(
                DataParallelInferenceRunner(dataset_val, infs,
                                            list(range(nr_tower))))

        config = tf.ConfigProto()
        jit_level = 0
        if xla:
            # Turns on XLA JIT compilation
            jit_level = tf.OptimizerOptions.ON_1
        config.graph_options.optimizer_options.global_jit_level = jit_level

    return TrainConfig(model=model,
                       dataflow=dataset_train,
                       callbacks=callbacks,
                       steps_per_epoch=10,
                       max_epoch=1,
                       nr_tower=nr_tower)
Esempio n. 11
0
    parser = argparse.ArgumentParser()
    parser.add_argument('--data', help='ILSVRC dataset dir')
    parser.add_argument('--fake', action='store_true')
    parser.add_argument('--batch',
                        help='per-GPU batch size',
                        default=32,
                        type=int)
    parser.add_argument('--benchmark', action='store_true')
    parser.add_argument('--no-zmq-ops', action='store_true')
    args = parser.parse_args()

    os.environ['CUDA_VISIBLE_DEVICES'] = ''

    if args.fake:
        ds = FakeData([[args.batch, 224, 224, 3], [args.batch]],
                      1000,
                      random=False,
                      dtype=['uint8', 'int32'])
    else:
        augs = fbresnet_augmentor(True)
        ds = get_data(args.batch, augs)

    logger.info("Serving data on {}".format(socket.gethostname()))

    if args.benchmark:
        from zmq_ops import dump_arrays
        ds = MapData(ds, dump_arrays)
        TestDataSpeed(ds, warmup=300).start()
    else:
        format = None if args.no_zmq_ops else 'zmq_ops'
        send_dataflow_zmq(ds,
                          'ipc://@imagenet-train-b{}'.format(args.batch),
def get_config(model, fake=False):
    start_ = 0
    # nr_tower GPU的数量
    nr_tower = max(get_nr_gpu(), 1)
    assert args.batch % nr_tower == 0
    # 每块GPU上的batch
    batch = args.batch // nr_tower
    # 加载日志,在第几块GPU上运行,batch是多少。
    logger.info("Running on {} towers. Batch size per tower: {}".format(
        nr_tower, batch))
    if fake:
        # FakeData参数说明
        # shape(列表):列表/元组的列表。每个组件的形状。
        # size(int):此数据流的大小。
        # random(bool):每次迭代是否随机生成数据。              请注意,仅仅生成数据有时可能会非常耗时!
        # dtype(str或list):数据类型为string,或数据类型列表。
        # 这里是使用fakedata测试或基准测试此模型
        dataset_train = FakeData([[batch, 224, 224, 3], [batch]],
                                 1000,
                                 random=False,
                                 dtype='uint8')
        callbacks = []  # 该语句可能是为了让没有训练数据的时候也能产生一些伪数据
    else:
        dataset_train = get_data('train', batch)
        dataset_val = get_data('val', batch)

        START_LR = 0.1  # 开始的学习率
        BASE_LR = START_LR * (args.batch / 256.0)  # 基础学习率?
        if start_ < 31:
            lr_setting = [(max(30 - start_, 0), BASE_LR * 1e-1),
                          (60 - start_, BASE_LR * 1e-2),
                          (90 - start_, BASE_LR * 1e-3),
                          (105 - start_, BASE_LR * 1e-4)]
        elif start_ < 61:
            lr_setting = [(max(60 - start_, 0), BASE_LR * 1e-2),
                          (90 - start_, BASE_LR * 1e-3),
                          (105 - start_, BASE_LR * 1e-4)]
        elif start_ < 91:
            lr_setting = [(max(90 - start_, 0), BASE_LR * 1e-3),
                          (105 - start_, BASE_LR * 1e-4)]
        else:
            print('not found learning rate setting!!!!!!!!!!!!!')
        # callback包含保存模型,估算时间,通过预定义的基于时间的计划设置超参数等
        callbacks = [
            ModelSaver(),
            EstimatedTimeLeft(),
            ScheduledHyperParamSetter('learning_rate', lr_setting),
            # TensorPrinter(['tower1/group3/block2/conv2/Abs_0', 'tower1/group3/block2/conv2/Abs_1:0', 'tower1/group3/block2/conv2/Abs_2:0'])
        ]
        if BASE_LR > START_LR:
            callbacks.append(
                ScheduledHyperParamSetter('learning_rate', [(0, START_LR),
                                                            (5, BASE_LR)],
                                          interp='linear'))

        infs = [
            ClassificationError('wrong-top1', 'val-error-top1'),
            ClassificationError('wrong-top5', 'val-error-top5')
        ]
        # 根据GPU的数量取出数据流
        if nr_tower == 1:
            # single-GPU inference with queue prefetch
            callbacks.append(InferenceRunner(QueueInput(dataset_val), infs))
        else:
            # multi-GPU inference (with mandatory queue prefetch)
            callbacks.append(
                DataParallelInferenceRunner(dataset_val, infs,
                                            list(range(nr_tower))))

    return TrainConfig(
        model=model,
        dataflow=dataset_train,
        callbacks=callbacks,
        steps_per_epoch=100 if args.fake else 1280000 // args.batch,
        max_epoch=120,
    )  # 返回参数 模型,数据流,callbacks(存放模型,估算时间等),步长
    parser.add_argument('--fake', action='store_true')
    parser.add_argument('--batch', help='per-GPU batch size',
                        default=32, type=int)
    parser.add_argument('--warmup', help='prefetch buffer size',
                        default=150, type=int)
    parser.add_argument('--port', help='server port',
                        default=1000, type=int)
    parser.add_argument('--benchmark', action='store_true')
    parser.add_argument('--no-zmq-ops', action='store_true')
    args = parser.parse_args()

    os.environ['CUDA_VISIBLE_DEVICES'] = ''

    if args.fake:
        ds = FakeData(
            [[args.batch, args.image_size, args.image_size, 3], [args.batch]],
            1000, random=False, dtype=['uint8', 'int32'])
    else:
        augs = fbresnet_augmentor(True, image_size=args.image_size)
        ds = get_data(args.batch, augs, args.worker)

    logger.info("Serving data on {}".format(socket.gethostname()))

    if args.benchmark:
        from zmq_ops import dump_arrays
        ds = MapData(ds, dump_arrays)
        TestDataSpeed(ds, warmup=300).start()
    else:
        format = None if args.no_zmq_ops else 'zmq_ops'
        send_dataflow_zmq(
            ds, 'ipc://@imagenet-train-b{}-p{}'.format(args.batch, args.port),
    parser.add_argument('--data', help='ILSVRC dataset dir')
    parser.add_argument('--fake', action='store_true')
    parser.add_argument('--batch',
                        help='per-GPU batch size',
                        default=32,
                        type=int)
    parser.add_argument('--benchmark', action='store_true')
    parser.add_argument('--v2', action='store_true')
    parser.add_argument('--no-zmq-ops', action='store_true')
    args = parser.parse_args()

    os.environ['CUDA_VISIBLE_DEVICES'] = ''

    if args.fake:
        ds = FakeData([[args.batch, 224, 224, 3], [args.batch, 224, 224, 3]],
                      9999999,
                      random=False,
                      dtype=['uint8', 'uint8'])
    else:
        aug = get_moco_v2_augmentor() if args.v2 else get_moco_v1_augmentor()
        ds = get_moco_dataflow(args.data, args.batch, aug)

    logger.info("Serving data on {}".format(socket.gethostname()))

    if args.benchmark:
        ds = MapData(ds, dump_arrays)
        TestDataSpeed(ds, size=99999, warmup=300).start()
    else:
        format = None if args.no_zmq_ops else 'zmq_ops'
        send_dataflow_zmq(ds,
                          'ipc://@imagenet-train-b{}'.format(args.batch),
                          hwm=200,