Ejemplo n.º 1
0
    """
    Sec 2.3: We keep the per-worker sample size n constant when we change the number of workers k.
    In this work, we use n = 32 which has performed well for a wide range of datasets and networks.
    """
    parser.add_argument('--batch', help='per-GPU batch size', default=32, type=int)
    args = parser.parse_args()

    model = Model(args.depth, args.norm, args.use_ws)
    model.accum_grad = args.accum_grad
    if args.weight_decay_norm:
        model.weight_decay_pattern = ".*/W|.*/gamma|.*/beta"

    if args.eval:
        batch = 128    # something that can run on one gpu
        ds = get_val_dataflow(args.data, batch, fbresnet_augmentor(False))
        eval_classification(model, get_model_loader(args.load), ds)
        sys.exit()

    logger.info("Training on {}".format(socket.gethostname()))
    # Print some information for sanity check.
    os.system("nvidia-smi")
    assert args.load is None

    hvd.init()

    if args.logdir is None:
        args.logdir = os.path.join('train_log', 'Horovod-{}GPUs-{}Batch'.format(hvd.size(), args.batch))

    if hvd.rank() == 0:
        logger.set_logger_dir(args.logdir, 'd')
    logger.info("Rank={}, Local Rank={}, Size={}".format(hvd.rank(), hvd.local_rank(), hvd.size()))
Ejemplo n.º 2
0
            raise
        logger.info("Name Transform: " + k + ' --> ' + newname)
        resnet_param[newname] = v
    return resnet_param


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--load', required=True,
                        help='.npz model file generated by tensorpack.utils.loadcaffe')
    parser.add_argument('-d', '--depth', help='resnet depth', required=True, type=int, choices=[50, 101, 152])
    parser.add_argument('--input', help='an input image')
    parser.add_argument('--convert', help='npz output file to save the converted model')
    parser.add_argument('--eval', help='ILSVRC dir to run validation on')

    args = parser.parse_args()
    DEPTH = args.depth

    param = dict(np.load(args.load))
    param = convert_param_name(param)

    if args.convert:
        assert args.convert.endswith('.npz')
        np.savez_compressed(args.convert, **param)

    if args.eval:
        ds = get_imagenet_dataflow(args.eval, 'val', 128, get_inference_augmentor())
        eval_classification(Model(), DictRestore(param), ds)
    elif args.input:
        run_test(param, args.input)
Ejemplo n.º 3
0
    parser = argparse.ArgumentParser()
    parser.add_argument('--gpu', help='the physical ids of GPUs to use')
    parser.add_argument('--load', help='load a checkpoint, or a npz (given as the pretrained model)')
    parser.add_argument('--data', help='ILSVRC dataset dir')
    parser.add_argument('--run', help='run on a list of images with the pretrained model', nargs='*')
    parser.add_argument('--eval', action='store_true')
    args = parser.parse_args()

    if args.gpu:
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu

    if args.run:
        assert args.load.endswith('.npz')
        run_image(Model(), SmartInit(args.load), args.run)
        sys.exit()
    if args.eval:
        BATCH_SIZE = 128
        ds = get_data('val')
        eval_classification(Model(), SmartInit(args.load), ds)
        sys.exit()

    nr_tower = max(get_nr_gpu(), 1)
    BATCH_SIZE = TOTAL_BATCH_SIZE // nr_tower
    logger.set_logger_dir(os.path.join(
        'train_log', 'alexnet-dorefa-{}'.format(args.dorefa)))
    logger.info("Batch per tower: {}".format(BATCH_SIZE))

    config = get_config()
    config.session_init = SmartInit(args.load)
    launch_train_with_config(config, SyncMultiGPUTrainerReplicated(nr_tower))
Ejemplo n.º 4
0
                        help='variants of resnet to use',
                        default='resnet')
    args = parser.parse_args()

    if args.gpu:
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu

    model = Model(args.depth, args.mode)
    model.data_format = args.data_format
    if args.weight_decay_norm:
        model.weight_decay_pattern = ".*/W|.*/gamma|.*/beta"

    if args.eval:
        batch = 128  # something that can run on one gpu
        ds = get_imagenet_dataflow(args.data, 'val', batch)
        eval_classification(model, SmartInit(args.load), ds)
    else:
        if args.fake:
            logger.set_logger_dir(os.path.join('train_log', 'tmp'), 'd')
        else:
            logger.set_logger_dir(
                os.path.join(
                    '/data0/wangguangrun/tensorflow_log/train_log',
                    'imagenet-{}-d{}-batch{}'.format(args.mode, args.depth,
                                                     args.batch)))

        config = get_config(model)
        config.session_init = SmartInit(args.load)
        trainer = SyncMultiGPUTrainerReplicated(max(get_num_gpu(), 1))
        launch_train_with_config(config, trainer)
Ejemplo n.º 5
0
        '--load',
        help='load a checkpoint, or a npz (given as the pretrained model)')
    parser.add_argument('--data', help='ILSVRC dataset dir')
    parser.add_argument(
        '--dorefa',
        required=True,
        help='number of bits for W,A,G, separated by comma. W="t" means TTQ')
    parser.add_argument(
        '--run',
        help='run on a list of images with the pretrained model',
        nargs='*')
    parser.add_argument('--eval', action='store_true')
    args = parser.parse_args()

    if args.eval:
        BATCH_SIZE = 128
        data_test = dataset.SVHNDigit('test')
        augmentors = [
            imgaug.Resize((40, 40)),
            imgaug.Brightness(30),
            imgaug.Contrast((0.5, 1.5)),
        ]
        data_test = AugmentImageComponent(data_test, augmentors)
        data_test = BatchData(data_test, 128, remainder=True)
        eval_classification(Model(), get_model_loader(args.load), data_test)
        sys.exit()

    BITW, BITA, BITG = map(int, args.dorefa.split(','))
    config = get_config()
    launch_train_with_config(config, SimpleTrainer())