def create_model(self):
        def input_builder_fun(model):
            model.param_init_net.UniformFill([], ["data"], shape=[32, 8])

        def model_build_fun(model, loss_scale):
            fc1 = brew.fc(model, "data", "fc1", dim_in=8, dim_out=8)
            fc2 = brew.fc(model, fc1, "fc2", dim_in=8, dim_out=8)
            fc3 = brew.fc(model, fc2, "fc3", dim_in=8, dim_out=8)
            fc4 = brew.fc(model, fc3, "fc4", dim_in=8, dim_out=8)
            fc5 = brew.fc(model, fc4, "fc5", dim_in=8, dim_out=8)
            loss = model.net.SumElements([fc5], ["loss"])
            return [loss]

        def add_optimizer(model):
            return optimizer.build_sgd(model, 0.1, policy="fixed")

        model = model_helper.ModelHelper()
        data_parallel_model.Parallelize(
            model,
            input_builder_fun=input_builder_fun,
            forward_pass_builder_fun=model_build_fun,
            optimizer_builder_fun=add_optimizer,
            devices=[0, 1, 2, 3],
        )
        return model
Esempio n. 2
0
def Benchmark(args, model_map):

    arg_scope = {
        'order': 'NCHW',
        'use_cudnn': True,
        'ws_nbytes_limit': args.cudnn_ws * 1024 * 1024,
    }
    model = model_helper.ModelHelper(name=args.model, arg_scope=arg_scope)

    # Either use specified device list or generate one
    if args.gpus is not None:
        gpus = [int(x) for x in args.gpus.split(',')]
        num_gpus = len(gpus)
    else:
        gpus = list(range(args.num_gpus))
        num_gpus = args.num_gpus

    # Verify valid batch size
    total_batch_size = args.batch_size
    batch_per_device = total_batch_size // num_gpus
    assert \
        total_batch_size % num_gpus == 0, \
        "Number of GPUs must divide batch size"

    def add_image_input(model):
        AddNullInput(
            model,
            batch_size=batch_per_device,
            img_size=model_map[args.model][1],
            dtype=args.dtype,
        )

    data_parallel_model.Parallelize(
        model,
        input_builder_fun=add_image_input,
        forward_pass_builder_fun=partial(model_map[args.model][0],
                                         dtype=args.dtype),
        optimizer_builder_fun=add_optimizer if not args.forward_only else None,
        post_sync_builder_fun=add_post_sync_ops,
        devices=gpus,
        optimize_gradient_memory=False,
        cpu_device=args.cpu,
        num_threads_per_device=args.num_workers_per_device,
    )

    if not args.forward_only:
        data_parallel_model.OptimizeGradientMemory(model, {}, set(), False)

    model.Proto().type = args.net_type

    workspace.RunNetOnce(model.param_init_net)
    workspace.CreateNet(model.net)
    ms_per_iter = workspace.BenchmarkNet(model.net.Proto().name,
                                         args.warmup_iterations,
                                         args.iterations,
                                         args.layer_wise_benchmark)
    print("number of images/sec: {}".format(
        round(args.batch_size * 1000 / ms_per_iter[0], 2)))
Esempio n. 3
0
    def prep_a_data_parallel_model(self, model, dataset, is_train):
        if model is None:
            return

        log.info('in prep_a_data_parallel_model')

        param_update = \
            self.gen_param_update_builder_fun(model, dataset, is_train) \
            if self.gen_param_update_builder_fun is not None else None
        log.info('in prep_a_data_parallel_model param_update done ')

        optimizer = \
            self.gen_optimizer_fun(model, dataset, is_train) \
            if self.gen_optimizer_fun is not None else None
        log.info('in prep_a_data_parallel_model optimizer done ')

        max_ops = self.opts['model_param']['max_concurrent_distributed_ops']
        data_parallel_model.Parallelize(
            model,
            input_builder_fun=self.gen_input_builder_fun(
                model, dataset, is_train),
            forward_pass_builder_fun=self.gen_forward_pass_builder_fun(
                model, dataset, is_train),
            param_update_builder_fun=param_update,
            optimizer_builder_fun=optimizer,
            devices=self.xpus,
            rendezvous=self.gen_rendezvous_ctx(model, dataset, is_train),
            broadcast_computed_params=False,
            optimize_gradient_memory=self.opts['model_param']['memonger'],
            use_nccl=self.opts['model_param']['cuda_nccl'],
            max_concurrent_distributed_ops=max_ops,
            cpu_device=(self.opts['distributed']['device'] == 'cpu'),
            # "shared model" will only keep model parameters for cpu_0 or gpu_0
            # will cause issue when initialize each gpu_0, gpu_1, gpu_2 ...
            # shared_model=(self.opts['distributed']['device'] == 'cpu'),
            combine_spatial_bn=self.opts['model_param']['combine_spatial_bn'],
        )
        log.info('in prep_a_data_parallel_model Parallelize done ')

        # log.info("Current blobs in workspace: {}".format(workspace.Blobs()))

        workspace.RunNetOnce(model.param_init_net)
        log.info('in prep_a_data_parallel_model RunNetOnce done ')

        # for op in model.net.Proto().op:
        #     log.info('op type engine {} {}'.format(op.type, op.engine))

        log.info('model.net.Proto() {}'.format(model.net.Proto()))

        workspace.CreateNet(model.net)

        # for op in model.net.Proto().op:
        #     log.info('after CreateNet op type engine {} {}'.
        #         format(op.type, op.engine))

        log.info('in prep_a_data_parallel_model CreateNet done ')
Esempio n. 4
0
def Train(args):
    # Either use specified device list or generate one
    if args.gpus is not None:
        gpus = [int(x) for x in args.gpus.split(',')]
        num_gpus = len(gpus)
    else:
        gpus = list(range(args.num_gpus))
        num_gpus = args.num_gpus

    log.info("Running on GPUs: {}".format(gpus))

    # Verify valid batch size
    total_batch_size = args.batch_size
    batch_per_device = total_batch_size // num_gpus
    assert \
        total_batch_size % num_gpus == 0, \
        "Number of GPUs must divide batch size"

    # Round down epoch size to closest multiple of batch size across machines
    global_batch_size = total_batch_size * args.num_shards
    epoch_iters = int(args.epoch_size / global_batch_size)

    assert \
        epoch_iters > 0, \
        "Epoch size must be larger than batch size times shard count"

    args.epoch_size = epoch_iters * global_batch_size
    log.info("Using epoch size: {}".format(args.epoch_size))

    # Create ModelHelper object
    train_arg_scope = {
        'order': 'NCHW',
        'use_cudnn': True,
        'cudnn_exhaustive_search': True,
        'ws_nbytes_limit': (args.cudnn_workspace_limit_mb * 1024 * 1024),
    }
    train_model = model_helper.ModelHelper(
        name="ban-pc-resnet50", arg_scope=train_arg_scope
    )

    num_shards = args.num_shards
    shard_id = args.shard_id

    # Expect interfaces to be comma separated.
    # Use of multiple network interfaces is not yet complete,
    # so simply use the first one in the list.
    interfaces = args.distributed_interfaces.split(",")

    # Rendezvous using MPI when run with mpirun
    if os.getenv("OMPI_COMM_WORLD_SIZE") is not None:
        num_shards = int(os.getenv("OMPI_COMM_WORLD_SIZE", 1))
        shard_id = int(os.getenv("OMPI_COMM_WORLD_RANK", 0))
        if num_shards > 1:
            rendezvous = dict(
                kv_handler=None,
                num_shards=num_shards,
                shard_id=shard_id,
                engine="GLOO",
                transport=args.distributed_transport,
                interface=interfaces[0],
                mpi_rendezvous=True,
                exit_nets=None)

    elif num_shards > 1:
        # Create rendezvous for distributed computation
        store_handler = "store_handler"
        if args.redis_host is not None:
            # Use Redis for rendezvous if Redis host is specified
            workspace.RunOperatorOnce(
                core.CreateOperator(
                    "RedisStoreHandlerCreate", [], [store_handler],
                    host=args.redis_host,
                    port=args.redis_port,
                    prefix=args.run_id,
                )
            )
        else:
            # Use filesystem for rendezvous otherwise
            workspace.RunOperatorOnce(
                core.CreateOperator(
                    "FileStoreHandlerCreate", [], [store_handler],
                    path=args.file_store_path,
                    prefix=args.run_id,
                )
            )

        rendezvous = dict(
            kv_handler=store_handler,
            shard_id=shard_id,
            num_shards=num_shards,
            engine="GLOO",
            transport=args.distributed_transport,
            interface=interfaces[0],
            exit_nets=None)

    else:
        rendezvous = None

    # Model configs for constructing model
    with open(args.model_config) as f:
        model_config = yaml.load(f)

    # Model building functions
    def create_target_model_ops(model, loss_scale):
        initializer = (PseudoFP16Initializer if args.dtype == 'float16'
                       else Initializer)
        with brew.arg_scope([brew.conv, brew.fc],
                            WeightInitializer=initializer,
                            BiasInitializer=initializer,
                            enable_tensor_core=args.enable_tensor_core,
                            float16_compute=args.float16_compute):
            pred = add_se_model(model, model_config, "data", is_test=False)

        if args.dtype == 'float16':
            pred = model.net.HalfToFloat(pred, pred + '_fp32')

        loss = add_softmax_loss(model, pred, 'label')
        brew.accuracy(model, ['softmax', 'label'], 'accuracy')
        return [loss]

    def add_optimizer(model):
        '''
        stepsz = int(30 * args.epoch_size / total_batch_size / num_shards)
        optimizer.add_weight_decay(model, args.weight_decay)
        opt = optimizer.build_multi_precision_sgd(
            model,
            args.base_learning_rate,
            momentum=0.9,
            nesterov=1,
            policy="step",
            stepsize=stepsz,
            gamma=0.1
        )
        '''

        optimizer.add_weight_decay(model, args.weight_decay)
        opt = optimizer.build_multi_precision_sgd(
            model,
            base_learning_rate = args.base_learning_rate,
            momentum = model_config['solver']['momentum'],
            nesterov = model_config['solver']['nesterov'],
            policy = model_config['solver']['lr_policy'],
            power = model_config['solver']['power'],
            max_iter = model_config['solver']['max_iter'],
        )
        return opt

    # Define add_image_input function.
    # Depends on the "train_data" argument.
    # Note that the reader will be shared with between all GPUS.
    reader = train_model.CreateDB(
        "reader",
        db=args.train_data,
        db_type=args.db_type,
        num_shards=num_shards,
        shard_id=shard_id,
    )

    def add_image_input(model):
        AddImageInput(
            model,
            reader,
            batch_size=batch_per_device,
            img_size=args.image_size,
            dtype=args.dtype,
            is_test=False,
        )

    def add_post_sync_ops(model):
        """Add ops applied after initial parameter sync."""
        for param_info in model.GetOptimizationParamInfo(model.GetParams()):
            if param_info.blob_copy is not None:
                model.param_init_net.HalfToFloat(
                    param_info.blob,
                    param_info.blob_copy[core.DataType.FLOAT]
                )

    # Create parallelized model
    data_parallel_model.Parallelize(
        train_model,
        input_builder_fun=add_image_input,
        forward_pass_builder_fun=create_target_model_ops,
        optimizer_builder_fun=add_optimizer,
        post_sync_builder_fun=add_post_sync_ops,
        devices=gpus,
        rendezvous=rendezvous,
        optimize_gradient_memory=False,
        cpu_device=args.use_cpu,
        shared_model=args.use_cpu,
        combine_spatial_bn=args.use_cpu,
    )

    if args.model_parallel:
        # Shift half of the activations to another GPU
        assert workspace.NumCudaDevices() >= 2 * args.num_gpus
        activations = data_parallel_model_utils.GetActivationBlobs(train_model)
        data_parallel_model_utils.ShiftActivationDevices(
            train_model,
            activations=activations[len(activations) // 2:],
            shifts={g: args.num_gpus + g for g in range(args.num_gpus)},
        )

    data_parallel_model.OptimizeGradientMemory(train_model, {}, set(), False)

    workspace.RunNetOnce(train_model.param_init_net)
    workspace.CreateNet(train_model.net)

    # Add test model, if specified
    test_model = None
    if (args.test_data is not None):
        log.info("----- Create test net ----")
        test_arg_scope = {
            'order': "NCHW",
            'use_cudnn': True,
            'cudnn_exhaustive_search': True,
        }
        test_model = model_helper.ModelHelper(
            name="ban-pc-resnet50_test", arg_scope=test_arg_scope, init_params=False
        )

        test_reader = test_model.CreateDB(
            "test_reader",
            db=args.test_data,
            db_type=args.db_type,
        )

        def test_input_fn(model):
            AddImageInput(
                model,
                test_reader,
                batch_size=batch_per_device,
                img_size=args.image_size,
                dtype=args.dtype,
                is_test=True,
            )

        data_parallel_model.Parallelize(
            test_model,
            input_builder_fun=test_input_fn,
            forward_pass_builder_fun=create_target_model_ops,
            post_sync_builder_fun=add_post_sync_ops,
            param_update_builder_fun=None,
            devices=gpus,
            cpu_device=args.use_cpu,
        )
        workspace.RunNetOnce(test_model.param_init_net)
        workspace.CreateNet(test_model.net)

    epoch = 0
    # load the pre-trained model and reset epoch
    if args.load_model_path is not None:
        LoadModel(args.load_model_path, train_model)

        # Sync the model params
        data_parallel_model.FinalizeAfterCheckpoint(train_model)

        # reset epoch. load_model_path should end with *_X.mdl,
        # where X is the epoch number
        last_str = args.load_model_path.split('_')[-1]
        if last_str.endswith('.mdl'):
            epoch = int(last_str[:-4])
            log.info("Reset epoch to {}".format(epoch))
        else:
            log.warning("The format of load_model_path doesn't match!")

    expname = "log/{}/resnet50_gpu{}_b{}_L{}_lr{:.2f}_v2".format(
        args.dataset_name,
        args.num_gpus,
        total_batch_size,
        args.num_labels,
        args.base_learning_rate,
    )
    explog = experiment_util.ModelTrainerLog(expname, args)

    # Load pretrained param_init_net
    load_init_net_multigpu(args)

    # Run the training one epoch a time
    best_accuracy = 0
    while epoch < args.num_epochs:
        epoch, best_accuracy = RunEpoch(
            args,
            epoch,
            train_model,
            test_model,
            total_batch_size,
            num_shards,
            expname,
            explog,
            best_accuracy,
        )

        # Save the model for each epoch
        SaveModel(args, train_model, epoch)

        model_path = "%s/%s_" % (
            args.file_store_path,
            args.save_model_name
        )
        # remove the saved model from the previous epoch if it exists
        if os.path.isfile(model_path + str(epoch - 1) + ".mdl"):
            os.remove(model_path + str(epoch - 1) + ".mdl")
Esempio n. 5
0
def Train(args):
    if args.model == "resnext":
        model_name = "resnext" + str(args.num_layers)
    elif args.model == "shufflenet":
        model_name = "shufflenet"

    # Either use specified device list or generate one
    if args.gpus is not None:
        gpus = [int(x) for x in args.gpus.split(',')]
        num_gpus = len(gpus)
    else:
        gpus = list(range(args.num_gpus))
        num_gpus = args.num_gpus

    log.info("Running on GPUs: {}".format(gpus))

    # Verify valid batch size
    total_batch_size = args.batch_size
    batch_per_device = total_batch_size // num_gpus
    assert \
        total_batch_size % num_gpus == 0, \
        "Number of GPUs must divide batch size"

    # Verify valid image mean/std per channel
    if args.image_mean_per_channel:
        assert \
            len(args.image_mean_per_channel) == args.num_channels, \
            "The number of channels of image mean doesn't match input"

    if args.image_std_per_channel:
        assert \
            len(args.image_std_per_channel) == args.num_channels, \
            "The number of channels of image std doesn't match input"

    # Round down epoch size to closest multiple of batch size across machines
    global_batch_size = total_batch_size * args.num_shards
    epoch_iters = int(args.epoch_size / global_batch_size)

    assert \
        epoch_iters > 0, \
        "Epoch size must be larger than batch size times shard count"

    args.epoch_size = epoch_iters * global_batch_size
    log.info("Using epoch size: {}".format(args.epoch_size))

    # Create ModelHelper object
    if args.use_ideep:
        train_arg_scope = {
            'use_cudnn': False,
            'cudnn_exhaustive_search': False,
            'training_mode': 1
        }
    else:
        train_arg_scope = {
            'order': 'NCHW',
            'use_cudnn': True,
            'cudnn_exhaustive_search': True,
            'ws_nbytes_limit': (args.cudnn_workspace_limit_mb * 1024 * 1024),
        }
    train_model = model_helper.ModelHelper(
        name=model_name, arg_scope=train_arg_scope
    )

    num_shards = args.num_shards
    shard_id = args.shard_id

    # Expect interfaces to be comma separated.
    # Use of multiple network interfaces is not yet complete,
    # so simply use the first one in the list.
    interfaces = args.distributed_interfaces.split(",")

    # Rendezvous using MPI when run with mpirun
    if os.getenv("OMPI_COMM_WORLD_SIZE") is not None:
        num_shards = int(os.getenv("OMPI_COMM_WORLD_SIZE", 1))
        shard_id = int(os.getenv("OMPI_COMM_WORLD_RANK", 0))
        if num_shards > 1:
            rendezvous = dict(
                kv_handler=None,
                num_shards=num_shards,
                shard_id=shard_id,
                engine="GLOO",
                transport=args.distributed_transport,
                interface=interfaces[0],
                mpi_rendezvous=True,
                exit_nets=None)

    elif num_shards > 1:
        # Create rendezvous for distributed computation
        store_handler = "store_handler"
        if args.redis_host is not None:
            # Use Redis for rendezvous if Redis host is specified
            workspace.RunOperatorOnce(
                core.CreateOperator(
                    "RedisStoreHandlerCreate", [], [store_handler],
                    host=args.redis_host,
                    port=args.redis_port,
                    prefix=args.run_id,
                )
            )
        else:
            # Use filesystem for rendezvous otherwise
            workspace.RunOperatorOnce(
                core.CreateOperator(
                    "FileStoreHandlerCreate", [], [store_handler],
                    path=args.file_store_path,
                    prefix=args.run_id,
                )
            )

        rendezvous = dict(
            kv_handler=store_handler,
            shard_id=shard_id,
            num_shards=num_shards,
            engine="GLOO",
            transport=args.distributed_transport,
            interface=interfaces[0],
            exit_nets=None)

    else:
        rendezvous = None

    # Model building functions
    def create_resnext_model_ops(model, loss_scale):
        initializer = (PseudoFP16Initializer if args.dtype == 'float16'
                       else Initializer)

        with brew.arg_scope([brew.conv, brew.fc],
                            WeightInitializer=initializer,
                            BiasInitializer=initializer,
                            enable_tensor_core=args.enable_tensor_core,
                            float16_compute=args.float16_compute):
            pred = resnet.create_resnext(
                model,
                "data",
                num_input_channels=args.num_channels,
                num_labels=args.num_labels,
                num_layers=args.num_layers,
                num_groups=args.resnext_num_groups,
                num_width_per_group=args.resnext_width_per_group,
                no_bias=True,
                no_loss=True,
            )

        if args.dtype == 'float16':
            pred = model.net.HalfToFloat(pred, pred + '_fp32')

        softmax, loss = model.SoftmaxWithLoss([pred, 'label'],
                                              ['softmax', 'loss'])
        loss = model.Scale(loss, scale=loss_scale)
        brew.accuracy(model, [softmax, "label"], "accuracy", top_k=1)
        brew.accuracy(model, [softmax, "label"], "accuracy_top5", top_k=5)
        return [loss]

    def create_shufflenet_model_ops(model, loss_scale):
        initializer = (PseudoFP16Initializer if args.dtype == 'float16'
                       else Initializer)

        with brew.arg_scope([brew.conv, brew.fc],
                            WeightInitializer=initializer,
                            BiasInitializer=initializer,
                            enable_tensor_core=args.enable_tensor_core,
                            float16_compute=args.float16_compute):
            pred = shufflenet.create_shufflenet(
                model,
                "data",
                num_input_channels=args.num_channels,
                num_labels=args.num_labels,
                no_loss=True,
            )

        if args.dtype == 'float16':
            pred = model.net.HalfToFloat(pred, pred + '_fp32')

        softmax, loss = model.SoftmaxWithLoss([pred, 'label'],
                                              ['softmax', 'loss'])
        loss = model.Scale(loss, scale=loss_scale)
        brew.accuracy(model, [softmax, "label"], "accuracy", top_k=1)
        brew.accuracy(model, [softmax, "label"], "accuracy_top5", top_k=5)
        return [loss]

    def add_optimizer(model):
        stepsz = int(30 * args.epoch_size / total_batch_size / num_shards)

        if args.float16_compute:
            # TODO: merge with multi-precision optimizer
            opt = optimizer.build_fp16_sgd(
                model,
                args.base_learning_rate,
                momentum=0.9,
                nesterov=1,
                weight_decay=args.weight_decay,   # weight decay included
                policy="step",
                stepsize=stepsz,
                gamma=0.1
            )
        else:
            optimizer.add_weight_decay(model, args.weight_decay)
            opt = optimizer.build_multi_precision_sgd(
                model,
                args.base_learning_rate,
                momentum=0.9,
                nesterov=1,
                policy="step",
                stepsize=stepsz,
                gamma=0.1
            )
        return opt

    # Define add_image_input function.
    # Depends on the "train_data" argument.
    # Note that the reader will be shared with between all GPUS.
    if args.train_data == "null":
        def add_image_input(model):
            AddNullInput(
                model,
                None,
                batch_size=batch_per_device,
                img_size=args.image_size,
                dtype=args.dtype,
            )
    else:
        reader = train_model.CreateDB(
            "reader",
            db=args.train_data,
            db_type=args.db_type,
            num_shards=num_shards,
            shard_id=shard_id,
        )

        def add_image_input(model):
            AddImageInput(
                model,
                reader,
                batch_size=batch_per_device,
                img_size=args.image_size,
                dtype=args.dtype,
                is_test=False,
                mean_per_channel=args.image_mean_per_channel,
                std_per_channel=args.image_std_per_channel,
            )

    def add_post_sync_ops(model):
        """Add ops applied after initial parameter sync."""
        for param_info in model.GetOptimizationParamInfo(model.GetParams()):
            if param_info.blob_copy is not None:
                model.param_init_net.HalfToFloat(
                    param_info.blob,
                    param_info.blob_copy[core.DataType.FLOAT]
                )

    data_parallel_model.Parallelize(
        train_model,
        input_builder_fun=add_image_input,
        forward_pass_builder_fun=create_resnext_model_ops
        if args.model == "resnext" else create_shufflenet_model_ops,
        optimizer_builder_fun=add_optimizer,
        post_sync_builder_fun=add_post_sync_ops,
        devices=gpus,
        rendezvous=rendezvous,
        optimize_gradient_memory=False,
        use_nccl=args.use_nccl,
        cpu_device=args.use_cpu,
        ideep=args.use_ideep,
        shared_model=args.use_cpu,
        combine_spatial_bn=args.use_cpu,
    )

    data_parallel_model.OptimizeGradientMemory(train_model, {}, set(), False)

    workspace.RunNetOnce(train_model.param_init_net)
    workspace.CreateNet(train_model.net)

    # Add test model, if specified
    test_model = None
    if (args.test_data is not None):
        log.info("----- Create test net ----")
        if args.use_ideep:
            test_arg_scope = {
                'use_cudnn': False,
                'cudnn_exhaustive_search': False,
            }
        else:
            test_arg_scope = {
                'order': "NCHW",
                'use_cudnn': True,
                'cudnn_exhaustive_search': True,
            }
        test_model = model_helper.ModelHelper(
            name=model_name + "_test",
            arg_scope=test_arg_scope,
            init_params=False,
        )

        test_reader = test_model.CreateDB(
            "test_reader",
            db=args.test_data,
            db_type=args.db_type,
        )

        def test_input_fn(model):
            AddImageInput(
                model,
                test_reader,
                batch_size=batch_per_device,
                img_size=args.image_size,
                dtype=args.dtype,
                is_test=True,
                mean_per_channel=args.image_mean_per_channel,
                std_per_channel=args.image_std_per_channel,
            )

        data_parallel_model.Parallelize(
            test_model,
            input_builder_fun=test_input_fn,
            forward_pass_builder_fun=create_resnext_model_ops
            if args.model == "resnext" else create_shufflenet_model_ops,
            post_sync_builder_fun=add_post_sync_ops,
            param_update_builder_fun=None,
            devices=gpus,
            use_nccl=args.use_nccl,
            cpu_device=args.use_cpu,
        )
        workspace.RunNetOnce(test_model.param_init_net)
        workspace.CreateNet(test_model.net)

    epoch = 0
    # load the pre-trained model and reset epoch
    if args.load_model_path is not None:
        LoadModel(args.load_model_path, train_model, args.use_ideep)

        # Sync the model params
        data_parallel_model.FinalizeAfterCheckpoint(train_model)

        # reset epoch. load_model_path should end with *_X.mdl,
        # where X is the epoch number
        last_str = args.load_model_path.split('_')[-1]
        if last_str.endswith('.mdl'):
            epoch = int(last_str[:-4])
            log.info("Reset epoch to {}".format(epoch))
        else:
            log.warning("The format of load_model_path doesn't match!")

    expname = "%s_gpu%d_b%d_L%d_lr%.2f_v2" % (
        model_name,
        args.num_gpus,
        total_batch_size,
        args.num_labels,
        args.base_learning_rate,
    )

    explog = experiment_util.ModelTrainerLog(expname, args)

    # Run the training one epoch a time
    while epoch < args.num_epochs:
        epoch = RunEpoch(
            args,
            epoch,
            train_model,
            test_model,
            total_batch_size,
            num_shards,
            expname,
            explog
        )

        # Save the model for each epoch
        SaveModel(args, train_model, epoch, args.use_ideep)

        model_path = "%s/%s_" % (
            args.file_store_path,
            args.save_model_name
        )
        # remove the saved model from the previous epoch if it exists
        if os.path.isfile(model_path + str(epoch - 1) + ".mdl"):
            os.remove(model_path + str(epoch - 1) + ".mdl")
Esempio n. 6
0
def build_resnet50_dataparallel_model(
        num_gpus,
        batch_size,
        epoch_size,
        cudnn_workspace_limit_mb=64,
        num_channels=3,
        num_labels=1000,
        weight_decay=1e-4,
        base_learning_rate=0.1,
        image_size=227,
        use_cpu=False):

    batch_per_device = batch_size // num_gpus

    train_arg_scope = {
        'order': 'NCHW',
        'use_cudnn': True,
        'cudnn_exhaustive_search': False,
        'ws_nbytes_limit': (cudnn_workspace_limit_mb * 1024 * 1024),
        'deterministic': True,
    }
    train_model = model_helper.ModelHelper(
        name="test_resnet50", arg_scope=train_arg_scope
    )

    def create_resnet50_model_ops(model, loss_scale):
        with brew.arg_scope([brew.conv, brew.fc],
                            WeightInitializer=Initializer,
                            BiasInitializer=Initializer,
                            enable_tensor_core=0):
            pred = resnet.create_resnet50(
                model,
                "data",
                num_input_channels=num_channels,
                num_labels=num_labels,
                no_bias=True,
                no_loss=True,
            )

        softmax, loss = model.SoftmaxWithLoss([pred, 'label'],
                                              ['softmax', 'loss'])
        loss = model.Scale(loss, scale=loss_scale)
        brew.accuracy(model, [softmax, "label"], "accuracy")
        return [loss]

    def add_optimizer(model):
        stepsz = int(30 * epoch_size / batch_size)
        optimizer.add_weight_decay(model, weight_decay)
        opt = optimizer.build_multi_precision_sgd(
            model,
            base_learning_rate,
            momentum=0.9,
            nesterov=1,
            policy="step",
            stepsize=stepsz,
            gamma=0.1
        )
        return opt

    def add_image_input(model):
        model.param_init_net.GaussianFill(
            [],
            ["data"],
            shape=[batch_per_device, 3, image_size, image_size],
            dtype='float',
        )
        model.param_init_net.ConstantFill(
            [],
            ["label"],
            shape=[batch_per_device],
            value=1,
            dtype=core.DataType.INT32,
        )

    def add_post_sync_ops(model):
        for param_info in model.GetOptimizationParamInfo(model.GetParams()):
            if param_info.blob_copy is not None:
                model.param_init_net.HalfToFloat(
                    param_info.blob,
                    param_info.blob_copy[core.DataType.FLOAT])

    # Create parallelized model
    data_parallel_model.Parallelize(
        train_model,
        input_builder_fun=add_image_input,
        forward_pass_builder_fun=create_resnet50_model_ops,
        optimizer_builder_fun=add_optimizer,
        post_sync_builder_fun=add_post_sync_ops,
        devices=list(range(num_gpus)),
        rendezvous=None,
        optimize_gradient_memory=True,
        cpu_device=use_cpu,
        shared_model=use_cpu,
    )

    return train_model
Esempio n. 7
0
    def run_model(self, devices, gpu):
        '''
        Helper function for test_equiv
        '''
        def input_builder_fun(model):
            return None

        def model_build_fun(model, loss_scale):
            fc = model.FC("data", "fc", 16, 1,
                          ("ConstantFill", {}), ("ConstantFill", {}))
            fc_fl = model.FlattenToVec(fc, "fc_fl")
            sigm = model.Sigmoid(fc_fl, "sigm")
            sq = model.SquaredL2Distance([sigm, "label"], "sq")
            loss = model.AveragedLoss(sq, "loss")
            loss = model.Scale(loss, scale=loss_scale)
            return [loss]

        def add_optimizer(model):
            optimizer.build_sgd(model, 0.1, policy="fixed")

        workspace.ResetWorkspace()
        model = cnn.CNNModelHelper(
            order="NHWC",
            name="test{}".format(devices),
        )
        data_parallel_model.Parallelize(
            model,
            input_builder_fun=input_builder_fun,
            forward_pass_builder_fun=model_build_fun,
            optimizer_builder_fun=add_optimizer,
            devices=devices,
            cpu_device=not gpu,
        )

        np.random.seed(2603)

        # Each run has same input, independent of number of gpus
        batch_size = 64
        for i in range(0, 10):
            full_data = np.random.rand(batch_size, 16)
            full_labels = np.round(full_data[:, 0])
            batch_per_device = batch_size // len(devices)

            for (j, g) in enumerate(devices):
                st = j * batch_per_device
                en = st + batch_per_device
                data = full_data[st:en, :].astype(np.float32)
                labels = full_labels[st:en].astype(np.float32)
                with core.DeviceScope(core.DeviceOption(model._device_type, g)):
                    workspace.FeedBlob(
                        "{}_{}/data".format(model._device_prefix, g), data
                    )
                    workspace.FeedBlob(
                        "{}_{}/label".format(model._device_prefix, g), labels
                    )

            if i == 0:
                workspace.RunNetOnce(model.param_init_net)
                workspace.CreateNet(model.net)

            workspace.RunNet(model.net.Proto().name)
        return workspace.FetchBlob("{}_0/fc_w".format(model._device_prefix))
Esempio n. 8
0
    def run_model(self, devices, gpu):

        '''
        Helper function for test_equiv
        '''
        def input_builder_fun(model):
            return None

        def model_build_fun(model, loss_scale):
            workspace.FeedBlob(
                core.ScopedBlobReference("seq_lengths"),
                np.array([self.T] * self.batch_per_device, dtype=np.int32)
            )
            model.param_init_net.ConstantFill(
                [],
                "hidden_init",
                value=0.0,
                shape=[1, self.batch_per_device, self.hidden_dim]
            )
            model.param_init_net.ConstantFill(
                [],
                "cell_init",
                value=0.0,
                shape=[1, self.batch_per_device, self.hidden_dim]
            )

            output, _last_hidden, _, _last_state, = rnn_cell.LSTM(
                model=model,
                input_blob="data",
                seq_lengths="seq_lengths",
                initial_states=("hidden_init", "cell_init"),
                dim_in=self.input_dim,
                dim_out=self.hidden_dim,
                scope="partest",
            )

            # A silly loss function
            loss = model.AveragedLoss(
                model.Sub([output, "target"], "dist"),
                "loss",
            )
            loss = model.Scale(loss, "loss_scaled", scale=loss_scale)
            return [loss]

        def param_update_fun(model):
            ITER = model.Iter("ITER")
            LR = model.net.LearningRate(
                [ITER],
                "LR",
                base_lr=(-0.1),
                policy="fixed",
            )
            ONE = model.param_init_net.ConstantFill(
                [], "ONE", shape=[1], value=1.0,
            )
            for param in model.GetParams():
                param_grad = model.param_to_grad[param]
                model.WeightedSum([param, ONE, param_grad, LR], param)

            assert len(model.GetParams()) == len(model.params) // len(model._devices)

        workspace.ResetWorkspace()
        model = cnn.CNNModelHelper(
            name="recurrent_test{}".format(devices),
        )

        self.T = 8
        self.batch_size = 64
        self.input_dim = 8
        self.hidden_dim = 31
        self.batch_per_device = self.batch_size // len(devices)

        data_parallel_model.Parallelize(
            model,
            input_builder_fun=input_builder_fun,
            forward_pass_builder_fun=model_build_fun,
            param_update_builder_fun=param_update_fun,
            devices=devices,
            optimize_gradient_memory=True,
            cpu_device=not gpu,
        )

        # Change all initialization to be ConstantFills so that
        # the everything is deterministic
        for op in model.param_init_net.Proto().op:
            if op.type.endswith('Fill'):
                op.type = 'ConstantFill'

        # Each run has same input, independent of number of gpus
        np.random.seed(20150210)
        for i in range(0, 10):
            full_data = np.random.rand(self.T, self.batch_size, self.input_dim)
            full_target = np.random.rand(
                self.T, self.batch_size, self.hidden_dim
            )

            for (j, g) in enumerate(devices):
                st = j * self.batch_per_device
                en = st + self.batch_per_device
                data = full_data[:, st:en, :].astype(np.float32)
                targets = full_target[:, st:en, :].astype(np.float32)
                with core.DeviceScope(core.DeviceOption(model._device_type, g)):
                    workspace.FeedBlob(
                        "{}_{}/data".format(model._device_prefix, g), data
                    )
                    workspace.FeedBlob(
                        "{}_{}/target".format(model._device_prefix, g), targets
                    )

            if i == 0:
                workspace.RunNetOnce(model.param_init_net)
                workspace.CreateNet(model.net)

            workspace.RunNet(model.net.Proto().name)

        return workspace.FetchBlob("{}_0/partest/i2h_w".format(model._device_prefix))
Esempio n. 9
0
def Train(args):
    # Either use specified device list or generate one
    if args.gpus is not None:
        gpus = [int(x) for x in args.gpus.split(',')]
        num_gpus = len(gpus)
    else:
        gpus = list(range(args.num_gpus))
        num_gpus = args.num_gpus

    log.info("Running on GPUs: {}".format(gpus))

    # Verify valid batch size
    total_batch_size = args.batch_size
    batch_per_device = total_batch_size // num_gpus
    assert \
        total_batch_size % num_gpus == 0, \
        "Number of GPUs must divide batch size"

    # Round down epoch size to closest multiple of batch size across machines
    global_batch_size = total_batch_size * args.num_shards
    epoch_iters = int(args.epoch_size / global_batch_size)
    args.epoch_size = epoch_iters * global_batch_size
    log.info("Using epoch size: {}".format(args.epoch_size))

    # Create ModelHelper object
    train_arg_scope = {
        'order': 'NCHW',
        'use_cudnn': True,
        'cudnn_exhaustice_search': True,
        'ws_nbytes_limit': (args.cudnn_workspace_limit_mb * 1024 * 1024),
    }
    train_model = model_helper.ModelHelper(
        name="resnet50", arg_scope=train_arg_scope
    )

    num_shards = args.num_shards
    shard_id = args.shard_id
    if num_shards > 1:
        # Create rendezvous for distributed computation
        store_handler = "store_handler"
        if args.redis_host is not None:
            # Use Redis for rendezvous if Redis host is specified
            workspace.RunOperatorOnce(
                core.CreateOperator(
                    "RedisStoreHandlerCreate", [], [store_handler],
                    host=args.redis_host,
                    port=args.redis_port,
                    prefix=args.run_id,
                )
            )
        else:
            # Use filesystem for rendezvous otherwise
            workspace.RunOperatorOnce(
                core.CreateOperator(
                    "FileStoreHandlerCreate", [], [store_handler],
                    path=args.file_store_path,
                )
            )
        rendezvous = dict(
            kv_handler=store_handler,
            shard_id=shard_id,
            num_shards=num_shards,
            engine="GLOO",
            exit_nets=None)
    else:
        rendezvous = None

    # Model building functions
    def create_resnet50_model_ops(model, loss_scale):
        [softmax, loss] = resnet.create_resnet50(
            model,
            "data",
            num_input_channels=args.num_channels,
            num_labels=args.num_labels,
            label="label",
            no_bias=True,
        )
        loss = model.Scale(loss, scale=loss_scale)
        brew.accuracy(model, [softmax, "label"], "accuracy")
        return [loss]

    def add_optimizer(model):
        stepsz = int(30 * args.epoch_size / total_batch_size / num_shards)
        optimizer.add_weight_decay(model, args.weight_decay)
        opt = optimizer.build_sgd(
            model,
            args.base_learning_rate,
            momentum=0.9,
            nesterov=1,
            policy="step",
            stepsize=stepsz,
            gamma=0.1
        )
        return opt

    # Input. Note that the reader must be shared with all GPUS.
    reader = train_model.CreateDB(
        "reader",
        db=args.train_data,
        db_type=args.db_type,
        num_shards=num_shards,
        shard_id=shard_id,
    )

    def add_image_input(model):
        AddImageInput(
            model,
            reader,
            batch_size=batch_per_device,
            img_size=args.image_size,
        )

    # Create parallelized model
    data_parallel_model.Parallelize(
        train_model,
        input_builder_fun=add_image_input,
        forward_pass_builder_fun=create_resnet50_model_ops,
        optimizer_builder_fun=add_optimizer,
        devices=gpus,
        rendezvous=rendezvous,
        optimize_gradient_memory=True,
        cpu_device=args.use_cpu,
    )

    # Add test model, if specified
    test_model = None
    if (args.test_data is not None):
        log.info("----- Create test net ----")
        test_arg_scope = {
            'order': "NCHW",
            'use_cudnn': True,
            'cudnn_exhaustive_search': True,
        }
        test_model = model_helper.ModelHelper(
            name="resnet50_test", arg_scope=test_arg_scope, init_params=False
        )

        test_reader = test_model.CreateDB(
            "test_reader",
            db=args.test_data,
            db_type=args.db_type,
        )

        def test_input_fn(model):
            AddImageInput(
                model,
                test_reader,
                batch_size=batch_per_device,
                img_size=args.image_size,
            )

        data_parallel_model.Parallelize(
            test_model,
            input_builder_fun=test_input_fn,
            forward_pass_builder_fun=create_resnet50_model_ops,
            param_update_builder_fun=None,
            devices=gpus,
            cpu_device=args.use_cpu,
        )
        workspace.RunNetOnce(test_model.param_init_net)
        workspace.CreateNet(test_model.net)

    workspace.RunNetOnce(train_model.param_init_net)
    workspace.CreateNet(train_model.net)

    epoch = 0
    # load the pre-trained model and reset epoch
    if args.load_model_path is not None:
        LoadModel(args.load_model_path, train_model)

        # Sync the model params
        data_parallel_model.FinalizeAfterCheckpoint(train_model)

        # reset epoch. load_model_path should end with *_X.mdl,
        # where X is the epoch number
        last_str = args.load_model_path.split('_')[-1]
        if last_str.endswith('.mdl'):
            epoch = int(last_str[:-4])
            log.info("Reset epoch to {}".format(epoch))
        else:
            log.warning("The format of load_model_path doesn't match!")

    expname = "resnet50_gpu%d_b%d_L%d_lr%.2f_v2" % (
        args.num_gpus,
        total_batch_size,
        args.num_labels,
        args.base_learning_rate,
    )
    explog = experiment_util.ModelTrainerLog(expname, args)

    # Run the training one epoch a time
    while epoch < args.num_epochs:
        epoch = RunEpoch(
            args,
            epoch,
            train_model,
            test_model,
            total_batch_size,
            num_shards,
            expname,
            explog
        )

        # Save the model for each epoch
        SaveModel(args, train_model, epoch)

        model_path = "%s/%s_" % (
            args.file_store_path,
            args.save_model_name
        )
        # remove the saved model from the previous epoch if it exists
        if os.path.isfile(model_path + str(epoch - 1) + ".mdl"):
            os.remove(model_path + str(epoch - 1) + ".mdl")
Esempio n. 10
0
def Train(args):
    # Either use specified device list or generate one
    if args.gpus is not None:
        gpus = [int(x) for x in args.gpus.split(',')]
        num_gpus = len(gpus)
    else:
        gpus = list(range(args.num_gpus))
        num_gpus = args.num_gpus

    log.info("Running on GPUs: {}".format(gpus))

    # Verify valid batch size
    total_batch_size = args.batch_size
    batch_per_device = total_batch_size // num_gpus

    global_batch_size = total_batch_size * args.num_shards
    epoch_iters = int(args.epoch_size / global_batch_size)
    args.epoch_size = epoch_iters * global_batch_size
    log.info("Using epoch size: {}".format(args.epoch_size))

    train_arg_scope = {
        'order': 'NCHW',
        'use_cudnn': True,
        'cudnn_exhaustive_search': True,
        'ws_nbytes_limit': (args.cudnn_workspace_limit_mb * 1024 * 1024),
    }
    train_model = model_helper.ModelHelper(name="resnet101",
                                           arg_scope=train_arg_scope)

    num_shards = args.num_shards
    shard_id = args.shard_id
    interfaces = args.distributed_interfaces.split(",")

    if os.getenv("OMPI_COMM_WORLD_SIZE") is not None:
        num_shards = int(os.getenv("OMPI_COMM_WORLD_SIZE", 1))
        shard_id = int(os.getenv("OMPI_COMM_WORLD_RANK", 0))
        if num_shards > 1:
            rendezvous = dict(kv_handler=None,
                              num_shards=num_shards,
                              shard_id=shard_id,
                              engine="GLOO",
                              transport=args.distributed_transport,
                              interface=interfaces[0],
                              mpi_rendezvous=True,
                              exit_nets=None)

    elif num_shards > 1:
        store_handler = "store_handler"
        if args.redis_host is not None:
            workspace.RunOperatorOnce(
                core.CreateOperator(
                    "RedisStoreHandlerCreate",
                    [],
                    [store_handler],
                    host=args.redis_host,
                    port=args.redis_port,
                    prefix=args.run_id,
                ))
        else:
            workspace.RunOperatorOnce(
                core.CreateOperator(
                    "FileStoreHandlerCreate",
                    [],
                    [store_handler],
                    path=args.file_store_path,
                    prefix=args.run_id,
                ))

        rendezvous = dict(kv_handler=store_handler,
                          shard_id=shard_id,
                          num_shards=num_shards,
                          engine="GLOO",
                          transport=args.distributed_transport,
                          interface=interfaces[0],
                          exit_nets=None)

    else:
        rendezvous = None

    def create_resnet101_model_ops(model, loss_scale):
        initializer = (pFP16Initializer
                       if args.dtype == 'float16' else Initializer)

        with brew.arg_scope([brew.conv, brew.fc],
                            WeightInitializer=initializer,
                            BiasInitializer=initializer,
                            enable_tensor_core=args.enable_tensor_core,
                            float16_compute=args.float16_compute):
            pred = resnet.create_resnet101(
                model,
                "data",
                num_input_channels=args.num_channels,
                num_labels=args.num_labels,
                no_bias=True,
                no_loss=True,
            )

        if args.dtype == 'float16':
            pred = model.net.HalfToFloat(pred, pred + '_fp32')

        softmax, loss = model.SoftmaxWithLoss([pred, 'label'],
                                              ['softmax', 'loss'])
        loss = model.Scale(loss, scale=loss_scale)
        brew.accuracy(model, [softmax, "label"], "accuracy")
        return [loss]

    def add_optimizer(model):
        stepsz = int(30 * args.epoch_size / total_batch_size / num_shards)

        if args.float16_compute:
            opt = optimizer.build_fp16_sgd(model,
                                           args.base_learning_rate,
                                           momentum=0.9,
                                           nesterov=1,
                                           weight_decay=args.weight_decay,
                                           policy="step",
                                           stepsize=stepsz,
                                           gamma=0.1)
        else:
            optimizer.add_weight_decay(model, args.weight_decay)
            opt = optimizer.build_multi_precision_sgd(model,
                                                      args.base_learning_rate,
                                                      momentum=0.9,
                                                      nesterov=1,
                                                      policy="step",
                                                      stepsize=stepsz,
                                                      gamma=0.1)
        return opt

    if args.train_data == "null":

        def add_image_input(model):
            AddNullInput(
                model,
                None,
                batch_size=batch_per_device,
                img_size=args.image_size,
                dtype=args.dtype,
            )
    else:
        reader = train_model.CreateDB(
            "reader",
            db=args.train_data,
            db_type=args.db_type,
            num_shards=num_shards,
            shard_id=shard_id,
        )

        def add_image_input(model):
            AddImageInput(
                model,
                reader,
                batch_size=batch_per_device,
                img_size=args.image_size,
                dtype=args.dtype,
                is_test=False,
            )

    def add_post_sync_ops(model):
        for param_info in model.GetOptimizationParamInfo(model.GetParams()):
            if param_info.blob_copy is not None:
                model.param_init_net.HalfToFloat(
                    param_info.blob, param_info.blob_copy[core.DataType.FLOAT])

    data_parallel_model.Parallelize(
        train_model,
        input_builder_fun=add_image_input,
        forward_pass_builder_fun=create_resnet101_model_ops,
        optimizer_builder_fun=add_optimizer,
        post_sync_builder_fun=add_post_sync_ops,
        devices=gpus,
        rendezvous=rendezvous,
        optimize_gradient_memory=False,
        cpu_device=args.use_cpu,
        shared_model=args.use_cpu,
    )

    if args.model_parallel:
        activations = data_parallel_model_utils.GetActivationBlobs(train_model)
        data_parallel_model_utils.ShiftActivationDevices(
            train_model,
            activations=activations[len(activations) // 2:],
            shifts={g: args.num_gpus + g
                    for g in range(args.num_gpus)},
        )

    data_parallel_model.OptimizeGradientMemory(train_model, {}, set(), False)

    workspace.RunNetOnce(train_model.param_init_net)
    workspace.CreateNet(train_model.net)

    test_model = None
    if (args.test_data is not None):
        log.info("----- Create test net ----")
        test_arg_scope = {
            'order': "NCHW",
            'use_cudnn': True,
            'cudnn_exhaustive_search': True,
        }
        test_model = model_helper.ModelHelper(name="resnet101_test",
                                              arg_scope=test_arg_scope,
                                              init_params=False)

        test_reader = test_model.CreateDB(
            "test_reader",
            db=args.test_data,
            db_type=args.db_type,
        )

        def test_input_fn(model):
            AddImageInput(
                model,
                test_reader,
                batch_size=batch_per_device,
                img_size=args.image_size,
                dtype=args.dtype,
                is_test=True,
            )

        data_parallel_model.Parallelize(
            test_model,
            input_builder_fun=test_input_fn,
            forward_pass_builder_fun=create_resnet101_model_ops,
            post_sync_builder_fun=add_post_sync_ops,
            param_update_builder_fun=None,
            devices=gpus,
            cpu_device=args.use_cpu,
        )
        workspace.RunNetOnce(test_model.param_init_net)
        workspace.CreateNet(test_model.net)

    epoch = 0
    if args.load_model_path is not None:
        LoadModel(args.load_model_path, train_model)
        data_parallel_model.FinalizeAfterCheckpoint(train_model)
        last_str = args.load_model_path.split('_')[-1]
        if last_str.endswith('.mdl'):
            epoch = int(last_str[:-4])
            log.info("Reset epoch to {}".format(epoch))
        else:
            log.warning("The format of load_model_path doesn't match!")

    expname = "resnet101_gpu%d_b%d_L%d_lr%.2f_v2" % (
        args.num_gpus,
        total_batch_size,
        args.num_labels,
        args.base_learning_rate,
    )
    explog = experiment_util.ModelTrainerLog(expname, args)

    while epoch < args.num_epochs:
        epoch = RunEpoch(args, epoch, train_model, test_model,
                         total_batch_size, num_shards, expname, explog)
    # final save
    SaveModel(workspace, train_model)
Esempio n. 11
0
def Train(args):
    # Either use specified device list or generate one
    if args.gpus is not None:
        gpus = [int(x) for x in args.gpus.split(',')]
        num_gpus = len(gpus)
    else:
        gpus = list(range(args.num_gpus))
        num_gpus = args.num_gpus

    log.info("Running on GPUs: {}".format(gpus))

    # Verify valid batch size
    total_batch_size = args.batch_size
    batch_per_device = total_batch_size // num_gpus
    assert \
        total_batch_size % num_gpus == 0, \
        "Number of GPUs must divide batch size"

    # Round down epoch size to closest multiple of batch size across machines
    global_batch_size = total_batch_size * args.num_shards
    epoch_iters = int(args.epoch_size / global_batch_size)

    assert \
        epoch_iters > 0, \
        "Epoch size must be larger than batch size times shard count"

    args.epoch_size = epoch_iters * global_batch_size
    log.info("Using epoch size: {}".format(args.epoch_size))

    # Create ModelHelper object
    train_arg_scope = {
        'order': 'NCHW',
        'use_cudnn': True,
        'cudnn_exhaustive_search': True,
        'ws_nbytes_limit': (args.cudnn_workspace_limit_mb * 1024 * 1024),
    }
    train_model = model_helper.ModelHelper(name="resnet50",
                                           arg_scope=train_arg_scope)

    num_shards = args.num_shards
    shard_id = args.shard_id

    # Expect interfaces to be comma separated.
    # Use of multiple network interfaces is not yet complete,
    # so simply use the first one in the list.
    interfaces = args.distributed_interfaces.split(",")

    # Rendezvous using MPI when run with mpirun
    if os.getenv("OMPI_COMM_WORLD_SIZE") is not None:
        num_shards = int(os.getenv("OMPI_COMM_WORLD_SIZE", 1))
        shard_id = int(os.getenv("OMPI_COMM_WORLD_RANK", 0))
        if num_shards > 1:
            rendezvous = dict(kv_handler=None,
                              num_shards=num_shards,
                              shard_id=shard_id,
                              engine="GLOO",
                              transport=args.distributed_transport,
                              interface=interfaces[0],
                              mpi_rendezvous=True,
                              exit_nets=None)

    elif num_shards > 1:
        # Create rendezvous for distributed computation
        store_handler = "store_handler"
        if args.redis_host is not None:
            # Use Redis for rendezvous if Redis host is specified
            workspace.RunOperatorOnce(
                core.CreateOperator(
                    "RedisStoreHandlerCreate",
                    [],
                    [store_handler],
                    host=args.redis_host,
                    port=args.redis_port,
                    prefix=args.run_id,
                ))
        else:
            # Use filesystem for rendezvous otherwise
            workspace.RunOperatorOnce(
                core.CreateOperator(
                    "FileStoreHandlerCreate",
                    [],
                    [store_handler],
                    path=args.file_store_path,
                    prefix=args.run_id,
                ))

        rendezvous = dict(kv_handler=store_handler,
                          shard_id=shard_id,
                          num_shards=num_shards,
                          engine="GLOO",
                          transport=args.distributed_transport,
                          interface=interfaces[0],
                          exit_nets=None)

    else:
        rendezvous = None

    # Model building functions
    # def create_resnet50_model_ops(model, loss_scale):
    #     initializer = (PseudoFP16Initializer if args.dtype == 'float16'
    #                    else Initializer)

    #     with brew.arg_scope([brew.conv, brew.fc],
    #                         WeightInitializer=initializer,
    #                         BiasInitializer=initializer,
    #                         enable_tensor_core=args.enable_tensor_core,
    #                         float16_compute=args.float16_compute):
    #         pred = resnet.create_resnet50(
    #             #args.layers,
    #             model,
    #             "data",
    #             num_input_channels=args.num_channels,
    #             num_labels=args.num_labels,
    #             no_bias=True,
    #             no_loss=True,
    #         )

    #     if args.dtype == 'float16':
    #         pred = model.net.HalfToFloat(pred, pred + '_fp32')

    #     softmax, loss = model.SoftmaxWithLoss([pred, 'label'],
    #                                           ['softmax', 'loss'])
    #     loss = model.Scale(loss, scale=loss_scale)
    #     brew.accuracy(model, [softmax, "label"], "accuracy")
    #     return [loss]

    def create_model_ops(model, loss_scale):
        return create_model_ops_testable(model, loss_scale, is_test=False)

    def create_model_ops_test(model, loss_scale):
        return create_model_ops_testable(model, loss_scale, is_test=True)

    # Model building functions
    def create_model_ops_testable(model, loss_scale, is_test=False):
        initializer = (PseudoFP16Initializer
                       if args.dtype == 'float16' else Initializer)

        with brew.arg_scope([brew.conv, brew.fc],
                            WeightInitializer=initializer,
                            BiasInitializer=initializer,
                            enable_tensor_core=args.enable_tensor_core,
                            float16_compute=args.float16_compute):

            if args.model == "cifar10":
                if args.image_size != 32:
                    log.warn("Cifar10 expects a 32x32 image.")
                pred = models.cifar10.create_cifar10(
                    model,
                    "data",
                    image_channels=args.num_channels,
                    num_classes=args.num_labels,
                    image_height=args.image_size,
                    image_width=args.image_size,
                )
            elif args.model == "resnet32x32":
                if args.image_size != 32:
                    log.warn("ResNet32x32 expects a 32x32 image.")
                pred = models.resnet.create_resnet32x32(
                    model,
                    "data",
                    num_layers=args.num_layers,
                    num_input_channels=args.num_channels,
                    num_labels=args.num_labels,
                    is_test=is_test)
            elif args.model == "resnet":
                if args.image_size != 224:
                    log.warn(
                        "ResNet expects a 224x224 image. input image = %d" %
                        args.image_size)
                pred = resnet.create_resnet50(
                    #args.layers,
                    model,
                    "data",
                    num_input_channels=args.num_channels,
                    num_labels=args.num_labels,
                    no_bias=True,
                    no_loss=True,
                )
            elif args.model == "vgg":
                if args.image_size != 224:
                    log.warn("VGG expects a 224x224 image.")
                pred = vgg.create_vgg(model,
                                      "data",
                                      num_input_channels=args.num_channels,
                                      num_labels=args.num_labels,
                                      num_layers=args.num_layers,
                                      is_test=is_test)
            elif args.model == "googlenet":
                if args.image_size != 224:
                    log.warn("GoogLeNet expects a 224x224 image.")
                pred = googlenet.create_googlenet(
                    model,
                    "data",
                    num_input_channels=args.num_channels,
                    num_labels=args.num_labels,
                    is_test=is_test)
            elif args.model == "alexnet":
                if args.image_size != 224:
                    log.warn("Alexnet expects a 224x224 image.")
                pred = alexnet.create_alexnet(
                    model,
                    "data",
                    num_input_channels=args.num_channels,
                    num_labels=args.num_labels,
                    is_test=is_test)
            elif args.model == "alexnetv0":
                if args.image_size != 224:
                    log.warn("Alexnet v0 expects a 224x224 image.")
                pred = alexnet.create_alexnetv0(
                    model,
                    "data",
                    num_input_channels=args.num_channels,
                    num_labels=args.num_labels,
                    is_test=is_test)
            else:
                raise NotImplementedError("Network {} not found.".format(
                    args.model))

        if args.dtype == 'float16':
            pred = model.net.HalfToFloat(pred, pred + '_fp32')

        softmax, loss = model.SoftmaxWithLoss([pred, 'label'],
                                              ['softmax', 'loss'])
        loss = model.Scale(loss, scale=loss_scale)
        brew.accuracy(model, [softmax, "label"], "accuracy")
        return [loss]

    def add_optimizer(model):
        stepsz = int(30 * args.epoch_size / total_batch_size / num_shards)

        if args.float16_compute:
            # TODO: merge with multi-prceision optimizer
            opt = optimizer.build_fp16_sgd(
                model,
                args.base_learning_rate,
                momentum=0.9,
                nesterov=1,
                weight_decay=args.weight_decay,  # weight decay included
                policy="step",
                stepsize=stepsz,
                gamma=0.1)
        else:
            optimizer.add_weight_decay(model, args.weight_decay)
            opt = optimizer.build_multi_precision_sgd(model,
                                                      args.base_learning_rate,
                                                      momentum=0.9,
                                                      nesterov=1,
                                                      policy="step",
                                                      stepsize=stepsz,
                                                      gamma=0.1)
            print("info:===============================" + str(opt))
        return opt

    # Define add_image_input function.
    # Depends on the "train_data" argument.
    # Note that the reader will be shared with between all GPUS.
    if args.train_data == "null":

        def add_image_input(model):
            AddNullInput(
                model,
                None,
                batch_size=batch_per_device,
                img_size=args.image_size,
                dtype=args.dtype,
            )
    else:
        reader = train_model.CreateDB(
            "reader",
            db=args.train_data,
            db_type=args.db_type,
            num_shards=num_shards,
            shard_id=shard_id,
        )

        def add_image_input(model):
            AddImageInput(
                model,
                reader,
                batch_size=batch_per_device,
                img_size=args.image_size,
                dtype=args.dtype,
                is_test=False,
            )

    def add_post_sync_ops(model):
        """Add ops applied after initial parameter sync."""
        for param_info in model.GetOptimizationParamInfo(model.GetParams()):
            if param_info.blob_copy is not None:
                model.param_init_net.HalfToFloat(
                    param_info.blob, param_info.blob_copy[core.DataType.FLOAT])

    # Create parallelized model
    data_parallel_model.Parallelize(train_model,
                                    input_builder_fun=add_image_input,
                                    forward_pass_builder_fun=create_model_ops,
                                    optimizer_builder_fun=add_optimizer,
                                    post_sync_builder_fun=add_post_sync_ops,
                                    devices=gpus,
                                    rendezvous=rendezvous,
                                    optimize_gradient_memory=False,
                                    cpu_device=args.use_cpu,
                                    shared_model=args.use_cpu,
                                    combine_spatial_bn=args.use_cpu,
                                    use_nccl=args.use_nccl)

    if args.model_parallel:
        # Shift half of the activations to another GPU
        assert workspace.NumCudaDevices() >= 2 * args.num_gpus
        activations = data_parallel_model_utils.GetActivationBlobs(train_model)
        data_parallel_model_utils.ShiftActivationDevices(
            train_model,
            activations=activations[len(activations) // 2:],
            shifts={g: args.num_gpus + g
                    for g in range(args.num_gpus)},
        )

    data_parallel_model.OptimizeGradientMemory(train_model, {}, set(), False)

    workspace.RunNetOnce(train_model.param_init_net)
    workspace.CreateNet(train_model.net)

    if "GLOO_ALGORITHM" in os.environ and os.environ[
            "GLOO_ALGORITHM"] == "PHUB":
        #i need to communicate to PHub about the elements that need aggregation,
        #as well as their sizes.
        #at this stage, all i need is the name of keys and my key ID.
        grad_names = list(reversed(train_model._grad_names))
        phubKeyNames = ["allreduce_{}_status".format(x) for x in grad_names]
        caffe2GradSizes = dict(
            zip([
                data_parallel_model.stripBlobName(name) + "_grad"
                for name in train_model._parameters_info.keys()
            ], [x.size for x in train_model._parameters_info.values()]))
        phubKeySizes = [str(caffe2GradSizes[x]) for x in grad_names]
        if rendezvous["shard_id"] == 0:
            #only id 0 needs to send to rendezvous.
            r = redis.StrictRedis()
            #foreach key, I need to assign an ID
            joinedStr = ",".join(phubKeyNames)
            r.set("[PLink]IntegrationKeys", joinedStr)
            joinedStr = ",".join(phubKeySizes)
            r.set("[PLink]IntegrationKeySizes", joinedStr)

    # Add test model, if specified
    test_model = None
    if (args.test_data is not None):
        log.info("----- Create test net ----")
        test_arg_scope = {
            'order': "NCHW",
            'use_cudnn': True,
            'cudnn_exhaustive_search': True,
        }
        test_model = model_helper.ModelHelper(name="resnet50_test",
                                              arg_scope=test_arg_scope,
                                              init_params=False)

        test_reader = test_model.CreateDB(
            "test_reader",
            db=args.test_data,
            db_type=args.db_type,
        )

        def test_input_fn(model):
            AddImageInput(
                model,
                test_reader,
                batch_size=batch_per_device,
                img_size=args.image_size,
                dtype=args.dtype,
                is_test=True,
            )

        data_parallel_model.Parallelize(
            test_model,
            input_builder_fun=test_input_fn,
            forward_pass_builder_fun=create_model_ops_test,
            post_sync_builder_fun=add_post_sync_ops,
            param_update_builder_fun=None,
            devices=gpus,
            cpu_device=args.use_cpu,
        )
        workspace.RunNetOnce(test_model.param_init_net)
        workspace.CreateNet(test_model.net)

    epoch = 0
    # load the pre-trained model and reset epoch
    if args.load_model_path is not None:
        LoadModel(args.load_model_path, train_model)

        # Sync the model params
        data_parallel_model.FinalizeAfterCheckpoint(train_model)

        # reset epoch. load_model_path should end with *_X.mdl,
        # where X is the epoch number
        last_str = args.load_model_path.split('_')[-1]
        if last_str.endswith('.mdl'):
            epoch = int(last_str[:-4])
            log.info("Reset epoch to {}".format(epoch))
        else:
            log.warning("The format of load_model_path doesn't match!")

    expname = "resnet50_gpu%d_b%d_L%d_lr%.2f_v2" % (
        args.num_gpus,
        total_batch_size,
        args.num_labels,
        args.base_learning_rate,
    )

    explog = experiment_util.ModelTrainerLog(expname, args)

    # Run the training one epoch a time
    while epoch < args.num_epochs:
        epoch = RunEpoch(args, epoch, train_model, test_model,
                         total_batch_size, num_shards, expname, explog)

        # Save the model for each epoch
        SaveModel(args, train_model, epoch)

        model_path = "%s/%s_" % (args.file_store_path, args.save_model_name)
        # remove the saved model from the previous epoch if it exists
        if os.path.isfile(model_path + str(epoch - 1) + ".mdl"):
            os.remove(model_path + str(epoch - 1) + ".mdl")
def Train(args):
    subdir = datetime.strftime(datetime.now(), '%Y%m%d-%H%M%S')
    save_dir = os.path.join(args.file_store_path, subdir)
    if not os.path.exists(save_dir):  # Create the model directory if it doesn't exist
        os.mkdir(save_dir)

    # Either use specified device list or generate one
    if args.gpus is not None:
        gpus = [int(x) for x in args.gpus.split(',')]
        num_gpus = len(gpus)
    else:
        gpus = list(range(args.num_gpus))
        num_gpus = args.num_gpus

    log.info("Running on GPUs: {}".format(gpus))

    # Verify valid batch size
    total_batch_size = args.batch_size
    batch_per_device = total_batch_size // num_gpus
    assert \
        total_batch_size % num_gpus == 0, \
        "Number of GPUs must divide batch size"

    # Round down epoch size to closest multiple of batch size across machines
    global_batch_size = total_batch_size * args.num_shards
    epoch_iters = int(args.epoch_size / global_batch_size)
    args.epoch_size = epoch_iters * global_batch_size
    log.info("Using epoch size: {}".format(args.epoch_size))

    # Create ModelHelper object
    train_arg_scope = {
        'order': 'NCHW',
        'use_cudnn': True,
        'cudnn_exhaustive_search': True,
        'ws_nbytes_limit': (args.cudnn_workspace_limit_mb * 1024 * 1024),
    }
    train_model = model_helper.ModelHelper(
        name="sphereface", arg_scope=train_arg_scope
    )

    num_shards = args.num_shards
    shard_id = args.shard_id

    # Expect interfaces to be comma separated.
    # Use of multiple network interfaces is not yet complete,
    # so simply use the first one in the list.
    interfaces = args.distributed_interfaces.split(",")

    # Rendezvous using MPI when run with mpirun
    if os.getenv("OMPI_COMM_WORLD_SIZE") is not None:
        num_shards = int(os.getenv("OMPI_COMM_WORLD_SIZE", 1))
        shard_id = int(os.getenv("OMPI_COMM_WORLD_RANK", 0))
        if num_shards > 1:
            rendezvous = dict(
                kv_handler=None,
                num_shards=num_shards,
                shard_id=shard_id,
                engine="GLOO",
                transport=args.distributed_transport,
                interface=interfaces[0],
                mpi_rendezvous=True,
                exit_nets=None)

    elif num_shards > 1:
        # Create rendezvous for distributed computation
        store_handler = "store_handler"
        if args.redis_host is not None:
            # Use Redis for rendezvous if Redis host is specified
            workspace.RunOperatorOnce(
                core.CreateOperator(
                    "RedisStoreHandlerCreate", [], [store_handler],
                    host=args.redis_host,
                    port=args.redis_port,
                    prefix=args.run_id,
                )
            )
        else:
            # Use filesystem for rendezvous otherwise
            workspace.RunOperatorOnce(
                core.CreateOperator(
                    "FileStoreHandlerCreate", [], [store_handler],
                    path=args.file_store_path,
                    prefix=args.run_id,
                )
            )

        rendezvous = dict(
            kv_handler=store_handler,
            shard_id=shard_id,
            num_shards=num_shards,
            engine="GLOO",
            transport=args.distributed_transport,
            interface=interfaces[0],
            exit_nets=None)

    else:
        rendezvous = None

    # Model building functions
    def create_sphereface_model_ops(model, loss_scale):
        initializer = (pFP16Initializer if args.dtype == 'float16'
                       else Initializer)

        with brew.arg_scope([brew.conv, brew.fc],
                            WeightInitializer=initializer,
                            BiasInitializer=initializer,
                            enable_tensor_core=args.enable_tensor_core):
            pred = sphereface.create_net(
                model,
                "data",
                "label",
                in_dim=args.num_channels,
                class_num=args.num_labels,
                feature_dim=args.feature_dim,
                is_test=False,
                no_loss=True,
                fp16_data=True if args.dtype == 'float16' else False,
            )

        if args.dtype == 'float16':
            pred = model.net.HalfToFloat(pred, pred + '_fp32')

        softmax, loss = model.SoftmaxWithLoss([pred, 'label'],
                                              ['softmax', 'loss'])
        loss = model.Scale(loss, scale=loss_scale)
        brew.accuracy(model, [softmax, "label"], "accuracy")
        return [loss]

    def add_optimizer(model):
        # stepsz = int(30 * args.epoch_size / total_batch_size / num_shards)
        stepsz = 1
        if args.dtype == 'float16':
            opt = optimizer.build_fp16_sgd(
                model,
                args.base_learning_rate,
                momentum=0.9,
                nesterov=1,
                weight_decay=args.weight_decay,   # weight decay included
                policy="step",
                stepsize=stepsz,
                gamma=0.9999
            )
        else:
            optimizer.add_weight_decay(model, args.weight_decay)
            opt = optimizer.build_multi_precision_sgd(
                model,
                args.base_learning_rate,
                momentum=0.9,
                nesterov=1,
                policy="step",
                stepsize=stepsz,
                gamma=0.9999
            )
        return opt

    # Define add_image_input function.
    # Depends on the "train_data" argument.
    # Note that the reader will be shared with between all GPUS.
    if args.train_data == "null":
        def add_image_input(model):
            AddNullInput(
                model,
                None,
                batch_size=batch_per_device,
                img_size=args.image_size,
                dtype=args.dtype,
            )
    else:
        reader = train_model.CreateDB(
            "reader",
            db=args.train_data,
            db_type=args.db_type,
            num_shards=num_shards,
            shard_id=shard_id,
        )

        def add_image_input(model):
            AddImageInput(
                model,
                reader,
                batch_size=batch_per_device,
                img_size=args.image_size,
                dtype=args.dtype,
                is_test=False,
            )

    def add_post_sync_ops(model):
        """Add ops applied after initial parameter sync."""
        for param_info in model.GetOptimizationParamInfo(model.GetParams()):
            if param_info.blob_copy is not None:
                model.param_init_net.HalfToFloat(
                    param_info.blob,
                    param_info.blob_copy[core.DataType.FLOAT]
                )

    # Create parallelized model
    data_parallel_model.Parallelize(
        train_model,
        input_builder_fun=add_image_input,
        forward_pass_builder_fun=create_sphereface_model_ops,
        optimizer_builder_fun=add_optimizer,
        post_sync_builder_fun=add_post_sync_ops,
        devices=gpus,
        rendezvous=rendezvous,
        optimize_gradient_memory=True,
        cpu_device=args.use_cpu,
        shared_model=args.use_cpu,
    )

    workspace.RunNetOnce(train_model.param_init_net)
    workspace.CreateNet(train_model.net)


    # Add test model, if specified
    test_model = None
    if (args.test_data is not None):
        log.info("----- Create test net ----")
        test_arg_scope = {
            'order': "NCHW",
            'use_cudnn': True,
            'cudnn_exhaustive_search': True,
        }
        test_model = model_helper.ModelHelper(
            name="sphereface_test", arg_scope=test_arg_scope, init_params=False
        )

        test_reader = test_model.CreateDB(
            "test_reader",
            db=args.test_data,
            db_type=args.db_type,
        )

        def test_input_fn(model):
            AddImageInput(
                model,
                test_reader,
                batch_size=batch_per_device,
                img_size=args.image_size,
                dtype=args.dtype,
                is_test=True,
            )

        data_parallel_model.Parallelize(
            test_model,
            input_builder_fun=test_input_fn,
            forward_pass_builder_fun=create_sphereface_model_ops,
            post_sync_builder_fun=add_post_sync_ops,
            param_update_builder_fun=None,
            devices=gpus,
            cpu_device=args.use_cpu,
        )
        workspace.RunNetOnce(test_model.param_init_net)
        workspace.CreateNet(test_model.net)
        graph = net_drawer2.GetPydotGraphMinimal(test_model.net.Proto(),
                                                "sphereface", 
                                                rankdir="TB")
        graph.write(os.path.join(save_dir, "sphereface.pdf"), format='pdf')

    epoch = 0
    # load the pre-trained model and reset epoch
    if args.load_model_path is not None:
        LoadModel(args.load_model_path, train_model)

        # Sync the model params
        data_parallel_model.FinalizeAfterCheckpoint(train_model)

        # reset epoch. load_model_path should end with *_X.mdl,
        # where X is the epoch number
        last_str = args.load_model_path.split('_')[-1]
        if last_str.endswith('.mdl'):
            epoch = int(last_str[:-4])
            log.info("Reset epoch to {}".format(epoch))
        else:
            log.warning("The format of load_model_path doesn't match!")

    expname = "sphereface_gpu%d_b%d_L%d_lr%.2f_v2" % (
        args.num_gpus,
        total_batch_size,
        args.num_labels,
        args.base_learning_rate,
    )

    explog = experiment_util.ModelTrainerLog(os.path.join(save_dir, expname), args)

    kernel_fig, plt_kernel = plt.subplots(nrows=4, ncols=5, figsize=(14, 14))
    loss_fig, plt_loss = plt.subplots(1)
    plt.tight_layout(h_pad=0, w_pad=0)
    plt.ion()

    iterations = 0
    old_x = 0
    old_loss = 0
    old_acc = 0
    while epoch < args.num_epochs or args.test_data_type != 'VAL':
        epoch, epoch_loss, epoch_accuracy = RunEpoch(
            args,
            epoch,
            train_model,
            test_model,
            total_batch_size,
            num_shards,
            explog,
            plt_kernel
        )

        x = list(range(iterations, iterations + len(epoch_loss)))
        x.insert(0, old_x)
        epoch_loss.insert(0, old_loss)
        epoch_accuracy.insert(0, old_acc)
        plt_loss.plot(x, epoch_loss, 'b')
        plt_loss.plot(x, epoch_accuracy, 'r')
        iterations += len(epoch_loss)
        old_x = iterations - 2
        old_loss = epoch_loss[-1]
        old_acc = epoch_accuracy[-1]

        log.info('Save checkpoint {}'.format(epoch))
        model_path = '{:s}/{:s}_{:d}.mdl'.format(save_dir, args.save_model_name, epoch)
        SaveModel(args, train_model, model_path)
        if DEBUG_TRAINING:
            kernel_fig_path = '%s/%s_%d.jpg' % (save_dir, 'activation', epoch)
            loss_fig_path = '%s/%s_%d.jpg' % (save_dir, 'loss', epoch)
            kernel_fig.savefig(kernel_fig_path)
            loss_fig.savefig(loss_fig_path)
Esempio n. 13
0
    def run_model(self, devices, gpu):
        '''
        Helper function for test_equiv
        '''
        def input_builder_fun(model):
            return None

        def model_build_fun(model, loss_scale):
            fc = model.FC("data", "fc", 16, 1, ("ConstantFill", {}),
                          ("ConstantFill", {}))
            fc_fl = model.FlattenToVec(fc, "fc_fl")
            sigm = model.Sigmoid(fc_fl, "sigm")
            sq = model.SquaredL2Distance([sigm, "label"], "sq")
            loss = model.AveragedLoss(sq, "loss")
            loss = model.Scale(loss, scale=loss_scale)

            # For testing explicit sync
            model.param_init_net.UniformFill([], ["sync_num"], shape=[1])
            return [loss]

        def add_optimizer(model):
            return optimizer.build_sgd(
                model,
                0.1,
                policy="fixed",
                max_gradient_norm=5.0,
                allow_lr_injection=True,
            )

        workspace.ResetWorkspace()
        model = cnn.CNNModelHelper(
            order="NHWC",
            name="test{}".format(devices),
        )
        data_parallel_model.Parallelize(
            model,
            input_builder_fun=input_builder_fun,
            forward_pass_builder_fun=model_build_fun,
            optimizer_builder_fun=add_optimizer,
            devices=devices,
            cpu_device=not gpu,
            shared_model=not gpu,
        )
        data_parallel_model.AddBlobSync(model, ["sync_num"])

        # Light test for LR names
        lr_names = data_parallel_model.GetLearningRateBlobNames(model)
        self.assertGreater(len(lr_names), 0)

        np.random.seed(2603)

        # Each run has same input, independent of number of gpus
        batch_size = 64
        for i in range(0, 10):
            full_data = np.random.rand(batch_size, 16)
            full_labels = np.round(full_data[:, 0])
            batch_per_device = batch_size // len(devices)

            for (j, g) in enumerate(devices):
                st = j * batch_per_device
                en = st + batch_per_device
                data = full_data[st:en, :].astype(np.float32)
                labels = full_labels[st:en].astype(np.float32)
                with core.DeviceScope(core.DeviceOption(model._device_type,
                                                        g)):
                    workspace.FeedBlob(
                        "{}_{}/data".format(model._device_prefix, g), data)
                    workspace.FeedBlob(
                        "{}_{}/label".format(model._device_prefix, g), labels)

            if i == 0:
                workspace.RunNetOnce(model.param_init_net)
                workspace.CreateNet(model.net)

            workspace.FeedBlob(model._device_prefix + "_0/sync_num",
                               np.array([i * 2]).astype(np.float32),
                               device_option=core.DeviceOption(
                                   model._device_type, 0))
            workspace.RunNet(model.net.Proto().name)

            # Test AddBlobSync
            for j in model._devices:
                sync = workspace.FetchBlob(model._device_prefix +
                                           "_{}/sync_num".format(j))[0]
                self.assertTrue(abs(sync - i * 2) < 0.01)

        return workspace.FetchBlob("{}_0/fc_w".format(model._device_prefix))
Esempio n. 14
0
def benchmark_training(model, opts):
    """ Runs N training batches and returns array of batch times in seconds.

    For some impl details see https://caffe2.ai/docs/SynchronousSGD.html.

    :param model: Caffe2's model helper class instances.
    :type model: :py:class:`caffe2.python.model_helper.ModelHelper`
    :param dict opts: Options for the inference benchmark. Must contain `device`,\
                      `num_gpus` if device is gpu, `enable_tensor_core`,\
                      `num_warmup_batches` and `num_batches`.
    :return: Tuple of model title and numpy array containing batch times.
    :rtype: (string, numpy array)
    """
    model_builder = ModelFactory.get_model(opts)
    assert model_builder.phase == 'training',\
           "Internal error, invalid phase was set. "\
           "Expecting 'training' but found %s" % (model_builder.phase)

    # Reader must be shared by all GPUs in a one machine.
    reader = None
    if 'data_dir' in opts and opts['data_dir']:
        reader = model.CreateDB(
            "reader",
            db=opts['data_dir'],            # (str, path to training data)
            db_type=opts['data_backend'],   # (str, 'lmdb' or 'leveldb')
            num_shards=1,                   # (int, number of machines)
            shard_id=0,                     # (int, machine id)
        )

    def add_inputs(model):
        if reader is None:
            print("[INFO] Adding synthetic data input for Caffe2 training benchmarks")
            model_builder.add_synthetic_inputs(model, add_labels=True)
        else:
            print("[INFO] Adding real data inputs (%s) for Caffe2 training benchmarks" %\
                  (opts['data_dir']))
            model_builder.add_data_inputs(
                model, reader, use_gpu_transform=(opts['device'] == 'gpu'),
                num_decode_threads=opts['num_decode_threads']
            )

    def create_net(model, loss_scale):
        return create_model(model_builder, model, opts['enable_tensor_core'],
                            opts['float16_compute'], loss_scale)

    def add_post_sync_ops(model):
        """Add ops applied after initial parameter sync."""
        for param_info in model.GetOptimizationParamInfo(model.GetParams()):
            if param_info.blob_copy is not None:
                model.param_init_net.HalfToFloat(
                    param_info.blob,
                    param_info.blob_copy[core.DataType.FLOAT]
                )
    def add_optimizer(model):
        return build_optimizer(model, float16_compute=opts['float16_compute'])

    if opts['device'] == 'gpu':
        rendezvous = setup_rendezvous(opts)
        print("rendezvous: %s" % str(rendezvous))
        dpm.Parallelize(
            model,
            input_builder_fun=add_inputs,
            forward_pass_builder_fun=create_net,
            optimizer_builder_fun=add_optimizer,
            #param_update_builder_fun=Model.add_parameter_update_ops,
            post_sync_builder_fun=add_post_sync_ops,
            devices=range(opts['num_gpus']),
            rendezvous=rendezvous,
            optimize_gradient_memory=True,
            cpu_device=(opts['device'] == 'cpu'),
            shared_model=(opts['device'] == 'cpu')
        )
    else:
        with core.DeviceScope(Model.get_device_option(gpu=None)):
            add_inputs(model)
            losses = create_net(model, 1.0)
            blobs_to_gradients = model.AddGradientOperators(losses)
            Model.add_parameter_update_ops(model)
        Model.optimize_gradient_memory(model, [blobs_to_gradients[losses[0]]])

    workspace.RunNetOnce(model.param_init_net)
    workspace.CreateNet(model.net)
    return (model_builder.name, run_n_times(model, opts['num_warmup_batches'], opts['num_batches']))
Esempio n. 15
0
    def _spatial_bn_check(self, device_type, seed, batch_size):
        devices = [0, 1, 2]
        epsilon = 1e-3
        tolerance = 1e-3

        def _test_forward_pass(x, devices, device_type, scale, bias, epsilon):
            x_concat = np.concatenate(x)
            mean = np.mean(x_concat, axis=0)
            var = np.var(x_concat, axis=0)
            for device in devices:
                x_i = x[device]
                x_hat = (x_i - mean) / (np.sqrt(var + epsilon))
                expected_out = scale * x_hat + bias
                spatial_out = workspace.FetchBlob("{}_{}/sp_out".format(
                    device_type, device))
                rel_error = np.linalg.norm(spatial_out - expected_out) \
                            / np.linalg.norm(expected_out)
                self.assertTrue(rel_error < 0.005)

        def _test_backward_pass(x, devices, device_type, scale, tolerance):
            dBias_arr = []
            dY_arr = []
            dGamma_arr = []
            num_devices = len(devices)
            mean = np.array(workspace.FetchBlob(
                "{}_0/sp_out_sm".format(device_type)),
                            dtype=np.float32)
            inv_var = np.array(workspace.FetchBlob(
                "{}_0/sp_out_siv".format(device_type)),
                               dtype=np.float32)

            # dBias
            # Sum dBias values over all devices to find the average gradient
            for device in devices:
                dY_blob = workspace.FetchBlob("{}_{}/sp_out_grad".format(
                    device_type, device))
                dY = np.array(dY_blob, dtype=np.float32)
                dY_arr.append(dY)
                dBias_arr.append(np.array(np.sum(dY, axis=0),
                                          dtype=np.float32))
            dBias = np.sum(dBias_arr, dtype=np.float32)
            dBias_avg = dBias / num_devices
            for device in devices:
                dBiasActual = np.sum(workspace.FetchBlob(
                    "{}_{}/sp_out_b_grad".format(device_type, device)),
                                     dtype=np.float32)
                self.assertTrue(
                    np.isclose([dBiasActual], [dBias], atol=tolerance))

            # dGamma
            # Sum dGamma values over all devices to find the average gradient
            for device in devices:
                dGamma = np.sum((x[device] - mean) * inv_var * dY_arr[device],
                                axis=0,
                                dtype=np.float32)
                dGamma_arr.append(dGamma)
            dGamma = np.sum(dGamma_arr, axis=0, dtype=np.float32)
            dGamma_avg = dGamma / num_devices
            for device in devices:
                dGammaActual = workspace.FetchBlob(
                    "{}_{}/sp_out_s_grad".format(device_type, device))
                self.assertTrue(
                    np.isclose([dGamma], [dGammaActual], atol=tolerance))

            # dX
            scale_inv_var = scale * inv_var / batch_size
            for device in devices:
                dX = scale_inv_var * (
                    dY_arr[device] * batch_size - dBias_avg -
                    (x[device] - mean) * dGamma_avg * inv_var)
                dX_actual = workspace.FetchBlob("{}_{}/tanh_grad".format(
                    device_type, device))
                self.assertTrue(
                    np.isclose([dX], [dX_actual], atol=tolerance).all())

        def add_input_ops(model):
            for device in devices:
                data = np.random.rand(batch_size, 1, 1, 1).astype(np.float32)
                workspace.FeedBlob("{}_{}/data".format(device_type, device),
                                   data)

        def add_model_ops(model, loss_scale):
            model.Tanh("data", "tanh")
            model.SpatialBN("tanh",
                            "sp_out",
                            1,
                            epsilon=epsilon,
                            is_test=False)
            model.Sqr("sp_out", "sqr")
            loss = model.SumElements("sqr", "loss")
            return [loss]

        def add_optimizer(model):
            return optimizer.build_sgd(model, 0.1)

        np.random.seed(seed)
        workspace.ResetWorkspace()
        model = cnn.CNNModelHelper(order="NCHW", name="test")
        data_parallel_model.Parallelize(
            model,
            input_builder_fun=add_input_ops,
            forward_pass_builder_fun=add_model_ops,
            optimizer_builder_fun=add_optimizer,
            devices=devices,
            cpu_device=device_type == "cpu",
            shared_model=False,
            combine_spatial_bn=True,
        )

        workspace.RunNetOnce(model.param_init_net)
        scale = workspace.FetchBlob("{}_0/sp_out_s".format(device_type))
        bias = workspace.FetchBlob("{}_0/sp_out_b".format(device_type))
        workspace.RunNetOnce(model.net)

        x = []
        for device in devices:
            x_blob = workspace.FetchBlob("{}_{}/tanh".format(
                device_type, device))
            x_i = np.array(x_blob, dtype=np.float32)
            x.append(x_i)

        _test_forward_pass(x, devices, device_type, scale, bias, epsilon)
        _test_backward_pass(x, devices, device_type, scale, tolerance)
Esempio n. 16
0
def network_eval(args):
    """
    Runs network benchmarking on either a single or multiple nodes
    """
    # Define some parameters for the model instantiation
    if args.use_ideep:
        train_arg_scope = {
            'use_cudnn': False,
            'cudnn_exhaustive_search': False,
            'training_mode': 1
        }
    else:
        train_arg_scope = {
            'order': 'NCHW',
            'use_cudnn': True,
            'cudnn_exhaustive_search': True,
            # 1048576 = 2 ^ 20 (1 MB)
            'ws_nbytes_limit': (args.cudnn_ws_lim * 1048576),
        }
    # Create the model for evaluation
    evaluation_model = model_helper.ModelHelper(name='evaluation_model',
                                                arg_scope=train_arg_scope)

    evaluation_model.Proto().num_workers = 16

    # Default the model for accuracy testing to None
    accuracy_time_model = None

    # Compute batch and epoch sizes
    # Per CPU / GPU batch size
    per_local_device_batch = (
        args.batch_size //
        len(args.gpu_devices)) if args.gpu_devices else args.batch_size
    # Total batch size (over all the devices)
    global_batch_size = args.batch_size * args.num_shards
    # Number of epoch iterations
    epoch_iters = args.epoch_size // global_batch_size
    # Adjust the true number of examples per epoch
    args.epoch_size = global_batch_size * epoch_iters

    if args.training_data:
        log.info("Running experiments with user provided data: %s",
                 args.training_data)

        # Create a reader, which can also help distribute data when running on multiple nodes
        reader = evaluation_model.CreateDB(
            "reader",
            db=args.training_data,
            db_type=args.db_type,
            num_shards=args.num_shards,
            shard_id=args.shard_id,
        )

        def image_input(model):
            AddImageInput(model, reader, per_local_device_batch,
                          min(args.height, args.width), args.data_type,
                          args.use_cpu)
    else:
        input_shape = [args.batch_size, args.channels, args.height, args.width]
        log.info("Running experiments with synthetic data w/ shape: %s",
                 input_shape)

        def image_input(model):
            AddSyntheticInput(model, args.data_type, input_shape,
                              args.num_labels)

    # Create the network, and normalize the loss
    def create_model(model, loss_scale):
        initializer = (PseudoFP16Initializer
                       if args.data_type == 'float16' else Initializer)

        with brew.arg_scope([brew.conv, brew.fc],
                            WeightInitializer=initializer,
                            BiasInitializer=initializer,
                            enable_tensor_core=False,
                            float16_compute=False):
            pred = resnet.create_resnet50(
                model,
                "data",
                num_input_channels=args.channels,
                num_labels=args.num_labels,
                # num_groups=args.resnext_num_groups,
                # num_width_per_group=args.resnext_width_per_group,
                no_bias=True,
                no_loss=True)

        # If we're using float on 2B, then inflate to the 4B representation
        if args.data_type == 'float16':
            pred = model.net.HalfToFloat(pred, pred + '_fp32')

        # Compute the softmax probabilities and the loss
        softmax, loss = model.SoftmaxWithLoss([pred, 'label'],
                                              ['softmax', 'loss'])

        # Noralize the loss, and compute the top_k accuracies for k \in {1, 5}
        loss = model.Scale(loss, scale=loss_scale)
        brew.accuracy(model, [softmax, "label"], "accuracy", top_k=1)
        brew.accuracy(model, [softmax, "label"], "accuracy_top5", top_k=5)
        return [loss]

    def add_optimizer(model):
        """
        Optimizer function called once for the entire model, as opposed for each 
        CPU / GPU individually. The optimizer will be a stepwise weight decay.

        :return: return the optimizer
        """
        stepsz = int(30 * args.epoch_size / args.batch_size / args.num_shards)
        stepsz = stepsz if stepsz else 100

        optimizer.add_weight_decay(model, 1e-4)
        # opt = optimizer.build_multi_precision_sgd(

        opt = optimizer.build_sgd(model,
                                  args.base_learning_rate,
                                  momentum=0.9,
                                  nesterov=1,
                                  policy="step",
                                  stepsize=stepsz,
                                  gamma=0.1)
        return opt

    def add_parameter_update(model):
        """
        Add a simple gradient based parameter update with stepwise adaptive learning rate.
        """
        # This counts the number if iterations we are making
        ITER = brew.iter(model, "iter")
        # Adds a LR to the model, updated using a simple step policy every 10k steps; gamma is an update parameter
        LR = model.LearningRate(ITER,
                                "LR",
                                base_lr=-args.base_learning_rate,
                                policy="step",
                                stepsize=1000,
                                gamma=0.999)
        # This is a constant used in the following loop
        ONE = model.param_init_net.ConstantFill([],
                                                "ONE",
                                                shape=[1],
                                                value=1.0)
        # Here we are essentially applying the gradients to the weights (using the classical method)
        for param in model.params:
            param_grad = model.param_to_grad[param]
            model.WeightedSum([param, ONE, param_grad, LR], param)

    def add_post_sync_ops(model):
        """
        Add ops applied after initial parameter sync.
        """
        for param_info in model.GetOptimizationParamInfo(model.GetParams()):
            if param_info.blob_copy is not None:
                model.param_init_net.HalfToFloat(
                    param_info.blob, param_info.blob_copy[core.DataType.FLOAT])

    if args.num_shards > 1:
        log.info("Distributed benchmarking is enabled")
        log.info("Num shards: %d", args.num_shards)
        log.info("My shard ID: %d", args.shard_id)
        if args.redis_host:
            log.info("Using Redis server at %s:%d", args.redis_host,
                     args.redis_port)
        else:
            log.info("Rendevous at: %s", args.rendezvous_path)

        # Prepare the required parameters for distribution
        store_handler = "store_handler"

        # We'll use the shared file system for rendezvous
        if args.redis_host:
            workspace.RunOperatorOnce(
                core.CreateOperator(
                    "RedisStoreHandlerCreate",
                    [],
                    [store_handler],
                    host=args.redis_host,
                    port=args.redis_port,
                    prefix=args.run_id,
                ))
        else:
            workspace.RunOperatorOnce(
                core.CreateOperator(
                    "FileStoreHandlerCreate",
                    [],
                    [store_handler],
                    path=args.rendezvous_path,
                    prefix=args.run_id,
                ))

        rendezvous = dict(kv_handler=store_handler,
                          shard_id=args.shard_id,
                          num_shards=args.num_shards,
                          engine="GLOO",
                          transport=args.distributed_transport,
                          interface=args.network_interface,
                          exit_nets=None)

        # Parallelize the model (data parallel)
        data_parallel_model.Parallelize(
            evaluation_model,
            input_builder_fun=image_input,
            forward_pass_builder_fun=create_model,
            optimizer_builder_fun=None if not args.backward else
            (add_optimizer if not args.per_device_optimization else None),
            param_update_builder_fun=None if not args.backward else
            (add_parameter_update if args.per_device_optimization else None),
            post_sync_builder_fun=add_post_sync_ops
            if args.post_sync else None,
            devices=(args.gpu_devices if not args.use_cpu else [0]),
            rendezvous=rendezvous,
            # Although this is a parameter (broadcast params) of this function, it is
            # currently not implemented in Caffe2's source code
            broadcast_computed_params=args.broadcast_params,
            optimize_gradient_memory=args.optimize_gradient_memory,
            dynamic_memory_management=args.dynamic_memory_management,
            max_concurrent_distributed_ops=args.max_distributed_ops,
            num_threads_per_device=args.max_threads,
            use_nccl=args.use_nccl,
            cpu_device=args.use_cpu,
            ideep=args.use_ideep,
            shared_model=args.shared_model,
            combine_spatial_bn=args.use_cpu,
        )

        if args.backward:
            data_parallel_model.OptimizeGradientMemory(evaluation_model, {},
                                                       set(), False)

        instantiate_and_create_net(evaluation_model)

        # If we're testing for the time it takes to reach a particular accuracy, then we'll need to create
        # a new model just for this
        if args.test_accuracy:
            # Test for the existance of testing data
            assert args.testing_data, "We must have testing data if we're measuring the time to accuracy"

            log.info("We're running time to test accuracy")
            log.info("The accuracy we're looking for: %f",
                     args.target_accuracy)
            log.info("Testing data provided in: %s", args.testing_data)

            # Create the model
            if args.use_ideep:
                test_arg_scope = {
                    'use_cudnn': False,
                    'cudnn_exhaustive_search': False,
                }
            else:
                test_arg_scope = {
                    'order': 'NCHW',
                    'use_cudnn': True,
                    'cudnn_exhaustive_search': True,
                }

            accuracy_time_model = model_helper.ModelHelper(
                name='accuracy_time_model',
                arg_scope=test_arg_scope,
                init_params=False)

            # Create the input function
            # Create a reader, which can also help distribute data when running on multiple nodes
            test_reader = accuracy_time_model.CreateDB("test_reader",
                                                       db=args.testing_data,
                                                       db_type=args.db_type)

            def test_image_input(model):
                AddImageInput(model,
                              test_reader,
                              per_local_device_batch,
                              min(args.height, args.width),
                              args.data_type,
                              args.use_cpu,
                              is_test=True)

            # Create the test model per se
            data_parallel_model.Parallelize(
                accuracy_time_model,
                input_builder_fun=test_image_input,
                forward_pass_builder_fun=create_model,
                post_sync_builder_fun=add_post_sync_ops
                if args.post_sync else None,
                param_update_builder_fun=None,
                devices=(args.gpu_devices if not args.use_cpu else [0]),
                cpu_device=args.use_cpu)

            instantiate_and_create_net(accuracy_time_model)
    else:
        print("Single node benchmarking is enabled")

        # Build the training model
        if args.use_cpu:
            image_input(evaluation_model)
            create_model(evaluation_model, 1.0)
            if args.backward:
                evaluation_model.AddGradientOperators(["loss"])
                add_parameter_update(evaluation_model)
        else:
            # We're running this on a single GPU on a single node, so create the net under the GPU's net
            with core.DeviceScope(
                    core.DeviceOption(caffe2_pb2.CUDA, args.gpu_devices[0])):
                image_input(evaluation_model)
                create_model(evaluation_model, 1.0)
                if args.backward:
                    evaluation_model.AddGradientOperators(["loss"])
                    add_parameter_update(evaluation_model)

        instantiate_and_create_net(evaluation_model)

        if args.test_accuracy:
            # Test for the existance of testing datan GPU: https://caffe2.ai/doxygen-python/html/classcaffe2_1_1python_1_1core_1_1_net.html#af67e059d8f4cc22e7e64ccdd07918681
            assert args.testing_data, "We must have testing data if we're measuring the time to accuracy"

            log.info("We're running time to test accuracy")
            log.info("The accuracy we're looking for: %f",
                     args.target_accuracy)
            log.info("Testing data provided in: %s", args.testing_data)

            # Create the model
            if args.use_ideep:
                test_arg_scope = {
                    'use_cudnn': False,
                    'cudnn_exhaustive_search': False,
                }
            else:
                test_arg_scope = {
                    'order': 'NCHW',
                    'use_cudnn': True,
                    'cudnn_exhaustive_search': True,
                }

            accuracy_time_model = model_helper.ModelHelper(
                name='accuracy_time_model',
                arg_scope=test_arg_scope,
                init_params=False)

            # Create the input function
            # Create a reader, which can also help distribute data when running on multiple nodes
            test_reader = accuracy_time_model.CreateDB("test_reader",
                                                       db=args.testing_data,
                                                       db_type=args.db_type)

            def test_image_input(model):
                AddImageInput(model,
                              test_reader,
                              per_local_device_batch,
                              min(args.height, args.width),
                              args.data_type,
                              args.use_cpu,
                              is_test=True)

            # Create the test model per se
            test_image_input(accuracy_time_model)
            create_model(accuracy_time_model, 1.0)

            instantiate_and_create_net(accuracy_time_model)

    if not args.test_accuracy:
        workspace.BenchmarkNet(evaluation_model.net.Proto().name,
                               args.warmup_rounds, args.eval_rounds,
                               args.per_layer_eval)
    else:
        # Create a log for time to accuracy testing
        expname = "time_to_acc_model_%s_gpu%d_b%d_L%d_lr%.2f_shard%d" % (
            args.model_name, len(args.gpu_devices) if not args.use_cpu else 1,
            args.batch_size, args.num_labels, args.base_learning_rate,
            args.shard_id)

        explog = experiment_util.ModelTrainerLog(expname, args)

        # Run the epochs
        elapsed_training_time = 0.0
        for i in range(args.epoch_count):
            elapsed_training_time, on_target = RunEpoch(
                args, i, evaluation_model, accuracy_time_model, explog,
                elapsed_training_time)

            if args.terminate_on_target and on_target:
                log.info("Have reached the target accuracy: {} in {} seconds.".
                         format(args.target_accuracy, elapsed_training_time))
                break