def test_equiv(self):
        '''
        Test that the model produces exactly same results given
        total batchsize, independent of number of GPUs.
        '''
        for gpu in [True, False]:
            if gpu and (not workspace.has_gpu_support
                        or workspace.NumCudaDevices() < 2):
                continue
            result_2gpus = self.run_model([0, 1], gpu=gpu)
            result_1gpus = self.run_model([0], gpu=gpu)

            self.assertTrue(np.allclose(result_1gpus, result_2gpus))

            if not gpu or workspace.NumCudaDevices() >= 4:
                result_4gpus = self.run_model(list(range(4)), gpu=gpu)
                self.assertTrue(np.allclose(result_1gpus, result_4gpus))

            if not gpu or workspace.NumCudaDevices() >= 8:
                result_8gpus = self.run_model(list(range(8)), gpu=gpu)
                self.assertTrue(np.allclose(result_1gpus, result_8gpus))

            if not gpu or workspace.NumCudaDevices() >= 16:
                result_16gpus = self.run_model(list(range(16)), gpu=gpu)
                self.assertTrue(np.allclose(result_1gpus, result_16gpus))
Esempio n. 2
0
    def test_sparse_shared_indices_gpu(self):
        '''
            Test that the model has same number of indices and gradient rows
            given total batchsize, independent of number of GPUs.
        '''
        V = 10000
        self.run_model(V, [0, 1])
        self.run_model(V, [0])

        if workspace.NumCudaDevices() >= 4:
            self.run_model(V, list(range(4)))

        if workspace.NumCudaDevices() >= 8:
            self.run_model(V, list(range(8)))
Esempio n. 3
0
    def get_device_option(gpu=None):
        """Constructs `core.DeviceOption` object

        :param int gpu: Identifier of GPU to use or None for CPU.
        :return: Instance of `core.DeviceOption`.
        """
        dev_opt = None
        if gpu is None:
            dev_opt = core.DeviceOption(caffe2_pb2.CPU)
        else:
            assert workspace.has_gpu_support, "Workspace does not support GPUs"
            assert gpu >= 0 and gpu < workspace.NumCudaDevices(),\
                   "Workspace does not provide this gpu (%d). "\
                   "Number of GPUs is %d" % (gpu, workspace.NumCudaDevices())
            dev_opt = core.DeviceOption(caffe2_pb2.CUDA, gpu)
        return dev_opt
Esempio n. 4
0
def _test_reshape(old_shape,
                  new_shape,
                  expected_shape=None,
                  arg_shape=True,
                  in_place=False):
    devices = [core.DeviceOption(caffe2_pb2.CPU, 0)]
    if workspace.NumCudaDevices() > 0:
        devices.append(core.DeviceOption(caffe2_pb2.CUDA, 0))

    for device_opt in devices:
        with core.DeviceScope(device_opt):
            if expected_shape is None:
                expected_shape = new_shape
            X = np.random.rand(*old_shape).astype(np.float32)

            blob_in = 'X'
            blob_out = blob_in if in_place else blob_in + '_out'

            if arg_shape:
                op = core.CreateOperator('Reshape', [blob_in],
                                         [blob_out, 'old_shape'],
                                         shape=new_shape)
            else:
                op = core.CreateOperator('Reshape', [blob_in, 'new_shape'],
                                         [blob_out, 'old_shape'])
                workspace.FeedBlob('new_shape', np.asarray(new_shape))

            workspace.FeedBlob(blob_in, X)
            workspace.RunOperatorOnce(op)

            Y = workspace.FetchBlob(blob_out)
            np.testing.assert_allclose(Y, X.reshape(expected_shape))
    def test_shift_gpu(self):
        model = self.create_model()
        data_parallel_model_utils.ShiftActivationDevices(
            model,
            activations=["fc4", "fc5"],
            shifts={
                0: 4,
                1: 4,
                2: 5,
                3: 5
            },
        )
        for op in model.param_init_net.Proto().op:
            for outp in op.output:
                prefix = outp.split("/")[0]
                if outp.split("/")[-1] in set(
                    ['fc4_w', 'fc5_w', 'fc4_b', 'fc5_b']):
                    if prefix == 'gpu_0' or prefix == 'gpu_1':
                        self.assertEqual(op.device_option.cuda_gpu_id, 4)
                    else:
                        self.assertEqual(op.device_option.cuda_gpu_id, 5)
                if outp.split("/")[-1] in set(
                    ['fc1_w', 'fc2_w', 'fc3_b', 'fc3_w']):
                    gpu_id = int(prefix.split("_")[-1])
                    self.assertEqual(gpu_id, op.device_option.cuda_gpu_id)

        # Test that we can run the net
        if workspace.NumCudaDevices() >= 6:
            workspace.RunNetOnce(model.param_init_net)
            workspace.CreateNet(model.net)
            workspace.RunNet(model.net.Proto().name)
Esempio n. 6
0
    def test_prepend_dim(self):
        devices = [core.DeviceOption(caffe2_pb2.CPU, 0)]
        if workspace.NumCudaDevices() > 0:
            devices.append(core.DeviceOption(caffe2_pb2.CUDA, 0))

        for device_opt in devices:
            with core.DeviceScope(device_opt):
                self._test_fwd_bwd()
Esempio n. 7
0
    def test_equiv(self):
        '''
        Test that the model produces exactly same results given
        total batchsize, independent of number of GPUs.
        '''
        result_2gpus = self.run_model([0, 1])
        result_1gpus = self.run_model([0])

        self.assertTrue(np.allclose(result_1gpus, result_2gpus))

        if workspace.NumCudaDevices() >= 4:
            result_4gpus = self.run_model(range(4))
            self.assertTrue(np.allclose(result_1gpus, result_4gpus))

        if workspace.NumCudaDevices() >= 8:
            result_8gpus = self.run_model(range(8))
            self.assertTrue(np.allclose(result_1gpus, result_8gpus))
Esempio n. 8
0
class CudaProfileOpsTest(unittest.TestCase):
    @unittest.skipIf(workspace.NumCudaDevices() < 1, "Need at least 1 GPU")
    def test_run(self):
        net = core.Net("net")
        net.CudaProfileInitialize([], [], output="/tmp/cuda_profile_test")
        net.CudaProfileStart([], [])
        with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 0)):
            net.ConstantFill([], ["out"], shape=[1, 3, 244, 244])
        net.CudaProfileStop([], [])

        workspace.CreateNet(net)
        workspace.RunNet(net)
Esempio n. 9
0
    def _test_equiv_sparse(self, cpu_indices):
        '''
            Test that the model produces exactly same results given
            total batchsize, independent of number of GPUs.
        '''
        V = 10000
        result_2gpus = self.run_model(V, [0, 1], cpu_indices)
        result_1gpus = self.run_model(V, [0], cpu_indices)

        self.assertTrue(np.allclose(result_1gpus[0], result_2gpus[0]))
        self.assertTrue(np.allclose(result_1gpus[1], result_2gpus[1]))

        if workspace.NumCudaDevices() >= 4:
            result_4gpus = self.run_model(V, range(4), cpu_indices)
            self.assertTrue(np.allclose(result_1gpus[0], result_4gpus[0]))
            self.assertTrue(np.allclose(result_1gpus[1], result_4gpus[1]))

        if workspace.NumCudaDevices() >= 8:
            result_8gpus = self.run_model(V, range(8), cpu_indices)
            self.assertTrue(np.allclose(result_1gpus[0], result_8gpus[0]))
            self.assertTrue(np.allclose(result_1gpus[1], result_8gpus[1]))
Esempio n. 10
0
    def test_executor(self, executor, num_workers):
        model = build_resnet50_dataparallel_model(
            num_gpus=workspace.NumCudaDevices(), batch_size=8, epoch_size=8)
        model.Proto().num_workers = num_workers

        def run_model():
            run_resnet50_epoch(model, batch_size=8, epoch_size=8)

        self.compare_executors(
            model,
            ref_executor="simple",
            test_executor=executor,
            model_run_func=run_model,
        )
Esempio n. 11
0
    def test_timings(self):
        for n in range(2, workspace.NumCudaDevices()):
            for in_place in [False, True]:
                xs = [np.random.randn(1e7).astype(np.float32)
                      for i in range(n)]
                inputs = [str("x_{}".format(i)) for i in range(n)]
                prefix = "" if in_place else "o"
                outputs = [str("{}x_{}".format(prefix, i)) for i in range(n)]

                net = core.Net("test")
                net.NCCLAllreduce(inputs, outputs)
                net.RunAllOnGPU()
                for i in range(n):
                    self.ws.create_blob(inputs[i]).feed(xs[i], gpu_device(i))
                self.ws.run(net)
                net_time = benchmark(self.ws, net)
                vanilla = core.Net("vanilla")
                muji.Allreduce(vanilla, inputs)
                vanilla_time = benchmark(self.ws, vanilla)
                print("Speedup for NCCL: {:.2f}".format(
                    vanilla_time / net_time))
Esempio n. 12
0
def getArgs():
    """Return command-line arguments."""
    CURDIR = os.path.dirname(__file__)
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--train-lmdb',
                        help='Path to training LMDB',
                        default=os.path.join(CURDIR, 'cifar10_train_lmdb'))
    parser.add_argument('--test-lmdb',
                        help='Path to test LMDB',
                        default=os.path.join(CURDIR, 'cifar10_test_lmdb'))
    parser.add_argument('--dtype',
                        choices=['float', 'float16'],
                        default='float',
                        help='Data type used for training')
    parser.add_argument('--gpus',
                        help='Comma separated list of GPU devices to use')
    parser.add_argument('--num_gpus',
                        type=int,
                        default=1,
                        help='Number of GPU devices (instead of --gpus)')
    parser.add_argument('--all-gpus',
                        action='store_true',
                        help='Use all GPUs in the system')
    args = parser.parse_args()

    args.dtype = (DataType.FLOAT16
                  if args.dtype == 'float16' else DataType.FLOAT)

    if args.all_gpus:
        args.num_gpus = workspace.NumCudaDevices()
        args.gpus = range(args.num_gpus)
    else:
        if args.gpus is not None:
            args.gpus = [int(x) for x in args.gpus.split(',')]
            args.num_gpus = len(args.gpus)
        else:
            args.gpus = range(args.num_gpus)
            args.num_gpus = args.num_gpus
    return args
Esempio n. 13
0
    def test_timings(self):
        for n in range(2, workspace.NumCudaDevices()):
            for in_place in [False, True]:
                xs = [
                    np.random.randn(1e7).astype(np.float32) for i in range(n)
                ]
                inputs = [str("x_{}".format(i)) for i in range(n)]
                prefix = "" if in_place else "o"
                outputs = [str("{}x_{}".format(prefix, i)) for i in range(n)]

                net = core.Net("test")
                net.NCCLAllreduce(inputs, outputs)
                net.RunAllOnGPU()
                for i in range(n):
                    workspace.FeedBlob(inputs[i], xs[i],
                                       gpu_device(i).SerializeToString())
                workspace.RunNetOnce(net.Proto().SerializeToString())
                net_time = benchmark(net)
                vanilla = core.Net("vanilla")
                muji.Allreduce(vanilla, inputs)
                vanilla_time = benchmark(vanilla)
                print("Speedup for NCCL: {:.2f}".format(vanilla_time /
                                                        net_time))
Esempio n. 14
0
from __future__ import unicode_literals
import errno
import hypothesis.strategies as st
from hypothesis import given
import numpy as np
import os
import shutil
import tempfile
import unittest

from caffe2.proto import caffe2_pb2
from caffe2.python import core, test_util, workspace

if workspace.has_gpu_support:
    DEVICES = [caffe2_pb2.CPU, caffe2_pb2.CUDA]
    max_gpuid = workspace.NumCudaDevices() - 1
else:
    DEVICES = [caffe2_pb2.CPU]
    max_gpuid = 0


# Utility class for other loading tests, don't add test functions here
# Inherit from this test instead. If you add a test here,
# each derived class will inherit it as well and cause test duplication
class TestLoadSaveBase(test_util.TestCase):

    def __init__(self, methodName, db_type='minidb'):
        super(TestLoadSaveBase, self).__init__(methodName)
        self._db_type = db_type

    @given(src_device_type=st.sampled_from(DEVICES),
Esempio n. 15
0
def Train(args):
    # Either use specified device list or generate one
    if args.gpus is not None:
        gpus = [int(x) for x in args.gpus.split(',')]
        num_gpus = len(gpus)
    else:
        gpus = list(range(args.num_gpus))
        num_gpus = args.num_gpus

    log.info("Running on GPUs: {}".format(gpus))

    # Verify valid batch size
    total_batch_size = args.batch_size
    batch_per_device = total_batch_size // num_gpus
    assert \
        total_batch_size % num_gpus == 0, \
        "Number of GPUs must divide batch size"

    # Round down epoch size to closest multiple of batch size across machines
    global_batch_size = total_batch_size * args.num_shards
    epoch_iters = int(args.epoch_size / global_batch_size)

    assert \
        epoch_iters > 0, \
        "Epoch size must be larger than batch size times shard count"

    args.epoch_size = epoch_iters * global_batch_size
    log.info("Using epoch size: {}".format(args.epoch_size))

    # Create ModelHelper object
    train_arg_scope = {
        'order': 'NCHW',
        'use_cudnn': True,
        'cudnn_exhaustive_search': True,
        'ws_nbytes_limit': (args.cudnn_workspace_limit_mb * 1024 * 1024),
    }
    train_model = model_helper.ModelHelper(name="resnet50",
                                           arg_scope=train_arg_scope)

    num_shards = args.num_shards
    shard_id = args.shard_id

    # Expect interfaces to be comma separated.
    # Use of multiple network interfaces is not yet complete,
    # so simply use the first one in the list.
    interfaces = args.distributed_interfaces.split(",")

    # Rendezvous using MPI when run with mpirun
    if os.getenv("OMPI_COMM_WORLD_SIZE") is not None:
        num_shards = int(os.getenv("OMPI_COMM_WORLD_SIZE", 1))
        shard_id = int(os.getenv("OMPI_COMM_WORLD_RANK", 0))
        if num_shards > 1:
            rendezvous = dict(kv_handler=None,
                              num_shards=num_shards,
                              shard_id=shard_id,
                              engine="GLOO",
                              transport=args.distributed_transport,
                              interface=interfaces[0],
                              mpi_rendezvous=True,
                              exit_nets=None)

    elif num_shards > 1:
        # Create rendezvous for distributed computation
        store_handler = "store_handler"
        if args.redis_host is not None:
            # Use Redis for rendezvous if Redis host is specified
            workspace.RunOperatorOnce(
                core.CreateOperator(
                    "RedisStoreHandlerCreate",
                    [],
                    [store_handler],
                    host=args.redis_host,
                    port=args.redis_port,
                    prefix=args.run_id,
                ))
        else:
            # Use filesystem for rendezvous otherwise
            workspace.RunOperatorOnce(
                core.CreateOperator(
                    "FileStoreHandlerCreate",
                    [],
                    [store_handler],
                    path=args.file_store_path,
                    prefix=args.run_id,
                ))

        rendezvous = dict(kv_handler=store_handler,
                          shard_id=shard_id,
                          num_shards=num_shards,
                          engine="GLOO",
                          transport=args.distributed_transport,
                          interface=interfaces[0],
                          exit_nets=None)

    else:
        rendezvous = None

    # Model building functions
    # def create_resnet50_model_ops(model, loss_scale):
    #     initializer = (PseudoFP16Initializer if args.dtype == 'float16'
    #                    else Initializer)

    #     with brew.arg_scope([brew.conv, brew.fc],
    #                         WeightInitializer=initializer,
    #                         BiasInitializer=initializer,
    #                         enable_tensor_core=args.enable_tensor_core,
    #                         float16_compute=args.float16_compute):
    #         pred = resnet.create_resnet50(
    #             #args.layers,
    #             model,
    #             "data",
    #             num_input_channels=args.num_channels,
    #             num_labels=args.num_labels,
    #             no_bias=True,
    #             no_loss=True,
    #         )

    #     if args.dtype == 'float16':
    #         pred = model.net.HalfToFloat(pred, pred + '_fp32')

    #     softmax, loss = model.SoftmaxWithLoss([pred, 'label'],
    #                                           ['softmax', 'loss'])
    #     loss = model.Scale(loss, scale=loss_scale)
    #     brew.accuracy(model, [softmax, "label"], "accuracy")
    #     return [loss]

    def create_model_ops(model, loss_scale):
        return create_model_ops_testable(model, loss_scale, is_test=False)

    def create_model_ops_test(model, loss_scale):
        return create_model_ops_testable(model, loss_scale, is_test=True)

    # Model building functions
    def create_model_ops_testable(model, loss_scale, is_test=False):
        initializer = (PseudoFP16Initializer
                       if args.dtype == 'float16' else Initializer)

        with brew.arg_scope([brew.conv, brew.fc],
                            WeightInitializer=initializer,
                            BiasInitializer=initializer,
                            enable_tensor_core=args.enable_tensor_core,
                            float16_compute=args.float16_compute):

            if args.model == "cifar10":
                if args.image_size != 32:
                    log.warn("Cifar10 expects a 32x32 image.")
                pred = models.cifar10.create_cifar10(
                    model,
                    "data",
                    image_channels=args.num_channels,
                    num_classes=args.num_labels,
                    image_height=args.image_size,
                    image_width=args.image_size,
                )
            elif args.model == "resnet32x32":
                if args.image_size != 32:
                    log.warn("ResNet32x32 expects a 32x32 image.")
                pred = models.resnet.create_resnet32x32(
                    model,
                    "data",
                    num_layers=args.num_layers,
                    num_input_channels=args.num_channels,
                    num_labels=args.num_labels,
                    is_test=is_test)
            elif args.model == "resnet":
                if args.image_size != 224:
                    log.warn(
                        "ResNet expects a 224x224 image. input image = %d" %
                        args.image_size)
                pred = resnet.create_resnet50(
                    #args.layers,
                    model,
                    "data",
                    num_input_channels=args.num_channels,
                    num_labels=args.num_labels,
                    no_bias=True,
                    no_loss=True,
                )
            elif args.model == "vgg":
                if args.image_size != 224:
                    log.warn("VGG expects a 224x224 image.")
                pred = vgg.create_vgg(model,
                                      "data",
                                      num_input_channels=args.num_channels,
                                      num_labels=args.num_labels,
                                      num_layers=args.num_layers,
                                      is_test=is_test)
            elif args.model == "googlenet":
                if args.image_size != 224:
                    log.warn("GoogLeNet expects a 224x224 image.")
                pred = googlenet.create_googlenet(
                    model,
                    "data",
                    num_input_channels=args.num_channels,
                    num_labels=args.num_labels,
                    is_test=is_test)
            elif args.model == "alexnet":
                if args.image_size != 224:
                    log.warn("Alexnet expects a 224x224 image.")
                pred = alexnet.create_alexnet(
                    model,
                    "data",
                    num_input_channels=args.num_channels,
                    num_labels=args.num_labels,
                    is_test=is_test)
            elif args.model == "alexnetv0":
                if args.image_size != 224:
                    log.warn("Alexnet v0 expects a 224x224 image.")
                pred = alexnet.create_alexnetv0(
                    model,
                    "data",
                    num_input_channels=args.num_channels,
                    num_labels=args.num_labels,
                    is_test=is_test)
            else:
                raise NotImplementedError("Network {} not found.".format(
                    args.model))

        if args.dtype == 'float16':
            pred = model.net.HalfToFloat(pred, pred + '_fp32')

        softmax, loss = model.SoftmaxWithLoss([pred, 'label'],
                                              ['softmax', 'loss'])
        loss = model.Scale(loss, scale=loss_scale)
        brew.accuracy(model, [softmax, "label"], "accuracy")
        return [loss]

    def add_optimizer(model):
        stepsz = int(30 * args.epoch_size / total_batch_size / num_shards)

        if args.float16_compute:
            # TODO: merge with multi-prceision optimizer
            opt = optimizer.build_fp16_sgd(
                model,
                args.base_learning_rate,
                momentum=0.9,
                nesterov=1,
                weight_decay=args.weight_decay,  # weight decay included
                policy="step",
                stepsize=stepsz,
                gamma=0.1)
        else:
            optimizer.add_weight_decay(model, args.weight_decay)
            opt = optimizer.build_multi_precision_sgd(model,
                                                      args.base_learning_rate,
                                                      momentum=0.9,
                                                      nesterov=1,
                                                      policy="step",
                                                      stepsize=stepsz,
                                                      gamma=0.1)
            print("info:===============================" + str(opt))
        return opt

    # Define add_image_input function.
    # Depends on the "train_data" argument.
    # Note that the reader will be shared with between all GPUS.
    if args.train_data == "null":

        def add_image_input(model):
            AddNullInput(
                model,
                None,
                batch_size=batch_per_device,
                img_size=args.image_size,
                dtype=args.dtype,
            )
    else:
        reader = train_model.CreateDB(
            "reader",
            db=args.train_data,
            db_type=args.db_type,
            num_shards=num_shards,
            shard_id=shard_id,
        )

        def add_image_input(model):
            AddImageInput(
                model,
                reader,
                batch_size=batch_per_device,
                img_size=args.image_size,
                dtype=args.dtype,
                is_test=False,
            )

    def add_post_sync_ops(model):
        """Add ops applied after initial parameter sync."""
        for param_info in model.GetOptimizationParamInfo(model.GetParams()):
            if param_info.blob_copy is not None:
                model.param_init_net.HalfToFloat(
                    param_info.blob, param_info.blob_copy[core.DataType.FLOAT])

    # Create parallelized model
    data_parallel_model.Parallelize(train_model,
                                    input_builder_fun=add_image_input,
                                    forward_pass_builder_fun=create_model_ops,
                                    optimizer_builder_fun=add_optimizer,
                                    post_sync_builder_fun=add_post_sync_ops,
                                    devices=gpus,
                                    rendezvous=rendezvous,
                                    optimize_gradient_memory=False,
                                    cpu_device=args.use_cpu,
                                    shared_model=args.use_cpu,
                                    combine_spatial_bn=args.use_cpu,
                                    use_nccl=args.use_nccl)

    if args.model_parallel:
        # Shift half of the activations to another GPU
        assert workspace.NumCudaDevices() >= 2 * args.num_gpus
        activations = data_parallel_model_utils.GetActivationBlobs(train_model)
        data_parallel_model_utils.ShiftActivationDevices(
            train_model,
            activations=activations[len(activations) // 2:],
            shifts={g: args.num_gpus + g
                    for g in range(args.num_gpus)},
        )

    data_parallel_model.OptimizeGradientMemory(train_model, {}, set(), False)

    workspace.RunNetOnce(train_model.param_init_net)
    workspace.CreateNet(train_model.net)

    if "GLOO_ALGORITHM" in os.environ and os.environ[
            "GLOO_ALGORITHM"] == "PHUB":
        #i need to communicate to PHub about the elements that need aggregation,
        #as well as their sizes.
        #at this stage, all i need is the name of keys and my key ID.
        grad_names = list(reversed(train_model._grad_names))
        phubKeyNames = ["allreduce_{}_status".format(x) for x in grad_names]
        caffe2GradSizes = dict(
            zip([
                data_parallel_model.stripBlobName(name) + "_grad"
                for name in train_model._parameters_info.keys()
            ], [x.size for x in train_model._parameters_info.values()]))
        phubKeySizes = [str(caffe2GradSizes[x]) for x in grad_names]
        if rendezvous["shard_id"] == 0:
            #only id 0 needs to send to rendezvous.
            r = redis.StrictRedis()
            #foreach key, I need to assign an ID
            joinedStr = ",".join(phubKeyNames)
            r.set("[PLink]IntegrationKeys", joinedStr)
            joinedStr = ",".join(phubKeySizes)
            r.set("[PLink]IntegrationKeySizes", joinedStr)

    # Add test model, if specified
    test_model = None
    if (args.test_data is not None):
        log.info("----- Create test net ----")
        test_arg_scope = {
            'order': "NCHW",
            'use_cudnn': True,
            'cudnn_exhaustive_search': True,
        }
        test_model = model_helper.ModelHelper(name="resnet50_test",
                                              arg_scope=test_arg_scope,
                                              init_params=False)

        test_reader = test_model.CreateDB(
            "test_reader",
            db=args.test_data,
            db_type=args.db_type,
        )

        def test_input_fn(model):
            AddImageInput(
                model,
                test_reader,
                batch_size=batch_per_device,
                img_size=args.image_size,
                dtype=args.dtype,
                is_test=True,
            )

        data_parallel_model.Parallelize(
            test_model,
            input_builder_fun=test_input_fn,
            forward_pass_builder_fun=create_model_ops_test,
            post_sync_builder_fun=add_post_sync_ops,
            param_update_builder_fun=None,
            devices=gpus,
            cpu_device=args.use_cpu,
        )
        workspace.RunNetOnce(test_model.param_init_net)
        workspace.CreateNet(test_model.net)

    epoch = 0
    # load the pre-trained model and reset epoch
    if args.load_model_path is not None:
        LoadModel(args.load_model_path, train_model)

        # Sync the model params
        data_parallel_model.FinalizeAfterCheckpoint(train_model)

        # reset epoch. load_model_path should end with *_X.mdl,
        # where X is the epoch number
        last_str = args.load_model_path.split('_')[-1]
        if last_str.endswith('.mdl'):
            epoch = int(last_str[:-4])
            log.info("Reset epoch to {}".format(epoch))
        else:
            log.warning("The format of load_model_path doesn't match!")

    expname = "resnet50_gpu%d_b%d_L%d_lr%.2f_v2" % (
        args.num_gpus,
        total_batch_size,
        args.num_labels,
        args.base_learning_rate,
    )

    explog = experiment_util.ModelTrainerLog(expname, args)

    # Run the training one epoch a time
    while epoch < args.num_epochs:
        epoch = RunEpoch(args, epoch, train_model, test_model,
                         total_batch_size, num_shards, expname, explog)

        # Save the model for each epoch
        SaveModel(args, train_model, epoch)

        model_path = "%s/%s_" % (args.file_store_path, args.save_model_name)
        # remove the saved model from the previous epoch if it exists
        if os.path.isfile(model_path + str(epoch - 1) + ".mdl"):
            os.remove(model_path + str(epoch - 1) + ".mdl")
Esempio n. 16
0
from caffe2.python import (
    brew,
    core,
    device_checker,
    gradient_checker,
    model_helper,
    test_util,
    workspace,
)
from caffe2.python.gradient_checker import NetGradientChecker
from caffe2.python.net_builder import ops, NetBuilder
from caffe2.proto import caffe2_pb2

import unittest

if workspace.has_gpu_support and workspace.NumCudaDevices() > 0:
    gpu_device_option = caffe2_pb2.DeviceOption()
    gpu_device_option.device_type = caffe2_pb2.CUDA
    cpu_device_option = caffe2_pb2.DeviceOption()
    gpu_device_checker = device_checker.DeviceChecker(0.01,
                                                      [gpu_device_option])
    device_checker = device_checker.DeviceChecker(
        0.01, [gpu_device_option, cpu_device_option])
    gpu_gradient_checkers = [
        gradient_checker.GradientChecker(0.005, 0.05, gpu_device_option,
                                         "gpu_checker_ws"),
    ]
    gradient_checkers = [
        gradient_checker.GradientChecker(0.005, 0.05, gpu_device_option,
                                         "gpu_checker_ws"),
        gradient_checker.GradientChecker(0.01, 0.05, cpu_device_option,
Esempio n. 17
0
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np
import unittest
from caffe2.proto import caffe2_pb2
from caffe2.python import core, workspace, data_parallel_model, cnn, recurrent
from caffe2.python.test_util import TestCase


@unittest.skipIf(not workspace.has_gpu_support, "No gpu support.")
@unittest.skipIf(workspace.NumCudaDevices() < 2, "Need at least 2 GPUs.")
class GPUDataParallelModelTest(TestCase):
    def run_model(self, gpu_devices):
        '''
        Helper function for test_equiv
        '''
        def input_builder_fun(model):
            return None

        def model_build_fun(model, loss_scale):
            fc = model.FC("data", "fc", 16, 1, ("ConstantFill", {}),
                          ("ConstantFill", {}))
            fc_fl = model.FlattenToVec(fc, "fc_fl")
            sigm = model.Sigmoid(fc_fl, "sigm")
            sq = model.SquaredL2Distance([sigm, "label"], "sq")
            loss = model.AveragedLoss(sq, "loss")
            loss = model.Scale(loss, scale=loss_scale)
            return [loss]
Esempio n. 18
0
class CopyOpsTest(unittest.TestCase):
    def tearDown(self):
        # Reset workspace after each test
        # Otherwise, the multi-GPU test will use previously created tensors,
        #   which may have been placed on the wrong device
        workspace.ResetWorkspace()

    def run_test_copy_gradient(self, device_opt):
        model = model_helper.ModelHelper(name="copy_test")
        with core.DeviceScope(device_opt):
            x = model.net.AddExternalInputs("x")
            y = model.Copy(x, "y")
            loss = model.AveragedLoss(y, "loss")
            gradient_map = model.AddGradientOperators([loss])
            workspace.FeedBlob(x, np.random.rand(32).astype(np.float32))
            workspace.RunNetOnce(model.param_init_net)
            workspace.RunNetOnce(model.net)
            self.assertTrue(
                np.array_equal(
                    workspace.FetchBlob(x),
                    workspace.FetchBlob(y),
                ))
            self.assertTrue(
                np.array_equal(
                    workspace.FetchBlob(gradient_map[x]),
                    workspace.FetchBlob(gradient_map[y]),
                ))

    def test_copy_gradient_cpu(self):
        self.run_test_copy_gradient(core.DeviceOption(caffe2_pb2.CPU, 0))

    @unittest.skipIf(workspace.NumCudaDevices() < 1, "Need at least 1 GPU.")
    def test_copy_gradient_gpu(self):
        self.run_test_copy_gradient(core.DeviceOption(caffe2_pb2.CUDA, 0))

    @unittest.skipIf(workspace.NumCudaDevices() < 2, "Need at least 2 GPU.")
    def test_copy_gradient_multiple_gpus(self):
        model = model_helper.ModelHelper(name="copy_test")

        with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU, 0)):
            x_cpu = model.net.AddExternalInputs("x_cpu")

        with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 0)):
            x_gpu_1 = model.CopyCPUToGPU(x_cpu, "x_gpu_1")

        with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 1)):
            x_gpu_2 = model.Copy(x_gpu_1, "x_gpu_2")
            loss = model.AveragedLoss(x_gpu_2, "loss")
            gradient_map = model.AddGradientOperators([loss])

        workspace.FeedBlob("x_cpu", np.random.rand(32).astype(np.float32))
        workspace.RunNetOnce(model.param_init_net)
        workspace.RunNetOnce(model.net)

        self.assertTrue(
            np.array_equal(
                workspace.FetchBlob("x_gpu_1"),
                workspace.FetchBlob("x_gpu_2"),
            ))
        self.assertTrue(
            np.array_equal(
                workspace.FetchBlob(gradient_map["x_gpu_1"]),
                workspace.FetchBlob(gradient_map["x_gpu_2"]),
            ))

        def get_op_with_output(model, output_blob_name):
            for op in model.net.Proto().op:
                if len(op.output) == 1 and op.output[0] == output_blob_name:
                    return op
            return None

        self.assertEqual(
            get_op_with_output(model, "x_gpu_2_grad").device_option,
            core.DeviceOption(caffe2_pb2.CUDA, 1),
        )
        self.assertEqual(
            get_op_with_output(model, "x_cpu_grad").device_option,
            core.DeviceOption(caffe2_pb2.CUDA, 0),
        )

    @unittest.skipIf(workspace.NumCudaDevices() < 1, "Need at least 1 GPU.")
    def test_cpu2gpu_gpu2cpu_sparse_gradients(self):
        model = model_helper.ModelHelper(name="copy_test")
        v = model.param_init_net.UniformFill([], ["v"], shape=[16, 4])
        indices = model.param_init_net.UniformFill([], ["v"], shape=[16, 4])
        cpu_opt = core.DeviceOption(caffe2_pb2.CPU, 0)
        gpu_opt = core.DeviceOption(caffe2_pb2.CUDA, 0)

        with core.DeviceScope(gpu_opt):
            vcpu = model.CopyGPUToCPU(v, "vcpu")

        with core.DeviceScope(cpu_opt):
            g = model.Gather([vcpu, indices], "g")

        with core.DeviceScope(gpu_opt):
            ggpu = model.CopyCPUToGPU(g, "ggpu")
            f = brew.fc(model, ggpu, "out", dim_in=4, dim_out=6)
            (softmax, loss) = model.SoftmaxWithLoss(
                [f, "label"],
                ["softmax", "loss"],
            )
        gradient_map = model.AddGradientOperators([loss])
        self.assertTrue("v" in gradient_map)
        self.assertTrue(isinstance(gradient_map['v'], core.GradientSlice))

    @unittest.skipIf(workspace.NumCudaDevices() < 1, "Need at least 1 GPU.")
    def test_cpu2gpu_gpu2cpu_gradients(self):
        model = model_helper.ModelHelper(name="copy_test")

        batch = 32
        cpu_opt = core.DeviceOption(caffe2_pb2.CPU, 0)
        gpu_opt = core.DeviceOption(caffe2_pb2.CUDA, 0)

        with core.NameScope("cpu"):
            with core.DeviceScope(cpu_opt):
                x_cpu = brew.fc(model, 'data', 'x_cpu', 16, 8)

        with core.NameScope("gpu_0"):
            with core.DeviceScope(gpu_opt):
                x_gpu = model.CopyCPUToGPU(x_cpu, "x_gpu")
                pred_gpu = brew.fc(model, x_gpu, "pred_gpu", 8, 4)
                pred_cpu = model.CopyGPUToCPU(pred_gpu, "pred_cpu")

        with core.DeviceScope(cpu_opt):
            with core.NameScope("cpu"):
                (softmax, loss) = model.SoftmaxWithLoss(
                    [pred_cpu, "label"],
                    ["softmax", "loss"],
                )

        gradient_map = model.AddGradientOperators([loss])

        # Add param updates (for cpu and gpu)
        init_net = model.param_init_net
        with core.DeviceScope(cpu_opt):
            with core.NameScope("cpu"):
                ONE = init_net.ConstantFill([], "ONE", shape=[1], value=1.)
                LR = init_net.ConstantFill([], "LR", shape=[1], value=-2.0)
                for param in model.GetParams():
                    model.WeightedSum(
                        [param, ONE, gradient_map[param], LR],
                        param,
                    )

        with core.NameScope("gpu_0"):
            with core.DeviceScope(gpu_opt):
                ONE = init_net.ConstantFill([], "ONE", shape=[1], value=1.)
                LR = init_net.ConstantFill([], "LR", shape=[1], value=-2.0)
                for param in model.GetParams():
                    model.WeightedSum(
                        [param, ONE, gradient_map[param], LR],
                        param,
                    )

        with core.DeviceScope(cpu_opt):
            workspace.FeedBlob(
                'cpu/data',
                np.random.rand(batch, 16).astype(np.float32),
            )
            workspace.FeedBlob(
                'cpu/label',
                np.random.randint(4, size=batch).astype(np.int32),
            )

        workspace.RunNetOnce(model.param_init_net)
        workspace.CreateNet(model.net)

        initial_params = {p: workspace.FetchBlob(p) for p in model.GetParams()}
        workspace.RunNet(model.net.Proto().name)
        updated_params = {p: workspace.FetchBlob(p) for p in model.GetParams()}

        for p in model.GetParams():
            g = gradient_map[p]
            expected = initial_params[p] - 2.0 * workspace.FetchBlob(g)
            actual = updated_params[p]
            self.assertTrue(
                np.array_equal(expected, updated_params[p]),
                "Mismatch: {}: {}, {}".format(p, expected, actual),
            )
    for scoped_name, blob in restored_all_params.items():
        unscoped_name = c2_utils.UnscopeName(scoped_name)
        np.testing.assert_array_equal(blob, orig_gpu_0_params[unscoped_name])


if __name__ == '__main__':
    workspace.GlobalInit(['caffe2', '--caffe2_log_level=0'])
    logger = utils.logging.setup_logging(__name__)
    logger.setLevel(logging.DEBUG)
    logging.getLogger('roi_data.loader').setLevel(logging.INFO)
    np.random.seed(cfg.RNG_SEED)
    output_dir = tempfile.mkdtemp()
    # Generate config for test
    cfg.MODEL.TYPE = 'generalized_rcnn'
    cfg.MODEL.CONV_BODY = 'FPN.add_fpn_ResNet50_conv5_body'
    cfg.MODEL.NUM_CLASSES = 81
    cfg.MODEL.FASTER_RCNN = True
    cfg.FPN.FPN_ON = True
    cfg.FPN.MULTILEVEL_ROIS = True
    cfg.FPN.MULTILEVEL_RPN = True
    cfg.FAST_RCNN.ROI_BOX_HEAD = 'fast_rcnn_heads.add_roi_2mlp_head'
    cfg.FAST_RCNN.ROI_XFORM_METHOD = 'RoIAlign'
    cfg.OUTPUT_DIR = output_dir
    cfg.TRAIN.DATASETS = ('coco_2014_minival',)
    cfg.TRAIN.WEIGHTS = b''
    for num_gpu in range(workspace.NumCudaDevices()):
        cfg.immutable(False)
        cfg.NUM_GPUS = num_gpu + 1
        assert_and_infer_cfg()
        test_restore_checkpoint()
def bmuf_process(filestore_dir, process_id, shared_results, nesterov=False):
    # We need to import caffe2 in every process to initialize CUDA independently.
    from caffe2.python import core, cnn, data_parallel_model, workspace, dyndep
    from caffe2.proto import caffe2_pb2
    dyndep.InitOpsLibrary("@/caffe2/caffe2/distributed:file_store_handler_ops")

    if not workspace.has_gpu_support:
        log.info('No GPU support test is Ignored.')
        return

    if workspace.NumCudaDevices() < 4:
        log.info('Not enough GPU support, test IGNORED')
        return

    model = cnn.CNNModelHelper(order="NHWC", name="test")

    gpu_ids = [0, 1] if process_id == 0 else [2, 3]

    def _model_build_fun(model, loss_scale):
        fc = model.FC("data", "fc", 16, 1, ("ConstantFill", {}),
                      ("ConstantFill", {}))
        fc_fl = model.FlattenToVec(fc, "fc_fl")
        sigm = model.Sigmoid(fc_fl, "sigm")
        sq = model.SquaredL2Distance([sigm, "label"], "sq")
        loss = model.AveragedLoss(sq, "loss")
        loss = model.Scale(loss, scale=loss_scale)

        # For testing explicit sync
        model.param_init_net.UniformFill([], ["sync_num"], shape=[1])
        return [loss]

    def _input_builder_fun(model):
        return None

    def _param_update_fun(model):
        ITER = model.Iter("ITER")
        LR = model.net.LearningRate(
            [ITER],
            "LR",
            base_lr=(-0.1),
            policy="fixed",
        )
        ONE = model.param_init_net.ConstantFill(
            [],
            "ONE",
            shape=[1],
            value=1.0,
        )
        for param in model.GetParams():
            grad = model.param_to_grad[param]
            model.WeightedSum([param, ONE, grad, LR], param)

    def _generate_data(gpu_devices, process_id):
        np.random.seed(26 + process_id * 10)
        # Each run has same input, independent of number of gpus
        batch_size = 64
        for _ in range(0, 10):
            full_data = np.random.rand(batch_size, 16)
            full_labels = np.round(full_data[:, 0])
            batch_per_device = batch_size // len(gpu_devices)

            for (j, g) in enumerate(gpu_devices):
                st = j * batch_per_device
                en = st + batch_per_device
                data = full_data[st:en, :].astype(np.float32)
                labels = full_labels[st:en].astype(np.float32)
                with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, g)):
                    workspace.FeedBlob("gpu_{}/data".format(g), data)
                    workspace.FeedBlob("gpu_{}/label".format(g), labels)

    _generate_data(gpu_ids, process_id)

    workspace.RunOperatorOnce(
        core.CreateOperator("FileStoreHandlerCreate", [], ["store_handler"],
                            path=filestore_dir))
    rendezvous = dict(kv_handler="store_handler",
                      shard_id=process_id,
                      num_shards=2,
                      engine="GLOO",
                      exit_nets=None)

    data_parallel_model.Parallelize_GPU_BMUF(
        model,
        _input_builder_fun,
        _model_build_fun,
        _param_update_fun,
        devices=gpu_ids,
        rendezvous=rendezvous,
        nesterov=nesterov,
        add_blobs_to_sync=["sync_num"],
    )

    data_parallel_model.RunInitNet(model)

    def _gpu_pid(gpu_id, pid):
        if pid == 1:
            return gpu_id + 2
        return gpu_id

    np.testing.assert_equal(
        workspace.FetchBlob("gpu_{}/fc_w_v".format(_gpu_pid(0, process_id))),
        np.zeros(16).astype(np.float32).reshape(1, 16))

    # Run the algorithm for one iteration to have non-zero params.
    data_parallel_model.RunNet(model, 1)

    # Save iteration momentum and post local update params
    results = {}
    v_b_ = workspace.FetchBlob("gpu_{}/fc_b_v".format(_gpu_pid(0, process_id)))
    v_w_ = workspace.FetchBlob("gpu_{}/fc_w_v".format(_gpu_pid(0, process_id)))

    results['v_b_'] = v_b_
    results['v_w_'] = v_w_

    workspace.RunNetOnce(model.net)

    b_0_ = workspace.FetchBlob("gpu_{}/fc_b".format(_gpu_pid(0, process_id)))
    w_0_ = workspace.FetchBlob("gpu_{}/fc_w".format(_gpu_pid(0, process_id)))
    b_1_ = workspace.FetchBlob("gpu_{}/fc_b".format(_gpu_pid(1, process_id)))
    w_1_ = workspace.FetchBlob("gpu_{}/fc_w".format(_gpu_pid(1, process_id)))

    results['b_0_'] = b_0_
    results['w_0_'] = w_0_
    results['b_1_'] = b_1_
    results['w_1_'] = w_1_

    # Test sync
    if process_id == 0:
        workspace.FeedBlob(model._device_prefix + "_0/sync_num",
                           np.array([2603]).astype(np.float32),
                           device_option=core.DeviceOption(
                               model._device_type, 0))

    # Compute block gradients.
    b_g_ = workspace.FetchBlob("gpu_{}/fc_b_g".format(_gpu_pid(0, process_id)))
    w_g_ = workspace.FetchBlob("gpu_{}/fc_w_g".format(_gpu_pid(0, process_id)))
    results['b_g_'] = b_g_
    results['w_g_'] = w_g_
    workspace.RunNetOnce(model._global_model_param_updates_net)

    #  g_b = (b_0_ + b_1_) / 2 - b_g_
    #  g_w = (w_0_ + w_1_) / 2 - w_g_
    v_b = workspace.FetchBlob("gpu_{}/fc_b_v".format(_gpu_pid(0, process_id)))
    v_w = workspace.FetchBlob("gpu_{}/fc_w_v".format(_gpu_pid(0, process_id)))
    w_g = workspace.FetchBlob("gpu_{}/fc_w_g".format(_gpu_pid(0, process_id)))
    b_g = workspace.FetchBlob("gpu_{}/fc_b_g".format(_gpu_pid(0, process_id)))
    w_0 = workspace.FetchBlob("gpu_{}/fc_w".format(_gpu_pid(0, process_id)))
    b_0 = workspace.FetchBlob("gpu_{}/fc_b".format(_gpu_pid(0, process_id)))
    w_1 = workspace.FetchBlob("gpu_{}/fc_w".format(_gpu_pid(1, process_id)))
    b_1 = workspace.FetchBlob("gpu_{}/fc_b".format(_gpu_pid(1, process_id)))
    results['v_b'] = v_b
    results['v_w'] = v_w
    results['w_g'] = w_g
    results['b_g'] = b_g
    results['w_0'] = w_0
    results['b_0'] = b_0
    results['w_1'] = w_1
    results['b_1'] = b_1

    # Test add_blobs_to_sync
    for j in model._devices:
        sync = workspace.FetchBlob(model._device_prefix +
                                   "_{}/sync_num".format(j))[0]
        results['sync_{}'.format(j)] = sync

    shared_results[process_id] = results
Esempio n. 21
0
def Parallelize_GPU(
    model_helper_obj,
    input_builder_fun,
    forward_pass_builder_fun,
    param_update_builder_fun,
    devices=range(0, workspace.NumCudaDevices()),
    rendezvous=None,
    net_type='dag',
    broadcast_computed_params=True,
):
    '''
    Function to create a model that can run on many GPUs.
      model_helper_obj: an object of ModelHelperBase, such as CNNModelHelper
      input_builder_fun:
                         Function that adds the input operators
                         Note: Remember to instantiate reader outside of this
                         function so all GPUs share same reader object.
                         Signature:  input_builder_fun(model)
      forward_pass_builder_fun:
                        Function to add the operators to the model.
                        Must return list of loss-blob references that
                        are used to build the gradient.
                        Signature: forward_pass_builder_fun(model)
      param_update_builder_fun:
                        Function that adds operators that are run after
                        gradient update, such as updating the weights and
                        weight decaying. Function is also passed the learning
                        rate scaling factor. You should multiple the learning
                        rate by the factor to maintain invariant of same
                        results with same total batch size, regardless of
                        number of gpus.
                        Signature: param_update_builder_fun(model, lr_scale)
      devices:          List of GPU ids, such as [0, 1, 2, 3],
      rendezvous:       used for rendezvous in distributed computation, if None
                        then only one node is used. To create rendezvous,
                        use <TBD>.
      net_type:         Network type

    '''
    log.info("Parallelizing model for devices: {}".format(devices))
    extra_workers = 8 if rendezvous is not None else 0  # best-guess
    model_helper_obj.net.Proto().num_workers = len(devices) * 4 + extra_workers
    model_helper_obj.net.Proto().type = net_type

    # Store some information in the model -- a bit ugly
    model_helper_obj._devices = devices
    model_helper_obj._rendezvous = rendezvous
    model_helper_obj._grad_names = []

    assert isinstance(model_helper_obj, model_helper.ModelHelperBase)
    assert model_helper_obj.params == [], "Model needs to be empty"

    # Add input and model
    log.info("Create input and model training operators")

    losses_by_gpu = {}
    for device in devices:
        device_opt = core.DeviceOption(caffe2_pb2.CUDA, device)
        with core.DeviceScope(device_opt):
            with core.NameScope("gpu_{}".format(device)):
                log.info("Model for GPU: {}".format(device))
                input_builder_fun(model_helper_obj)
                losses = forward_pass_builder_fun(model_helper_obj)
                # Losses are not needed for test net
                if param_update_builder_fun is not None:
                    assert isinstance(losses, list), \
                        'Model builder function must return list of loss blobs'
                    for loss in losses:
                        assert isinstance(loss, core.BlobReference), \
                            'Model builder func must return list of loss blobs'

                losses_by_gpu[device] = losses

    # Create parameter map
    model_helper_obj._device_grouped_blobs =\
        _GroupByDevice(devices, model_helper_obj.params)

    # computed params
    computed_params_grouped =\
        _GroupByDevice(devices, model_helper_obj.computed_params)
    model_helper_obj._device_grouped_blobs.update(computed_params_grouped)

    model_helper_obj._param_names =\
        model_helper_obj._device_grouped_blobs.keys()
    model_helper_obj._computed_param_names = computed_params_grouped.keys()

    if (param_update_builder_fun is None):
        log.info("Parameter update function not defined --> only forward")
        return

    log.info("Adding gradient operators")
    _AddGradientOperators(devices, model_helper_obj, losses_by_gpu)

    # Group gradients by device and register to blob lookup
    param_to_grad = model_helper_obj.param_to_grad
    grads_ordered = [param_to_grad[p] for p in
                     model_helper_obj.params if p in param_to_grad]
    gradients_grouped = _GroupByDevice(
        devices,
        grads_ordered,
    )
    model_helper_obj._device_grouped_blobs.update(gradients_grouped)
    model_helper_obj._grad_names = gradients_grouped.keys()

    log.info("Add gradient all-reduces for SyncSGD")
    if broadcast_computed_params:
        _BroadcastComputedParams(devices, model_helper_obj, rendezvous)
    _AllReduceGradients(
        devices, model_helper_obj, rendezvous
    )

    log.info("Post-iteration operators for updating params")
    num_shards = 1 if rendezvous is None else rendezvous['num_shards']
    lr_scale = 1.0 / (len(devices) * num_shards)
    for device in devices:
        device_opt = core.DeviceOption(caffe2_pb2.CUDA, device)
        with core.DeviceScope(device_opt):
            with core.NameScope("gpu_{}".format(device)):
                param_update_builder_fun(model_helper_obj, lr_scale)

    _AnalyzeOperators(model_helper_obj)

    # Add initial parameter syncs
    log.info("Add initial parameter sync")
    if (rendezvous is not None):
        _AddDistributedParameterSync(
            devices,
            model_helper_obj,
            model_helper_obj.param_init_net,
            model_helper_obj.param_init_net,
            rendezvous,
        )

    _SyncParams(devices, model_helper_obj, model_helper_obj.param_init_net)
Esempio n. 22
0
 def testAllreduceSingleGPU(self):
     for i in range(workspace.NumCudaDevices()):
         self.RunningAllreduceWithGPUs([i], muji.Allreduce)
Esempio n. 23
0
class NCCLOpsTest(hu.HypothesisTestCase):
    @given(n=st.integers(min_value=2, max_value=workspace.NumCudaDevices()),
           m=st.integers(min_value=1, max_value=1000),
           in_place=st.booleans())
    def test_nccl_allreduce(self, n, m, in_place):
        xs = [np.random.randn(m).astype(np.float32) for i in range(n)]
        inputs = [str("x_{}".format(i)) for i in range(n)]
        prefix = "" if in_place else "o"
        outputs = [str("{}x_{}".format(prefix, i)) for i in range(n)]
        op = core.CreateOperator("NCCLAllreduce", inputs, outputs)
        input_device_options = {n: gpu_device(i) for i, n in enumerate(inputs)}

        def allreduce(*args):
            assert len(args) == n
            output = np.sum(args, axis=0)
            return [output for _ in range(n)]

        outputs = self.assertReferenceChecks(
            hu.gpu_do, op, [xs[i] for i, _ in enumerate(inputs)], allreduce,
            input_device_options)
        for output in outputs:
            np.testing.assert_array_equal(outputs[0], output)
            self.assertEqual(outputs[0].tobytes(), output.tobytes())

    @given(n=st.integers(min_value=2, max_value=workspace.NumCudaDevices()),
           m=st.integers(min_value=1, max_value=1000),
           root=st.integers(min_value=0,
                            max_value=workspace.NumCudaDevices() - 1))
    def test_nccl_broadcast(self, n, m, root):
        assume(root < n)
        xs = [np.random.randn(m).astype(np.float32) for i in range(n)]
        inputs = [str("x_{}".format(i)) for i in range(n)]
        op = core.CreateOperator("NCCLBroadcast", inputs, inputs, root=root)
        input_device_options = {n: gpu_device(i) for i, n in enumerate(inputs)}

        def broadcast(*args):
            assert len(args) == n
            return [args[root] for _ in range(n)]

        self.assertReferenceChecks(hu.gpu_do, op,
                                   [xs[i] for i, _ in enumerate(inputs)],
                                   broadcast, input_device_options)

    @given(
        n=st.integers(min_value=2, max_value=workspace.NumCudaDevices()),
        m=st.integers(min_value=1, max_value=1000),
        # NCCL Reduce seems to deadlock for non-zero roots.
        root=st.integers(min_value=0, max_value=0),
        in_place=st.booleans())
    def test_nccl_reduce(self, n, m, root, in_place):
        assume(in_place is False or root == 0)
        xs = [np.random.randn(m).astype(np.float32) for i in range(n)]
        inputs = [str("x_{}".format(i)) for i in range(n)]
        op = core.CreateOperator("NCCLReduce",
                                 inputs,
                                 inputs[root] if in_place else b"o",
                                 root=root)
        input_device_options = {n: gpu_device(i) for i, n in enumerate(inputs)}

        def reduce(*args):
            assert len(args) == n
            return [np.sum(args, axis=0)]

        self.assertReferenceChecks(hu.gpu_do, op,
                                   [xs[i] for i, _ in enumerate(inputs)],
                                   reduce, input_device_options)

    @given(n=st.integers(min_value=2, max_value=workspace.NumCudaDevices()),
           m=st.integers(min_value=1, max_value=1000))
    def test_nccl_allgather(self, n, m):
        xs = [np.random.randn(m).astype(np.float32) for i in range(n)]
        inputs = [str("x_{}".format(i)) for i in range(n)]
        outputs = [str("o_{}".format(i)) for i in range(n)]
        op = core.CreateOperator("NCCLAllGather", inputs, outputs)
        input_device_options = {n: gpu_device(i) for i, n in enumerate(inputs)}

        def allgather(*args):
            assert len(args) == n
            return [np.stack(args, axis=0) for _ in range(n)]

        outputs = self.assertReferenceChecks(
            hu.gpu_do, op, [xs[i] for i, _ in enumerate(inputs)], allgather,
            input_device_options)
        for output in outputs:
            np.testing.assert_array_equal(outputs[0], output)
            self.assertEqual(outputs[0].tobytes(), output.tobytes())

    @given(n=st.integers(min_value=2, max_value=workspace.NumCudaDevices()),
           m=st.integers(min_value=1, max_value=1000))
    def test_nccl_reduce_scatter(self, n, m):
        xs = [np.random.randn(n, m).astype(np.float32) for i in range(n)]
        inputs = [str("x_{}".format(i)) for i in range(n)]
        outputs = [str("o_{}".format(i)) for i in range(n)]
        op = core.CreateOperator("NCCLReduceScatter", inputs, outputs)
        input_device_options = {n: gpu_device(i) for i, n in enumerate(inputs)}

        def reduce_scatter(*args):
            assert len(args) == n
            reduced = sum(args)
            assert len(reduced.shape) > 1
            ref = [reduced[i, :] for i in range(n)]
            return ref

        self.assertReferenceChecks(hu.gpu_do, op,
                                   [xs[i] for i, _ in enumerate(inputs)],
                                   reduce_scatter, input_device_options)

    @given(n=st.integers(min_value=2, max_value=workspace.NumCudaDevices()),
           m=st.integers(min_value=100000, max_value=100000),
           iters=st.integers(min_value=1, max_value=100),
           net_type=st.sampled_from(["dag", "async_dag", "simple"]))
    def _test_nccl_sync(self, n, m, iters, net_type):
        inputs = [str("x_{}".format(i)) for i in range(n)]
        extra_inputs = [str("xe_{}".format(i)) for i in range(n)]
        net = core.Net("asdf")
        net.Proto().type = net_type
        net.Proto().num_workers = n
        for i in range(n):
            net.ConstantFill([],
                             inputs[i],
                             shape=[m],
                             value=0.0,
                             device_option=gpu_device(i))
            net.ConstantFill([],
                             extra_inputs[i],
                             shape=[m],
                             value=1.0,
                             device_option=gpu_device(i))
            for _ in range(iters):
                net.Sum([inputs[i], extra_inputs[i]], [inputs[i]],
                        device_option=gpu_device(i))
        net.NCCLReduce(inputs, [inputs[0]], device_option=gpu_device(0))
        self.ws.run(net)
        np.testing.assert_array_equal(
            self.ws.blobs[inputs[0]].fetch(),
            np.full(shape=(m, ), fill_value=iters * n, dtype=np.float32))

    @unittest.skipIf(not os.environ.get("CAFFE2_BENCHMARK"), "Benchmark")
    def test_timings(self):
        for n in range(2, workspace.NumCudaDevices()):
            for in_place in [False, True]:
                xs = [
                    np.random.randn(1e7).astype(np.float32) for i in range(n)
                ]
                inputs = [str("x_{}".format(i)) for i in range(n)]
                prefix = "" if in_place else "o"
                outputs = [str("{}x_{}".format(prefix, i)) for i in range(n)]

                net = core.Net("test")
                net.NCCLAllreduce(inputs, outputs)
                net.RunAllOnGPU()
                for i in range(n):
                    self.ws.create_blob(inputs[i]).feed(xs[i], gpu_device(i))
                self.ws.run(net)
                net_time = benchmark(self.ws, net)
                vanilla = core.Net("vanilla")
                muji.Allreduce(vanilla, inputs)
                vanilla_time = benchmark(self.ws, vanilla)
                print("Speedup for NCCL: {:.2f}".format(vanilla_time /
                                                        net_time))
Esempio n. 24
0
def Parallelize_GPU(
    model_helper_obj,
    input_builder_fun,
    forward_pass_builder_fun,
    param_update_builder_fun,
    devices=range(0, workspace.NumCudaDevices()),
    rendezvous=None,
    net_type='dag',
    broadcast_computed_params=True,
    optimize_gradient_memory=False,
):
    '''
    Function to create a model that can run on many GPUs.
      model_helper_obj: an object of ModelHelperBase, such as CNNModelHelper
      input_builder_fun:
                         Function that adds the input operators
                         Note: Remember to instantiate reader outside of this
                         function so all GPUs share same reader object.
                         Signature:  input_builder_fun(model)
      forward_pass_builder_fun:
                        Function to add the operators to the model.
                        Must return list of loss-blob references that
                        are used to build the gradient. Loss scale parameter
                        is passed, as you should scale the loss of your model
                        by 1.0 / the total number of gpus.
                        Signature: forward_pass_builder_fun(model, loss_scale)
      param_update_builder_fun:
                        Function that adds operators that are run after
                        gradient update, such as updating the weights and
                        weight decaying.
                        Signature: param_update_builder_fun(model)
      devices:          List of GPU ids, such as [0, 1, 2, 3],
      rendezvous:       used for rendezvous in distributed computation, if None
                        then only one node is used. To create rendezvous,
                        use <TBD>.
      net_type:         Network type

    '''
    log.info("Parallelizing model for devices: {}".format(devices))
    extra_workers = 8 if rendezvous is not None else 0  # best-guess
    model_helper_obj.net.Proto().num_workers = len(devices) * 4 + extra_workers
    model_helper_obj.net.Proto().type = net_type

    # Store some information in the model -- a bit ugly
    model_helper_obj._devices = devices
    model_helper_obj._rendezvous = rendezvous
    model_helper_obj._grad_names = []

    assert isinstance(model_helper_obj, model_helper.ModelHelperBase)

    # Keep track of params that were in the model before: they are not
    # data parallel, so we need to handle them separately
    non_datapar_params = copy.copy(model_helper_obj.params)

    # Add input and model
    log.info("Create input and model training operators")

    losses_by_gpu = {}
    num_shards = 1 if rendezvous is None else rendezvous['num_shards']
    loss_scale = 1.0 / (len(devices) * num_shards)

    for device in devices:
        device_opt = core.DeviceOption(caffe2_pb2.CUDA, device)
        with core.DeviceScope(device_opt):
            with core.NameScope("gpu_{}".format(device)):
                log.info("Model for GPU: {}".format(device))
                input_builder_fun(model_helper_obj)
                losses = forward_pass_builder_fun(model_helper_obj, loss_scale)
                # Losses are not needed for test net
                if param_update_builder_fun is not None:
                    assert isinstance(losses, list), \
                        'Model builder function must return list of loss blobs'
                    for loss in losses:
                        assert isinstance(loss, core.BlobReference), \
                            'Model builder func must return list of loss blobs'

                losses_by_gpu[device] = losses
    _ValidateParams(model_helper_obj.params)

    # Create parameter map
    model_helper_obj._device_grouped_blobs =\
        _GroupByDevice(devices, model_helper_obj.params, non_datapar_params)

    # computed params
    computed_params_grouped =\
        _GroupByDevice(devices, model_helper_obj.computed_params, [])
    model_helper_obj._device_grouped_blobs.update(computed_params_grouped)

    model_helper_obj._param_names =\
        model_helper_obj._device_grouped_blobs.keys()
    model_helper_obj._computed_param_names = computed_params_grouped.keys()

    if (param_update_builder_fun is None):
        log.info("Parameter update function not defined --> only forward")
        _InferBlobDevice(model_helper_obj)
        return

    log.info("Adding gradient operators")
    _AddGradientOperators(devices, model_helper_obj, losses_by_gpu)

    _ValidateParams(model_helper_obj.params)

    # Group gradients by device and register to blob lookup
    param_to_grad = model_helper_obj.param_to_grad
    grads_ordered = [
        param_to_grad[p] for p in model_helper_obj.params if p in param_to_grad
    ]
    non_datapar_grads = [param_to_grad[p] for p in non_datapar_params]

    gradients_grouped = _GroupByDevice(devices, grads_ordered,
                                       non_datapar_grads)
    model_helper_obj._device_grouped_blobs.update(gradients_grouped)
    model_helper_obj._grad_names = gradients_grouped.keys()

    _InferBlobDevice(model_helper_obj)

    log.info("Add gradient all-reduces for SyncSGD")
    if broadcast_computed_params:
        _BroadcastComputedParams(devices, model_helper_obj, rendezvous)

    _AllReduceGradients(devices, model_helper_obj, rendezvous)

    log.info("Post-iteration operators for updating params")
    num_shards = 1 if rendezvous is None else rendezvous['num_shards']
    # The following check is necessary for ring reduce to work
    if rendezvous is not None:
        assert num_shards > 1, \
            "Please use more than one shard for distributed training"
    for device in devices:
        device_opt = core.DeviceOption(caffe2_pb2.CUDA, device)
        with core.DeviceScope(device_opt):
            with core.NameScope("gpu_{}".format(device)):
                param_update_builder_fun(model_helper_obj)

    _InferBlobDevice(model_helper_obj)
    _AnalyzeOperators(model_helper_obj)

    # Configure dagnet to run with only one worker on the first iteration,
    # to prevent concurrency problems with allocs and nccl.
    arg = model_helper_obj.Proto().arg.add()
    arg.name = "first_iter_only_one_worker"
    arg.i = 1

    # Add initial parameter syncs
    log.info("Add initial parameter sync")
    if (rendezvous is not None):
        _AddDistributedParameterSync(
            devices,
            model_helper_obj,
            model_helper_obj.param_init_net,
            model_helper_obj.param_init_net,
            rendezvous,
        )

    _SyncParams(devices, model_helper_obj, model_helper_obj.param_init_net)

    if optimize_gradient_memory:
        _OptimizeGradientMemory(model_helper_obj, losses_by_gpu, devices)
Esempio n. 25
0
 def testGetCudaPeerAccessPattern(self):
     pattern = workspace.GetCudaPeerAccessPattern()
     self.assertEqual(type(pattern), np.ndarray)
     self.assertEqual(pattern.ndim, 2)
     self.assertEqual(pattern.shape[0], pattern.shape[1])
     self.assertEqual(pattern.shape[0], workspace.NumCudaDevices())
Esempio n. 26
0
            max_dim=4,
            dtype=np.float32,
            elements=None,
            **kwargs):
    dims_ = st.lists(dims(**kwargs), min_size=min_dim, max_size=max_dim)
    return dims_.flatmap(lambda dims: st.lists(
        arrays(dims, dtype, elements), min_size=n, max_size=n))


cpu_do = caffe2_pb2.DeviceOption()
gpu_do = caffe2_pb2.DeviceOption(device_type=caffe2_pb2.CUDA)
device_options = [cpu_do] + ([gpu_do] if workspace.has_gpu_support else [])
# Include device option for each GPU
expanded_device_options = [cpu_do] + ([
    caffe2_pb2.DeviceOption(device_type=caffe2_pb2.CUDA, cuda_gpu_id=i)
    for i in range(workspace.NumCudaDevices())
] if workspace.has_gpu_support else [])


def device_checker_device_options():
    return st.just(device_options)


def gradient_checker_device_option():
    return st.sampled_from(device_options)


gcs = dict(gc=gradient_checker_device_option(),
           dc=device_checker_device_options())

gcs_cpu_only = dict(gc=st.sampled_from([cpu_do]), dc=st.just([cpu_do]))
Esempio n. 27
0
    # debugging and profiling
    parser.add_argument("--print-freq", type=int, default=1)
    parser.add_argument("--print-time", action="store_true", default=False)
    parser.add_argument("--debug-mode", action="store_true", default=False)
    parser.add_argument("--enable-profiling", action="store_true", default=False)
    parser.add_argument("--plot-compute-graph", action="store_true", default=False)
    args = parser.parse_args()

    ### some basic setup ###
    np.random.seed(args.numpy_rand_seed)
    np.set_printoptions(precision=args.print_precision)

    use_gpu = args.use_gpu
    if use_gpu:
        device_opt = core.DeviceOption(workspace.GpuDeviceType, 0)
        ngpus = workspace.NumCudaDevices()  # 1
        print("Using {} GPU(s)...".format(ngpus))
    else:
        device_opt = core.DeviceOption(caffe2_pb2.CPU)
        print("Using CPU...")

    ### prepare training data ###
    ln_bot = np.fromstring(args.arch_mlp_bot, dtype=int, sep="-")
    if args.data_generation == "dataset":
        # input and target data
        (nbatches, lX, lS_l, lS_i, lT,
         nbatches_test, lX_test, lS_l_test, lS_i_test, lT_test,
         ln_emb, m_den) = dc.read_dataset(
            args.data_set, args.mini_batch_size, args.data_randomize, args.num_batches,
            True, args.raw_data_file, args.processed_data_file)
        ln_bot[0] = m_den
Esempio n. 28
0
def Parallelize_GPU(
    model_helper_obj,
    input_builder_fun,
    forward_pass_builder_fun,
    param_update_builder_fun=None,
    optimizer_builder_fun=None,
    post_sync_builder_fun=None,
    devices=range(0, workspace.NumCudaDevices()),
    rendezvous=None,
    net_type='dag',
    broadcast_computed_params=True,
    optimize_gradient_memory=False,
    use_nccl=False,
    max_concurrent_distributed_ops=4,
):
    '''
    Function to create a model that can run on many GPUs.
      model_helper_obj: an object of ModelHelper, such as CNNModelHelper
      input_builder_fun:
                         Function that adds the input operators
                         Note: Remember to instantiate reader outside of this
                         function so all GPUs share same reader object.
                         Signature:  input_builder_fun(model)
      forward_pass_builder_fun:
                        Function to add the operators to the model.
                        Must return list of loss-blob references that
                        are used to build the gradient. Loss scale parameter
                        is passed, as you should scale the loss of your model
                        by 1.0 / the total number of gpus.
                        Signature: forward_pass_builder_fun(model, loss_scale)
      param_update_builder_fun:
                        Function that adds operators that are run after
                        gradient update, such as updating the weights and
                        weight decaying. This is called for each GPU separately.
                        Signature: param_update_builder_fun(model)
      optimizer_builder_fun:
                        Alternative to param_update_builder_fun, allows one
                        to add an optimizer for the whole model. Called only
                        once, without name or devicescope.
      post_sync_builder_fun:
                        Function applied after initial parameter sync has been
                        completed, such as keeping multi-precision parameters
                        in sync.
                        Signature: post_sync_builder_fun(model)
      devices:          List of GPU ids, such as [0, 1, 2, 3],
      rendezvous:       used for rendezvous in distributed computation, if None
                        then only one node is used. To create rendezvous,
                        use <TBD>.
      net_type:         Network type
      optimize_gradient_memory: whether to apply 'memonger' to share blobs
                        in gradient computation to reduce memory footprint

    '''
    log.info("Parallelizing model for devices: {}".format(devices))
    extra_workers = 8 if rendezvous is not None else 0  # best-guess
    num_workers = len(devices) * 4 + extra_workers
    max_concurrent_distributed_ops =\
        min(max_concurrent_distributed_ops, num_workers - 1)
    model_helper_obj.net.Proto().num_workers = num_workers
    model_helper_obj.net.Proto().type = net_type

    # Store some information in the model -- a bit ugly
    model_helper_obj._devices = devices
    model_helper_obj._rendezvous = rendezvous
    model_helper_obj._grad_names = []

    assert isinstance(model_helper_obj, model_helper.ModelHelper)

    # Keep track of params that were in the model before: they are not
    # data parallel, so we need to handle them separately
    non_datapar_params = copy.copy(model_helper_obj.params)

    # Add input and model
    log.info("Create input and model training operators")

    losses_by_gpu = {}
    num_shards = 1 if rendezvous is None else rendezvous['num_shards']
    loss_scale = 1.0 / (len(devices) * num_shards)

    has_parameter_updates = param_update_builder_fun is not None or \
        optimizer_builder_fun is not None
    assert not (
        param_update_builder_fun is not None and
        optimizer_builder_fun is not None
    ), 'Can only specify one of param_update_builder_fun, optimizer_builder_fun'

    for device in devices:
        device_opt = core.DeviceOption(caffe2_pb2.CUDA, device)
        with core.DeviceScope(device_opt):
            with core.NameScope("gpu_{}".format(device)):
                log.info("Model for GPU: {}".format(device))
                input_builder_fun(model_helper_obj)
                losses = forward_pass_builder_fun(model_helper_obj, loss_scale)
                # Losses are not needed for test net
                if has_parameter_updates:
                    assert isinstance(losses, list), \
                        'Model builder function must return list of loss blobs'
                    for loss in losses:
                        assert isinstance(loss, core.BlobReference), \
                            'Model builder func must return list of loss blobs'

                losses_by_gpu[device] = losses
    _ValidateParams(model_helper_obj.params)

    # Create parameter map
    model_helper_obj._device_grouped_blobs =\
        _GroupByDevice(devices, model_helper_obj.params, non_datapar_params)

    # computed params
    computed_params_grouped =\
        _GroupByDevice(devices, model_helper_obj.GetComputedParams(''), [])
    model_helper_obj._device_grouped_blobs.update(computed_params_grouped)

    model_helper_obj._param_names =\
        model_helper_obj._device_grouped_blobs.keys()
    model_helper_obj._computed_param_names = computed_params_grouped.keys()

    if not has_parameter_updates:
        log.info("Parameter update function not defined --> only forward")
        _InferBlobDevice(model_helper_obj)
        return

    log.info("Adding gradient operators")
    _AddGradientOperators(devices, model_helper_obj, losses_by_gpu)

    _ValidateParams(model_helper_obj.params)

    # Group gradients by device and register to blob lookup
    param_to_grad = model_helper_obj.param_to_grad
    grads_ordered = [param_to_grad[p] for p in
                     model_helper_obj.params if p in param_to_grad]
    non_datapar_grads = [param_to_grad[p] for p in non_datapar_params]

    gradients_grouped = _GroupByDevice(
        devices,
        grads_ordered,
        non_datapar_grads
    )
    model_helper_obj._device_grouped_blobs.update(gradients_grouped)
    model_helper_obj._grad_names = gradients_grouped.keys()
    model_helper_obj._losses_by_gpu = losses_by_gpu

    _InferBlobDevice(model_helper_obj)

    log.info("Add gradient all-reduces for SyncSGD")
    if broadcast_computed_params:
        _BroadcastComputedParams(devices, model_helper_obj, rendezvous, use_nccl)

    if len(model_helper_obj._grad_names) > 0:
        _AllReduceGradients(
            devices,
            model_helper_obj,
            rendezvous,
            use_nccl,
            max_concurrent_distributed_ops,
        )
    else:
        log.info("NOTE: Param builder function did not create any parameters.")

    log.info("Post-iteration operators for updating params")
    num_shards = 1 if rendezvous is None else rendezvous['num_shards']

    if param_update_builder_fun is not None:
        for device in devices:
            device_opt = core.DeviceOption(caffe2_pb2.CUDA, device)
            with core.DeviceScope(device_opt):
                with core.NameScope("gpu_{}".format(device)):
                    param_update_builder_fun(model_helper_obj)
    else:
        log.info("Calling optimizer builder function")
        optimizer_builder_fun(model_helper_obj)

    (sync_blobs, sync_names) = _ComputeBlobsToSync(model_helper_obj)
    sync_blobs_grouped = _GroupByDevice(
        devices,
        sync_blobs,
        [],
    )
    model_helper_obj._device_grouped_blobs.update(sync_blobs_grouped)

    _InferBlobDevice(model_helper_obj)
    _AnalyzeOperators(model_helper_obj)

    # Configure dagnet to run with only one worker on the first iteration,
    # to prevent concurrency problems with allocs and nccl.
    arg = model_helper_obj.Proto().arg.add()
    arg.name = "first_iter_only_one_worker"
    arg.i = 1

    # Add initial parameter syncs
    log.info("Add initial parameter sync")
    if (rendezvous is not None and num_shards > 1):
        _AddDistributedParameterSync(
            devices,
            model_helper_obj,
            model_helper_obj.param_init_net,
            model_helper_obj.param_init_net,
            rendezvous,
            sync_names,
        )

    _SyncParams(
        devices, model_helper_obj, model_helper_obj.param_init_net, sync_names
    )

    # Handle any operations that need to be done after parameter sync
    # i.e. making sure multi-precision copies of parameters are up-to-date
    if post_sync_builder_fun is not None:
        for device in devices:
            device_opt = core.DeviceOption(caffe2_pb2.CUDA, device)
            with core.DeviceScope(device_opt):
                with core.NameScope("gpu_{}".format(device)):
                    post_sync_builder_fun(model_helper_obj)

    if optimize_gradient_memory:
        _OptimizeGradientMemorySimple(model_helper_obj, losses_by_gpu, devices)

    model_helper_obj._data_parallel_model_init_nets = [
        model_helper_obj.param_init_net,
    ]
    model_helper_obj._data_parallel_model_nets = [model_helper_obj.net]
Esempio n. 29
0
def Train(args):
    # Either use specified device list or generate one
    if args.gpus is not None:
        gpus = [int(x) for x in args.gpus.split(',')]
        num_gpus = len(gpus)
    else:
        gpus = list(range(args.num_gpus))
        num_gpus = args.num_gpus

    log.info("Running on GPUs: {}".format(gpus))

    # Verify valid batch size
    total_batch_size = args.batch_size
    batch_per_device = total_batch_size // num_gpus
    assert \
        total_batch_size % num_gpus == 0, \
        "Number of GPUs must divide batch size"

    # Round down epoch size to closest multiple of batch size across machines
    global_batch_size = total_batch_size * args.num_shards
    epoch_iters = int(args.epoch_size / global_batch_size)

    assert \
        epoch_iters > 0, \
        "Epoch size must be larger than batch size times shard count"

    args.epoch_size = epoch_iters * global_batch_size
    log.info("Using epoch size: {}".format(args.epoch_size))

    # Create ModelHelper object
    train_arg_scope = {
        'order': 'NCHW',
        'use_cudnn': True,
        'cudnn_exhaustive_search': True,
        'ws_nbytes_limit': (args.cudnn_workspace_limit_mb * 1024 * 1024),
    }
    train_model = model_helper.ModelHelper(
        name="ban-pc-resnet50", arg_scope=train_arg_scope
    )

    num_shards = args.num_shards
    shard_id = args.shard_id

    # Expect interfaces to be comma separated.
    # Use of multiple network interfaces is not yet complete,
    # so simply use the first one in the list.
    interfaces = args.distributed_interfaces.split(",")

    # Rendezvous using MPI when run with mpirun
    if os.getenv("OMPI_COMM_WORLD_SIZE") is not None:
        num_shards = int(os.getenv("OMPI_COMM_WORLD_SIZE", 1))
        shard_id = int(os.getenv("OMPI_COMM_WORLD_RANK", 0))
        if num_shards > 1:
            rendezvous = dict(
                kv_handler=None,
                num_shards=num_shards,
                shard_id=shard_id,
                engine="GLOO",
                transport=args.distributed_transport,
                interface=interfaces[0],
                mpi_rendezvous=True,
                exit_nets=None)

    elif num_shards > 1:
        # Create rendezvous for distributed computation
        store_handler = "store_handler"
        if args.redis_host is not None:
            # Use Redis for rendezvous if Redis host is specified
            workspace.RunOperatorOnce(
                core.CreateOperator(
                    "RedisStoreHandlerCreate", [], [store_handler],
                    host=args.redis_host,
                    port=args.redis_port,
                    prefix=args.run_id,
                )
            )
        else:
            # Use filesystem for rendezvous otherwise
            workspace.RunOperatorOnce(
                core.CreateOperator(
                    "FileStoreHandlerCreate", [], [store_handler],
                    path=args.file_store_path,
                    prefix=args.run_id,
                )
            )

        rendezvous = dict(
            kv_handler=store_handler,
            shard_id=shard_id,
            num_shards=num_shards,
            engine="GLOO",
            transport=args.distributed_transport,
            interface=interfaces[0],
            exit_nets=None)

    else:
        rendezvous = None

    # Model configs for constructing model
    with open(args.model_config) as f:
        model_config = yaml.load(f)

    # Model building functions
    def create_target_model_ops(model, loss_scale):
        initializer = (PseudoFP16Initializer if args.dtype == 'float16'
                       else Initializer)
        with brew.arg_scope([brew.conv, brew.fc],
                            WeightInitializer=initializer,
                            BiasInitializer=initializer,
                            enable_tensor_core=args.enable_tensor_core,
                            float16_compute=args.float16_compute):
            pred = add_se_model(model, model_config, "data", is_test=False)

        if args.dtype == 'float16':
            pred = model.net.HalfToFloat(pred, pred + '_fp32')

        loss = add_softmax_loss(model, pred, 'label')
        brew.accuracy(model, ['softmax', 'label'], 'accuracy')
        return [loss]

    def add_optimizer(model):
        '''
        stepsz = int(30 * args.epoch_size / total_batch_size / num_shards)
        optimizer.add_weight_decay(model, args.weight_decay)
        opt = optimizer.build_multi_precision_sgd(
            model,
            args.base_learning_rate,
            momentum=0.9,
            nesterov=1,
            policy="step",
            stepsize=stepsz,
            gamma=0.1
        )
        '''

        optimizer.add_weight_decay(model, args.weight_decay)
        opt = optimizer.build_multi_precision_sgd(
            model,
            base_learning_rate = args.base_learning_rate,
            momentum = model_config['solver']['momentum'],
            nesterov = model_config['solver']['nesterov'],
            policy = model_config['solver']['lr_policy'],
            power = model_config['solver']['power'],
            max_iter = model_config['solver']['max_iter'],
        )
        return opt

    # Define add_image_input function.
    # Depends on the "train_data" argument.
    # Note that the reader will be shared with between all GPUS.
    reader = train_model.CreateDB(
        "reader",
        db=args.train_data,
        db_type=args.db_type,
        num_shards=num_shards,
        shard_id=shard_id,
    )

    def add_image_input(model):
        AddImageInput(
            model,
            reader,
            batch_size=batch_per_device,
            img_size=args.image_size,
            dtype=args.dtype,
            is_test=False,
        )

    def add_post_sync_ops(model):
        """Add ops applied after initial parameter sync."""
        for param_info in model.GetOptimizationParamInfo(model.GetParams()):
            if param_info.blob_copy is not None:
                model.param_init_net.HalfToFloat(
                    param_info.blob,
                    param_info.blob_copy[core.DataType.FLOAT]
                )

    # Create parallelized model
    data_parallel_model.Parallelize(
        train_model,
        input_builder_fun=add_image_input,
        forward_pass_builder_fun=create_target_model_ops,
        optimizer_builder_fun=add_optimizer,
        post_sync_builder_fun=add_post_sync_ops,
        devices=gpus,
        rendezvous=rendezvous,
        optimize_gradient_memory=False,
        cpu_device=args.use_cpu,
        shared_model=args.use_cpu,
        combine_spatial_bn=args.use_cpu,
    )

    if args.model_parallel:
        # Shift half of the activations to another GPU
        assert workspace.NumCudaDevices() >= 2 * args.num_gpus
        activations = data_parallel_model_utils.GetActivationBlobs(train_model)
        data_parallel_model_utils.ShiftActivationDevices(
            train_model,
            activations=activations[len(activations) // 2:],
            shifts={g: args.num_gpus + g for g in range(args.num_gpus)},
        )

    data_parallel_model.OptimizeGradientMemory(train_model, {}, set(), False)

    workspace.RunNetOnce(train_model.param_init_net)
    workspace.CreateNet(train_model.net)

    # Add test model, if specified
    test_model = None
    if (args.test_data is not None):
        log.info("----- Create test net ----")
        test_arg_scope = {
            'order': "NCHW",
            'use_cudnn': True,
            'cudnn_exhaustive_search': True,
        }
        test_model = model_helper.ModelHelper(
            name="ban-pc-resnet50_test", arg_scope=test_arg_scope, init_params=False
        )

        test_reader = test_model.CreateDB(
            "test_reader",
            db=args.test_data,
            db_type=args.db_type,
        )

        def test_input_fn(model):
            AddImageInput(
                model,
                test_reader,
                batch_size=batch_per_device,
                img_size=args.image_size,
                dtype=args.dtype,
                is_test=True,
            )

        data_parallel_model.Parallelize(
            test_model,
            input_builder_fun=test_input_fn,
            forward_pass_builder_fun=create_target_model_ops,
            post_sync_builder_fun=add_post_sync_ops,
            param_update_builder_fun=None,
            devices=gpus,
            cpu_device=args.use_cpu,
        )
        workspace.RunNetOnce(test_model.param_init_net)
        workspace.CreateNet(test_model.net)

    epoch = 0
    # load the pre-trained model and reset epoch
    if args.load_model_path is not None:
        LoadModel(args.load_model_path, train_model)

        # Sync the model params
        data_parallel_model.FinalizeAfterCheckpoint(train_model)

        # reset epoch. load_model_path should end with *_X.mdl,
        # where X is the epoch number
        last_str = args.load_model_path.split('_')[-1]
        if last_str.endswith('.mdl'):
            epoch = int(last_str[:-4])
            log.info("Reset epoch to {}".format(epoch))
        else:
            log.warning("The format of load_model_path doesn't match!")

    expname = "log/{}/resnet50_gpu{}_b{}_L{}_lr{:.2f}_v2".format(
        args.dataset_name,
        args.num_gpus,
        total_batch_size,
        args.num_labels,
        args.base_learning_rate,
    )
    explog = experiment_util.ModelTrainerLog(expname, args)

    # Load pretrained param_init_net
    load_init_net_multigpu(args)

    # Run the training one epoch a time
    best_accuracy = 0
    while epoch < args.num_epochs:
        epoch, best_accuracy = RunEpoch(
            args,
            epoch,
            train_model,
            test_model,
            total_batch_size,
            num_shards,
            expname,
            explog,
            best_accuracy,
        )

        # Save the model for each epoch
        SaveModel(args, train_model, epoch)

        model_path = "%s/%s_" % (
            args.file_store_path,
            args.save_model_name
        )
        # remove the saved model from the previous epoch if it exists
        if os.path.isfile(model_path + str(epoch - 1) + ".mdl"):
            os.remove(model_path + str(epoch - 1) + ".mdl")
Esempio n. 30
0
def Parallelize_GPU_BMUF(
    model_helper_obj,
    input_builder_fun,
    forward_pass_builder_fun,
    param_update_builder_fun,
    block_learning_rate=1.0,
    block_momentum=None,
    devices=range(0, workspace.NumCudaDevices()),
    net_type='dag',
    master_gpu=None,
    optimize_gradient_memory=False,
    reset_momentum_sgd=False
):
    '''
    Function to create model that run on many GPUs and creates a net for
    parameter_updates that can be run independently for number of iterations
    then followed by another net that runs once to compute the final parameter
    updates according to block wise model update filtering rule described
    in : Scalable Training of Deep Learning Machines by Incremental Block
    Training with Intra-block Parallel Optimization and Blockwise Model-Update
    Filtering (ICASSP 2016).
    '''
    assert isinstance(model_helper_obj, model_helper.ModelHelper)

    if master_gpu is None:
        master_gpu = devices[0]

    model_helper_obj._devices = devices
    master_gpu_opt = core.DeviceOption(caffe2_pb2.CUDA, master_gpu)

    num_workers = len(devices)
    num_worker_threads = 4 * len(devices)
    loss_scale = 1.0 / num_workers
    if block_momentum is None:
        block_momentum = 1.0 - 1.0 / num_workers

    model_helper_obj.net.Proto().num_workers = num_worker_threads
    model_helper_obj.net.Proto().type = net_type

    # A net for initializing global model parameters. Its called once in the
    # same step as net parameters initialization.
    model_helper_obj._global_model_init_net = core.Net('global_model_init')
    model_helper_obj._global_model_init_net.Proto().type = net_type
    model_helper_obj._global_model_init_net.Proto().num_workers = \
        num_worker_threads

    # A net for computing final parameter updates. Its will run once after
    # running net (local models updates) for `num_local_iterations` times.
    model_helper_obj._global_model_param_updates_net = core.Net('global_model')
    model_helper_obj._global_model_param_updates_net.Proto().type = net_type
    model_helper_obj._global_model_param_updates_net.Proto().num_workers = \
        num_worker_threads

    def _v(param):
        return "{}_v".format(param)

    def _g(param):
        return "{}_g".format(param)

    # Keep track of params that were in the model before: they are not
    # data parallel, so we need to handle them separately
    non_datapar_params = copy.copy(model_helper_obj.params)
    model_helper_obj._losses_by_gpu = {}

    def _InitializeModels(gpu_id):
        input_builder_fun(model_helper_obj)
        loss = forward_pass_builder_fun(model_helper_obj, loss_scale)
        model_helper_obj._losses_by_gpu[gpu_id] = loss
    _ForEachGPU(devices, _InitializeModels, scoped=True)

    model_helper_obj._device_grouped_blobs =\
        _GroupByDevice(devices, model_helper_obj.params, non_datapar_params)

    model_helper_obj._param_names =\
        model_helper_obj._device_grouped_blobs.keys()

    _AddGradientOperators(
        devices, model_helper_obj, model_helper_obj._losses_by_gpu
    )

    _InferBlobDevice(model_helper_obj)

    def _InitializeParamUpdate(gpu_id):
        param_update_builder_fun(model_helper_obj)
    _ForEachGPU(devices, _InitializeParamUpdate, scoped=True)

    # (Step-0) Initialize momentum parameters on master GPU.
    for param_name in model_helper_obj._device_grouped_blobs.keys():
        param = model_helper_obj._device_grouped_blobs[param_name][master_gpu]
        with core.DeviceScope(master_gpu_opt):
            model_helper_obj._global_model_init_net.ConstantFill(
                param, _v(param), value=0.0
            )
            model_helper_obj._global_model_init_net.Copy(param, _g(param))

    # (Step-1) Update models for num_local_iterations.

    # (Step-2) Comute post-local-updates average of the params.
    # Sum model params across GPUs and store resutls in param_avg blob.
    for param_name in model_helper_obj._device_grouped_blobs.keys():
        with core.DeviceScope(master_gpu_opt):
            _AllReduce(
                devices, model_helper_obj,
                model_helper_obj._global_model_param_updates_net,
                param_name
            )

    # (Step-3) Update momentum params :
    # param_v = block_momentum * param_v
    # + block_learning_Rate * (param_avg - param)
    # param = param + param_v
    for param_name in model_helper_obj._device_grouped_blobs.keys():
        param = model_helper_obj._device_grouped_blobs[param_name][master_gpu]
        with core.DeviceScope(master_gpu_opt):
            # TODO(ataei) : Stop building the graph here to get model average ?
            model_helper_obj._global_model_param_updates_net.Scale(
                param, param, scale=1.0 / num_workers
            )
            model_helper_obj._global_model_param_updates_net.Sub(
                [param, _g(param)], param
            )
            model_helper_obj._global_model_param_updates_net.Scale(
                param, param, scale=block_learning_rate
            )
            model_helper_obj._global_model_param_updates_net.Scale(
                _v(param), _v(param), scale=block_momentum
            )
            model_helper_obj._global_model_param_updates_net.Add(
                [_v(param), param], _v(param)
            )
            model_helper_obj._global_model_param_updates_net.Add(
                [_g(param), _v(param)], _g(param)
            )
            model_helper_obj._global_model_param_updates_net.Copy(
                _g(param), param
            )
            _Broadcast(
                devices, model_helper_obj,
                model_helper_obj._global_model_param_updates_net,
                param_name
            )

    # Reset momentum-SGD parameters
    if reset_momentum_sgd:
        momentum_ops = [op for op in model_helper_obj.net.Proto().op
                        if op.type == 'MomentumSGDUpdate']
        for op in momentum_ops:
            momentum_blob = op.input[1]
            with core.DeviceScope(op.device_option):
                model_helper_obj._global_model_param_updates_net.ConstantFill(
                    [momentum_blob], momentum_blob, value=0.0
                )

    if optimize_gradient_memory:
        _OptimizeGradientMemorySimple(
            model_helper_obj, model_helper_obj._losses_by_gpu, devices
        )

    model_helper_obj._data_parallel_model_init_nets = [
        model_helper_obj.param_init_net,
        model_helper_obj._global_model_init_net
    ]
    model_helper_obj._data_parallel_model_nets = [
        model_helper_obj.net,
        (model_helper_obj._global_model_param_updates_net, 1)
    ]