Esempio n. 1
0
def _test_reshape(old_shape, new_shape, expected_shape=None, arg_shape=True,
                 in_place=False):
    devices = [core.DeviceOption(caffe2_pb2.CPU, 0)]
    if workspace.NumGpuDevices() > 0:
        devices.append(core.DeviceOption(workspace.GpuDeviceType, 0))

    for device_opt in devices:
        with core.DeviceScope(device_opt):
            if expected_shape is None:
                expected_shape = new_shape
            X = np.random.rand(*old_shape).astype(np.float32)

            blob_in = 'X'
            blob_out = blob_in if in_place else blob_in + '_out'

            if arg_shape:
                op = core.CreateOperator('Reshape',
                                         [blob_in],
                                         [blob_out, 'old_shape'],
                                         shape=new_shape)
            else:
                op = core.CreateOperator('Reshape',
                                         [blob_in, 'new_shape'],
                                         [blob_out, 'old_shape'])
                workspace.FeedBlob('new_shape', np.asarray(new_shape))

            workspace.FeedBlob(blob_in, X)
            workspace.RunOperatorOnce(op)

            Y = workspace.FetchBlob(blob_out)
            np.testing.assert_allclose(Y, X.reshape(expected_shape))
Esempio n. 2
0
    def test_prepend_dim(self):
        devices = [core.DeviceOption(caffe2_pb2.CPU, 0)]
        if workspace.NumGpuDevices() > 0:
            devices.append(core.DeviceOption(workspace.GpuDeviceType, 0))

        for device_opt in devices:
            with core.DeviceScope(device_opt):
                self._test_fwd_bwd()
Esempio n. 3
0
    def test_executor(self, executor, num_workers):
        model = build_resnet50_dataparallel_model(
            num_gpus=workspace.NumGpuDevices(), batch_size=8, epoch_size=8)
        model.Proto().num_workers = num_workers

        def run_model():
            run_resnet50_epoch(model, batch_size=8, epoch_size=8)

        self.compare_executors(
            model,
            ref_executor="simple",
            test_executor=executor,
            model_run_func=run_model,
        )
Esempio n. 4
0
def _test_reshape_output_and_gradient(old_shape,
                                      new_shape,
                                      expected_shape=None,
                                      arg_shape=True,
                                      in_place=False,
                                      expected_gradient=None):
    devices = [core.DeviceOption(caffe2_pb2.CPU, 0)]
    if workspace.NumGpuDevices() > 0:
        devices.append(core.DeviceOption(workspace.GpuDeviceType, 0))

    for device_opt in devices:
        with core.DeviceScope(device_opt):
            if expected_shape is None:
                expected_shape = new_shape
            net = core.Net('net')

            if len(old_shape) == 0:
                # scalar, convert to tensor before feeding to blob
                X = np.atleast_1d(np.random.rand(*old_shape))
            else:
                X = np.random.rand(*old_shape).astype(np.float32)
            blob_in = 'X'
            blob_out = blob_in if in_place else blob_in + '_out'

            if arg_shape:
                out, _ = net.Reshape([blob_in], [blob_out, 'old_shape'],
                                     shape=new_shape)
            else:
                out, _ = net.Reshape([blob_in, 'new_shape'],
                                     [blob_out, 'old_shape'])
                workspace.FeedBlob('new_shape', np.asarray(new_shape))

            workspace.FeedBlob(blob_in, X)
            if expected_gradient is not None:
                net.AddGradientOperators([out])
            workspace.CreateNet(net)
            workspace.RunNetOnce(net)

            Y = workspace.FetchBlob(blob_out)
            np.testing.assert_allclose(Y, X.reshape(expected_shape))
            if expected_gradient is not None:
                data_grad = workspace.FetchBlob(blob_in + '_grad')
                np.testing.assert_array_equal(data_grad, expected_gradient)
Esempio n. 5
0
    def test_timings(self):
        for n in range(2, workspace.NumGpuDevices()):
            for in_place in [False, True]:
                xs = [
                    np.random.randn(1e7).astype(np.float32) for i in range(n)
                ]
                inputs = [str("x_{}".format(i)) for i in range(n)]
                prefix = "" if in_place else "o"
                outputs = [str("{}x_{}".format(prefix, i)) for i in range(n)]

                net = core.Net("test")
                net.NCCLAllreduce(inputs, outputs)
                net.RunAllOnGPU()
                for i in range(n):
                    self.ws.create_blob(inputs[i]).feed(xs[i], gpu_device(i))
                self.ws.run(net)
                net_time = benchmark(self.ws, net)
                vanilla = core.Net("vanilla")
                muji.Allreduce(vanilla, inputs)
                vanilla_time = benchmark(self.ws, vanilla)
                print("Speedup for NCCL: {:.2f}".format(vanilla_time /
                                                        net_time))
Esempio n. 6
0
def bmuf_process(filestore_dir,
                 process_id,
                 shared_results,
                 cpu_device=False,
                 nesterov=False):
    # We need to import caffe2 in every process to initialize CUDA independently.
    from caffe2.python import core, cnn, data_parallel_model, dyndep, workspace
    from caffe2.proto import caffe2_pb2
    dyndep.InitOpsLibrary("@/caffe2/caffe2/distributed:file_store_handler_ops")

    if not cpu_device:
        if not workspace.has_gpu_support and not workspace.has_hip_support:
            log.info('No GPU support test is Ignored.')
            return
        if workspace.NumGpuDevices() < 4:
            log.info('Not enough GPU support, test IGNORED')
            return

    model = cnn.CNNModelHelper(order="NHWC", name="test")
    if not cpu_device:
        device_type = workspace.GpuDeviceType
        device_prefix = "gpu"
    else:
        device_type = caffe2_pb2.CPU
        device_prefix = "cpu"

    devices = [0, 1] if process_id == 0 else [2, 3]

    def _model_build_fun(model, loss_scale):
        fc = model.FC("data", "fc", 16, 1, ("ConstantFill", {}),
                      ("ConstantFill", {}))
        fc_fl = model.FlattenToVec(fc, "fc_fl")
        sigm = model.Sigmoid(fc_fl, "sigm")
        sq = model.SquaredL2Distance([sigm, "label"], "sq")
        loss = model.AveragedLoss(sq, "loss")
        loss = model.Scale(loss, scale=loss_scale)

        # For testing explicit sync
        model.param_init_net.UniformFill([], ["sync_num"], shape=[1])
        return [loss]

    def _input_builder_fun(model):
        return None

    def _param_update_fun(model):
        ITER = model.Iter("ITER")
        LR = model.net.LearningRate(
            [ITER],
            "LR",
            base_lr=(-0.1),
            policy="fixed",
        )
        ONE = model.param_init_net.ConstantFill(
            [],
            "ONE",
            shape=[1],
            value=1.0,
        )
        for param in model.GetParams():
            grad = model.param_to_grad[param]
            model.WeightedSum([param, ONE, grad, LR], param)

    def _generate_data(devices, process_id, device_type, device_prefix):
        np.random.seed(26 + process_id * 10)
        # Each run has same input, independent of number of gpus
        batch_size = 64
        for _ in range(0, 10):
            full_data = np.random.rand(batch_size, 16)
            full_labels = np.round(full_data[:, 0])
            batch_per_device = batch_size // len(devices)

            for (j, g) in enumerate(devices):
                st = j * batch_per_device
                en = st + batch_per_device
                data = full_data[st:en, :].astype(np.float32)
                labels = full_labels[st:en].astype(np.float32)
                with core.DeviceScope(core.DeviceOption(device_type, g)):
                    workspace.FeedBlob("{}_{}/data".format(device_prefix, g),
                                       data)
                    workspace.FeedBlob("{}_{}/label".format(device_prefix, g),
                                       labels)

    _generate_data(devices, process_id, device_type, device_prefix)

    workspace.RunOperatorOnce(
        core.CreateOperator("FileStoreHandlerCreate", [], ["store_handler"],
                            path=filestore_dir))
    rendezvous = dict(kv_handler="store_handler",
                      shard_id=process_id,
                      num_shards=2,
                      engine="GLOO",
                      exit_nets=None)

    data_parallel_model.Parallelize_BMUF(model,
                                         _input_builder_fun,
                                         _model_build_fun,
                                         _param_update_fun,
                                         devices=devices,
                                         rendezvous=rendezvous,
                                         nesterov=nesterov,
                                         add_blobs_to_sync=["sync_num"],
                                         cpu_device=cpu_device)

    data_parallel_model.RunInitNet(model)

    def _device_pid(device, pid):
        if pid == 1:
            return device + 2
        return device

    np.testing.assert_equal(
        workspace.FetchBlob("{}_{}/fc_w_v".format(device_prefix,
                                                  _device_pid(0, process_id))),
        np.zeros(16).astype(np.float32).reshape(1, 16))

    # Run the algorithm for one iteration to have non-zero params.
    data_parallel_model.RunNet(model, 1)

    # Save iteration momentum and post local update params
    results = {}
    v_b_ = workspace.FetchBlob("{}_{}/fc_b_v".format(
        device_prefix, _device_pid(0, process_id)))
    v_w_ = workspace.FetchBlob("{}_{}/fc_w_v".format(
        device_prefix, _device_pid(0, process_id)))

    results['v_b_'] = v_b_
    results['v_w_'] = v_w_

    workspace.RunNetOnce(model.net)

    b_0_ = workspace.FetchBlob("{}_{}/fc_b".format(device_prefix,
                                                   _device_pid(0, process_id)))
    w_0_ = workspace.FetchBlob("{}_{}/fc_w".format(device_prefix,
                                                   _device_pid(0, process_id)))
    b_1_ = workspace.FetchBlob("{}_{}/fc_b".format(device_prefix,
                                                   _device_pid(1, process_id)))
    w_1_ = workspace.FetchBlob("{}_{}/fc_w".format(device_prefix,
                                                   _device_pid(1, process_id)))

    results['b_0_'] = b_0_
    results['w_0_'] = w_0_
    results['b_1_'] = b_1_
    results['w_1_'] = w_1_

    # Test sync
    if process_id == 0:
        workspace.FeedBlob(device_prefix + "_0/sync_num",
                           np.array([2603]).astype(np.float32),
                           device_option=core.DeviceOption(device_type, 0))

    # Compute block gradients.
    b_g_ = workspace.FetchBlob("{}_{}/fc_b_g".format(
        device_prefix, _device_pid(0, process_id)))
    w_g_ = workspace.FetchBlob("{}_{}/fc_w_g".format(
        device_prefix, _device_pid(0, process_id)))
    results['b_g_'] = b_g_
    results['w_g_'] = w_g_
    workspace.RunNetOnce(model._global_model_param_updates_net)

    #  g_b = (b_0_ + b_1_) / 2 - b_g_
    #  g_w = (w_0_ + w_1_) / 2 - w_g_
    v_b = workspace.FetchBlob("{}_{}/fc_b_v".format(device_prefix,
                                                    _device_pid(0,
                                                                process_id)))
    v_w = workspace.FetchBlob("{}_{}/fc_w_v".format(device_prefix,
                                                    _device_pid(0,
                                                                process_id)))
    w_g = workspace.FetchBlob("{}_{}/fc_w_g".format(device_prefix,
                                                    _device_pid(0,
                                                                process_id)))
    b_g = workspace.FetchBlob("{}_{}/fc_b_g".format(device_prefix,
                                                    _device_pid(0,
                                                                process_id)))
    w_0 = workspace.FetchBlob("{}_{}/fc_w".format(device_prefix,
                                                  _device_pid(0, process_id)))
    b_0 = workspace.FetchBlob("{}_{}/fc_b".format(device_prefix,
                                                  _device_pid(0, process_id)))
    w_1 = workspace.FetchBlob("{}_{}/fc_w".format(device_prefix,
                                                  _device_pid(1, process_id)))
    b_1 = workspace.FetchBlob("{}_{}/fc_b".format(device_prefix,
                                                  _device_pid(1, process_id)))
    results['v_b'] = v_b
    results['v_w'] = v_w
    results['w_g'] = w_g
    results['b_g'] = b_g
    results['w_0'] = w_0
    results['b_0'] = b_0
    results['w_1'] = w_1
    results['b_1'] = b_1

    # Test add_blobs_to_sync
    for j in devices:
        sync = workspace.FetchBlob(device_prefix + "_{}/sync_num".format(j))[0]
        results['sync_{}'.format(j)] = sync

    shared_results[process_id] = results
Esempio n. 7
0
    brew,
    core,
    device_checker,
    gradient_checker,
    model_helper,
    test_util,
    workspace,
)
from caffe2.python.gradient_checker import NetGradientChecker
from caffe2.python.net_builder import ops, NetBuilder
from caffe2.proto import caffe2_pb2

import unittest


if workspace.has_gpu_support and workspace.NumGpuDevices() > 0:
    gpu_device_option = caffe2_pb2.DeviceOption()
    gpu_device_option.device_type = workspace.GpuDeviceType
    cpu_device_option = caffe2_pb2.DeviceOption()
    gpu_device_checker = device_checker.DeviceChecker(
        0.01, [gpu_device_option]
    )
    device_checker = device_checker.DeviceChecker(
        0.01, [gpu_device_option, cpu_device_option]
    )
    gpu_gradient_checkers = [
        gradient_checker.GradientChecker(
            0.005, 0.05, gpu_device_option, "gpu_checker_ws"
        ),
    ]
    gradient_checkers = [
Esempio n. 8
0
 def testGetGpuPeerAccessPattern(self):
     pattern = workspace.GetGpuPeerAccessPattern()
     self.assertEqual(type(pattern), np.ndarray)
     self.assertEqual(pattern.ndim, 2)
     self.assertEqual(pattern.shape[0], pattern.shape[1])
     self.assertEqual(pattern.shape[0], workspace.NumGpuDevices())
Esempio n. 9
0
import errno
import hypothesis.strategies as st
from hypothesis import given, assume, settings
import numpy as np
import os
import shutil
import unittest
from pathlib import Path
from typing import List, Optional, Tuple, Type

from caffe2.proto import caffe2_pb2
from caffe2.python import core, test_util, workspace

if workspace.has_gpu_support:
    DEVICES = [caffe2_pb2.CPU, workspace.GpuDeviceType]
    max_gpuid = workspace.NumGpuDevices() - 1
else:
    DEVICES = [caffe2_pb2.CPU]
    max_gpuid = 0


# Utility class for other loading tests, don't add test functions here
# Inherit from this test instead. If you add a test here,
# each derived class will inherit it as well and cause test duplication
class TestLoadSaveBase(test_util.TestCase):

    def __init__(self, methodName, db_type='minidb'):
        super(TestLoadSaveBase, self).__init__(methodName)
        self._db_type = db_type

    @settings(deadline=None)
Esempio n. 10
0
    brew,
    core,
    device_checker,
    gradient_checker,
    model_helper,
    test_util,
    workspace,
)
from caffe2.python.gradient_checker import NetGradientChecker
from caffe2.python.net_builder import ops, NetBuilder
from caffe2.proto import caffe2_pb2

import unittest

if (workspace.has_gpu_support
        or workspace.has_hip_support) and workspace.NumGpuDevices() > 0:
    gpu_device_option = caffe2_pb2.DeviceOption()
    gpu_device_option.device_type = caffe2_pb2.HIP if workspace.has_hip_support else caffe2_pb2.CUDA
    cpu_device_option = caffe2_pb2.DeviceOption()
    gpu_device_checker = device_checker.DeviceChecker(0.01,
                                                      [gpu_device_option])
    device_checker = device_checker.DeviceChecker(
        0.01, [gpu_device_option, cpu_device_option])
    gpu_gradient_checkers = [
        gradient_checker.GradientChecker(0.005, 0.05, gpu_device_option,
                                         "gpu_checker_ws"),
    ]
    gradient_checkers = [
        gradient_checker.GradientChecker(0.005, 0.05, gpu_device_option,
                                         "gpu_checker_ws"),
        gradient_checker.GradientChecker(0.01, 0.05, cpu_device_option,
Esempio n. 11
0
    sys.path.append("..")
    # data generation
    from data_generator.dlrm_data_caffe2 import DLRMDataGenerator

    from utils.utils import cli

    args = cli()

    ### some basic setup ###
    np.random.seed(args.numpy_rand_seed)
    np.set_printoptions(precision=args.print_precision)

    use_gpu = args.use_gpu
    if use_gpu:
        device_opt = core.DeviceOption(workspace.GpuDeviceType, 0)
        ngpus = workspace.NumGpuDevices()  # 1
        print("Using {} GPU(s)...".format(ngpus))
    else:
        device_opt = core.DeviceOption(caffe2_pb2.CPU)
        print("Using CPU...")

    ### prepare training data ###
    ln_bot = np.fromstring(args.arch_mlp_bot, dtype=int, sep="-")
    dc = DLRMDataGenerator (args)
    if args.data_generation == "dataset":
        print("Error we have disabled this function currently....")
        sys.exit()
        # input and target data
        #(nbatches, lX, lS_l, lS_i, lT,
        # nbatches_test, lX_test, lS_l_test, lS_i_test, lT_test,
        # ln_emb, m_den) = dc.read_dataset(
Esempio n. 12
0
class CopyOpsTest(test_util.TestCase):

    def tearDown(self):
        # Reset workspace after each test
        # Otherwise, the multi-GPU test will use previously created tensors,
        #   which may have been placed on the wrong device
        workspace.ResetWorkspace()

    def run_test_copy_gradient(self, device_opt):
        model = model_helper.ModelHelper(name="copy_test")
        with core.DeviceScope(device_opt):
            x = model.net.AddExternalInputs("x")
            y = model.Copy(x, "y")
            loss = model.AveragedLoss(y, "loss")
            gradient_map = model.AddGradientOperators([loss])
            workspace.FeedBlob(x, np.random.rand(32).astype(np.float32))
            workspace.RunNetOnce(model.param_init_net)
            workspace.RunNetOnce(model.net)
            self.assertTrue(np.array_equal(
                workspace.FetchBlob(x),
                workspace.FetchBlob(y),
            ))
            self.assertTrue(np.array_equal(
                workspace.FetchBlob(gradient_map[x]),
                workspace.FetchBlob(gradient_map[y]),
            ))

    def test_copy_gradient_cpu(self):
        self.run_test_copy_gradient(core.DeviceOption(caffe2_pb2.CPU, 0))

    @unittest.skipIf(workspace.NumGpuDevices() < 1, "Need at least 1 GPU.")
    def test_copy_gradient_gpu(self):
        self.run_test_copy_gradient(core.DeviceOption(workspace.GpuDeviceType, 0))

    @unittest.skipIf(workspace.NumGpuDevices() < 2, "Need at least 2 GPU.")
    def test_copy_gradient_multiple_gpus(self):
        model = model_helper.ModelHelper(name="copy_test")

        with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU, 0)):
            x_cpu = model.net.AddExternalInputs("x_cpu")

        with core.DeviceScope(core.DeviceOption(workspace.GpuDeviceType, 0)):
            x_gpu_1 = model.CopyCPUToGPU(x_cpu, "x_gpu_1")

        with core.DeviceScope(core.DeviceOption(workspace.GpuDeviceType, 1)):
            x_gpu_2 = model.Copy(x_gpu_1, "x_gpu_2")
            loss = model.AveragedLoss(x_gpu_2, "loss")
            gradient_map = model.AddGradientOperators([loss])

        workspace.FeedBlob("x_cpu", np.random.rand(32).astype(np.float32))
        workspace.RunNetOnce(model.param_init_net)
        workspace.RunNetOnce(model.net)

        self.assertTrue(np.array_equal(
            workspace.FetchBlob("x_gpu_1"),
            workspace.FetchBlob("x_gpu_2"),
        ))
        self.assertTrue(np.array_equal(
            workspace.FetchBlob(gradient_map["x_gpu_1"]),
            workspace.FetchBlob(gradient_map["x_gpu_2"]),
        ))

        def get_op_with_output(model, output_blob_name):
            for op in model.net.Proto().op:
                if len(op.output) == 1 and op.output[0] == output_blob_name:
                    return op
            return None

        self.assertEqual(
            get_op_with_output(model, "x_gpu_2_grad").device_option,
            core.DeviceOption(workspace.GpuDeviceType, 1),
        )
        self.assertEqual(
            get_op_with_output(model, "x_cpu_grad").device_option,
            core.DeviceOption(workspace.GpuDeviceType, 0),
        )

    @unittest.skipIf(workspace.NumGpuDevices() < 1, "Need at least 1 GPU.")
    def test_cpu2gpu_gpu2cpu_sparse_gradients(self):
        model = model_helper.ModelHelper(name="copy_test")
        v = model.param_init_net.UniformFill([], ["v"], shape=[16, 4])
        indices = model.param_init_net.UniformFill([], ["v"], shape=[16, 4])
        cpu_opt = core.DeviceOption(caffe2_pb2.CPU, 0)
        gpu_opt = core.DeviceOption(workspace.GpuDeviceType, 0)

        with core.DeviceScope(gpu_opt):
            vcpu = model.CopyGPUToCPU(v, "vcpu")

        with core.DeviceScope(cpu_opt):
            g = model.Gather([vcpu, indices], "g")

        with core.DeviceScope(gpu_opt):
            ggpu = model.CopyCPUToGPU(g, "ggpu")
            f = brew.fc(model, ggpu, "out", dim_in=4, dim_out=6)
            (softmax, loss) = model.SoftmaxWithLoss(
                [f, "label"],
                ["softmax", "loss"],
            )
        gradient_map = model.AddGradientOperators([loss])
        self.assertTrue("v" in gradient_map)
        self.assertTrue(isinstance(gradient_map['v'], core.GradientSlice))

    @unittest.skipIf(workspace.NumGpuDevices() < 1, "Need at least 1 GPU.")
    def test_cpu2gpu_gpu2cpu_gradients(self):
        model = model_helper.ModelHelper(name="copy_test")

        batch = 32
        cpu_opt = core.DeviceOption(caffe2_pb2.CPU, 0)
        gpu_opt = core.DeviceOption(workspace.GpuDeviceType, 0)

        with core.NameScope("cpu"):
            with core.DeviceScope(cpu_opt):
                x_cpu = brew.fc(model, 'data', 'x_cpu', 16, 8)

        with core.NameScope("gpu_0"):
            with core.DeviceScope(gpu_opt):
                x_gpu = model.CopyCPUToGPU(x_cpu, "x_gpu")
                pred_gpu = brew.fc(model, x_gpu, "pred_gpu", 8, 4)
                pred_cpu = model.CopyGPUToCPU(pred_gpu, "pred_cpu")

        with core.DeviceScope(cpu_opt):
            with core.NameScope("cpu"):
                (softmax, loss) = model.SoftmaxWithLoss(
                    [pred_cpu, "label"],
                    ["softmax", "loss"],
                )

        gradient_map = model.AddGradientOperators([loss])

        # Add param updates (for cpu and gpu)
        init_net = model.param_init_net
        with core.DeviceScope(cpu_opt):
            with core.NameScope("cpu"):
                ONE = init_net.ConstantFill([], "ONE", shape=[1], value=1.)
                LR = init_net.ConstantFill([], "LR", shape=[1], value=-2.0)
                for param in model.GetParams():
                    model.WeightedSum(
                        [param, ONE, gradient_map[param], LR],
                        param,
                    )

        with core.NameScope("gpu_0"):
            with core.DeviceScope(gpu_opt):
                ONE = init_net.ConstantFill([], "ONE", shape=[1], value=1.)
                LR = init_net.ConstantFill([], "LR", shape=[1], value=-2.0)
                for param in model.GetParams():
                    model.WeightedSum(
                        [param, ONE, gradient_map[param], LR],
                        param,
                    )

        with core.DeviceScope(cpu_opt):
            workspace.FeedBlob(
                'cpu/data',
                np.random.rand(batch, 16).astype(np.float32),
            )
            workspace.FeedBlob(
                'cpu/label',
                np.random.randint(4, size=batch).astype(np.int32),
            )

        workspace.RunNetOnce(model.param_init_net)
        workspace.CreateNet(model.net)

        initial_params = {p: workspace.FetchBlob(p) for p in model.GetParams()}
        workspace.RunNet(model.net.Proto().name)
        updated_params = {p: workspace.FetchBlob(p) for p in model.GetParams()}

        for p in model.GetParams():
            g = gradient_map[p]
            expected = initial_params[p] - 2.0 * workspace.FetchBlob(g)
            actual = updated_params[p]
            self.assertTrue(
                np.array_equal(expected, updated_params[p]),
                "Mismatch: {}: {}, {}".format(p, expected, actual),
            )
Esempio n. 13
0
    )


cpu_do = caffe2_pb2.DeviceOption()
cuda_do = caffe2_pb2.DeviceOption(device_type=caffe2_pb2.CUDA)
hip_do = caffe2_pb2.DeviceOption(device_type=caffe2_pb2.HIP)
gpu_do = caffe2_pb2.DeviceOption(device_type=workspace.GpuDeviceType)  # CUDA or ROCm
# (bddppq) Do not rely on this no_hip option! It's just used to
# temporarily skip some flaky tests on ROCM before it's getting more mature.
_device_options_no_hip = [cpu_do] + ([cuda_do] if workspace.has_cuda_support else [])
device_options = _device_options_no_hip + ([hip_do] if workspace.has_hip_support else [])

# Include device option for each GPU
expanded_device_options = [cpu_do] + [
    caffe2_pb2.DeviceOption(device_type=workspace.GpuDeviceType, device_id=i)
    for i in range(workspace.NumGpuDevices())]


def device_checker_device_options():
    return st.just(device_options)


def gradient_checker_device_option():
    return st.sampled_from(device_options)


gcs = dict(
    gc=gradient_checker_device_option(),
    dc=device_checker_device_options()
)
Esempio n. 14
0
class NCCLOpsTest(hu.HypothesisTestCase):
    @given(n=st.integers(min_value=2, max_value=workspace.NumGpuDevices()),
           m=st.integers(min_value=1, max_value=1000),
           in_place=st.booleans())
    def test_nccl_allreduce(self, n, m, in_place):
        xs = [np.random.randn(m).astype(np.float32) for i in range(n)]
        inputs = [str("x_{}".format(i)) for i in range(n)]
        prefix = "" if in_place else "o"
        outputs = [str("{}x_{}".format(prefix, i)) for i in range(n)]
        op = core.CreateOperator("NCCLAllreduce", inputs, outputs)
        input_device_options = {n: gpu_device(i) for i, n in enumerate(inputs)}

        def allreduce(*args):
            assert len(args) == n
            output = np.sum(args, axis=0)
            return [output for _ in range(n)]

        outputs = self.assertReferenceChecks(
            hu.gpu_do, op, [xs[i] for i, _ in enumerate(inputs)], allreduce,
            input_device_options)
        for output in outputs:
            np.testing.assert_array_equal(outputs[0], output)
            self.assertEqual(outputs[0].tobytes(), output.tobytes())

    @given(n=st.integers(min_value=2, max_value=workspace.NumGpuDevices()),
           m=st.integers(min_value=1, max_value=1000),
           root=st.integers(min_value=0,
                            max_value=workspace.NumGpuDevices() - 1))
    def test_nccl_broadcast(self, n, m, root):
        assume(root < n)
        xs = [np.random.randn(m).astype(np.float32) for i in range(n)]
        inputs = [str("x_{}".format(i)) for i in range(n)]
        op = core.CreateOperator("NCCLBroadcast", inputs, inputs, root=root)
        input_device_options = {n: gpu_device(i) for i, n in enumerate(inputs)}

        def broadcast(*args):
            assert len(args) == n
            return [args[root] for _ in range(n)]

        self.assertReferenceChecks(hu.gpu_do, op,
                                   [xs[i] for i, _ in enumerate(inputs)],
                                   broadcast, input_device_options)

    @given(
        n=st.integers(min_value=2, max_value=workspace.NumGpuDevices()),
        m=st.integers(min_value=1, max_value=1000),
        # NCCL Reduce seems to deadlock for non-zero roots.
        root=st.integers(min_value=0, max_value=0),
        in_place=st.booleans())
    def test_nccl_reduce(self, n, m, root, in_place):
        assume(in_place is False or root == 0)
        xs = [np.random.randn(m).astype(np.float32) for i in range(n)]
        inputs = [str("x_{}".format(i)) for i in range(n)]
        op = core.CreateOperator("NCCLReduce",
                                 inputs,
                                 inputs[root] if in_place else b"o",
                                 root=root)
        input_device_options = {n: gpu_device(i) for i, n in enumerate(inputs)}

        def reduce(*args):
            assert len(args) == n
            return [np.sum(args, axis=0)]

        self.assertReferenceChecks(hu.gpu_do, op,
                                   [xs[i] for i, _ in enumerate(inputs)],
                                   reduce, input_device_options)

    @given(n=st.integers(min_value=2, max_value=workspace.NumGpuDevices()),
           m=st.integers(min_value=1, max_value=1000))
    def test_nccl_allgather(self, n, m):
        xs = [np.random.randn(m).astype(np.float32) for i in range(n)]
        inputs = [str("x_{}".format(i)) for i in range(n)]
        outputs = [str("o_{}".format(i)) for i in range(n)]
        op = core.CreateOperator("NCCLAllGather", inputs, outputs)
        input_device_options = {n: gpu_device(i) for i, n in enumerate(inputs)}

        def allgather(*args):
            assert len(args) == n
            return [np.stack(args, axis=0) for _ in range(n)]

        outputs = self.assertReferenceChecks(
            hu.gpu_do, op, [xs[i] for i, _ in enumerate(inputs)], allgather,
            input_device_options)
        for output in outputs:
            np.testing.assert_array_equal(outputs[0], output)
            self.assertEqual(outputs[0].tobytes(), output.tobytes())

    @given(n=st.integers(min_value=2, max_value=workspace.NumGpuDevices()),
           m=st.integers(min_value=1, max_value=1000))
    def test_nccl_reduce_scatter(self, n, m):
        xs = [np.random.randn(n, m).astype(np.float32) for i in range(n)]
        inputs = [str("x_{}".format(i)) for i in range(n)]
        outputs = [str("o_{}".format(i)) for i in range(n)]
        op = core.CreateOperator("NCCLReduceScatter", inputs, outputs)
        input_device_options = {n: gpu_device(i) for i, n in enumerate(inputs)}

        def reduce_scatter(*args):
            assert len(args) == n
            reduced = sum(args)
            assert len(reduced.shape) > 1
            ref = [reduced[i, :] for i in range(n)]
            return ref

        self.assertReferenceChecks(hu.gpu_do, op,
                                   [xs[i] for i, _ in enumerate(inputs)],
                                   reduce_scatter, input_device_options)

    @given(n=st.integers(min_value=2, max_value=workspace.NumGpuDevices()),
           m=st.integers(min_value=100000, max_value=100000),
           iters=st.integers(min_value=1, max_value=100),
           net_type=st.sampled_from(["dag", "async_dag", "simple"]))
    def _test_nccl_sync(self, n, m, iters, net_type):
        inputs = [str("x_{}".format(i)) for i in range(n)]
        extra_inputs = [str("xe_{}".format(i)) for i in range(n)]
        net = core.Net("asdf")
        net.Proto().type = net_type
        net.Proto().num_workers = n
        for i in range(n):
            net.ConstantFill([],
                             inputs[i],
                             shape=[m],
                             value=0.0,
                             device_option=gpu_device(i))
            net.ConstantFill([],
                             extra_inputs[i],
                             shape=[m],
                             value=1.0,
                             device_option=gpu_device(i))
            for _ in range(iters):
                net.Sum([inputs[i], extra_inputs[i]], [inputs[i]],
                        device_option=gpu_device(i))
        net.NCCLReduce(inputs, [inputs[0]], device_option=gpu_device(0))
        self.ws.run(net)
        np.testing.assert_array_equal(
            self.ws.blobs[inputs[0]].fetch(),
            np.full(shape=(m, ), fill_value=iters * n, dtype=np.float32))

    @unittest.skipIf(not os.environ.get("CAFFE2_BENCHMARK"), "Benchmark")
    def test_timings(self):
        for n in range(2, workspace.NumGpuDevices()):
            for in_place in [False, True]:
                xs = [
                    np.random.randn(1e7).astype(np.float32) for i in range(n)
                ]
                inputs = [str("x_{}".format(i)) for i in range(n)]
                prefix = "" if in_place else "o"
                outputs = [str("{}x_{}".format(prefix, i)) for i in range(n)]

                net = core.Net("test")
                net.NCCLAllreduce(inputs, outputs)
                net.RunAllOnGPU()
                for i in range(n):
                    self.ws.create_blob(inputs[i]).feed(xs[i], gpu_device(i))
                self.ws.run(net)
                net_time = benchmark(self.ws, net)
                vanilla = core.Net("vanilla")
                muji.Allreduce(vanilla, inputs)
                vanilla_time = benchmark(self.ws, vanilla)
                print("Speedup for NCCL: {:.2f}".format(vanilla_time /
                                                        net_time))
Esempio n. 15
0
 def testAllreduceSingleGPU(self):
     for i in range(workspace.NumGpuDevices()):
         self.RunningAllreduceWithGPUs([i], muji.Allreduce)
Esempio n. 16
0
 def testAllreduceFallback(self):
     self.RunningAllreduceWithGPUs(list(range(workspace.NumGpuDevices())),
                                   muji.AllreduceFallback)