def _test_reshape(old_shape, new_shape, expected_shape=None, arg_shape=True, in_place=False): devices = [core.DeviceOption(caffe2_pb2.CPU, 0)] if workspace.NumGpuDevices() > 0: devices.append(core.DeviceOption(workspace.GpuDeviceType, 0)) for device_opt in devices: with core.DeviceScope(device_opt): if expected_shape is None: expected_shape = new_shape X = np.random.rand(*old_shape).astype(np.float32) blob_in = 'X' blob_out = blob_in if in_place else blob_in + '_out' if arg_shape: op = core.CreateOperator('Reshape', [blob_in], [blob_out, 'old_shape'], shape=new_shape) else: op = core.CreateOperator('Reshape', [blob_in, 'new_shape'], [blob_out, 'old_shape']) workspace.FeedBlob('new_shape', np.asarray(new_shape)) workspace.FeedBlob(blob_in, X) workspace.RunOperatorOnce(op) Y = workspace.FetchBlob(blob_out) np.testing.assert_allclose(Y, X.reshape(expected_shape))
def test_prepend_dim(self): devices = [core.DeviceOption(caffe2_pb2.CPU, 0)] if workspace.NumGpuDevices() > 0: devices.append(core.DeviceOption(workspace.GpuDeviceType, 0)) for device_opt in devices: with core.DeviceScope(device_opt): self._test_fwd_bwd()
def test_executor(self, executor, num_workers): model = build_resnet50_dataparallel_model( num_gpus=workspace.NumGpuDevices(), batch_size=8, epoch_size=8) model.Proto().num_workers = num_workers def run_model(): run_resnet50_epoch(model, batch_size=8, epoch_size=8) self.compare_executors( model, ref_executor="simple", test_executor=executor, model_run_func=run_model, )
def _test_reshape_output_and_gradient(old_shape, new_shape, expected_shape=None, arg_shape=True, in_place=False, expected_gradient=None): devices = [core.DeviceOption(caffe2_pb2.CPU, 0)] if workspace.NumGpuDevices() > 0: devices.append(core.DeviceOption(workspace.GpuDeviceType, 0)) for device_opt in devices: with core.DeviceScope(device_opt): if expected_shape is None: expected_shape = new_shape net = core.Net('net') if len(old_shape) == 0: # scalar, convert to tensor before feeding to blob X = np.atleast_1d(np.random.rand(*old_shape)) else: X = np.random.rand(*old_shape).astype(np.float32) blob_in = 'X' blob_out = blob_in if in_place else blob_in + '_out' if arg_shape: out, _ = net.Reshape([blob_in], [blob_out, 'old_shape'], shape=new_shape) else: out, _ = net.Reshape([blob_in, 'new_shape'], [blob_out, 'old_shape']) workspace.FeedBlob('new_shape', np.asarray(new_shape)) workspace.FeedBlob(blob_in, X) if expected_gradient is not None: net.AddGradientOperators([out]) workspace.CreateNet(net) workspace.RunNetOnce(net) Y = workspace.FetchBlob(blob_out) np.testing.assert_allclose(Y, X.reshape(expected_shape)) if expected_gradient is not None: data_grad = workspace.FetchBlob(blob_in + '_grad') np.testing.assert_array_equal(data_grad, expected_gradient)
def test_timings(self): for n in range(2, workspace.NumGpuDevices()): for in_place in [False, True]: xs = [ np.random.randn(1e7).astype(np.float32) for i in range(n) ] inputs = [str("x_{}".format(i)) for i in range(n)] prefix = "" if in_place else "o" outputs = [str("{}x_{}".format(prefix, i)) for i in range(n)] net = core.Net("test") net.NCCLAllreduce(inputs, outputs) net.RunAllOnGPU() for i in range(n): self.ws.create_blob(inputs[i]).feed(xs[i], gpu_device(i)) self.ws.run(net) net_time = benchmark(self.ws, net) vanilla = core.Net("vanilla") muji.Allreduce(vanilla, inputs) vanilla_time = benchmark(self.ws, vanilla) print("Speedup for NCCL: {:.2f}".format(vanilla_time / net_time))
def bmuf_process(filestore_dir, process_id, shared_results, cpu_device=False, nesterov=False): # We need to import caffe2 in every process to initialize CUDA independently. from caffe2.python import core, cnn, data_parallel_model, dyndep, workspace from caffe2.proto import caffe2_pb2 dyndep.InitOpsLibrary("@/caffe2/caffe2/distributed:file_store_handler_ops") if not cpu_device: if not workspace.has_gpu_support and not workspace.has_hip_support: log.info('No GPU support test is Ignored.') return if workspace.NumGpuDevices() < 4: log.info('Not enough GPU support, test IGNORED') return model = cnn.CNNModelHelper(order="NHWC", name="test") if not cpu_device: device_type = workspace.GpuDeviceType device_prefix = "gpu" else: device_type = caffe2_pb2.CPU device_prefix = "cpu" devices = [0, 1] if process_id == 0 else [2, 3] def _model_build_fun(model, loss_scale): fc = model.FC("data", "fc", 16, 1, ("ConstantFill", {}), ("ConstantFill", {})) fc_fl = model.FlattenToVec(fc, "fc_fl") sigm = model.Sigmoid(fc_fl, "sigm") sq = model.SquaredL2Distance([sigm, "label"], "sq") loss = model.AveragedLoss(sq, "loss") loss = model.Scale(loss, scale=loss_scale) # For testing explicit sync model.param_init_net.UniformFill([], ["sync_num"], shape=[1]) return [loss] def _input_builder_fun(model): return None def _param_update_fun(model): ITER = model.Iter("ITER") LR = model.net.LearningRate( [ITER], "LR", base_lr=(-0.1), policy="fixed", ) ONE = model.param_init_net.ConstantFill( [], "ONE", shape=[1], value=1.0, ) for param in model.GetParams(): grad = model.param_to_grad[param] model.WeightedSum([param, ONE, grad, LR], param) def _generate_data(devices, process_id, device_type, device_prefix): np.random.seed(26 + process_id * 10) # Each run has same input, independent of number of gpus batch_size = 64 for _ in range(0, 10): full_data = np.random.rand(batch_size, 16) full_labels = np.round(full_data[:, 0]) batch_per_device = batch_size // len(devices) for (j, g) in enumerate(devices): st = j * batch_per_device en = st + batch_per_device data = full_data[st:en, :].astype(np.float32) labels = full_labels[st:en].astype(np.float32) with core.DeviceScope(core.DeviceOption(device_type, g)): workspace.FeedBlob("{}_{}/data".format(device_prefix, g), data) workspace.FeedBlob("{}_{}/label".format(device_prefix, g), labels) _generate_data(devices, process_id, device_type, device_prefix) workspace.RunOperatorOnce( core.CreateOperator("FileStoreHandlerCreate", [], ["store_handler"], path=filestore_dir)) rendezvous = dict(kv_handler="store_handler", shard_id=process_id, num_shards=2, engine="GLOO", exit_nets=None) data_parallel_model.Parallelize_BMUF(model, _input_builder_fun, _model_build_fun, _param_update_fun, devices=devices, rendezvous=rendezvous, nesterov=nesterov, add_blobs_to_sync=["sync_num"], cpu_device=cpu_device) data_parallel_model.RunInitNet(model) def _device_pid(device, pid): if pid == 1: return device + 2 return device np.testing.assert_equal( workspace.FetchBlob("{}_{}/fc_w_v".format(device_prefix, _device_pid(0, process_id))), np.zeros(16).astype(np.float32).reshape(1, 16)) # Run the algorithm for one iteration to have non-zero params. data_parallel_model.RunNet(model, 1) # Save iteration momentum and post local update params results = {} v_b_ = workspace.FetchBlob("{}_{}/fc_b_v".format( device_prefix, _device_pid(0, process_id))) v_w_ = workspace.FetchBlob("{}_{}/fc_w_v".format( device_prefix, _device_pid(0, process_id))) results['v_b_'] = v_b_ results['v_w_'] = v_w_ workspace.RunNetOnce(model.net) b_0_ = workspace.FetchBlob("{}_{}/fc_b".format(device_prefix, _device_pid(0, process_id))) w_0_ = workspace.FetchBlob("{}_{}/fc_w".format(device_prefix, _device_pid(0, process_id))) b_1_ = workspace.FetchBlob("{}_{}/fc_b".format(device_prefix, _device_pid(1, process_id))) w_1_ = workspace.FetchBlob("{}_{}/fc_w".format(device_prefix, _device_pid(1, process_id))) results['b_0_'] = b_0_ results['w_0_'] = w_0_ results['b_1_'] = b_1_ results['w_1_'] = w_1_ # Test sync if process_id == 0: workspace.FeedBlob(device_prefix + "_0/sync_num", np.array([2603]).astype(np.float32), device_option=core.DeviceOption(device_type, 0)) # Compute block gradients. b_g_ = workspace.FetchBlob("{}_{}/fc_b_g".format( device_prefix, _device_pid(0, process_id))) w_g_ = workspace.FetchBlob("{}_{}/fc_w_g".format( device_prefix, _device_pid(0, process_id))) results['b_g_'] = b_g_ results['w_g_'] = w_g_ workspace.RunNetOnce(model._global_model_param_updates_net) # g_b = (b_0_ + b_1_) / 2 - b_g_ # g_w = (w_0_ + w_1_) / 2 - w_g_ v_b = workspace.FetchBlob("{}_{}/fc_b_v".format(device_prefix, _device_pid(0, process_id))) v_w = workspace.FetchBlob("{}_{}/fc_w_v".format(device_prefix, _device_pid(0, process_id))) w_g = workspace.FetchBlob("{}_{}/fc_w_g".format(device_prefix, _device_pid(0, process_id))) b_g = workspace.FetchBlob("{}_{}/fc_b_g".format(device_prefix, _device_pid(0, process_id))) w_0 = workspace.FetchBlob("{}_{}/fc_w".format(device_prefix, _device_pid(0, process_id))) b_0 = workspace.FetchBlob("{}_{}/fc_b".format(device_prefix, _device_pid(0, process_id))) w_1 = workspace.FetchBlob("{}_{}/fc_w".format(device_prefix, _device_pid(1, process_id))) b_1 = workspace.FetchBlob("{}_{}/fc_b".format(device_prefix, _device_pid(1, process_id))) results['v_b'] = v_b results['v_w'] = v_w results['w_g'] = w_g results['b_g'] = b_g results['w_0'] = w_0 results['b_0'] = b_0 results['w_1'] = w_1 results['b_1'] = b_1 # Test add_blobs_to_sync for j in devices: sync = workspace.FetchBlob(device_prefix + "_{}/sync_num".format(j))[0] results['sync_{}'.format(j)] = sync shared_results[process_id] = results
brew, core, device_checker, gradient_checker, model_helper, test_util, workspace, ) from caffe2.python.gradient_checker import NetGradientChecker from caffe2.python.net_builder import ops, NetBuilder from caffe2.proto import caffe2_pb2 import unittest if workspace.has_gpu_support and workspace.NumGpuDevices() > 0: gpu_device_option = caffe2_pb2.DeviceOption() gpu_device_option.device_type = workspace.GpuDeviceType cpu_device_option = caffe2_pb2.DeviceOption() gpu_device_checker = device_checker.DeviceChecker( 0.01, [gpu_device_option] ) device_checker = device_checker.DeviceChecker( 0.01, [gpu_device_option, cpu_device_option] ) gpu_gradient_checkers = [ gradient_checker.GradientChecker( 0.005, 0.05, gpu_device_option, "gpu_checker_ws" ), ] gradient_checkers = [
def testGetGpuPeerAccessPattern(self): pattern = workspace.GetGpuPeerAccessPattern() self.assertEqual(type(pattern), np.ndarray) self.assertEqual(pattern.ndim, 2) self.assertEqual(pattern.shape[0], pattern.shape[1]) self.assertEqual(pattern.shape[0], workspace.NumGpuDevices())
import errno import hypothesis.strategies as st from hypothesis import given, assume, settings import numpy as np import os import shutil import unittest from pathlib import Path from typing import List, Optional, Tuple, Type from caffe2.proto import caffe2_pb2 from caffe2.python import core, test_util, workspace if workspace.has_gpu_support: DEVICES = [caffe2_pb2.CPU, workspace.GpuDeviceType] max_gpuid = workspace.NumGpuDevices() - 1 else: DEVICES = [caffe2_pb2.CPU] max_gpuid = 0 # Utility class for other loading tests, don't add test functions here # Inherit from this test instead. If you add a test here, # each derived class will inherit it as well and cause test duplication class TestLoadSaveBase(test_util.TestCase): def __init__(self, methodName, db_type='minidb'): super(TestLoadSaveBase, self).__init__(methodName) self._db_type = db_type @settings(deadline=None)
brew, core, device_checker, gradient_checker, model_helper, test_util, workspace, ) from caffe2.python.gradient_checker import NetGradientChecker from caffe2.python.net_builder import ops, NetBuilder from caffe2.proto import caffe2_pb2 import unittest if (workspace.has_gpu_support or workspace.has_hip_support) and workspace.NumGpuDevices() > 0: gpu_device_option = caffe2_pb2.DeviceOption() gpu_device_option.device_type = caffe2_pb2.HIP if workspace.has_hip_support else caffe2_pb2.CUDA cpu_device_option = caffe2_pb2.DeviceOption() gpu_device_checker = device_checker.DeviceChecker(0.01, [gpu_device_option]) device_checker = device_checker.DeviceChecker( 0.01, [gpu_device_option, cpu_device_option]) gpu_gradient_checkers = [ gradient_checker.GradientChecker(0.005, 0.05, gpu_device_option, "gpu_checker_ws"), ] gradient_checkers = [ gradient_checker.GradientChecker(0.005, 0.05, gpu_device_option, "gpu_checker_ws"), gradient_checker.GradientChecker(0.01, 0.05, cpu_device_option,
sys.path.append("..") # data generation from data_generator.dlrm_data_caffe2 import DLRMDataGenerator from utils.utils import cli args = cli() ### some basic setup ### np.random.seed(args.numpy_rand_seed) np.set_printoptions(precision=args.print_precision) use_gpu = args.use_gpu if use_gpu: device_opt = core.DeviceOption(workspace.GpuDeviceType, 0) ngpus = workspace.NumGpuDevices() # 1 print("Using {} GPU(s)...".format(ngpus)) else: device_opt = core.DeviceOption(caffe2_pb2.CPU) print("Using CPU...") ### prepare training data ### ln_bot = np.fromstring(args.arch_mlp_bot, dtype=int, sep="-") dc = DLRMDataGenerator (args) if args.data_generation == "dataset": print("Error we have disabled this function currently....") sys.exit() # input and target data #(nbatches, lX, lS_l, lS_i, lT, # nbatches_test, lX_test, lS_l_test, lS_i_test, lT_test, # ln_emb, m_den) = dc.read_dataset(
class CopyOpsTest(test_util.TestCase): def tearDown(self): # Reset workspace after each test # Otherwise, the multi-GPU test will use previously created tensors, # which may have been placed on the wrong device workspace.ResetWorkspace() def run_test_copy_gradient(self, device_opt): model = model_helper.ModelHelper(name="copy_test") with core.DeviceScope(device_opt): x = model.net.AddExternalInputs("x") y = model.Copy(x, "y") loss = model.AveragedLoss(y, "loss") gradient_map = model.AddGradientOperators([loss]) workspace.FeedBlob(x, np.random.rand(32).astype(np.float32)) workspace.RunNetOnce(model.param_init_net) workspace.RunNetOnce(model.net) self.assertTrue(np.array_equal( workspace.FetchBlob(x), workspace.FetchBlob(y), )) self.assertTrue(np.array_equal( workspace.FetchBlob(gradient_map[x]), workspace.FetchBlob(gradient_map[y]), )) def test_copy_gradient_cpu(self): self.run_test_copy_gradient(core.DeviceOption(caffe2_pb2.CPU, 0)) @unittest.skipIf(workspace.NumGpuDevices() < 1, "Need at least 1 GPU.") def test_copy_gradient_gpu(self): self.run_test_copy_gradient(core.DeviceOption(workspace.GpuDeviceType, 0)) @unittest.skipIf(workspace.NumGpuDevices() < 2, "Need at least 2 GPU.") def test_copy_gradient_multiple_gpus(self): model = model_helper.ModelHelper(name="copy_test") with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU, 0)): x_cpu = model.net.AddExternalInputs("x_cpu") with core.DeviceScope(core.DeviceOption(workspace.GpuDeviceType, 0)): x_gpu_1 = model.CopyCPUToGPU(x_cpu, "x_gpu_1") with core.DeviceScope(core.DeviceOption(workspace.GpuDeviceType, 1)): x_gpu_2 = model.Copy(x_gpu_1, "x_gpu_2") loss = model.AveragedLoss(x_gpu_2, "loss") gradient_map = model.AddGradientOperators([loss]) workspace.FeedBlob("x_cpu", np.random.rand(32).astype(np.float32)) workspace.RunNetOnce(model.param_init_net) workspace.RunNetOnce(model.net) self.assertTrue(np.array_equal( workspace.FetchBlob("x_gpu_1"), workspace.FetchBlob("x_gpu_2"), )) self.assertTrue(np.array_equal( workspace.FetchBlob(gradient_map["x_gpu_1"]), workspace.FetchBlob(gradient_map["x_gpu_2"]), )) def get_op_with_output(model, output_blob_name): for op in model.net.Proto().op: if len(op.output) == 1 and op.output[0] == output_blob_name: return op return None self.assertEqual( get_op_with_output(model, "x_gpu_2_grad").device_option, core.DeviceOption(workspace.GpuDeviceType, 1), ) self.assertEqual( get_op_with_output(model, "x_cpu_grad").device_option, core.DeviceOption(workspace.GpuDeviceType, 0), ) @unittest.skipIf(workspace.NumGpuDevices() < 1, "Need at least 1 GPU.") def test_cpu2gpu_gpu2cpu_sparse_gradients(self): model = model_helper.ModelHelper(name="copy_test") v = model.param_init_net.UniformFill([], ["v"], shape=[16, 4]) indices = model.param_init_net.UniformFill([], ["v"], shape=[16, 4]) cpu_opt = core.DeviceOption(caffe2_pb2.CPU, 0) gpu_opt = core.DeviceOption(workspace.GpuDeviceType, 0) with core.DeviceScope(gpu_opt): vcpu = model.CopyGPUToCPU(v, "vcpu") with core.DeviceScope(cpu_opt): g = model.Gather([vcpu, indices], "g") with core.DeviceScope(gpu_opt): ggpu = model.CopyCPUToGPU(g, "ggpu") f = brew.fc(model, ggpu, "out", dim_in=4, dim_out=6) (softmax, loss) = model.SoftmaxWithLoss( [f, "label"], ["softmax", "loss"], ) gradient_map = model.AddGradientOperators([loss]) self.assertTrue("v" in gradient_map) self.assertTrue(isinstance(gradient_map['v'], core.GradientSlice)) @unittest.skipIf(workspace.NumGpuDevices() < 1, "Need at least 1 GPU.") def test_cpu2gpu_gpu2cpu_gradients(self): model = model_helper.ModelHelper(name="copy_test") batch = 32 cpu_opt = core.DeviceOption(caffe2_pb2.CPU, 0) gpu_opt = core.DeviceOption(workspace.GpuDeviceType, 0) with core.NameScope("cpu"): with core.DeviceScope(cpu_opt): x_cpu = brew.fc(model, 'data', 'x_cpu', 16, 8) with core.NameScope("gpu_0"): with core.DeviceScope(gpu_opt): x_gpu = model.CopyCPUToGPU(x_cpu, "x_gpu") pred_gpu = brew.fc(model, x_gpu, "pred_gpu", 8, 4) pred_cpu = model.CopyGPUToCPU(pred_gpu, "pred_cpu") with core.DeviceScope(cpu_opt): with core.NameScope("cpu"): (softmax, loss) = model.SoftmaxWithLoss( [pred_cpu, "label"], ["softmax", "loss"], ) gradient_map = model.AddGradientOperators([loss]) # Add param updates (for cpu and gpu) init_net = model.param_init_net with core.DeviceScope(cpu_opt): with core.NameScope("cpu"): ONE = init_net.ConstantFill([], "ONE", shape=[1], value=1.) LR = init_net.ConstantFill([], "LR", shape=[1], value=-2.0) for param in model.GetParams(): model.WeightedSum( [param, ONE, gradient_map[param], LR], param, ) with core.NameScope("gpu_0"): with core.DeviceScope(gpu_opt): ONE = init_net.ConstantFill([], "ONE", shape=[1], value=1.) LR = init_net.ConstantFill([], "LR", shape=[1], value=-2.0) for param in model.GetParams(): model.WeightedSum( [param, ONE, gradient_map[param], LR], param, ) with core.DeviceScope(cpu_opt): workspace.FeedBlob( 'cpu/data', np.random.rand(batch, 16).astype(np.float32), ) workspace.FeedBlob( 'cpu/label', np.random.randint(4, size=batch).astype(np.int32), ) workspace.RunNetOnce(model.param_init_net) workspace.CreateNet(model.net) initial_params = {p: workspace.FetchBlob(p) for p in model.GetParams()} workspace.RunNet(model.net.Proto().name) updated_params = {p: workspace.FetchBlob(p) for p in model.GetParams()} for p in model.GetParams(): g = gradient_map[p] expected = initial_params[p] - 2.0 * workspace.FetchBlob(g) actual = updated_params[p] self.assertTrue( np.array_equal(expected, updated_params[p]), "Mismatch: {}: {}, {}".format(p, expected, actual), )
) cpu_do = caffe2_pb2.DeviceOption() cuda_do = caffe2_pb2.DeviceOption(device_type=caffe2_pb2.CUDA) hip_do = caffe2_pb2.DeviceOption(device_type=caffe2_pb2.HIP) gpu_do = caffe2_pb2.DeviceOption(device_type=workspace.GpuDeviceType) # CUDA or ROCm # (bddppq) Do not rely on this no_hip option! It's just used to # temporarily skip some flaky tests on ROCM before it's getting more mature. _device_options_no_hip = [cpu_do] + ([cuda_do] if workspace.has_cuda_support else []) device_options = _device_options_no_hip + ([hip_do] if workspace.has_hip_support else []) # Include device option for each GPU expanded_device_options = [cpu_do] + [ caffe2_pb2.DeviceOption(device_type=workspace.GpuDeviceType, device_id=i) for i in range(workspace.NumGpuDevices())] def device_checker_device_options(): return st.just(device_options) def gradient_checker_device_option(): return st.sampled_from(device_options) gcs = dict( gc=gradient_checker_device_option(), dc=device_checker_device_options() )
class NCCLOpsTest(hu.HypothesisTestCase): @given(n=st.integers(min_value=2, max_value=workspace.NumGpuDevices()), m=st.integers(min_value=1, max_value=1000), in_place=st.booleans()) def test_nccl_allreduce(self, n, m, in_place): xs = [np.random.randn(m).astype(np.float32) for i in range(n)] inputs = [str("x_{}".format(i)) for i in range(n)] prefix = "" if in_place else "o" outputs = [str("{}x_{}".format(prefix, i)) for i in range(n)] op = core.CreateOperator("NCCLAllreduce", inputs, outputs) input_device_options = {n: gpu_device(i) for i, n in enumerate(inputs)} def allreduce(*args): assert len(args) == n output = np.sum(args, axis=0) return [output for _ in range(n)] outputs = self.assertReferenceChecks( hu.gpu_do, op, [xs[i] for i, _ in enumerate(inputs)], allreduce, input_device_options) for output in outputs: np.testing.assert_array_equal(outputs[0], output) self.assertEqual(outputs[0].tobytes(), output.tobytes()) @given(n=st.integers(min_value=2, max_value=workspace.NumGpuDevices()), m=st.integers(min_value=1, max_value=1000), root=st.integers(min_value=0, max_value=workspace.NumGpuDevices() - 1)) def test_nccl_broadcast(self, n, m, root): assume(root < n) xs = [np.random.randn(m).astype(np.float32) for i in range(n)] inputs = [str("x_{}".format(i)) for i in range(n)] op = core.CreateOperator("NCCLBroadcast", inputs, inputs, root=root) input_device_options = {n: gpu_device(i) for i, n in enumerate(inputs)} def broadcast(*args): assert len(args) == n return [args[root] for _ in range(n)] self.assertReferenceChecks(hu.gpu_do, op, [xs[i] for i, _ in enumerate(inputs)], broadcast, input_device_options) @given( n=st.integers(min_value=2, max_value=workspace.NumGpuDevices()), m=st.integers(min_value=1, max_value=1000), # NCCL Reduce seems to deadlock for non-zero roots. root=st.integers(min_value=0, max_value=0), in_place=st.booleans()) def test_nccl_reduce(self, n, m, root, in_place): assume(in_place is False or root == 0) xs = [np.random.randn(m).astype(np.float32) for i in range(n)] inputs = [str("x_{}".format(i)) for i in range(n)] op = core.CreateOperator("NCCLReduce", inputs, inputs[root] if in_place else b"o", root=root) input_device_options = {n: gpu_device(i) for i, n in enumerate(inputs)} def reduce(*args): assert len(args) == n return [np.sum(args, axis=0)] self.assertReferenceChecks(hu.gpu_do, op, [xs[i] for i, _ in enumerate(inputs)], reduce, input_device_options) @given(n=st.integers(min_value=2, max_value=workspace.NumGpuDevices()), m=st.integers(min_value=1, max_value=1000)) def test_nccl_allgather(self, n, m): xs = [np.random.randn(m).astype(np.float32) for i in range(n)] inputs = [str("x_{}".format(i)) for i in range(n)] outputs = [str("o_{}".format(i)) for i in range(n)] op = core.CreateOperator("NCCLAllGather", inputs, outputs) input_device_options = {n: gpu_device(i) for i, n in enumerate(inputs)} def allgather(*args): assert len(args) == n return [np.stack(args, axis=0) for _ in range(n)] outputs = self.assertReferenceChecks( hu.gpu_do, op, [xs[i] for i, _ in enumerate(inputs)], allgather, input_device_options) for output in outputs: np.testing.assert_array_equal(outputs[0], output) self.assertEqual(outputs[0].tobytes(), output.tobytes()) @given(n=st.integers(min_value=2, max_value=workspace.NumGpuDevices()), m=st.integers(min_value=1, max_value=1000)) def test_nccl_reduce_scatter(self, n, m): xs = [np.random.randn(n, m).astype(np.float32) for i in range(n)] inputs = [str("x_{}".format(i)) for i in range(n)] outputs = [str("o_{}".format(i)) for i in range(n)] op = core.CreateOperator("NCCLReduceScatter", inputs, outputs) input_device_options = {n: gpu_device(i) for i, n in enumerate(inputs)} def reduce_scatter(*args): assert len(args) == n reduced = sum(args) assert len(reduced.shape) > 1 ref = [reduced[i, :] for i in range(n)] return ref self.assertReferenceChecks(hu.gpu_do, op, [xs[i] for i, _ in enumerate(inputs)], reduce_scatter, input_device_options) @given(n=st.integers(min_value=2, max_value=workspace.NumGpuDevices()), m=st.integers(min_value=100000, max_value=100000), iters=st.integers(min_value=1, max_value=100), net_type=st.sampled_from(["dag", "async_dag", "simple"])) def _test_nccl_sync(self, n, m, iters, net_type): inputs = [str("x_{}".format(i)) for i in range(n)] extra_inputs = [str("xe_{}".format(i)) for i in range(n)] net = core.Net("asdf") net.Proto().type = net_type net.Proto().num_workers = n for i in range(n): net.ConstantFill([], inputs[i], shape=[m], value=0.0, device_option=gpu_device(i)) net.ConstantFill([], extra_inputs[i], shape=[m], value=1.0, device_option=gpu_device(i)) for _ in range(iters): net.Sum([inputs[i], extra_inputs[i]], [inputs[i]], device_option=gpu_device(i)) net.NCCLReduce(inputs, [inputs[0]], device_option=gpu_device(0)) self.ws.run(net) np.testing.assert_array_equal( self.ws.blobs[inputs[0]].fetch(), np.full(shape=(m, ), fill_value=iters * n, dtype=np.float32)) @unittest.skipIf(not os.environ.get("CAFFE2_BENCHMARK"), "Benchmark") def test_timings(self): for n in range(2, workspace.NumGpuDevices()): for in_place in [False, True]: xs = [ np.random.randn(1e7).astype(np.float32) for i in range(n) ] inputs = [str("x_{}".format(i)) for i in range(n)] prefix = "" if in_place else "o" outputs = [str("{}x_{}".format(prefix, i)) for i in range(n)] net = core.Net("test") net.NCCLAllreduce(inputs, outputs) net.RunAllOnGPU() for i in range(n): self.ws.create_blob(inputs[i]).feed(xs[i], gpu_device(i)) self.ws.run(net) net_time = benchmark(self.ws, net) vanilla = core.Net("vanilla") muji.Allreduce(vanilla, inputs) vanilla_time = benchmark(self.ws, vanilla) print("Speedup for NCCL: {:.2f}".format(vanilla_time / net_time))
def testAllreduceSingleGPU(self): for i in range(workspace.NumGpuDevices()): self.RunningAllreduceWithGPUs([i], muji.Allreduce)
def testAllreduceFallback(self): self.RunningAllreduceWithGPUs(list(range(workspace.NumGpuDevices())), muji.AllreduceFallback)