Esempio n. 1
0
    def test_timings(self):
        for n in range(2, workspace.NumberOfGPUs()):
            for in_place in [False, True]:
                xs = [
                    np.random.randn(1e7).astype(np.float32) for i in range(n)
                ]
                inputs = [str("x_{}".format(i)) for i in range(n)]
                prefix = "" if in_place else "o"
                outputs = [str("{}x_{}".format(prefix, i)) for i in range(n)]

                net = core.Net("test")
                net.NCCLAllreduce(inputs, outputs)
                net.RunAllOnGPU()
                for i in range(n):
                    workspace.FeedBlob(inputs[i], xs[i],
                                       gpu_device(i).SerializeToString())
                workspace.RunNetOnce(net.Proto().SerializeToString())
                net_time = benchmark(net)
                vanilla = core.Net("vanilla")
                muji.Allreduce(vanilla, inputs)
                vanilla_time = benchmark(vanilla)
                print("Speedup for NCCL: {:.2f}".format(vanilla_time /
                                                        net_time))
Esempio n. 2
0
import numpy as np
from caffe2.python import core, device_checker, gradient_checker, workspace
from caffe2.proto import caffe2_pb2, caffe2_legacy_pb2

import sys
import unittest

if workspace.has_gpu_support and workspace.NumberOfGPUs() > 0:
  gpu_device_option = caffe2_pb2.DeviceOption()
  gpu_device_option.device_type = caffe2_pb2.CUDA
  cpu_device_option = caffe2_pb2.DeviceOption()
  device_checker = device_checker.DeviceChecker(
      0.01, [gpu_device_option, cpu_device_option])
  gradient_checkers = [
      gradient_checker.GradientChecker(
          0.005, 0.05, gpu_device_option, "gpu_checker_ws"),
      gradient_checker.GradientChecker(
          0.01, 0.05, cpu_device_option, "cpu_checker_ws"),
  ]
else:
  cpu_device_option = caffe2_pb2.DeviceOption()
  device_checker = device_checker.DeviceChecker(
      0.01, [cpu_device_option])
  gradient_checkers = [
      gradient_checker.GradientChecker(
          0.01, 0.05, cpu_device_option, "cpu_checker_ws")
  ]


class TestConvLegacyPooling(unittest.TestCase):
  def setUp(self):
Esempio n. 3
0
 def testAllreduceSingleGPU(self):
     for i in range(workspace.NumberOfGPUs()):
         self.RunningAllreduceWithGPUs([i], muji.Allreduce)
Esempio n. 4
0
 def testAllreduceFallback(self):
     self.RunningAllreduceWithGPUs(range(workspace.NumberOfGPUs()),
                                   muji.AllreduceFallback)
Esempio n. 5
0
class NCCLOpsTest(hu.HypothesisTestCase):
    @given(n=st.integers(min_value=2, max_value=workspace.NumberOfGPUs()),
           m=st.integers(min_value=1, max_value=1000),
           in_place=st.booleans())
    def test_nccl_allreduce(self, n, m, in_place):
        xs = [np.random.randn(m).astype(np.float32) for i in range(n)]
        inputs = [str("x_{}".format(i)) for i in range(n)]
        prefix = "" if in_place else "o"
        outputs = [str("{}x_{}".format(prefix, i)) for i in range(n)]
        op = core.CreateOperator("NCCLAllreduce", inputs, outputs)
        input_device_options = {n: gpu_device(i) for i, n in enumerate(inputs)}

        def allreduce(*args):
            assert len(args) == n
            output = np.sum(args, axis=0)
            return [output for _ in range(n)]

        self.assertReferenceChecks(hu.gpu_do, op,
                                   [xs[i] for i, _ in enumerate(inputs)],
                                   allreduce, input_device_options)

    @given(n=st.integers(min_value=2, max_value=workspace.NumberOfGPUs()),
           m=st.integers(min_value=1, max_value=1000),
           root=st.integers(min_value=0,
                            max_value=workspace.NumberOfGPUs() - 1))
    def test_nccl_broadcast(self, n, m, root):
        assume(root < n)
        xs = [np.random.randn(m).astype(np.float32) for i in range(n)]
        inputs = [str("x_{}".format(i)) for i in range(n)]
        op = core.CreateOperator("NCCLBroadcast", inputs, inputs, root=root)
        input_device_options = {n: gpu_device(i) for i, n in enumerate(inputs)}

        def broadcast(*args):
            assert len(args) == n
            return [args[root] for _ in range(n)]

        self.assertReferenceChecks(hu.gpu_do, op,
                                   [xs[i] for i, _ in enumerate(inputs)],
                                   broadcast, input_device_options)

    @given(n=st.integers(min_value=2, max_value=workspace.NumberOfGPUs()),
           m=st.integers(min_value=1, max_value=1000),
           root=st.integers(min_value=0,
                            max_value=workspace.NumberOfGPUs() - 1),
           in_place=st.booleans())
    def test_nccl_reduce(self, n, m, root, in_place):
        assume(root < n)
        assume(in_place is False or root == 0)
        xs = [np.random.randn(m).astype(np.float32) for i in range(n)]
        inputs = [str("x_{}".format(i)) for i in range(n)]
        op = core.CreateOperator("NCCLReduce",
                                 inputs,
                                 inputs[root] if in_place else b"o",
                                 root=root)
        input_device_options = {n: gpu_device(i) for i, n in enumerate(inputs)}

        def reduce(*args):
            assert len(args) == n
            return [np.sum(args, axis=0)]

        self.assertReferenceChecks(hu.gpu_do, op,
                                   [xs[i] for i, _ in enumerate(inputs)],
                                   reduce, input_device_options)

    @given(n=st.integers(min_value=2, max_value=workspace.NumberOfGPUs()),
           m=st.integers(min_value=1, max_value=1000))
    def test_nccl_allgather(self, n, m):
        xs = [np.random.randn(m).astype(np.float32) for i in range(n)]
        inputs = [str("x_{}".format(i)) for i in range(n)]
        outputs = [str("o_{}".format(i)) for i in range(n)]
        op = core.CreateOperator("NCCLAllGather", inputs, outputs)
        input_device_options = {n: gpu_device(i) for i, n in enumerate(inputs)}

        def allgather(*args):
            assert len(args) == n
            return [np.stack(args, axis=0) for _ in range(n)]

        self.assertReferenceChecks(hu.gpu_do, op,
                                   [xs[i] for i, _ in enumerate(inputs)],
                                   allgather, input_device_options)

    @unittest.skipIf(not os.environ.get("CAFFE2_BENCHMARK"), "Benchmark")
    def test_timings(self):
        for n in range(2, workspace.NumberOfGPUs()):
            for in_place in [False, True]:
                xs = [
                    np.random.randn(1e7).astype(np.float32) for i in range(n)
                ]
                inputs = [str("x_{}".format(i)) for i in range(n)]
                prefix = "" if in_place else "o"
                outputs = [str("{}x_{}".format(prefix, i)) for i in range(n)]

                net = core.Net("test")
                net.NCCLAllreduce(inputs, outputs)
                net.RunAllOnGPU()
                for i in range(n):
                    workspace.FeedBlob(inputs[i], xs[i],
                                       gpu_device(i).SerializeToString())
                workspace.RunNetOnce(net.Proto().SerializeToString())
                net_time = benchmark(net)
                vanilla = core.Net("vanilla")
                muji.Allreduce(vanilla, inputs)
                vanilla_time = benchmark(vanilla)
                print("Speedup for NCCL: {:.2f}".format(vanilla_time /
                                                        net_time))
Esempio n. 6
0
 def testGetCudaPeerAccessPattern(self):
     pattern = workspace.GetCudaPeerAccessPattern()
     self.assertEqual(type(pattern), np.ndarray)
     self.assertEqual(pattern.ndim, 2)
     self.assertEqual(pattern.shape[0], pattern.shape[1])
     self.assertEqual(pattern.shape[0], workspace.NumberOfGPUs())
Esempio n. 7
0
import numpy as np
import unittest

from caffe2.proto import caffe2_pb2
from caffe2.python import core, workspace, test_util


@unittest.skipIf(not workspace.has_gpu_support
                 or workspace.NumberOfGPUs() == 0, "No gpu support.")
class TestWorkspaceGPU(test_util.TestCase):
    def setUp(self):
        workspace.ResetWorkspace()
        self.net = core.Net("test-net")
        self.net.ConstantFill([], "testblob", shape=[1, 2, 3, 4], value=1.0)
        self.net.RunAllOnGPU()

    def testFetchBlobGPU(self):
        self.assertEqual(
            workspace.RunNetOnce(self.net.Proto().SerializeToString()), True)
        fetched = workspace.FetchBlob("testblob")
        # check if fetched is correct.
        self.assertEqual(fetched.shape, (1, 2, 3, 4))
        np.testing.assert_array_equal(fetched, 1.0)
        fetched[:] = 2.0
        self.assertEqual(workspace.FeedBlob("testblob", fetched), True)
        fetched_again = workspace.FetchBlob("testblob")
        self.assertEqual(fetched_again.shape, (1, 2, 3, 4))
        np.testing.assert_array_equal(fetched_again, 2.0)

    def testDefaultGPUID(self):
        self.assertEqual(workspace.SetDefaultGPUID(0), True)
Esempio n. 8
0
            inputs["data"] = np.random.rand(4, 227, 227, 3).astype(np.float32)
        inputs["label"] = np.array([1, 2, 3, 4]).astype(np.int32)

        cpu_device = caffe2_pb2.DeviceOption()
        cpu_device.device_type = caffe2_pb2.CPU
        gpu_device = caffe2_pb2.DeviceOption()
        gpu_device.device_type = caffe2_pb2.CUDA

        checker = device_checker.DeviceChecker(1e-2, [cpu_device, gpu_device])
        ret = checker.CheckNet(
            model.net.Proto(),
            inputs,
            # The indices sometimes may be sensitive to small numerical
            # differences in the input, so we ignore checking them.
            ignore=['_pool1_idx', '_pool2_idx', '_pool5_idx']
        )
        self.assertEqual(ret, True)

    def testMiniAlexNet(self):
        self._testMiniAlexNet("NCHW")
        self._testMiniAlexNet("NHWC")


if __name__ == '__main__':
    if not workspace.has_gpu_support:
        print('No GPU support. Skipping gpu test.')
    elif workspace.NumberOfGPUs() == 0:
        print('No GPU device. Skipping gpu test.')
    else:
        unittest.main()