Example #1
0
    def test_padded_kernel(self):
        """ Implement a simple padded kernel. """
        for case in self.cases:
            # Form data to work on.
            space.initialize_space(case['shape'])
            x_np = comm.allreduce(np.random.randn(*case['shape']).astype(case['dtype']))
            x = Grid(x_np, x_overlap=1)
            s_np = comm.allreduce(np.random.randn(1).astype(case['dtype']))
            s = Const(s_np)
            z = Out(case['dtype'])

            # Make a kernel.
            code = Template("""
                            if (_in_local && _in_global) {
                                x(0,0,0) = s(0) * x(0,0,0);
                                z += a * x(0,0,0);
                            }
                            """).render()
            fun = Kernel(code, \
                        ('a', 'number', case['dtype']), \
                        ('x', 'grid', x.dtype), \
                        ('s', 'const', s.dtype, s.data.size), \
                        ('z', 'out', z.dtype), \
                        padding=(1,1,1,1))

            # Execute and check the result.
            fun(case['dtype'](2), x, s, z)
            gpu_sum = z.get()
            cpu_sum = np.sum(2.0 * s_np * x_np)
            err = abs(gpu_sum - cpu_sum) / abs(cpu_sum)
            # print case, err
            if case['dtype'] in (np.float32, np.complex64):
                self.assertTrue(err < 1e-2, (case, err))
            else:
                self.assertTrue(err < 1e-6, (case, err))
Example #2
0
    def test_batch_sum(self):
        """ Make sure batch summing works. """
        num_outs = 3
        for case in self.cases:
            space.initialize_space(case['shape'])
            x = [Out(case['dtype'], op='sum') for k in range(num_outs)]
            x_cpu_data = [np.random.randn(*case['shape'][1:])\
                            .astype(case['dtype']) for k in range(num_outs)]
                    
            if case['dtype'] in (np.complex64, np.complex128):
                for k in range(num_outs):
                    x_cpu_data[k] = (1 + 1j) * x_cpu_data[k]

            res_gold = []
            for k in range(num_outs):
                x[k].data.set(x_cpu_data[k])
                res_gold.append(comm.allreduce(np.sum(x_cpu_data[k].flatten())))

            batch_reduce(*x)
            res_gpu = [x_indiv.get() for x_indiv in x]

            for k in range(num_outs):
                err = abs(res_gold[k] - res_gpu[k]) / abs(res_gold[k])

                if case['dtype'] in (np.float32, np.complex64):
                    self.assertTrue(err < 1e-3)
                else:
                    self.assertTrue(err < 1e-10)
Example #3
0
    def test_simple_kernel(self):
        """ Implement a simple kernel. """
        for case in self.cases:
            # Form data to work on.
            space.initialize_space(case['shape'])
            x_np = comm.allreduce(np.random.randn(*case['shape']).astype(case['dtype']))
            x = Grid(x_np, x_overlap=2)
            s_np = comm.allreduce(np.random.randn(case['shape'][0],1,1).astype(case['dtype']))
            s = Const(s_np)
            z = Out(case['dtype'])

            # Make a kernel.
            code = Template("""
                            if (_in_local && _in_global) {
                                z += a * s(_X) * x(0,0,0);
                                // z += a * x(0,0,0);
                            }
                            """).render()
            fun = Kernel(code, \
                        ('a', 'number', case['dtype']), \
                        ('x', 'grid', x.dtype), \
                        ('s', 'const', s.dtype), \
                        ('z', 'out', z.dtype), \
                        shape_filter='all')

            # Execute and check the result.
            # fun()
            while fun.exec_configs:
            # for k in range(40):
                fun(case['dtype'](2.0), x, s, z)
                # fun(case['dtype'](2.0), x, z)
                gpu_sum = z.get()
                cpu_sum = np.sum(2 * s_np * x_np)
                # cpu_sum = np.sum(2 * x_np)
                err = abs(gpu_sum - cpu_sum) / abs(cpu_sum)
                if case['dtype'] in (np.float32, np.complex64):
                    self.assertTrue(err < 1e-2, (case, err))
                else:
                    self.assertTrue(err < 1e-6, (case, err))
Example #4
0
    def test_sum(self):
        """ Make sure summing works. """
        for case in self.cases:
            space.initialize_space(case['shape'])
            x = Out(case['dtype'], op='sum')
            x_cpu_data = np.random.randn(*case['shape'][1:]).astype(case['dtype'])
            if case['dtype'] in (np.complex64, np.complex128):
                x_cpu_data = (1 + 1j) * x_cpu_data

            x.data.set(x_cpu_data)
            res_gold = comm.allreduce(np.sum(x_cpu_data.flatten()))

            x.reduce()
            err = abs(res_gold - x.get()) / abs(res_gold)

            if case['dtype'] in (np.float32, np.complex64):
                self.assertTrue(err < 1e-3)
            else:
                self.assertTrue(err < 1e-10)
Example #5
0
    def test_sum(self):
        """ Make sure summing works. """
        for case in self.cases:
            space.initialize_space(case['shape'])
            x = Out(case['dtype'], op='sum')
            x_cpu_data = np.random.randn(*case['shape'][1:]).astype(
                case['dtype'])
            if case['dtype'] in (np.complex64, np.complex128):
                x_cpu_data = (1 + 1j) * x_cpu_data

            x.data.set(x_cpu_data)
            res_gold = comm.allreduce(np.sum(x_cpu_data.flatten()))

            x.reduce()
            err = abs(res_gold - x.get()) / abs(res_gold)

            if case['dtype'] in (np.float32, np.complex64):
                self.assertTrue(err < 1e-3)
            else:
                self.assertTrue(err < 1e-10)
Example #6
0
def batch_reduce(*outs):
    """ Optimal (compared to self.reduce) when communication cost is latency bound. """
    results = comm.allreduce(np.array([ga.sum(out.data).get()
                                       for out in outs]))
    for k in range(len(outs)):
        outs[k].result = results[k]
Example #7
0
 def reduce(self):
     """ Compute the result. """
     self.result = comm.allreduce(ga.sum(self.data).get())
Example #8
0
        """ Make sure batch summing works. """
        num_outs = 3
        for case in self.cases:
            space.initialize_space(case['shape'])
            x = [Out(case['dtype'], op='sum') for k in range(num_outs)]
            x_cpu_data = [np.random.randn(*case['shape'][1:])\
                            .astype(case['dtype']) for k in range(num_outs)]

            if case['dtype'] in (np.complex64, np.complex128):
                for k in range(num_outs):
                    x_cpu_data[k] = (1 + 1j) * x_cpu_data[k]

            res_gold = []
            for k in range(num_outs):
                x[k].data.set(x_cpu_data[k])
                res_gold.append(comm.allreduce(np.sum(
                    x_cpu_data[k].flatten())))

            batch_reduce(*x)
            res_gpu = [x_indiv.get() for x_indiv in x]

            for k in range(num_outs):
                err = abs(res_gold[k] - res_gpu[k]) / abs(res_gold[k])

                if case['dtype'] in (np.float32, np.complex64):
                    self.assertTrue(err < 1e-3)
                else:
                    self.assertTrue(err < 1e-10)


if __name__ == '__main__':
    unittest.main()
Example #9
0
    def _create_tempdir(request, mode=None):
        """
        Adapted from DOLFIN's dolfin_utils/test/fixtures.py.
        """

        # Get directory name of test_foo.py file
        testfile = request.module.__file__
        testfiledir = os.path.dirname(os.path.abspath(testfile))

        # Construct name test_foo_tempdir from name test_foo.py
        testfilename = os.path.basename(testfile)
        if hasattr(request.config, "slaveinput"):
            outputname = testfilename.replace(
                ".py",
                "_tempdir_{}".format(request.config.slaveinput["slaveid"]))
        else:
            outputname = testfilename.replace(".py", "_tempdir")

        # Get function name test_something from test_foo.py
        function = request.function.__name__
        if mode == "save":
            function = function.replace("_save", "_io")
        elif mode == "load":
            function = function.replace("_load", "_io")

        # Join all of these to make a unique path for this test function
        basepath = os.path.join(testfiledir, outputname)
        path = os.path.join(basepath, function)

        # Add a sequence number to avoid collisions when tests are otherwise parameterized
        if COMM_WORLD.rank == 0:
            _create_tempdir._sequencenumber[path] += 1
            sequencenumber = _create_tempdir._sequencenumber[path]
            sequencenumber = COMM_WORLD.allreduce(sequencenumber, op=SUM)
        else:
            sequencenumber = COMM_WORLD.allreduce(0, op=SUM)
        path += "__" + str(sequencenumber)

        # Delete and re-create directory on root node
        if COMM_WORLD.rank == 0:
            # First time visiting this basepath, delete the old and create
            # a new if mode is not load
            if basepath not in _create_tempdir._basepaths:
                _create_tempdir._basepaths.add(basepath)
                if mode == "load":
                    assert os.path.exists(basepath)
                else:
                    if os.path.exists(basepath):
                        shutil.rmtree(basepath)
                # Make sure we have the base path test_foo_tempdir for
                # this test_foo.py file
                if not os.path.exists(basepath):
                    os.mkdir(basepath)

            # Delete path from old test run if mode is not load
            if mode == "load":
                assert os.path.exists(path)
            else:
                if os.path.exists(path):
                    shutil.rmtree(path)
            # Make sure we have the path for this test execution:
            # e.g. test_foo_tempdir/test_something__3
            if not os.path.exists(path):
                os.mkdir(path)
        COMM_WORLD.barrier()

        return path
Example #10
0
def batch_reduce(*outs):
    """ Optimal (compared to self.reduce) when communication cost is latency bound. """
    results = comm.allreduce(np.array([ga.sum(out.data).get() for out in outs]))
    for k in range(len(outs)):
        outs[k].result = results[k]
Example #11
0
 def reduce(self):
     """ Compute the result. """
     self.result = comm.allreduce(ga.sum(self.data).get())