Example #1
0
 def test_realign_numpy_array(self):
     import numpy
     a = numpy.empty(1000, dtype=numpy.float32)
     a = cl.realign_array(a, 1056, numpy)
     self.assertEqual(a.__array_interface__["data"][0] % 1056, 0)
     a = numpy.empty(1024, dtype=numpy.float32)
     a = cl.realign_array(a, 4096, numpy)
     self.assertEqual(a.__array_interface__["data"][0] % 4096, 0)
Example #2
0
 def test_realign_numpy_array(self):
     import numpy
     a = numpy.empty(1000, dtype=numpy.float32)
     a = cl.realign_array(a, 1056, numpy)
     self.assertEqual(a.__array_interface__["data"][0] % 1056, 0)
     a = numpy.empty(1024, dtype=numpy.float32)
     a = cl.realign_array(a, 4096, numpy)
     self.assertEqual(a.__array_interface__["data"][0] % 4096, 0)
Example #3
0
File: memory.py Project: 2php/veles
 def ocl_realign_mem(self):
     """We are using CL_MEM_USE_HOST_PTR, so memory should be PAGE-aligned.
     """
     if isinstance(self.device, NumpyDevice) or \
             self.device.device_info.memalign <= 4096:
         memalign = 4096
     else:
         memalign = self.device.device_info.memalign
     self.mem = cl.realign_array(self._mem, memalign, numpy)
Example #4
0
 def ocl_realign_mem(self):
     """We are using CL_MEM_USE_HOST_PTR, so memory should be PAGE-aligned.
     """
     if isinstance(self.device, NumpyDevice) or \
             self.device.device_info.memalign <= 4096:
         memalign = 4096
     else:
         memalign = self.device.device_info.memalign
     self.mem = cl.realign_array(self._mem, memalign, numpy)
Example #5
0
 def test_realign_numpy_array(self):
     try:
         import numpy
     except ImportError:  # for pypy
         try:
             import numpypy as numpy
         except ImportError:
             raise ImportError("Could not import numpy")
     a = numpy.empty(1000, dtype=numpy.float32)
     a = cl.realign_array(a, 1056, numpy)
     self.assertEqual(a.__array_interface__["data"][0] % 1056, 0)
Example #6
0
    def test():
        print(os.environ.get("PYOPENCL_CTX"))
        os.environ["PYOPENCL_CTX"] = "0:0"
        # Create platform, context, program, kernel and queue
        platforms = cl.Platforms()
        print("OpenCL devices:\n%s" % platforms.dump_devices())
        ctx = platforms.create_some_context()
        queue = ctx.create_queue(ctx.devices[0], cl.CL_QUEUE_PROFILING_ENABLE)
        '''
        prg = ctx.create_program(
        """
        __kernel void test(
                __global float *a, 
                __global float *b, 
                const float c) 
        {
          size_t i = get_global_id(0);
          a[i] = (a[i] + b[i]) * c;
        }
        """)
        '''
        prg = ctx.create_program(testopencl.readoclfile("test.cl"))
        krn = prg.get_kernel("test")

        # Create arrays with some values for testing
        a = np.arange(100000, dtype=np.float32)
        b = np.cos(a)
        a = np.sin(a)
        a_copy = a.copy()
        # Prepare arrays for use with map_buffer
        a = cl.realign_array(a, queue.device.memalign, np)
        b = cl.realign_array(b, queue.device.memalign, np)
        c = np.array([0.1], dtype=np.float32)
        d = (a + b) * c[0]
        # Create buffers
        a_ = ctx.create_buffer(cl.CL_MEM_READ_WRITE | cl.CL_MEM_USE_HOST_PTR,
                               a)
        b_ = ctx.create_buffer(cl.CL_MEM_READ_WRITE | cl.CL_MEM_USE_HOST_PTR,
                               b)
        # Set kernel arguments
        krn.set_args(a_, b_, c[0:1])
        # Execute kernel
        global_size = [a.size]
        local_size = None
        queue.execute_kernel(krn, global_size, local_size, need_event=False)
        # Get results back from the device by map_buffer
        ev, ptr = queue.map_buffer(a_, cl.CL_MAP_READ, a.nbytes)
        del ev
        queue.unmap_buffer(a_, ptr).wait()
        print(a - d)
        aa = np.zeros(a.shape, dtype=a.dtype)
        queue.read_buffer(a_, aa)
        print(aa - d)
        # Refill buffer with stored copy by write_buffer
        ev = queue.write_buffer(a_, a_copy, blocking=False, need_event=True)
        # Execute kernel
        ev = queue.execute_kernel(krn,
                                  global_size,
                                  local_size,
                                  wait_for=(ev, ))
        # Get results back from the device by map_buffer
        ev, ptr = queue.map_buffer(a_,
                                   cl.CL_MAP_READ,
                                   a.nbytes,
                                   wait_for=(ev, ),
                                   need_event=True)
        ev.wait()
        queue.unmap_buffer(a_, ptr).wait()
        print(a - d)
        bb = np.zeros(a.shape, dtype=a.dtype)
        queue.read_buffer(a_, bb)
        print(bb - d)
        del queue
        del ctx
        del krn
        del prg
        gc.collect()
Example #7
0
File: memory.py Project: 2php/veles
 def cuda_realign_mem(self):
     # We expect numpy array with continuous memory layout, so realign it.
     # PAGE-boundary alignment may increase speed also.
     self.mem = cl.realign_array(self._mem, 4096, numpy)
Example #8
0
    def test_create_sub_buffer(self):
        import numpy
        # Create platform, context, program, kernel and queue
        platforms = cl.Platforms()
        ctx = platforms.create_some_context()
        prg = ctx.create_program(self.src_test, self.include_dirs)
        krn = prg.get_kernel("test")
        queue = ctx.create_queue(ctx.devices[0])

        # Create arrays with some values for testing
        a = numpy.arange(100000, dtype=numpy.float32)
        b = numpy.cos(a)
        a = numpy.sin(a)

        # Prepare arrays for use with map_buffer
        a = cl.realign_array(a, queue.device.memalign, numpy)
        b = cl.realign_array(b, queue.device.memalign, numpy)
        c = numpy.array([1.2345], dtype=numpy.float32)
        d = a[1024:1024 + 4096] + b[2048:2048 + 4096] * c[0]

        # Create buffers
        a_parent_ = ctx.create_buffer(
            cl.CL_MEM_READ_WRITE | cl.CL_MEM_USE_HOST_PTR, a)
        self.assertEqual(a_parent_._n_refs, 1)
        a_ = a_parent_.create_sub_buffer(4096, 16384)
        self.assertEqual(a_parent_._n_refs, 2)
        self.assertEqual(a_._n_refs, 1)
        b_parent_ = ctx.create_buffer(
            cl.CL_MEM_READ_WRITE | cl.CL_MEM_USE_HOST_PTR, b)
        self.assertEqual(b_parent_._n_refs, 1)
        b_ = b_parent_.create_sub_buffer(8192, 16384)
        self.assertEqual(b_parent_._n_refs, 2)
        self.assertEqual(b_._n_refs, 1)

        # Set kernel arguments
        krn.set_args(a_, b_, c[0:1])

        # Execute kernel
        global_size = [4096]
        local_size = None
        queue.execute_kernel(krn, global_size, local_size, need_event=False)

        # Get results back from the device by map_buffer
        ev, ptr = queue.map_buffer(a_, cl.CL_MAP_READ, a_.size)
        del ev
        queue.unmap_buffer(a_, ptr).wait()
        self.assertLess(
            numpy.fabs(a[1024:1024 + 4096] - d).max(), 0.0001,
            "Incorrect result after map_buffer")

        # Get results back from the device by read_buffer
        aa = numpy.zeros(4096, dtype=numpy.float32)
        queue.read_buffer(a_, aa)
        self.assertLess(
            numpy.fabs(aa - d).max(), 0.0001,
            "Incorrect result after read_buffer")

        del b_
        self.assertIn(b_parent_._n_refs, (1, 2))
        logging.info(
            "test_create_sub_buffer: "
            "b_parent_._n_refs = %d (expected 1 or 2)", b_parent_._n_refs)
        del a_
        self.assertIn(a_parent_._n_refs, (1, 2))
        logging.info(
            "test_create_sub_buffer: "
            "a_parent_._n_refs = %d (expected 1 or 2)", a_parent_._n_refs)
Example #9
0
    def test_api_numpy(self):
        import numpy
        # Create platform, context, program, kernel and queue
        platforms = cl.Platforms()
        ctx = platforms.create_some_context()
        prg = ctx.create_program(self.src_test, self.include_dirs)
        krn = prg.get_kernel("test")
        queue = ctx.create_queue(ctx.devices[0])

        # Create arrays with some values for testing
        a = numpy.arange(100000, dtype=numpy.float32)
        b = numpy.cos(a)
        a = numpy.sin(a)
        a_copy = a.copy()

        # Prepare arrays for use with map_buffer
        a = cl.realign_array(a, queue.device.memalign, numpy)
        b = cl.realign_array(b, queue.device.memalign, numpy)
        c = numpy.array([1.2345], dtype=numpy.float32)
        d = a + b * c[0]

        # Create buffers
        a_ = ctx.create_buffer(cl.CL_MEM_READ_WRITE | cl.CL_MEM_USE_HOST_PTR,
                               a)
        b_ = ctx.create_buffer(cl.CL_MEM_READ_WRITE | cl.CL_MEM_USE_HOST_PTR,
                               b)

        # Set kernel arguments
        krn.set_args(a_, b_, c[0:1])

        # Execute kernel
        global_size = [a.size]
        local_size = None
        queue.execute_kernel(krn, global_size, local_size, need_event=False)

        # Get results back from the device by map_buffer
        ev, ptr = queue.map_buffer(a_, cl.CL_MAP_READ, a.nbytes)
        del ev
        queue.unmap_buffer(a_, ptr).wait()
        self.assertLess(
            numpy.fabs(a - d).max(), 0.0001,
            "Incorrect result after map_buffer")

        # Get results back from the device by read_buffer
        aa = numpy.zeros(a.shape, dtype=a.dtype)
        queue.read_buffer(a_, aa)
        self.assertLess(
            numpy.fabs(aa - d).max(), 0.0001,
            "Incorrect result after read_buffer")

        # Refill buffer with stored copy by map_buffer with event
        ev, ptr = queue.map_buffer(
            a_,
            cl.CL_MAP_WRITE if queue.device.version < 1.1999 else
            cl.CL_MAP_WRITE_INVALIDATE_REGION,
            a.nbytes,
            blocking=False,
            need_event=True)
        ev.wait()
        a[:] = a_copy[:]
        ev = queue.unmap_buffer(a_, ptr)

        # Execute kernel
        ev = queue.execute_kernel(krn,
                                  global_size,
                                  local_size,
                                  wait_for=(ev, ))
        # Get results back from the device by map_buffer
        ev, ptr = queue.map_buffer(a_,
                                   cl.CL_MAP_READ,
                                   a.nbytes,
                                   wait_for=(ev, ),
                                   need_event=True)
        ev.wait()
        queue.unmap_buffer(a_, ptr).wait()
        self.assertLess(
            numpy.fabs(a - d).max(), 0.0001,
            "Incorrect result after map_buffer")

        # Refill buffer with stored copy by write_buffer
        ev = queue.write_buffer(a_, a_copy, blocking=False, need_event=True)

        # Execute kernel
        ev = queue.execute_kernel(krn,
                                  global_size,
                                  local_size,
                                  wait_for=(ev, ))
        # Get results back from the device by map_buffer
        ev, ptr = queue.map_buffer(a_,
                                   cl.CL_MAP_READ,
                                   a.nbytes,
                                   wait_for=(ev, ),
                                   need_event=True)
        ev.wait()
        queue.unmap_buffer(a_, ptr).wait()
        self.assertLess(
            numpy.fabs(a - d).max(), 0.0001,
            "Incorrect result after map_buffer")
Example #10
0
    def test_create_sub_buffer(self):
        import numpy
        # Create platform, context, program, kernel and queue
        platforms = cl.Platforms()
        ctx = platforms.create_some_context()
        prg = ctx.create_program(self.src_test, self.include_dirs)
        krn = prg.get_kernel("test")
        queue = ctx.create_queue(ctx.devices[0])

        # Create arrays with some values for testing
        a = numpy.arange(100000, dtype=numpy.float32)
        b = numpy.cos(a)
        a = numpy.sin(a)

        # Prepare arrays for use with map_buffer
        a = cl.realign_array(a, queue.device.memalign, numpy)
        b = cl.realign_array(b, queue.device.memalign, numpy)
        c = numpy.array([1.2345], dtype=numpy.float32)
        d = a[1024:1024 + 4096] + b[2048:2048 + 4096] * c[0]

        # Create buffers
        a_parent_ = ctx.create_buffer(
            cl.CL_MEM_READ_WRITE | cl.CL_MEM_USE_HOST_PTR, a)
        self.assertEqual(a_parent_._n_refs, 1)
        a_ = a_parent_.create_sub_buffer(4096, 16384)
        self.assertEqual(a_parent_._n_refs, 2)
        self.assertEqual(a_._n_refs, 1)
        b_parent_ = ctx.create_buffer(
            cl.CL_MEM_READ_WRITE | cl.CL_MEM_USE_HOST_PTR, b)
        self.assertEqual(b_parent_._n_refs, 1)
        b_ = b_parent_.create_sub_buffer(8192, 16384)
        self.assertEqual(b_parent_._n_refs, 2)
        self.assertEqual(b_._n_refs, 1)

        # Set kernel arguments
        krn.set_args(a_, b_, c[0:1])

        # Execute kernel
        global_size = [4096]
        local_size = None
        queue.execute_kernel(krn, global_size, local_size, need_event=False)

        # Get results back from the device by map_buffer
        ev, ptr = queue.map_buffer(a_, cl.CL_MAP_READ, a_.size)
        del ev
        queue.unmap_buffer(a_, ptr).wait()
        self.assertLess(numpy.fabs(a[1024:1024 + 4096] - d).max(), 0.0001,
                        "Incorrect result after map_buffer")

        # Get results back from the device by read_buffer
        aa = numpy.zeros(4096, dtype=numpy.float32)
        queue.read_buffer(a_, aa)
        self.assertLess(numpy.fabs(aa - d).max(), 0.0001,
                        "Incorrect result after read_buffer")

        del b_
        self.assertIn(b_parent_._n_refs, (1, 2))
        logging.info("test_create_sub_buffer: "
                     "b_parent_._n_refs = %d (expected 1 or 2)",
                     b_parent_._n_refs)
        del a_
        self.assertIn(a_parent_._n_refs, (1, 2))
        logging.info("test_create_sub_buffer: "
                     "a_parent_._n_refs = %d (expected 1 or 2)",
                     a_parent_._n_refs)
Example #11
0
    def test_api_numpy(self):
        import numpy
        # Create platform, context, program, kernel and queue
        platforms = cl.Platforms()
        ctx = platforms.create_some_context()
        prg = ctx.create_program(self.src_test, self.include_dirs)
        krn = prg.get_kernel("test")
        queue = ctx.create_queue(ctx.devices[0])

        # Create arrays with some values for testing
        a = numpy.arange(100000, dtype=numpy.float32)
        b = numpy.cos(a)
        a = numpy.sin(a)
        a_copy = a.copy()

        # Prepare arrays for use with map_buffer
        a = cl.realign_array(a, queue.device.memalign, numpy)
        b = cl.realign_array(b, queue.device.memalign, numpy)
        c = numpy.array([1.2345], dtype=numpy.float32)
        d = a + b * c[0]

        # Create buffers
        a_ = ctx.create_buffer(cl.CL_MEM_READ_WRITE | cl.CL_MEM_USE_HOST_PTR,
                               a)
        b_ = ctx.create_buffer(cl.CL_MEM_READ_WRITE | cl.CL_MEM_USE_HOST_PTR,
                               b)

        # Set kernel arguments
        krn.set_args(a_, b_, c[0:1])

        # Execute kernel
        global_size = [a.size]
        local_size = None
        queue.execute_kernel(krn, global_size, local_size, need_event=False)

        # Get results back from the device by map_buffer
        ev, ptr = queue.map_buffer(a_, cl.CL_MAP_READ, a.nbytes)
        del ev
        queue.unmap_buffer(a_, ptr).wait()
        self.assertLess(numpy.fabs(a - d).max(), 0.0001,
                        "Incorrect result after map_buffer")

        # Get results back from the device by read_buffer
        aa = numpy.zeros(a.shape, dtype=a.dtype)
        queue.read_buffer(a_, aa)
        self.assertLess(numpy.fabs(aa - d).max(), 0.0001,
                        "Incorrect result after read_buffer")

        # Refill buffer with stored copy by map_buffer with event
        ev, ptr = queue.map_buffer(
            a_, cl.CL_MAP_WRITE if queue.device.version < 1.1999
            else cl.CL_MAP_WRITE_INVALIDATE_REGION, a.nbytes,
            blocking=False, need_event=True)
        ev.wait()
        a[:] = a_copy[:]
        ev = queue.unmap_buffer(a_, ptr)

        # Execute kernel
        ev = queue.execute_kernel(krn, global_size, local_size, wait_for=(ev,))
        # Get results back from the device by map_buffer
        ev, ptr = queue.map_buffer(a_, cl.CL_MAP_READ, a.nbytes,
                                   wait_for=(ev,), need_event=True)
        ev.wait()
        queue.unmap_buffer(a_, ptr).wait()
        self.assertLess(numpy.fabs(a - d).max(), 0.0001,
                        "Incorrect result after map_buffer")

        # Refill buffer with stored copy by write_buffer
        ev = queue.write_buffer(a_, a_copy, blocking=False, need_event=True)

        # Execute kernel
        ev = queue.execute_kernel(krn, global_size, local_size, wait_for=(ev,))
        # Get results back from the device by map_buffer
        ev, ptr = queue.map_buffer(a_, cl.CL_MAP_READ, a.nbytes,
                                   wait_for=(ev,), need_event=True)
        ev.wait()
        queue.unmap_buffer(a_, ptr).wait()
        self.assertLess(numpy.fabs(a - d).max(), 0.0001,
                        "Incorrect result after map_buffer")
Example #12
0
 def cuda_realign_mem(self):
     # We expect numpy array with continuous memory layout, so realign it.
     # PAGE-boundary alignment may increase speed also.
     self.mem = cl.realign_array(self._mem, 4096, numpy)