Example #1
0
    def __call__(self, input, u, v):
        output = zeros_like(input.data)
        events = []
        in_buf, in_evt = buffer_from_ndarray(self.queue, input.data,
                                             blocking=False)
        events.append(in_evt)
        self.kernel.setarg(0, in_buf, sizeof(cl_mem))

        u_buf, u_evt = buffer_from_ndarray(self.queue, u.data, blocking=False)
        events.append(u_evt)
        self.kernel.setarg(1, u_buf, sizeof(cl_mem))

        v_buf, v_evt = buffer_from_ndarray(self.queue, v.data, blocking=False)
        events.append(v_evt)
        self.kernel.setarg(2, v_buf, sizeof(cl_mem))

        out_buf, out_evt = buffer_from_ndarray(self.queue, output,
                                               blocking=False)
        events.append(out_evt)
        self.kernel.setarg(3, out_buf, sizeof(cl_mem))
        clWaitForEvents(*events)
        evt = clEnqueueNDRangeKernel(self.queue, self.kernel, self.global_size)
        evt.wait()
        _, evt = buffer_to_ndarray(self.queue, out_buf, output)
        evt.wait()
        return Array(unique_name(), output)
Example #2
0
 def __call__(self, A):
     output_array = np.empty(ceil(len(A) / WORK_GROUP_SIZE), np.int32)
     buf, evt = cl.buffer_from_ndarray(self.queue, A, blocking=False)
     output_buffer, output_evt = cl.buffer_from_ndarray(self.queue, output_array, blocking=False)
     self._c_function(self.queue, self.kernel, buf, output_buffer)
     B, evt = cl.buffer_to_ndarray(self.queue, output_buffer, like=output_array)
     return B
Example #3
0
 def process_output(self, output, out_like=None):
     if isinstance(output, cl.cl_mem):
         out, evt = cl.buffer_to_ndarray(self.queue, output,
                                         like=out_like)
         evt.wait()
         return out
     else:
         return output.value
Example #4
0
    def __call__(self, *args):
        """__call__

        :param *args:
        """
        if isinstance(args[0], hmarray):
            output = empty_like(args[0])
        else:
            output = np.zeros_like(args[0])
        # self.kernel.argtypes = tuple(
        #     cl_mem for _ in args + (output, )
        # ) + (localmem, )
        buffers = []
        events = []
        for index, arg in enumerate(args + (output, )):
            if isinstance(arg, hmarray):
                buffers.append(arg.ocl_buf)
            else:
                buf, evt = buffer_from_ndarray(self.queue, arg, blocking=True)
                # evt.wait()
                events.append(evt)
                buffers.append(buf)
                # self.kernel.setarg(index, buf, sizeof(cl_mem))
        cl.clWaitForEvents(*events)
        cl_error = 0
        if isinstance(self.kernel, list):
            kernels = len(self.kernel)
            if kernels == 2:
                cl_error = self._c_function(self.queue, self.kernel[0],
                                            self.kernel[1], *buffers)
            elif kernels == 3:
                cl_error = self._c_function(self.queue, self.kernel[0],
                                            self.kernel[1], self.kernel[2],
                                            *buffers)
            elif kernels == 4:
                cl_error = self._c_function(
                    self.queue, self.kernel[0], self.kernel[1], self.kernel[2],
                    self.kernel[3], *buffers
                )
        else:
            cl_error = self._c_function(self.queue, self.kernel, *buffers)

        if cl.cl_errnum(cl_error) != cl.cl_errnum.CL_SUCCESS:
            raise StencilException(
                "Error executing stencil kernel: opencl {} {}".format(
                    cl_error,
                    cl.cl_errnum(cl_error)
                )
            )
        if isinstance(output, hmarray):
            return output
        buf, evt = buffer_to_ndarray(
            self.queue, buffers[-1], output
        )
        evt.wait()

        return buf
Example #5
0
    def gpu_to_device(self, device=get_gpu(), wait=True, force=False):
        if not self.__is_dirty("host") and not force:
            return
        print("GPU to DEVICE")
        _, evt = pycl.buffer_to_ndarray(self.get_queue(device), self.__buffers[device.value], out=self)
        if wait:
            evt.wait()
        else:
            self.__waiting.append(evt)

        self.set_dirty('host', False)
Example #6
0
    def __call__(self, im):
        output = zeros_like(im.data)
        in_buf, evt = buffer_from_ndarray(self.queue, im.data, blocking=False)
        evt.wait()
        self.kernel.setarg(0, in_buf, sizeof(cl_mem))

        out_buf = clCreateBuffer(self.context, output.nbytes)
        self.kernel.setarg(1, out_buf, sizeof(cl_mem))
        evt = clEnqueueNDRangeKernel(self.queue, self.kernel, self.global_size)
        evt.wait()
        _, evt = buffer_to_ndarray(self.queue, out_buf, output)
        evt.wait()
        del in_buf
        del out_buf
        return Array(unique_name(), output)
Example #7
0
    def __call__(self, im, num_powers, border):
        out_shape = [num_powers] + list(im.shape)
        output = np.empty(out_shape, dtype=np.float32)

        in_buf, evt = buffer_from_ndarray(self.queue, im.data, blocking=False)
        evt.wait()
        self.kernel.setarg(0, in_buf, sizeof(cl_mem))

        out_buf = clCreateBuffer(self.queue.context, output.nbytes)
        self.kernel.setarg(1, out_buf, sizeof(cl_mem))

        evt = clEnqueueNDRangeKernel(self.queue, self.kernel, self.global_size)
        evt.wait()

        self.kernel2.setarg(0, out_buf, sizeof(cl_mem))

        for power in range(num_powers):
            self.kernel2.setarg(1, power, sizeof(cl_int))
            evt = clEnqueueNDRangeKernel(self.queue, self.kernel2, self.global_size)
            evt.wait()

        _, evt = buffer_to_ndarray(self.queue, out_buf, output)
        evt.wait()
        return Array(unique_name(), output)
Example #8
0
    def __call__(self, A):
        
        a = time.time()

        # Initialization and copy from CPU to GPU
        output_array = np.empty(1, A.dtype)
        buf, evt = cl.buffer_from_ndarray(self.queue, A, blocking=False)
        output_buffer, output_evt = cl.buffer_from_ndarray(self.queue, output_array, blocking=False)
        
        b = time.time()

        # Actual execution of the reduction.
        self._c_function(self.queue, self.kernel, buf, output_buffer)
        c = time.time()

        # Copying the result back from the GPU to the CPU
        B, evt = cl.buffer_to_ndarray(self.queue, output_buffer, like=output_array)
        d = time.time()

        # The true time of execution, exluding copy time is between b and c.
        print ("True SEJITS Time (excluding copy time): {0} seconds".format(c - b))

        # print("overall execution:", d-a, "Initial Copy:", b-a, "Kernel execution:", c-b, "Final Copy:", d-c)
        return B[0]
Example #9
0
 def process_output(self, out_buf, output):
     _, evt = buffer_to_ndarray(self.queue, out_buf, output.data)
     evt.wait()
     return output
 def sync_host(self):
     if backend in {"ocl", "opencl", "OCL"}:
         if os.environ.get("HM_BACKEND") in {"omp", "openmp"}:
             return
         _, evt = cl.buffer_to_ndarray(queue, self.ocl_buf, self)
         evt.wait()
Example #11
0
 def __call__(self, A):
     buf, evt = cl.buffer_from_ndarray(self.queue, A, blocking=False)
     self._c_function(self.queue, self.kernel, buf)
     B, evt = cl.buffer_to_ndarray(self.queue, buf, like=A)
     return B