Ejemplo n.º 1
0
    def __call__(self, input, u, v):
        output = zeros_like(input.data)
        events = []
        in_buf, in_evt = buffer_from_ndarray(self.queue, input.data,
                                             blocking=False)
        events.append(in_evt)
        self.kernel.setarg(0, in_buf, sizeof(cl_mem))

        u_buf, u_evt = buffer_from_ndarray(self.queue, u.data, blocking=False)
        events.append(u_evt)
        self.kernel.setarg(1, u_buf, sizeof(cl_mem))

        v_buf, v_evt = buffer_from_ndarray(self.queue, v.data, blocking=False)
        events.append(v_evt)
        self.kernel.setarg(2, v_buf, sizeof(cl_mem))

        out_buf, out_evt = buffer_from_ndarray(self.queue, output,
                                               blocking=False)
        events.append(out_evt)
        self.kernel.setarg(3, out_buf, sizeof(cl_mem))
        clWaitForEvents(*events)
        evt = clEnqueueNDRangeKernel(self.queue, self.kernel, self.global_size)
        evt.wait()
        _, evt = buffer_to_ndarray(self.queue, out_buf, output)
        evt.wait()
        return Array(unique_name(), output)
Ejemplo n.º 2
0
 def __call__(self, A):
     output_array = np.empty(ceil(len(A) / WORK_GROUP_SIZE), np.int32)
     buf, evt = cl.buffer_from_ndarray(self.queue, A, blocking=False)
     output_buffer, output_evt = cl.buffer_from_ndarray(self.queue, output_array, blocking=False)
     self._c_function(self.queue, self.kernel, buf, output_buffer)
     B, evt = cl.buffer_to_ndarray(self.queue, output_buffer, like=output_array)
     return B
Ejemplo n.º 3
0
 def process_output(self, output, out_like=None):
     if isinstance(output, cl.cl_mem):
         out, evt = cl.buffer_to_ndarray(self.queue, output,
                                         like=out_like)
         evt.wait()
         return out
     else:
         return output.value
Ejemplo n.º 4
0
    def __call__(self, *args):
        """__call__

        :param *args:
        """
        if isinstance(args[0], hmarray):
            output = empty_like(args[0])
        else:
            output = np.zeros_like(args[0])
        # self.kernel.argtypes = tuple(
        #     cl_mem for _ in args + (output, )
        # ) + (localmem, )
        buffers = []
        events = []
        for index, arg in enumerate(args + (output, )):
            if isinstance(arg, hmarray):
                buffers.append(arg.ocl_buf)
            else:
                buf, evt = buffer_from_ndarray(self.queue, arg, blocking=True)
                # evt.wait()
                events.append(evt)
                buffers.append(buf)
                # self.kernel.setarg(index, buf, sizeof(cl_mem))
        cl.clWaitForEvents(*events)
        cl_error = 0
        if isinstance(self.kernel, list):
            kernels = len(self.kernel)
            if kernels == 2:
                cl_error = self._c_function(self.queue, self.kernel[0],
                                            self.kernel[1], *buffers)
            elif kernels == 3:
                cl_error = self._c_function(self.queue, self.kernel[0],
                                            self.kernel[1], self.kernel[2],
                                            *buffers)
            elif kernels == 4:
                cl_error = self._c_function(
                    self.queue, self.kernel[0], self.kernel[1], self.kernel[2],
                    self.kernel[3], *buffers
                )
        else:
            cl_error = self._c_function(self.queue, self.kernel, *buffers)

        if cl.cl_errnum(cl_error) != cl.cl_errnum.CL_SUCCESS:
            raise StencilException(
                "Error executing stencil kernel: opencl {} {}".format(
                    cl_error,
                    cl.cl_errnum(cl_error)
                )
            )
        if isinstance(output, hmarray):
            return output
        buf, evt = buffer_to_ndarray(
            self.queue, buffers[-1], output
        )
        evt.wait()

        return buf
Ejemplo n.º 5
0
    def gpu_to_device(self, device=get_gpu(), wait=True, force=False):
        if not self.__is_dirty("host") and not force:
            return
        print("GPU to DEVICE")
        _, evt = pycl.buffer_to_ndarray(self.get_queue(device), self.__buffers[device.value], out=self)
        if wait:
            evt.wait()
        else:
            self.__waiting.append(evt)

        self.set_dirty('host', False)
Ejemplo n.º 6
0
    def __call__(self, im):
        output = zeros_like(im.data)
        in_buf, evt = buffer_from_ndarray(self.queue, im.data, blocking=False)
        evt.wait()
        self.kernel.setarg(0, in_buf, sizeof(cl_mem))

        out_buf = clCreateBuffer(self.context, output.nbytes)
        self.kernel.setarg(1, out_buf, sizeof(cl_mem))
        evt = clEnqueueNDRangeKernel(self.queue, self.kernel, self.global_size)
        evt.wait()
        _, evt = buffer_to_ndarray(self.queue, out_buf, output)
        evt.wait()
        del in_buf
        del out_buf
        return Array(unique_name(), output)
Ejemplo n.º 7
0
        def get():
            if cl_buffer is not None:
                _, evt = cl.buffer_to_ndarray(latte.config.cl_queue, cl_buffer, out=buffer)
                evt.wait()
            if field in self.tiling_info:
                untiled = buffer
                if field in self.private_info:
                    untiled = untiled[0]
                shape = untiled.shape
                tiled_shape = list(shape)
                if not isinstance(self, ActivationEnsemble) or field not in ["value", "grad"]:
                    for dim, factor in self.tiling_info[field]:
                        if field in self.batch_fields:
                            dim += 1
                        tiled_shape[dim] //= factor
                        tiled_shape.append(factor)
                #print(tiled_shape)
                untiled = untiled.reshape(tiled_shape)
                for dim, _ in reversed(self.tiling_info[field]):
                    if field in self.batch_fields:
                        dim += 1
                    untiled = util.untile(untiled, dim)
                to_return = untiled
            else:
                to_return = buffer
                if "grad_" in field and "grad_inputs" not in field:
                    to_return = to_return[0]
            if field in ["value", "grad"] and any(p != (0, 0) for p in self.pad):
                _slice = [slice(None)]
                for p in self.pad:
                    if p != (0, 0):
                        _slice.append(slice(p[0], -p[1]))
                    else:
                        _slice.append(slice(None))
                to_return = to_return[tuple(_slice)]
            if field in ["value", "grad"] and any(p != (0, 0) for p in self.filter_pad):
                _slice = [slice(None)]
                for p in self.filter_pad:
                    if p != (0, 0):
                        _slice.append(slice(p[0], -p[1]))
                    else:
                        _slice.append(slice(None))
                to_return = to_return[tuple(_slice)]

            return to_return
Ejemplo n.º 8
0
    def __call__(self, *args):
        """__call__

        :param *args:
        """
        if self.output is not None:
            output = self.output
            self.output = None
        else:
            output = np.zeros_like(args[0])
        self.kernel.argtypes = tuple(cl_mem
                                     for _ in args + (output, )) + (localmem, )
        bufs = []
        events = []
        for index, arg in enumerate(args + (output, )):
            buf, evt = buffer_from_ndarray(self.queue, arg, blocking=False)
            # evt.wait()
            events.append(evt)
            bufs.append(buf)
            self.kernel.setarg(index, buf, sizeof(cl_mem))
        cl.clWaitForEvents(*events)
        if self.device.type == cl.cl_device_type.CL_DEVICE_TYPE_GPU:
            local = 8
        else:
            local = 1
        localmem_size = reduce(operator.mul, (local + (self.ghost_depth * 2)
                                              for _ in range(args[0].ndim)),
                               sizeof(c_float))
        self.kernel.setarg(
            len(args) + 1, localmem(localmem_size), localmem_size)
        evt = clEnqueueNDRangeKernel(self.queue, self.kernel, self.global_size,
                                     tuple(local for _ in range(args[0].ndim)))
        evt.wait()
        buf, evt = buffer_to_ndarray(self.queue, bufs[-1], output)
        evt.wait()
        for mem in bufs:
            del mem

        return buf
Ejemplo n.º 9
0
    def __call__(self, im, num_powers, border):
        out_shape = [num_powers] + list(im.shape)
        output = np.empty(out_shape, dtype=np.float32)

        in_buf, evt = buffer_from_ndarray(self.queue, im.data, blocking=False)
        evt.wait()
        self.kernel.setarg(0, in_buf, sizeof(cl_mem))

        out_buf = clCreateBuffer(self.queue.context, output.nbytes)
        self.kernel.setarg(1, out_buf, sizeof(cl_mem))

        evt = clEnqueueNDRangeKernel(self.queue, self.kernel, self.global_size)
        evt.wait()

        self.kernel2.setarg(0, out_buf, sizeof(cl_mem))

        for power in range(num_powers):
            self.kernel2.setarg(1, power, sizeof(cl_int))
            evt = clEnqueueNDRangeKernel(self.queue, self.kernel2, self.global_size)
            evt.wait()

        _, evt = buffer_to_ndarray(self.queue, out_buf, output)
        evt.wait()
        return Array(unique_name(), output)
Ejemplo n.º 10
0
    def __call__(self, A):
        
        a = time.time()

        # Initialization and copy from CPU to GPU
        output_array = np.empty(1, A.dtype)
        buf, evt = cl.buffer_from_ndarray(self.queue, A, blocking=False)
        output_buffer, output_evt = cl.buffer_from_ndarray(self.queue, output_array, blocking=False)
        
        b = time.time()

        # Actual execution of the reduction.
        self._c_function(self.queue, self.kernel, buf, output_buffer)
        c = time.time()

        # Copying the result back from the GPU to the CPU
        B, evt = cl.buffer_to_ndarray(self.queue, output_buffer, like=output_array)
        d = time.time()

        # The true time of execution, exluding copy time is between b and c.
        print ("True SEJITS Time (excluding copy time): {0} seconds".format(c - b))

        # print("overall execution:", d-a, "Initial Copy:", b-a, "Kernel execution:", c-b, "Final Copy:", d-c)
        return B[0]
Ejemplo n.º 11
0
 def __call__(self, A):
     buf, evt = cl.buffer_from_ndarray(self.queue, A, blocking=False)
     self._c_function(self.queue, self.kernel, buf)
     B, evt = cl.buffer_to_ndarray(self.queue, buf, like=A)
     return B
Ejemplo n.º 12
0
 def process_output(self, out_buf, output):
     _, evt = buffer_to_ndarray(self.queue, out_buf, output.data)
     evt.wait()
     return output
Ejemplo n.º 13
0
 def sync_host(self):
     if backend in {"ocl", "opencl", "OCL"}:
         if os.environ.get("HM_BACKEND") in {"omp", "openmp"}:
             return
         _, evt = cl.buffer_to_ndarray(queue, self.ocl_buf, self)
         evt.wait()
Ejemplo n.º 14
0
 def sync_host(self):
     if backend in {"ocl", "opencl", "OCL"}:
         if os.environ.get("HM_BACKEND") in {'omp', 'openmp'}:
             return
         _, evt = cl.buffer_to_ndarray(queue, self.ocl_buf, self)
         evt.wait()
Ejemplo n.º 15
0
 def __call__(self, A):
     buf, evt = cl.buffer_from_ndarray(self.queue, A, blocking=False)
     self._c_function(self.queue, self.kernel, buf)
     B, evt = cl.buffer_to_ndarray(self.queue, buf, like=A)
     return B