Beispiel #1
0
 def process_args(self, *args):
     processed = []
     events = []
     output = ct.c_int()
     out_like = None
     for arg in args:
         if isinstance(arg, np.ndarray):
             buf, evt = cl.buffer_from_ndarray(self.queue, arg,
                                               blocking=False)
             processed.append(buf)
             events.append(evt)
             output = buf.empty_like_this()
             out_like = arg
         else:
             if isinstance(arg, int):
                 processed.append(arg)
             elif isinstance(arg, float) and isinstance(output, ct.c_int):
                 processed.append(arg)
                 output = ct.c_float()
             else:
                 raise NotImplementedError(
                     "UnsupportedType: %s" % type(arg)
                 )
     if self.output is not None:
         output, evt = cl.buffer_from_ndarray(self.queue, self.output,
                                              blocking=False)
         out_like = self.output
         evt.wait()
     if isinstance(output, cl.cl_mem):
         processed.append(output)
     else:
         processed.append(output.byref)
     cl.clWaitForEvents(*events)
     return processed, output, out_like
Beispiel #2
0
    def __call__(self, input, u, v):
        output = zeros_like(input.data)
        events = []
        in_buf, in_evt = buffer_from_ndarray(self.queue, input.data,
                                             blocking=False)
        events.append(in_evt)
        self.kernel.setarg(0, in_buf, sizeof(cl_mem))

        u_buf, u_evt = buffer_from_ndarray(self.queue, u.data, blocking=False)
        events.append(u_evt)
        self.kernel.setarg(1, u_buf, sizeof(cl_mem))

        v_buf, v_evt = buffer_from_ndarray(self.queue, v.data, blocking=False)
        events.append(v_evt)
        self.kernel.setarg(2, v_buf, sizeof(cl_mem))

        out_buf, out_evt = buffer_from_ndarray(self.queue, output,
                                               blocking=False)
        events.append(out_evt)
        self.kernel.setarg(3, out_buf, sizeof(cl_mem))
        clWaitForEvents(*events)
        evt = clEnqueueNDRangeKernel(self.queue, self.kernel, self.global_size)
        evt.wait()
        _, evt = buffer_to_ndarray(self.queue, out_buf, output)
        evt.wait()
        return Array(unique_name(), output)
Beispiel #3
0
 def __call__(self, A):
     output_array = np.empty(ceil(len(A) / WORK_GROUP_SIZE), np.int32)
     buf, evt = cl.buffer_from_ndarray(self.queue, A, blocking=False)
     output_buffer, output_evt = cl.buffer_from_ndarray(self.queue, output_array, blocking=False)
     self._c_function(self.queue, self.kernel, buf, output_buffer)
     B, evt = cl.buffer_to_ndarray(self.queue, output_buffer, like=output_array)
     return B
    def __call__(self, *args):
        """__call__

        :param *args:
        """
        if isinstance(args[0], hmarray):
            output = empty_like(args[0])
        else:
            output = np.zeros_like(args[0])
        # self.kernel.argtypes = tuple(
        #     cl_mem for _ in args + (output, )
        # ) + (localmem, )
        buffers = []
        events = []
        for index, arg in enumerate(args + (output, )):
            if isinstance(arg, hmarray):
                buffers.append(arg.ocl_buf)
            else:
                buf, evt = buffer_from_ndarray(self.queue, arg, blocking=True)
                # evt.wait()
                events.append(evt)
                buffers.append(buf)
                # self.kernel.setarg(index, buf, sizeof(cl_mem))
        cl.clWaitForEvents(*events)
        cl_error = 0
        if isinstance(self.kernel, list):
            kernels = len(self.kernel)
            if kernels == 2:
                cl_error = self._c_function(self.queue, self.kernel[0],
                                            self.kernel[1], *buffers)
            elif kernels == 3:
                cl_error = self._c_function(self.queue, self.kernel[0],
                                            self.kernel[1], self.kernel[2],
                                            *buffers)
            elif kernels == 4:
                cl_error = self._c_function(
                    self.queue, self.kernel[0], self.kernel[1], self.kernel[2],
                    self.kernel[3], *buffers
                )
        else:
            cl_error = self._c_function(self.queue, self.kernel, *buffers)

        if cl.cl_errnum(cl_error) != cl.cl_errnum.CL_SUCCESS:
            raise StencilException(
                "Error executing stencil kernel: opencl {} {}".format(
                    cl_error,
                    cl.cl_errnum(cl_error)
                )
            )
        if isinstance(output, hmarray):
            return output
        buf, evt = buffer_to_ndarray(
            self.queue, buffers[-1], output
        )
        evt.wait()

        return buf
    def __array_finalize__(self, obj):
        if obj is None:
            return

        if backend in {"ocl", "opencl", "OCL"}:
            buf, evt = cl.buffer_from_ndarray(queue, obj)
            evt.wait()
            self.ocl_buf = buf
            self.host_dirty = False
            self.ocl_dirty = False
        self.register = None
    def __array_finalize__(self, obj):
        if obj is None:
            return

        if backend in {"ocl", "opencl", "OCL"}:
            buf, evt = cl.buffer_from_ndarray(queue, obj)
            evt.wait()
            self.ocl_buf = buf
            self.host_dirty = False
            self.ocl_dirty = False
        self.register = None
Beispiel #7
0
 def process_inputs(self, *args):
     events = []
     processed = []
     self.kernel.argtypes = tuple(cl_mem for _ in args)
     for index, arg in enumerate(args):
         if isinstance(arg, types.common.Array):
             arg = arg.data
         buf, evt = buffer_from_ndarray(self.queue, arg, blocking=False)
         processed.append(buf)
         events.append(evt)
         self.kernel.setarg(index, buf, sizeof(cl_mem))
     clWaitForEvents(*events)
     return processed
Beispiel #8
0
        def set(value):
            dest = buffer
            if field in ["value", "grad"] and any(p != (0, 0) for p in self.pad):
                _slice = [slice(None)]
                for i, p in enumerate(self.pad):
                    if p != (0, 0):
                        _slice.append(slice(p[0], -p[1]))
                    else:
                        _slice.append(slice(None))
                # dest = dest[tuple(_slice)]
            else:
                _slice = [slice(None) for _ in dest.shape]

            if field in self.tiling_info:
                tiled = value
                # Handle padding of weights and bias if set using set_value
                if field in ["weights", "bias"]:
                    padding = list()
                    # generate padding tuple
                    for dim in tiled.shape:
                        padding += ((0,0),)
                    for dim, factor in self.tiling_info[field]:
                        if tiled.shape[dim] % factor != 0:
                            padding[dim] = (0, factor - tiled.shape[dim] % factor)
                    tiled = np.lib.pad(tiled, padding, 'constant')
                for dim, _ in self.tiling_info[field]:
                    if field in self.batch_fields:
                        dim += 1
                    if not isinstance(self, ActivationEnsemble):
                        _slice.append(_slice[dim])
                        _slice[dim] = slice(None)
                    tiled = util.tile(tiled, dim)
                tiled_shape = list(dest.shape)
                if not isinstance(self, ActivationEnsemble) or field not in ["value", "grad"]:
                    for dim, factor in self.tiling_info[field]:
                        if field in self.batch_fields:
                            dim += 1
                        if tiled_shape[dim] < factor:
                            factor = tiled_shape[dim]
                        elif tiled_shape[dim] % factor != 0:
                            raise NotImplementedError()
                        tiled_shape[dim] //= factor
                        tiled_shape.append(factor)
                dest = dest.reshape(tiled_shape)
                dest[_slice] = tiled
            else:
                dest[_slice] = value
            if cl_buffer is not None:
                _, evt = cl.buffer_from_ndarray(latte.config.cl_queue, dest, buf=cl_buffer)
                evt.wait()
    def __call__(self, A):
        
        a = time.time()

        # Initialization and copy from CPU to GPU
        output_array = np.empty(1, A.dtype)
        buf, evt = cl.buffer_from_ndarray(self.queue, A, blocking=False)
        output_buffer, output_evt = cl.buffer_from_ndarray(self.queue, output_array, blocking=False)
        
        b = time.time()

        # Actual execution of the reduction.
        self._c_function(self.queue, self.kernel, buf, output_buffer)
        c = time.time()

        # Copying the result back from the GPU to the CPU
        B, evt = cl.buffer_to_ndarray(self.queue, output_buffer, like=output_array)
        d = time.time()

        # The true time of execution, exluding copy time is between b and c.
        print ("True SEJITS Time (excluding copy time): {0} seconds".format(c - b))

        # print("overall execution:", d-a, "Initial Copy:", b-a, "Kernel execution:", c-b, "Final Copy:", d-c)
        return B[0]
Beispiel #10
0
    def __call__(self, im):
        output = zeros_like(im.data)
        in_buf, evt = buffer_from_ndarray(self.queue, im.data, blocking=False)
        evt.wait()
        self.kernel.setarg(0, in_buf, sizeof(cl_mem))

        out_buf = clCreateBuffer(self.context, output.nbytes)
        self.kernel.setarg(1, out_buf, sizeof(cl_mem))
        evt = clEnqueueNDRangeKernel(self.queue, self.kernel, self.global_size)
        evt.wait()
        _, evt = buffer_to_ndarray(self.queue, out_buf, output)
        evt.wait()
        del in_buf
        del out_buf
        return Array(unique_name(), output)
Beispiel #11
0
    def __call__(self, *args):
        """__call__

        :param *args:
        """
        if self.output is not None:
            output = self.output
            self.output = None
        else:
            output = np.zeros_like(args[0])
        self.kernel.argtypes = tuple(cl_mem
                                     for _ in args + (output, )) + (localmem, )
        bufs = []
        events = []
        for index, arg in enumerate(args + (output, )):
            buf, evt = buffer_from_ndarray(self.queue, arg, blocking=False)
            # evt.wait()
            events.append(evt)
            bufs.append(buf)
            self.kernel.setarg(index, buf, sizeof(cl_mem))
        cl.clWaitForEvents(*events)
        if self.device.type == cl.cl_device_type.CL_DEVICE_TYPE_GPU:
            local = 8
        else:
            local = 1
        localmem_size = reduce(operator.mul, (local + (self.ghost_depth * 2)
                                              for _ in range(args[0].ndim)),
                               sizeof(c_float))
        self.kernel.setarg(
            len(args) + 1, localmem(localmem_size), localmem_size)
        evt = clEnqueueNDRangeKernel(self.queue, self.kernel, self.global_size,
                                     tuple(local for _ in range(args[0].ndim)))
        evt.wait()
        buf, evt = buffer_to_ndarray(self.queue, bufs[-1], output)
        evt.wait()
        for mem in bufs:
            del mem

        return buf
Beispiel #12
0
    def __call__(self, im, num_powers, border):
        out_shape = [num_powers] + list(im.shape)
        output = np.empty(out_shape, dtype=np.float32)

        in_buf, evt = buffer_from_ndarray(self.queue, im.data, blocking=False)
        evt.wait()
        self.kernel.setarg(0, in_buf, sizeof(cl_mem))

        out_buf = clCreateBuffer(self.queue.context, output.nbytes)
        self.kernel.setarg(1, out_buf, sizeof(cl_mem))

        evt = clEnqueueNDRangeKernel(self.queue, self.kernel, self.global_size)
        evt.wait()

        self.kernel2.setarg(0, out_buf, sizeof(cl_mem))

        for power in range(num_powers):
            self.kernel2.setarg(1, power, sizeof(cl_int))
            evt = clEnqueueNDRangeKernel(self.queue, self.kernel2, self.global_size)
            evt.wait()

        _, evt = buffer_to_ndarray(self.queue, out_buf, output)
        evt.wait()
        return Array(unique_name(), output)
Beispiel #13
0
 def __call__(self, A):
     buf, evt = cl.buffer_from_ndarray(self.queue, A, blocking=False)
     self._c_function(self.queue, self.kernel, buf)
     B, evt = cl.buffer_to_ndarray(self.queue, buf, like=A)
     return B
 def sync_ocl(self):
     if backend in {"ocl", "opencl", "OCL"}:
         _, evt = cl.buffer_from_ndarray(queue, self, self.ocl_buf)
         evt.wait()
Beispiel #15
0
 def __call__(self, A):
     buf, evt = cl.buffer_from_ndarray(self.queue, A, blocking=False)
     self._c_function(self.queue, self.kernel, buf)
     B, evt = cl.buffer_to_ndarray(self.queue, buf, like=A)
     return B
Beispiel #16
0
 def __allocate_buffer(self, device=get_gpu()):
     queue = self.get_queue(device)
     self.__buffers[device.value], evt = pycl.buffer_from_ndarray(queue, self, self.get_buffer(device))
     return evt
Beispiel #17
0
    def __call__(self, *args):
        cacheSize = args[1]
        cacheSizeInFloats = (int)(cacheSize / sizeof(c_float))
        args = args[2:]
        nPoints, dFeatures = args[0].shape

        localInitSize = self.init.work_group_size(self.device)
        numGroups_init = nPoints / localInitSize if (
            nPoints % localInitSize == 0) else nPoints / localInitSize + 1
        globalInitSize = numGroups_init * localInitSize

        localFoph1Size = self.foph1.work_group_size(self.device)
        numGroups_foph1 = nPoints / localFoph1Size if (
            nPoints % localFoph1Size == 0) else nPoints / localFoph1Size + 1
        globalFoph1Size = numGroups_foph1 * localFoph1Size

        localFoph2Size = self.foph2.work_group_size(self.device)
        globalFoph2Size = localFoph2Size

        localSoph1Size = self.soph1.work_group_size(self.device)
        numGroups_soph1 = nPoints / localSoph1Size if (
            nPoints % localSoph1Size == 0) else nPoints / localSoph1Size + 1
        globalSoph1Size = numGroups_soph1 * localSoph1Size

        localSoph2Size = self.soph2.work_group_size(self.device)
        globalSoph2Size = localSoph2Size

        localSoph3Size = self.soph3.work_group_size(self.device)
        numGroups_soph3 = nPoints / localSoph3Size if (
            nPoints % localSoph3Size == 0) else nPoints / localSoph3Size + 3
        globalSoph3Size = numGroups_soph3 * localSoph3Size

        localSoph4Size = self.soph4.work_group_size(self.device)
        globalSoph4Size = localSoph4Size
        # print "Init .... Local: %d, Num Groups: %d, Global: %d" % (localInitSize, numGroups_init, globalInitSize)
        # print "Foph1 ... Local: %d, Num Groups: %d, Global: %d" % (localFoph1Size, numGroups_foph1, globalFoph1Size)
        # print "Foph2 ... Local & Global: %d" % (localFoph2Size)
        # print "Soph1 ... Local: %d, Num Groups: %d, Global: %d" % (localSoph1Size, numGroups_soph1, globalSoph1Size)
        # print "Soph2 ... Local & Global: %d" % (localSoph2Size)
        # print "Soph3 ... Local: %d, Num Groups: %d, Global: %d" % (localSoph3Size, numGroups_soph3, globalSoph3Size)
        # print "Soph4 ... Local & Global: %d" % (localSoph4Size)

        #create buffers from input
        d_input_data, evt = cl.buffer_from_ndarray(self.queue,
                                                   args[0],
                                                   blocking=False)
        d_input_data_colmajor, evt = cl.buffer_from_ndarray(self.queue,
                                                            args[0].T,
                                                            blocking=False)
        d_labels, evt = cl.buffer_from_ndarray(self.queue,
                                               args[1],
                                               blocking=False)
        args = (args[0].ctypes.data_as(POINTER(c_float)),
                args[1].ctypes.data_as(POINTER(c_int))) + args[2:]

        # temporary numpy arrays
        iArray = np.zeros(nPoints, dtype=np.float32)

        reduceIntsFO = np.zeros(numGroups_foph1, dtype=np.int32)
        reduceFloatsFO = np.zeros(numGroups_foph1, dtype=np.float32)

        reduceIntsSO1 = np.zeros(numGroups_soph1, dtype=np.int32)
        reduceFloatsSO1 = np.zeros(numGroups_soph1, dtype=np.float32)

        reduceIntsSO3 = np.zeros(numGroups_soph3, dtype=np.int32)
        reduceFloatsSO3 = np.zeros(numGroups_soph3, dtype=np.float32)

        results = np.zeros(8, dtype=np.float32)
        cache = np.zeros(cacheSizeInFloats, dtype=np.float32)

        # new buffers from scratch
        d_trainingAlpha, evt = cl.buffer_from_ndarray(self.queue,
                                                      iArray,
                                                      blocking=False)
        d_kernelDiag, evt = cl.buffer_from_ndarray(self.queue,
                                                   iArray,
                                                   blocking=False)
        d_F, evt = cl.buffer_from_ndarray(self.queue, iArray, blocking=False)
        d_cache, evt = cl.buffer_from_ndarray(self.queue,
                                              cache,
                                              blocking=False)

        d_highFsFO, evt = cl.buffer_from_ndarray(self.queue,
                                                 reduceFloatsFO,
                                                 blocking=False)
        d_highIndicesFO, evt = cl.buffer_from_ndarray(self.queue,
                                                      reduceIntsFO,
                                                      blocking=False)

        d_lowFsFO, evt = cl.buffer_from_ndarray(self.queue,
                                                reduceFloatsFO,
                                                blocking=False)
        d_lowIndicesFO, evt = cl.buffer_from_ndarray(self.queue,
                                                     reduceIntsFO,
                                                     blocking=False)

        d_highFsSO1, evt = cl.buffer_from_ndarray(self.queue,
                                                  reduceFloatsSO1,
                                                  blocking=False)
        d_highIndicesSO1, evt = cl.buffer_from_ndarray(self.queue,
                                                       reduceIntsSO1,
                                                       blocking=False)

        d_lowFsSO3, evt = cl.buffer_from_ndarray(self.queue,
                                                 reduceFloatsSO3,
                                                 blocking=False)
        d_lowIndicesSO3, evt = cl.buffer_from_ndarray(self.queue,
                                                      reduceIntsSO3,
                                                      blocking=False)
        d_deltaFsSO3, evt = cl.buffer_from_ndarray(self.queue,
                                                   reduceFloatsSO3,
                                                   blocking=False)

        d_results, evt = cl.buffer_from_ndarray(self.queue,
                                                results,
                                                blocking=False)

        #return values
        rho = c_float()
        nSV = c_int()
        iterations = c_int()
        signedAlpha = pointer(c_float())
        supportVectors = pointer(c_float())

        args = (cacheSizeInFloats, ) + args
        #add args
        args += (
            # sizes
            localInitSize,
            globalInitSize,
            numGroups_foph1,
            localFoph1Size,
            globalFoph1Size,
            localFoph2Size,
            globalFoph2Size,
            numGroups_soph1,
            localSoph1Size,
            globalSoph1Size,
            localSoph2Size,
            globalSoph2Size,
            numGroups_soph3,
            localSoph3Size,
            globalSoph3Size,
            localSoph4Size,
            globalSoph4Size,
            # buffers
            d_input_data,
            d_input_data_colmajor,
            d_labels,
            d_trainingAlpha,
            d_kernelDiag,
            d_F,
            d_highFsFO,
            d_highIndicesFO,
            d_lowFsFO,
            d_lowIndicesFO,
            d_highFsSO1,
            d_highIndicesSO1,
            d_lowFsSO3,
            d_lowIndicesSO3,
            d_deltaFsSO3,
            d_results,
            d_cache,
            # Ocl queue and kernels
            self.queue,
            self.init,
            self.step1,
            self.foph1,
            self.foph2,
            self.soph1,
            self.soph2,
            self.soph3,
            self.soph4,
            # return pointers
            byref(rho),
            byref(nSV),
            byref(iterations),
            byref(signedAlpha),
            byref(supportVectors))

        with Timer() as cfunc:
            err = self._c_function(*args)
        print "Actual function took %.6f" % cfunc.interval
        return rho.value, nSV.value, iterations.value, \
               as_array(supportVectors,shape=(nSV.value,dFeatures)),\
               as_array(signedAlpha,shape=(nSV.value,))
 def sync_ocl(self):
     if backend in {"ocl", "opencl", "OCL"}:
         _, evt = cl.buffer_from_ndarray(queue, self, self.ocl_buf)
         evt.wait()