Esempio n. 1
0
    def gaussian_gpu_v2(cls, sigma, size=None):
        """
        Calculate a 1D gaussian using pyopencl.
        This is the same as scipy.signal.gaussian.
        Only one kernel to

        :param sigma: width of the gaussian
        :param size: can be calculated as 1 + 2 * 4sigma
        """
        if not size:
            size = int(1 + 8 * sigma)
        g_gpu = pyopencl.array.empty(cls.queue,
                                     size,
                                     dtype=numpy.float32,
                                     order="C")
        t0 = time.time()
        evt = cls.kernels["gaussian"].gaussian(
            cls.queue,
            (64, ),
            (64, ),
            g_gpu.data,  # __global     float     *data,
            numpy.float32(sigma),  # const        float     sigma,
            numpy.int32(size),  # const        int     SIZE
            pyopencl.LocalMemory(64 * 4),
            pyopencl.LocalMemory(64 * 4),
        )
        g = g_gpu.get()
        if cls.PROFILE:
            logger.info("execution time: %.3fms; Kernel took %.3fms",
                        1e3 * (time.time() - t0),
                        1e-6 * (evt.profile.end - evt.profile.start))
        return g
Esempio n. 2
0
    def process(self):
        """
        Process for InputLayer does nothing. Simple invokes process for next layers.
        """
        self.opencl.kernel_process_layer.set_arg(0, self.context._inputs_buf)
        self.opencl.kernel_process_layer.set_arg(1, self.context._weights_buf)
        self.opencl.kernel_process_layer.set_arg(7,
                                                 pyopencl.LocalMemory(64 * 4))
        self.opencl.kernel_process_layer.set_arg(8, self.context._outputs_buf)

        if self.context.training_allowed:
            self.opencl.kernel_calc_layer_gradient.set_arg(
                0, self.context._inputs_buf)
            self.opencl.kernel_calc_layer_gradient.set_arg(
                1, self.context._errors_backpropagation_buf)
            self.opencl.kernel_calc_layer_gradient.set_arg(
                6, self.context._gradient_buf)

            self.opencl.kernel_propagate_errors.set_arg(
                0, self.context._errors_backpropagation_buf)
            self.opencl.kernel_propagate_errors.set_arg(
                1, self.context._weights_buf)
            self.opencl.kernel_propagate_errors.set_arg(
                8, pyopencl.LocalMemory(256))
            self.opencl.kernel_propagate_errors.set_arg(
                9, self.context._outputs_buf)

        super(InputLayer, self).process()

        self.reset_processed()
Esempio n. 3
0
    def count_violations(self, queue, restraints, rotmat, access_interspace,
                         viol_counter, weight):

        WORKGROUPSIZE = 32
        kernel = self.kernels.count_violations

        rotmat16 = np.zeros(16, dtype=np.float32)
        rotmat16[:9] = rotmat.flatten()[:]
        shape = np.asarray(list(access_interspace.shape) +
                           [access_interspace.size],
                           dtype=np.int32)
        loc_viol = cl.LocalMemory(4 * restraints.shape[0]**2 * WORKGROUPSIZE)
        # float4
        restraints_center = cl.LocalMemory(4 * restraints.shape[0] * 4)
        mindist2 = cl.LocalMemory(4 * restraints.shape[0])
        maxdist2 = cl.LocalMemory(4 * restraints.shape[0])

        kernel.set_args(restraints.data, rotmat16, access_interspace.data,
                        viol_counter.data, loc_viol,
                        restraints_center, mindist2, maxdist2,
                        np.int32(restraints.shape[0]), shape,
                        np.float32(weight))

        gws = (8 * WORKGROUPSIZE * 8 * 4, )
        lws = (WORKGROUPSIZE, )
        status = cl.enqueue_nd_range_kernel(queue, kernel, gws, lws)

        return status
Esempio n. 4
0
def test_scatter(cl_env, radix_kernels, key_dtype, ngroups, group_size):
    ctx, cq = cl_env

    radix_bits = 4
    histogram_len = 2 ** radix_bits
    keys = np.random.randint(0, 64, size=(ngroups, group_size * 2), dtype=key_dtype)
    keys_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY, keys.nbytes)
    out_keys_buf = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, keys.nbytes)
    histogram_buf = cl.Buffer(
        ctx, cl.mem_flags.READ_ONLY, histogram_len * ngroups * np.dtype('uint32').itemsize
    )
    offset_buf = cl.Buffer(
        ctx, cl.mem_flags.READ_ONLY, histogram_len * ngroups * np.dtype('uint32').itemsize
    )

    for radix_pass in range(keys.dtype.itemsize * 8 // radix_bits):
        radix_keys = radix_key(keys, radix_bits, radix_pass).astype('uint16')
        order = np.argsort(radix_keys, kind='mergesort')
        grid = np.ogrid[tuple(slice(0, s) for s in keys.shape)]
        block_keys = keys[grid[:-1] + [order]] # Partially sort

        (keys_map, _) = cl.enqueue_map_buffer(
            cq, keys_buf, cl.map_flags.WRITE_INVALIDATE_REGION, 0,
            keys.shape, keys.dtype, wait_for=[], is_blocking=True
        )
        keys_map[...] = block_keys
        del keys_map

        radix_keys = radix_key(block_keys, radix_bits, radix_pass).astype('uint16')

        (histogram_map, _) = cl.enqueue_map_buffer(
            cq, histogram_buf, cl.map_flags.WRITE_INVALIDATE_REGION, 0,
            (histogram_len, ngroups), np.dtype('uint32'), wait_for=[], is_blocking=True
        )
        (offset_map, _) = cl.enqueue_map_buffer(
            cq, offset_buf, cl.map_flags.WRITE_INVALIDATE_REGION, 0,
            (histogram_len, ngroups), np.dtype('uint32'), wait_for=[], is_blocking=True
        )
        histogram_map[...] = np.array([np.bincount(group_keys, minlength=16)
                                       for group_keys in radix_keys], dtype='uint32').T
        offset_map[...] = prefix_sum(histogram_map.flat).reshape(histogram_len, ngroups)
        del histogram_map, offset_map

        local_offset = cl.LocalMemory(histogram_len * np.dtype('uint32').itemsize)
        local_histogram = cl.LocalMemory(histogram_len * np.dtype('uint32').itemsize)

        e = radix_kernels['scatter'](
            cq, (ngroups,), (group_size,),
            keys_buf, out_keys_buf, None, None,
            offset_buf, local_offset, histogram_buf, local_histogram,
            radix_bits, radix_pass, g_times_l=True,
        )

        (keys_map, _) = cl.enqueue_map_buffer(
            cq, out_keys_buf, cl.map_flags.READ, 0,
            (ngroups, group_size * 2), keys.dtype, wait_for=[e], is_blocking=True
        )

        expected = block_keys.flat[np.argsort(radix_keys, axis=None, kind='mergesort')]
        np.testing.assert_equal(keys_map, expected.reshape(ngroups, 2 * group_size))
Esempio n. 5
0
def test_ternary(context, q, float_data, float_data_gpu):
    kernelSource = """
__global__ void setValue(float *data, int idx, float value) {
    if(threadIdx.x == 0) {
        data[idx] = value;
    }
}
__global__ void testTernary(float *data) {
    data[0] = data[1] > 0 ? data[2] : data[3];
}
"""
    setValueKernelName = test_common.mangle('setValue',
                                            ['float *', 'int', 'float'])
    setValueProg = compile_code(cl,
                                context,
                                kernelSource,
                                setValueKernelName,
                                num_clmems=1)

    testTernaryName = test_common.mangle('testTernary', ['float *'])
    testTernaryProg = compile_code(cl,
                                   context,
                                   kernelSource,
                                   testTernaryName,
                                   num_clmems=1)

    float_data_orig = np.copy(float_data)

    def set_float_value(gpu_buffer, idx, value):
        setValueProg.__getattr__(setValueKernelName)(q, (32, ), (32, ),
                                                     float_data_gpu,
                                                     offset_type(0),
                                                     np.int32(idx),
                                                     np.float32(value),
                                                     cl.LocalMemory(4))

    cl.enqueue_copy(q, float_data_gpu, float_data)
    print('float_data[:8]', float_data[:8])
    set_float_value(float_data_gpu, 1, 10)
    testTernaryProg.__getattr__(testTernaryName)(q, (32, ),
                                                 (32, ), float_data_gpu,
                                                 offset_type(0),
                                                 cl.LocalMemory(4))
    q.finish()
    cl.enqueue_copy(q, float_data, float_data_gpu)
    q.finish()
    print('float_data[:8]', float_data[:8])
    assert float_data[0] == float_data_orig[2]

    set_float_value(float_data_gpu, 1, -2)
    testTernaryProg.__getattr__(testTernaryName)(q, (32, ),
                                                 (32, ), float_data_gpu,
                                                 offset_type(0),
                                                 cl.LocalMemory(4))
    q.finish()
    cl.enqueue_copy(q, float_data, float_data_gpu)
    q.finish()
    print('float_data[:8]', float_data[:8])
    assert float_data[0] == float_data_orig[3]
Esempio n. 6
0
def hist_op_6_time(BS, D, left_buf, right_buf, H, W, out_buf):
    start = time.time()
    global_size = np.zeros(shape=(7*(H-6),9*(W-6-D))).astype(np.float32) # 7x9 group shape 63 work item
    group_size = np.zeros(shape=(7,9)).astype(np.float32)
    lclLeft = cl.LocalMemory(np.int32().nbytes*630)
    lclRight = cl.LocalMemory(np.int32().nbytes*182)
    func6(queue, global_size.shape, group_size.shape, lclRight, lclLeft, np.int32(BS/2), np.int32(D), left_buf, right_buf, np.int32(H), np.int32(W), out_buf)
    return time.time()-start
Esempio n. 7
0
 def sort(self, queue, N, a_buf, o_buf):
     loc_aux = cl.LocalMemory(16 * self.n_threads)
     loc_idx = cl.LocalMemory(16 * self.n_threads)
     #print("N==", N, "n_threads==", self.n_threads)
     minnt = min(N, self.n_threads)
     evt = self.prgsrt.ParallelBitonic_Local(queue, (minnt, ), (minnt, ),
                                             a_buf, o_buf, loc_aux, loc_idx)
     evt.wait()
Esempio n. 8
0
def test_block_sort_random(cl_env, radix_kernels, key_dtype, ngroups, group_size):
    ctx, cq = cl_env

    radix_bits = 4
    histogram_len = 2 ** radix_bits

    keys = np.random.randint(0, 64, size=(ngroups, group_size * 2), dtype=key_dtype)

    keys_buf = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, keys.nbytes)
    histogram_buf = cl.Buffer(ctx, cl.mem_flags.READ_WRITE,
                              ngroups * histogram_len * np.dtype('uint32').itemsize)

    local_keys = cl.LocalMemory(group_size * 2 * keys.dtype.itemsize)
    local_values = cl.LocalMemory(group_size * 2 * keys.dtype.itemsize)
    count = cl.LocalMemory(group_size * 2 * np.dtype('uint32').itemsize)
    local_histogram = cl.LocalMemory(histogram_len * np.dtype('uint32').itemsize)

    for radix_pass in range(keys.dtype.itemsize * 8 // radix_bits):
        (keys_map, _) = cl.enqueue_map_buffer(
            cq, keys_buf, cl.map_flags.WRITE_INVALIDATE_REGION, 0,
            (ngroups, group_size * 2), keys.dtype, wait_for=[], is_blocking=True
        )
        keys_map[...] = keys
        del keys_map

        e = radix_kernels['block_sort'](
            cq, (ngroups,), (group_size,),
            keys_buf, local_keys, local_keys, None, local_values, local_values,
            histogram_buf, local_histogram, count,
            radix_bits, radix_pass, g_times_l=True,
        )

        keys = keys.reshape(ngroups, group_size * 2)
        order = np.argsort(radix_key(keys, radix_bits, radix_pass), kind='mergesort')
        grid = np.ogrid[tuple(slice(0, s) for s in keys.shape)]

        (histogram_map, _) = cl.enqueue_map_buffer(
            cq, histogram_buf, cl.map_flags.READ, 0,
            (histogram_len, ngroups), np.dtype('uint32'), wait_for=[e], is_blocking=True
        )
        i = 0
        for group_keys, histogram in zip(keys, histogram_map.T):
            group_keys = radix_key(group_keys, radix_bits, radix_pass).astype('uint16')
            expected = np.bincount(group_keys, minlength=16)
            try:
                np.testing.assert_equal(histogram, expected)
            except AssertionError:
                print((radix_pass, i))
                raise
            i += 1

        expected = keys[grid[:-1] + [order]]
        (keys_map, _) = cl.enqueue_map_buffer(
            cq, keys_buf, cl.map_flags.READ, 0,
            (ngroups, group_size * 2), keys.dtype, wait_for=[e], is_blocking=True
        )
        np.testing.assert_equal(keys_map, expected)
Esempio n. 9
0
def opencl_dtw_run (SrcS, TrgS, ctx, queue, prg, dev_Param):
    #MAX_MEM_ALLOC_SIZE
    cot1 = int(dev_Param["LOCAL_MEM_SIZE"] / (TrgS.shape[1] *4 *3))
    cot2 = int(dev_Param["MAX_WORK_ITEM_SIZES"][0] / TrgS.shape[1])
    TRG_COT = min(cot1, cot2)
    Grp_Cot = int(dev_Param["MAX_MEM_ALLOC_SIZE"] / (TrgS.shape[1] *4 *TRG_COT))

    T0 = TrgS.shape[0]
    T1 = TrgS.shape[1]

    TrgS_Alignment =  TRG_COT -T0 % TRG_COT
    if TrgS_Alignment != TRG_COT:
        TrgS = numpy.concatenate ((TrgS, numpy.ones((TrgS_Alignment,T1),dtype=numpy.float32)))
    T0 = TrgS.shape[0]
    #print ("TrgS_Alignment,TRG_COT",TrgS_Alignment,TRG_COT)

    Splits = list(range(0, T0, Grp_Cot *TRG_COT))
    Splits.append (T0)
    allret = numpy.empty ((SrcS.shape[0],TrgS.shape[0]), dtype=numpy.float32)

    for j in range(len(Splits)-1):
        TrgS_sub = TrgS[Splits[j]:Splits[j+1],:]
        Ts0 = TrgS_sub.shape[0]
        Ts1 = TrgS_sub.shape[1]
        local_size  = TRG_COT *Ts1
        global_size = Ts0 *Ts1
        
        t = numpy.reshape(TrgS_sub,(Ts0 *Ts1))
        t_dev = cl_array.to_device(queue, t)
        #print ("local_size, global_size ",local_size,global_size,t.nbytes/1024/1024)
 
        SRC_LEN = SrcS.shape[1]
        TRG_LEN = TrgS.shape[1]

        for i in range(SrcS.shape[0]):
            s = SrcS[i,:]
            s_dev = cl_array.to_device(queue, s)
            r_dev = cl_array.empty (queue, (Ts0,), dtype=numpy.float32)
            shared_mem_size = Ts1 *TRG_COT *4

            prg.opencl_dtw(queue, (global_size,), (local_size,), \
                numpy.uint32(SRC_LEN),numpy.uint32(TRG_LEN),numpy.uint32(TRG_COT),
                s_dev.data, t_dev.data, r_dev.data,\
                cl.LocalMemory(shared_mem_size),
                cl.LocalMemory(shared_mem_size),
                cl.LocalMemory(shared_mem_size)
                )
            r = r_dev.get()
            allret[i,Splits[j]:Splits[j+1]] = r
            #print(la.norm((dest_dev - (a_dev+b_dev)).get()))

    if TrgS_Alignment != TRG_COT:
        allret = allret[:,0:-TrgS_Alignment]
    return (allret)
Esempio n. 10
0
File: main.py Progetto: silgon/dev
    def runAlgo(self):
        """
        The program implementation
        """
        #initialize client side (CPU) arrays
        N = 100
        A_VAL = .5
        B_VAL = 1
        size = N * N
        h_A = np.empty(size).astype(np.float32)
        h_B = np.empty(size).astype(np.float32)
        h_A.fill(A_VAL)
        h_B.fill(B_VAL)

        #create OpenCL buffers
        d_A = self.vectToBuffer(h_A)
        d_B = self.vectToBuffer(h_B)
        d_C = self.outBuffer(h_A.nbytes)

        np.set_printoptions(threshold='nan')
        # execute program
        mmul = self.program.mmul
        mmul.set_scalar_arg_dtypes([np.int32, None, None, None])
        mmul(self.queue, (N, N), None, N, d_A, d_B, d_C)
        print "First problem solved"

        h_C = np.empty_like(h_A)
        self.bufferToVect(d_C, h_C)
        print "{}".format(h_C)

        localmem = cl.LocalMemory(np.dtype(np.float32).itemsize * N)
        n_blocks = 10
        mmul2 = self.program.mmul2
        mmul2.set_scalar_arg_dtypes([np.int32, None, None, None, None])
        mmul2(self.queue, (N, ), (N / n_blocks, ), N, d_A, d_B, d_C, localmem)
        print "Second problem solved"

        h_C = np.empty_like(h_A)
        self.bufferToVect(d_C, h_C)
        print "{}".format(h_C)

        blocksize = 10
        A_block = cl.LocalMemory(np.dtype(np.float32).itemsize * blocksize**2)
        B_block = cl.LocalMemory(np.dtype(np.float32).itemsize * blocksize**2)
        mmul3 = self.program.mmul3
        mmul3.set_scalar_arg_dtypes([np.int32, None, None, None, None, None])
        mmul3(self.queue, (N, N), (blocksize, blocksize), N, d_A, d_B, d_C,
              A_block, B_block)

        print "Third problem solved"

        h_C = np.empty_like(h_A)
        self.bufferToVect(d_C, h_C)
        print "{}".format(h_C)
Esempio n. 11
0
 def reorder(self, startbit, num):
     totalBlocks = num / 2 / self.cta_size
     global_size = (self.cta_size * totalBlocks, )
     local_size = (self.cta_size, )
     reorder_args = (self.keys, self.values, self.d_tempKeys,
                     self.d_tempValues, self.mBlockOffsets,
                     self.mCountersSum, self.mCounters, np.uint32(startbit),
                     np.uint32(num), np.uint32(totalBlocks),
                     cl.LocalMemory(2 * self.cta_size * self.uintsz),
                     cl.LocalMemory(2 * self.cta_size * self.uintsz))
     self.radix_prg.reorderDataKeysValues(self.queue, global_size,
                                          local_size, *(reorder_args))
Esempio n. 12
0
    def execute(self, n_it=1, **kwargs):

        # this defines how often the calculations are copied back from the compute unit (GPU)
        # e.g. 10 means that every 10th iteration is copied from the computing unit (GPU) to "python"
        n_out = kwargs.get('n_out', 10)

        queue = self.queue
        prg = self.program
        local_size = self.local_size  #(n_local,) #self.local_size
        n_local = 512
        ng = self.ng

        # initialize the next step
        i_out = 0
        total_out = (n_it // n_out + 1)
        time_axis = np.arange(total_out, dtype=np.float32) * self.t_step
        n_excited = np.zeros(total_out, dtype=np.float32)
        n_excited[0] = 1.0

        tmp_1 = cl_array.zeros(queue, (n_local * total_out, ),
                               dtype=np.float32)
        tmp_2 = cl_array.zeros(queue, (n_local * total_out, ),
                               dtype=np.float32)

        p = self.p_gp
        n = self.n_gp

        b = self.b_gp
        d = self.d_gp
        k = self.k_gp

        #prg.copy3d(queue, self.global_size, None,
        #           n.data, p.data, b).wait()
        for time_i in range(n_it):
            if time_i % 2 > 0:
                p, n = n, p
            prg.iterate(queue, self.global_size_3d, local_size, n, p, d, k, b)
            if time_i % n_out == 0:
                prg.reduce_decay(queue, self.global_size, self.local_size, p,
                                 k, cl.LocalMemory(n_local * 32),
                                 cl.LocalMemory(n_local * 32),
                                 np.int32(self.global_size[0]),
                                 np.int32(n_local), np.int32(i_out),
                                 np.float32(time_i), tmp_1.data, tmp_2.data)
                i_out += 1
            self.it += 1

        dc = (tmp_1.map_to_host()).reshape((total_out, n_local)).sum(axis=1)
        ds = (tmp_2.map_to_host()).reshape((total_out, n_local)).sum(axis=1)
        n_ex = dc / ds
        cl.enqueue_copy(queue, self.p_np, self.p_gp)
        self.p = self.p_np.reshape((ng, ng, ng), order='C')
        return time_axis, n_ex, self.p
Esempio n. 13
0
    def allocate_constants(self):
        """
        Allocates constants and local memory to be used by OpenCL.
        """

        self.w = cl.Buffer(self.context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=w)
        self.cx = cl.Buffer(self.context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=cx)
        self.cy = cl.Buffer(self.context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=cy)

        self.local_u = cl.LocalMemory(float_size * self.two_d_local_size[0]*self.two_d_local_size[1])
        self.local_v = cl.LocalMemory(float_size * self.two_d_local_size[0]*self.two_d_local_size[1])
        self.local_rho = cl.LocalMemory(float_size * self.two_d_local_size[0]*self.two_d_local_size[1])
Esempio n. 14
0
 def reorder(self, d_key, d_val, startbit, num):
     totalBlocks = num // 2 // self.cta_size
     global_size = (self.cta_size * totalBlocks, )
     local_size = (self.cta_size, )
     reorder_args = (d_key, d_val, self.d_temp_keys, self.d_temp_values,
                     self.d_block_offsets, self.d_counters_sum,
                     self.d_counters, np.uint32(startbit), np.uint32(num),
                     np.uint32(totalBlocks),
                     cl.LocalMemory(2 * self.cta_size * self.dtype_size),
                     cl.LocalMemory(2 * self.cta_size * self.dtype_size))
     self.radix_prg.reorderDataKeysValues(self.queue, global_size,
                                          local_size, *reorder_args)
Esempio n. 15
0
    def test_rgb(self):
        """
        tests the int64 kernel
        """
        max_wg = kernel_workgroup_size(self.reduction, "max_min_global_stage1")
        if max_wg < self.red_size:
            logger.warning(
                "test_uint16: Skipping test of WG=%s when maximum is %s (%s)",
                self.red_size, max_wg, self.max_wg)
            return

        lint = numpy.empty((self.input.shape[0], self.input.shape[1], 3),
                           dtype=numpy.uint8)
        lint[:, :, 0] = self.input.astype(numpy.uint8)
        lint[:, :, 1] = self.input.astype(numpy.uint8)
        lint[:, :, 2] = self.input.astype(numpy.uint8)
        t0 = time.time()
        au8 = pyopencl.array.to_device(self.queue, lint)
        k1 = self.program.rgb_to_float(self.queue, self.shape, self.wg,
                                       au8.data, self.gpudata.data,
                                       self.IMAGE_W, self.IMAGE_H)
        k2 = self.reduction.max_min_global_stage1(
            self.queue, (self.red_size * self.red_size, ), (self.red_size, ),
            self.gpudata.data, self.buffers_max_min.data,
            (self.IMAGE_W * self.IMAGE_H),
            pyopencl.LocalMemory(8 * self.red_size))
        k3 = self.reduction.max_min_global_stage2(
            self.queue, (self.red_size, ), (self.red_size, ),
            self.buffers_max_min.data, self.buffers_max.data,
            self.buffers_min.data, pyopencl.LocalMemory(8 * self.red_size))
        k4 = self.program.normalizes(self.queue, self.shape, self.wg,
                                     self.gpudata.data, self.buffers_min.data,
                                     self.buffers_max.data,
                                     self.twofivefive.data, self.IMAGE_W,
                                     self.IMAGE_H)
        res = self.gpudata.get()
        t1 = time.time()
        ref = normalize(lint.max(axis=-1))
        t2 = time.time()
        delta = abs(ref - res).max()
        if self.PROFILE:
            logger.info("Global execution time: CPU %.3fms, GPU: %.3fms." %
                        (1000.0 * (t2 - t1), 1000.0 * (t1 - t0)))
            logger.info("Conversion  RGB ->float took %.3fms" %
                        (1e-6 * (k1.profile.end - k1.profile.start)))
            logger.info("Reduction stage1 took        %.3fms" %
                        (1e-6 * (k2.profile.end - k2.profile.start)))
            logger.info("Reduction stage2 took        %.3fms" %
                        (1e-6 * (k3.profile.end - k3.profile.start)))
            logger.info("Normalization                %.3fms" %
                        (1e-6 * (k4.profile.end - k4.profile.start)))
            logger.info("--------------------------------------")
        self.assert_(delta < 1e-4, "delta=%s" % delta)
Esempio n. 16
0
 def blocks(self, nbits, startbit, num):
     totalBlocks = num / 4 / self.cta_size
     global_size = (self.cta_size * totalBlocks, )
     local_size = (self.cta_size, )
     blocks_args = (self.keys, self.values,
                    self.d_tempKeys, self.d_tempValues, np.uint32(nbits),
                    np.uint32(startbit), np.uint32(num),
                    np.uint32(totalBlocks),
                    cl.LocalMemory(4 * self.cta_size * self.uintsz),
                    cl.LocalMemory(4 * self.cta_size * self.uintsz))
     self.radix_prg.radixSortBlocksKeysValues(self.queue,
                                              global_size, local_size,
                                              *(blocks_args)).wait()
Esempio n. 17
0
def test_ieeefloats(context, q, float_data, float_data_gpu):
    cu_code = """
__global__ void mykernel(double *data) {
    double d_neginfinity = -INFINITY;
    double d_posinfinity = INFINITY;
    float f_neginfinity = -INFINITY;
    float f_posinfinity = INFINITY;
    data[0] = INFINITY;
    data[1] = -INFINITY;
    data[2] = f_neginfinity;
    data[3] = f_posinfinity;
}
"""
    kernel_name = test_common.mangle('mykernel', ['double*'])
    cl_code = test_common.cu_to_cl(cu_code, kernel_name, num_clmems=1)
    kernel = test_common.build_kernel(context, cl_code, kernel_name)
    kernel(
        q, (32,), (32,),
        float_data_gpu, offset_type(0), cl.LocalMemory(4))
    q.finish()
    cl.enqueue_copy(q, float_data, float_data_gpu)
    q.finish()
    print(float_data[:4])
    assert float_data[0] == np.inf
    assert float_data[1] == - np.inf
    assert float_data[2] == - np.inf
    assert float_data[3] == np.inf
Esempio n. 18
0
def test_sitofp(context, q, float_data, float_data_gpu, int_data, int_data_gpu):

    code = """
__global__ void myKernel(float *float_data, int *int_data) {
    float_data[0] = (float)int_data[0];
}
"""
    kernel = test_common.compile_code_v3(cl, context, code, test_common.mangle('myKernel', ['float *', 'int *']), num_clmems=2)['kernel']
    int_data[0] = 5
    int_data[1] = 2
    int_data[2] = 4
    cl.enqueue_copy(q, int_data_gpu, int_data)
    kernel(
        q, (32,), (32,),
        float_data_gpu,
        int_data_gpu,
        offset_type(0),
        offset_type(0),
        cl.LocalMemory(4))
    q.finish()
    cl.enqueue_copy(q, float_data, float_data_gpu)
    cl.enqueue_copy(q, int_data, int_data_gpu)
    q.finish()
    print('float_data[0]', float_data[0])
    # expected = pow(float_data[1], float_data[2])
    assert float_data[0] == 5
Esempio n. 19
0
def test_sqrt(context, q, float_data, float_data_gpu):

    code = """
__global__ void myKernel(float *data) {
    data[threadIdx.x] = sqrt(data[threadIdx.x]);
}
"""
    kernel = test_common.compile_code_v3(cl, context, code, test_common.mangle('myKernel', ['float *']), num_clmems=1)['kernel']
    float_data[0] = 1.5
    float_data[1] = 4.6
    float_data[2] = -1.5
    float_data[3] = 0
    float_data_orig = np.copy(float_data)
    cl.enqueue_copy(q, float_data_gpu, float_data)
    kernel(
        q, (32,), (32,),
        float_data_gpu, offset_type(0), cl.LocalMemory(4))
    q.finish()
    cl.enqueue_copy(q, float_data, float_data_gpu)
    q.finish()
    print('float_data[:4]', float_data[:4])
    for i in range(4):
        if float_data_orig[i] >= 0:
            assert abs(float_data[i] - math.sqrt(float_data_orig[i])) <= 1e-4
        else:
            assert math.isnan(float_data[i])
Esempio n. 20
0
def test_pow(context, q, float_data, float_data_gpu):

    code = """
__global__ void myKernel(float *data) {
    data[0] = pow(data[1], data[2]);
    data[3] = pow(data[4], data[5]);
    data[5] = pow(data[7], data[8]);
}
"""
    kernel = test_common.compile_code_v3(cl, context, code, test_common.mangle('myKernel', ['float *']), num_clmems=1)['kernel']
    float_data[1] = 1.5
    float_data[2] = 4.6
    float_data[4] = -1.5
    float_data[5] = 4.6
    float_data[7] = 1.5
    float_data[8] = -4.6
    cl.enqueue_copy(q, float_data_gpu, float_data)
    kernel(
        q, (32,), (32,),
        float_data_gpu, offset_type(0), cl.LocalMemory(4))
    q.finish()
    cl.enqueue_copy(q, float_data, float_data_gpu)
    q.finish()
    print('float_data[0]', float_data[0])
    print('float_data[3]', float_data[3])
    print('float_data[6]', float_data[6])
    expected = pow(float_data[1], float_data[2])
    assert abs(float_data[0] - expected) <= 1e-4
Esempio n. 21
0
    def setUp(self):
        if not test_options.opencl:
            self.skipTest("User request to skip OpenCL tests")
        if pyopencl is None or ocl is None:
            self.skipTest("OpenCL module (pyopencl) is not present or no device available")

        self.h_data = numpy.random.random(self.N).astype("float32")
        self.h2_data = numpy.random.random((self.N, self.N)).astype("float32").reshape((self.N, self.N))

        self.ctx = ocl.create_context(devicetype="GPU")
        device = self.ctx.devices[0]
        try:
            devtype = pyopencl.device_type.to_string(device.type).upper()
        except ValueError:
            # pocl does not describe itself as a CPU !
            devtype = "CPU"
        workgroup = device.max_work_group_size
        if (devtype == "CPU") and (device.platform.vendor == "Apple"):
            logger.info("For Apple's OpenCL on CPU: enforce max_work_goup_size=1")
            workgroup = 1

        self.ws = min(workgroup, self.ws)
        self.queue = pyopencl.CommandQueue(self.ctx, properties=pyopencl.command_queue_properties.PROFILING_ENABLE)
        self.local_mem = pyopencl.LocalMemory(self.ws * 32)  # 2float4 = 2*4*4 bytes per workgroup size
        src = read_cl_file("pyfai:openCL/bitonic.cl")
        self.prg = pyopencl.Program(self.ctx, src).build()
Esempio n. 22
0
def test_inlining(context, q, float_data, float_data_gpu):

    cu_source = """
__global__ void myKernel(float *data) {
    data[0] = (data[3] * (data[1] + data[2])) / data[4];
    data[7] = (data[3] / (data[1] - data[2])) * data[4];
}
"""
    kernelName = test_common.mangle('myKernel', ['float *'])
    cl_sourcecode = test_common.cu_to_cl(cu_source, kernelName, num_clmems=1)
    print('cl_sourcecode', cl_sourcecode)
    kernel = test_common.build_kernel(context, cl_sourcecode, kernelName)

    for i in range(10):
        float_data[i] = i + 3
    cl.enqueue_copy(q, float_data_gpu, float_data)
    q.finish()
    # prog = cl.Program(context, sourcecode).build()
    # prog.__getattr__(kernelName)(
    kernel(q, (32, ), (32, ), float_data_gpu, offset_type(0),
           cl.LocalMemory(4))
    q.finish()
    float_data2 = np.zeros((1024, ), dtype=np.float32)
    cl.enqueue_copy(q, float_data2, float_data_gpu)
    q.finish()
    print('float_data2[0]', float_data2[0])
    d = float_data
    d2 = float_data2
    expect = (d[3] * (d[1] + d[2])) / d[4]
    assert abs(d2[0] - expect) < 1e-5
Esempio n. 23
0
def test_use_template1(context, q, int_data, int_data_gpu, float_data,
                       float_data_gpu):
    code = """
template< typename T >
__device__ T addNumbers(T one, T two) {
    return one + two;
}

__global__ void use_template1(float *data, int *intdata) {
    if(threadIdx.x == 0 && blockIdx.x == 0) {
        data[0] = addNumbers(data[1], data[2]);
        intdata[0] = addNumbers(intdata[1], intdata[2]);
    }
}
"""
    kernelName = test_common.mangle('use_template1', ['float *', 'int *'])
    prog = compile_code(cl, context, code, kernelName, num_clmems=2)
    float_data_orig = np.copy(float_data)
    int_data_orig = np.copy(int_data)

    prog.__getattr__(kernelName)(q, (32, ), (32, ), float_data_gpu,
                                 int_data_gpu, offset_type(0), offset_type(0),
                                 cl.LocalMemory(4))
    cl.enqueue_copy(q, float_data, float_data_gpu)
    cl.enqueue_copy(q, int_data, int_data_gpu)
    q.finish()
    assert float_data[0] == float_data_orig[1] + float_data_orig[2]
    assert int_data[0] == int_data_orig[1] + int_data_orig[2]
Esempio n. 24
0
def test_sincos(context, q, float_data, float_data_gpu):

    cu_code = """
__global__ void mykernel(float *data) {
    sincosf(0.1, &data[0], &data[1]);
    sincosf(data[2], &data[3], &data[4]);
}
"""
    kernel_name = test_common.mangle('mykernel', ['float*'])
    cl_code = test_common.cu_to_cl(cu_code, kernel_name, num_clmems=1)
    print('cl_code', cl_code)

    float_data[2] = -0.3
    float_data_orig = np.copy(float_data)
    cl.enqueue_copy(q, float_data_gpu, float_data)

    kernel = test_common.build_kernel(context, cl_code, kernel_name)
    kernel(
        q, (32,), (32,),
        float_data_gpu, offset_type(0), offset_type(0), cl.LocalMemory(4))
    q.finish()
    cl.enqueue_copy(q, float_data, float_data_gpu)
    q.finish()
    print(float_data[:5])
    assert abs(float_data[0] - math.sin(0.1)) < 1e-4
    assert abs(float_data[1] - math.cos(0.1)) < 1e-4
    assert abs(float_data[3] - math.sin(float_data_orig[2])) < 1e-4
    assert abs(float_data[4] - math.cos(float_data_orig[2])) < 1e-4
Esempio n. 25
0
 def reduce_min(self, queue, a_buf, N, o_buf, o_lid):
     r = np.empty(self.n_threads).astype(np.float32)
     r_buf = cl.Buffer(self.ctx, mf.READ_WRITE, size=r.nbytes)
     q_buf = cl.Buffer(self.ctx, mf.READ_WRITE, size=r.nbytes)
     loc_buf = cl.LocalMemory(4 * self.n_threads)
     loc_lid = cl.LocalMemory(4 * self.n_threads)
     #print("N==", N, "n_threads==", self.n_threads)
     minnt = min(N, self.n_threads)
     evt = self.prgmna.reduce(queue, (N, ), (minnt, ), a_buf, r_buf, q_buf,
                              o_lid, loc_buf, loc_lid)
     evt.wait()
     #print(evt.profile.end - evt.profile.start)
     n_threads = N // minnt
     evt = self.prgmnb.reduce(queue, (n_threads, ), (n_threads, ), r_buf,
                              o_buf, q_buf, o_lid, loc_buf, loc_lid)
     evt.wait()
Esempio n. 26
0
def test_umulhi(context, q, int_data, int_data_gpu):
    ll_code = """
declare i32 @_Z8__umulhiii(i32, i32)

define void @test_umulhi(i32* %data) {
  %1 = load i32, i32* %data

  %2 = getelementptr i32, i32* %data, i32 1
  %3 = load i32, i32* %2

  %4 = getelementptr i32, i32* %data, i32 2
  %5 = load i32, i32* %4

  %6 = call i32 @_Z8__umulhiii(i32 %3, i32 %5)
  store i32 %6, i32* %data
  ret void
}
"""
    cl_code = test_common.ll_to_cl(ll_code, 'test_umulhi', 1)
    print('cl_code', cl_code)
    int_data[0] = 0
    int_data[1] = -50
    int_data[2] = 2523123
    cl.enqueue_copy(q, int_data_gpu, int_data)
    kernel = test_common.build_kernel(context, cl_code, 'test_umulhi')
    kernel(q, (32,), (32,), int_data_gpu, offset_type(0), offset_type(0), cl.LocalMemory(32))
    from_gpu = np.copy(int_data)
    cl.enqueue_copy(q, from_gpu, int_data_gpu)
    q.finish()
    expected = (np.uint64(np.uint32(2523123)) * np.uint64(np.uint32(-50))) // 2**32
    print('expected', expected)
    print('from_gpu[0]', from_gpu[0])
    assert expected == from_gpu[0].item()
Esempio n. 27
0
 def allocate_constants(self):
     super(Clumpy_Surfactant_Nutrient_Wave, self).allocate_constants()
     # Allocate local memory for the finite difference code
     self.halo = np.int32(1)  # As we are doing D2Q9, we have a halo of one
     self.buf_nx = np.int32(self.two_d_local_size[0] + 2 * self.halo)
     self.buf_ny = np.int32(self.two_d_local_size[1] + 2 * self.halo)
     self.psi_local = cl.LocalMemory(float_size * self.buf_nx * self.buf_ny)
Esempio n. 28
0
    def setUp(self):
        self.h_data = numpy.random.random(self.N).astype("float32")
        self.h2_data = numpy.random.random(
            (self.N, self.N)).astype("float32").reshape((self.N, self.N))

        self.ctx = ocl.create_context(devicetype="GPU")
        device = self.ctx.devices[0]
        try:
            devtype = pyopencl.device_type.to_string(device.type).upper()
        except ValueError:
            # pocl does not describe itself as a CPU !
            devtype = "CPU"
        workgroup = device.max_work_group_size
        if (devtype == "CPU") and (device.platform.vendor == "Apple"):
            logger.info(
                "For Apple's OpenCL on CPU: enforce max_work_goup_size=1")
            workgroup = 1

        self.ws = min(workgroup, self.ws)
        self.queue = pyopencl.CommandQueue(
            self.ctx,
            properties=pyopencl.command_queue_properties.PROFILING_ENABLE)
        self.local_mem = pyopencl.LocalMemory(
            self.ws * 32)  # 2float4 = 2*4*4 bytes per workgroup size
        src = pyFAI.utils.read_cl_file("bitonic.cl")
        self.prg = pyopencl.Program(self.ctx, src).build()
Esempio n. 29
0
def test_sext(context, q, int_data, int_data_gpu):
    ll_code = """
define void @mykernel(i32* %data) {
  %1 = load i32, i32* %data
  %2 = sext i32 %1 to i64
  %3 = lshr i64 %2, 32
  %4 = trunc i64 %3 to i32
  store i32 %4, i32* %data
  ret void
}
"""
    cl_code = test_common.ll_to_cl(ll_code, 'mykernel', 1)
    print('cl_code', cl_code)
    for experiment in [{'in': 23, 'out': 0}, {'in': -1, 'out': -1}]:
        int_data[0] = experiment['in']
        cl.enqueue_copy(q, int_data_gpu, int_data)
        kernel = test_common.build_kernel(context, cl_code, 'mykernel')
        kernel(q, (32, ), (32, ), int_data_gpu, offset_type(0), offset_type(0),
               cl.LocalMemory(32))
        from_gpu = np.copy(int_data)
        cl.enqueue_copy(q, from_gpu, int_data_gpu)
        q.finish()
        # expected = (np.uint32(int_data[1]) * np.uint32(int_data[2])) >> 32
        expected = experiment['out']
        print('expected', expected)
        print('from_gpu[0]', from_gpu[0])
        assert expected == from_gpu[0].item()
        split_cl = cl_code.split('\n')
        found_long_cast = False
        for line in split_cl:
            if ' >> 32' in line and '(long)' in line:
                found_long_cast = True
        assert found_long_cast
Esempio n. 30
0
 def set_float_value(gpu_buffer, idx, value):
     setValueProg.__getattr__(setValueKernelName)(q, (32, ), (32, ),
                                                  float_data_gpu,
                                                  offset_type(0),
                                                  np.int32(idx),
                                                  np.float32(value),
                                                  cl.LocalMemory(4))