Ejemplo n.º 1
0
def test_scatter(cl_env, radix_kernels, key_dtype, ngroups, group_size):
    ctx, cq = cl_env

    radix_bits = 4
    histogram_len = 2 ** radix_bits
    keys = np.random.randint(0, 64, size=(ngroups, group_size * 2), dtype=key_dtype)
    keys_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY, keys.nbytes)
    out_keys_buf = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, keys.nbytes)
    histogram_buf = cl.Buffer(
        ctx, cl.mem_flags.READ_ONLY, histogram_len * ngroups * np.dtype('uint32').itemsize
    )
    offset_buf = cl.Buffer(
        ctx, cl.mem_flags.READ_ONLY, histogram_len * ngroups * np.dtype('uint32').itemsize
    )

    for radix_pass in range(keys.dtype.itemsize * 8 // radix_bits):
        radix_keys = radix_key(keys, radix_bits, radix_pass).astype('uint16')
        order = np.argsort(radix_keys, kind='mergesort')
        grid = np.ogrid[tuple(slice(0, s) for s in keys.shape)]
        block_keys = keys[grid[:-1] + [order]] # Partially sort

        (keys_map, _) = cl.enqueue_map_buffer(
            cq, keys_buf, cl.map_flags.WRITE_INVALIDATE_REGION, 0,
            keys.shape, keys.dtype, wait_for=[], is_blocking=True
        )
        keys_map[...] = block_keys
        del keys_map

        radix_keys = radix_key(block_keys, radix_bits, radix_pass).astype('uint16')

        (histogram_map, _) = cl.enqueue_map_buffer(
            cq, histogram_buf, cl.map_flags.WRITE_INVALIDATE_REGION, 0,
            (histogram_len, ngroups), np.dtype('uint32'), wait_for=[], is_blocking=True
        )
        (offset_map, _) = cl.enqueue_map_buffer(
            cq, offset_buf, cl.map_flags.WRITE_INVALIDATE_REGION, 0,
            (histogram_len, ngroups), np.dtype('uint32'), wait_for=[], is_blocking=True
        )
        histogram_map[...] = np.array([np.bincount(group_keys, minlength=16)
                                       for group_keys in radix_keys], dtype='uint32').T
        offset_map[...] = prefix_sum(histogram_map.flat).reshape(histogram_len, ngroups)
        del histogram_map, offset_map

        local_offset = cl.LocalMemory(histogram_len * np.dtype('uint32').itemsize)
        local_histogram = cl.LocalMemory(histogram_len * np.dtype('uint32').itemsize)

        e = radix_kernels['scatter'](
            cq, (ngroups,), (group_size,),
            keys_buf, out_keys_buf, None, None,
            offset_buf, local_offset, histogram_buf, local_histogram,
            radix_bits, radix_pass, g_times_l=True,
        )

        (keys_map, _) = cl.enqueue_map_buffer(
            cq, out_keys_buf, cl.map_flags.READ, 0,
            (ngroups, group_size * 2), keys.dtype, wait_for=[e], is_blocking=True
        )

        expected = block_keys.flat[np.argsort(radix_keys, axis=None, kind='mergesort')]
        np.testing.assert_equal(keys_map, expected.reshape(ngroups, 2 * group_size))
Ejemplo n.º 2
0
    def initBuffers(self,puzzle):
        #define lengths buffer and copy to the GPU
        #as we will not read from this buffer later, mapping is not required
        self.lengths = np.full(self.simulations,np.iinfo(np.int16).max,dtype=np.int16)
        self.lengthsBuffer = cl.Buffer(self.context, cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, hostbuf=self.lengths)
         
        #define buffer for aggregated lengths for each workgroup
        self.groupLengths = np.full(self.workGroups,np.iinfo(np.int16).max,dtype=np.int16)
        self.groupLengthsBuffer = cl.Buffer(self.context, cl.mem_flags.READ_WRITE | cl.mem_flags.USE_HOST_PTR, hostbuf=self.groupLengths)
        
        #map group lengths buffer
        cl.enqueue_map_buffer(self.queue,self.groupLengthsBuffer,cl.map_flags.READ,0,self.groupLengths.shape,self.groupLengths.dtype)
        
        #get the input puzzle ready for the kernel; convert to 8 bit int (char)
        p = np.array(puzzle['puzzle']).astype(np.int8)
        #subtract 1 so that -1 denotes a gap and 0 denotes a square to be filled
        p = p - np.ones_like(p,dtype=p.dtype)
        
        #copy the puzzle, one for each simulation
        self.puzzles = np.zeros((self.simulations,self.height,self.width),dtype=p.dtype)
        self.puzzles[:,0:self.height,0:self.width] = p
    
        #define puzzles buffer and copy data (we do not need to worry about getting data out of this buffer, so mapping isn't required)
        #this buffer contains the input puzzles, one for each invocation (the puzzle is too large to hold in local or shared memory)
        self.puzzlesFlattened = self.puzzles.ravel()
        self.puzzlesBuffer = cl.Buffer(self.context, cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, hostbuf=self.puzzlesFlattened)
        
        #define output buffer for best solutions aggregated across workgroups
        self.solutions = self.puzzles[0:self.workGroups]
        self.solutionsFlattened = self.solutions.ravel()
        self.solutionsBuffer = cl.Buffer(self.context, cl.mem_flags.READ_WRITE | cl.mem_flags.USE_HOST_PTR, hostbuf=self.solutionsFlattened)

        #map solutions buffer
        cl.enqueue_map_buffer(self.queue,self.solutionsBuffer,cl.map_flags.READ,0,self.solutionsFlattened.shape,self.solutions.dtype)
Ejemplo n.º 3
0
def estimate_niter(N):
    """returns niter s.t. the time spent on kernel is same as for memory transfer"""
    a = np.ones(N,np.float32)


    dev = get_device()
    context, queue = dev.context, dev.queue
    mf = cl.mem_flags


    t = time()
    copy_g = cl.Buffer(context, mf.ALLOC_HOST_PTR
                       ,size = a.nbytes)

    cl.enqueue_map_buffer(queue, copy_g, a,
                    device_offset=0,
                    is_blocking=False)


    # cl.enqueue_copy(queue, copy_g, a,
    #                 device_offset=0,
    #                 is_blocking=False)

    queue.flush()

    # a_g = OCLArray.from_array(a, async = True)
    #a_g = array.to_device(queue, a, async = False)
    print time()-t
Ejemplo n.º 4
0
 def execute(self):
     # start = timer()
     evtcompute = self.program.tthetaf4(self.queue,
                                        (self.npx / 4, self.npx), None,
                                        self.tth_buf, self.eta_buf,
                                        self.par_buf)
     #evtcompute.wait()
     #print timer()-start
     self.tthl, evtt = cl.enqueue_map_buffer(self.queue,
                                             self.tth_buf,
                                             cl.map_flags.READ,
                                             0, (self.npx, self.npx),
                                             numpy.float32,
                                             'C',
                                             wait_for=[evtcompute],
                                             is_blocking=False)
     self.etal, evte = cl.enqueue_map_buffer(self.queue,
                                             self.eta_buf,
                                             cl.map_flags.READ,
                                             0, (self.npx, self.npx),
                                             numpy.float32,
                                             'C',
                                             wait_for=[evtcompute],
                                             is_blocking=False)
     evtcompute.wait()
     evtt.wait()
     evte.wait()
     return self.tthl, self.etal
Ejemplo n.º 5
0
 def enqueue_readouts(self, queue, buffers, range_start, range_end):
     if self._is_writable:
         nmr_problems = range_end - range_start
         cl.enqueue_map_buffer(
             queue, buffers[0], cl.map_flags.READ,
             range_start * self._data.strides[0],
             (nmr_problems,) + self._data.shape[1:], self._data.dtype,
             order="C", wait_for=None, is_blocking=False)
Ejemplo n.º 6
0
def test_random_collision_resized(cl_env, coord_dtype, collision_programs,
                                  old_shape, new_shape):
    ctx, cq = cl_env

    collider = Collider(ctx, *old_shape, coord_dtype, *collision_programs)
    collider.resize(*new_shape)

    np.random.seed(4)
    size = new_shape[0] or old_shape[0]
    coords = np.random.random((size, 3)).astype(coord_dtype)
    radius = 1 / (size**0.5)  # Keep number of collisions under control
    radii = np.random.uniform(0, radius, len(coords)).astype(coord_dtype)
    expected = find_collisions(coords, radii)

    coords_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY,
                           len(coords) * 4 * coord_dtype.itemsize)
    (coords_map,
     _) = cl.enqueue_map_buffer(cq,
                                coords_buf,
                                cl.map_flags.WRITE_INVALIDATE_REGION,
                                0, (len(coords), 4),
                                coord_dtype,
                                is_blocking=True)
    coords_map[..., :3] = coords
    del coords_map
    radii_buf = cl.Buffer(ctx,
                          cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR,
                          hostbuf=radii)
    collisions_buf = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY,
                               len(expected) * 2 * collider.id_dtype.itemsize)
    n_collisions_buf = cl.Buffer(ctx, cl.mem_flags.READ_WRITE,
                                 collider.counter_dtype.itemsize)

    e = collider.get_collisions(cq, coords_buf, radii_buf, n_collisions_buf,
                                collisions_buf, len(expected))

    (n_collisions_map, _) = cl.enqueue_map_buffer(cq,
                                                  n_collisions_buf,
                                                  cl.map_flags.READ,
                                                  0,
                                                  1,
                                                  collider.counter_dtype,
                                                  wait_for=[e],
                                                  is_blocking=True)
    assert n_collisions_map[0] == len(expected)

    (collisions_map, _) = cl.enqueue_map_buffer(cq,
                                                collisions_buf,
                                                cl.map_flags.READ,
                                                0, (n_collisions_map[0], 2),
                                                collider.id_dtype,
                                                wait_for=[e],
                                                is_blocking=True)

    # Need to sort, order is undefined
    collisions = set(map(tuple, np.sort(collisions_map, axis=1)))
    assert collisions == expected
Ejemplo n.º 7
0
def test_block_sort_random(cl_env, radix_kernels, key_dtype, ngroups, group_size):
    ctx, cq = cl_env

    radix_bits = 4
    histogram_len = 2 ** radix_bits

    keys = np.random.randint(0, 64, size=(ngroups, group_size * 2), dtype=key_dtype)

    keys_buf = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, keys.nbytes)
    histogram_buf = cl.Buffer(ctx, cl.mem_flags.READ_WRITE,
                              ngroups * histogram_len * np.dtype('uint32').itemsize)

    local_keys = cl.LocalMemory(group_size * 2 * keys.dtype.itemsize)
    local_values = cl.LocalMemory(group_size * 2 * keys.dtype.itemsize)
    count = cl.LocalMemory(group_size * 2 * np.dtype('uint32').itemsize)
    local_histogram = cl.LocalMemory(histogram_len * np.dtype('uint32').itemsize)

    for radix_pass in range(keys.dtype.itemsize * 8 // radix_bits):
        (keys_map, _) = cl.enqueue_map_buffer(
            cq, keys_buf, cl.map_flags.WRITE_INVALIDATE_REGION, 0,
            (ngroups, group_size * 2), keys.dtype, wait_for=[], is_blocking=True
        )
        keys_map[...] = keys
        del keys_map

        e = radix_kernels['block_sort'](
            cq, (ngroups,), (group_size,),
            keys_buf, local_keys, local_keys, None, local_values, local_values,
            histogram_buf, local_histogram, count,
            radix_bits, radix_pass, g_times_l=True,
        )

        keys = keys.reshape(ngroups, group_size * 2)
        order = np.argsort(radix_key(keys, radix_bits, radix_pass), kind='mergesort')
        grid = np.ogrid[tuple(slice(0, s) for s in keys.shape)]

        (histogram_map, _) = cl.enqueue_map_buffer(
            cq, histogram_buf, cl.map_flags.READ, 0,
            (histogram_len, ngroups), np.dtype('uint32'), wait_for=[e], is_blocking=True
        )
        i = 0
        for group_keys, histogram in zip(keys, histogram_map.T):
            group_keys = radix_key(group_keys, radix_bits, radix_pass).astype('uint16')
            expected = np.bincount(group_keys, minlength=16)
            try:
                np.testing.assert_equal(histogram, expected)
            except AssertionError:
                print((radix_pass, i))
                raise
            i += 1

        expected = keys[grid[:-1] + [order]]
        (keys_map, _) = cl.enqueue_map_buffer(
            cq, keys_buf, cl.map_flags.READ, 0,
            (ngroups, group_size * 2), keys.dtype, wait_for=[e], is_blocking=True
        )
        np.testing.assert_equal(keys_map, expected)
Ejemplo n.º 8
0
    def initBuffers(self, puzzle):
        #define lengths buffer and copy to the GPU
        #as we will not read from this buffer later, mapping is not required
        self.lengths = np.full(self.simulations,
                               np.iinfo(np.int16).max,
                               dtype=np.int16)
        self.lengthsBuffer = cl.Buffer(self.context,
                                       cl.mem_flags.READ_WRITE
                                       | cl.mem_flags.COPY_HOST_PTR,
                                       hostbuf=self.lengths)

        #define buffer for aggregated lengths for each workgroup
        self.groupLengths = np.full(self.workGroups,
                                    np.iinfo(np.int16).max,
                                    dtype=np.int16)
        self.groupLengthsBuffer = cl.Buffer(self.context,
                                            cl.mem_flags.READ_WRITE
                                            | cl.mem_flags.USE_HOST_PTR,
                                            hostbuf=self.groupLengths)

        #map group lengths buffer
        cl.enqueue_map_buffer(self.queue, self.groupLengthsBuffer,
                              cl.map_flags.READ, 0, self.groupLengths.shape,
                              self.groupLengths.dtype)

        #get the input puzzle ready for the kernel; convert to 8 bit int (char)
        p = np.array(puzzle['puzzle']).astype(np.int8)
        #subtract 1 so that -1 denotes a gap and 0 denotes a square to be filled
        p = p - np.ones_like(p, dtype=p.dtype)

        #copy the puzzle, one for each simulation
        self.puzzles = np.zeros((self.simulations, self.height, self.width),
                                dtype=p.dtype)
        self.puzzles[:, 0:self.height, 0:self.width] = p

        #define puzzles buffer and copy data (we do not need to worry about getting data out of this buffer, so mapping isn't required)
        #this buffer contains the input puzzles, one for each invocation (the puzzle is too large to hold in local or shared memory)
        self.puzzlesFlattened = self.puzzles.ravel()
        self.puzzlesBuffer = cl.Buffer(self.context,
                                       cl.mem_flags.READ_WRITE
                                       | cl.mem_flags.COPY_HOST_PTR,
                                       hostbuf=self.puzzlesFlattened)

        #define output buffer for best solutions aggregated across workgroups
        self.solutions = self.puzzles[0:self.workGroups]
        self.solutionsFlattened = self.solutions.ravel()
        self.solutionsBuffer = cl.Buffer(self.context,
                                         cl.mem_flags.READ_WRITE
                                         | cl.mem_flags.USE_HOST_PTR,
                                         hostbuf=self.solutionsFlattened)

        #map solutions buffer
        cl.enqueue_map_buffer(self.queue, self.solutionsBuffer,
                              cl.map_flags.READ, 0,
                              self.solutionsFlattened.shape,
                              self.solutions.dtype)
Ejemplo n.º 9
0
def test_collision(cl_env, coord_dtype, collision_programs):
    ctx, cq = cl_env

    coords = np.array(
        [[0.0, 1.0, 3.0], [0.0, 1.0, 3.0], [4.0, 1.0, 8.0], [-4.0, -6.0, 3.0],
         [-5.0, 0.0, -1.0], [-5.0, 0.5, -0.5]],
        dtype=coord_dtype)
    radii = np.ones(len(coords), dtype=coord_dtype)
    expected = {(0, 1), (4, 5)}

    collider = Collider(ctx, len(coords), 3, 8, coord_dtype,
                        *collision_programs)

    coords_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY,
                           len(coords) * 4 * coord_dtype.itemsize)
    (coords_map,
     _) = cl.enqueue_map_buffer(cq,
                                coords_buf,
                                cl.map_flags.WRITE_INVALIDATE_REGION,
                                0, (len(coords), 4),
                                coord_dtype,
                                is_blocking=True)
    coords_map[..., :3] = coords
    del coords_map
    radii_buf = cl.Buffer(ctx,
                          cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR,
                          hostbuf=radii)
    collisions_buf = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY,
                               len(expected) * 2 * collider.id_dtype.itemsize)
    n_collisions_buf = cl.Buffer(ctx, cl.mem_flags.READ_WRITE,
                                 collider.counter_dtype.itemsize)

    e = collider.get_collisions(cq, coords_buf, radii_buf, n_collisions_buf,
                                collisions_buf, len(expected))

    (n_collisions_map, _) = cl.enqueue_map_buffer(cq,
                                                  n_collisions_buf,
                                                  cl.map_flags.READ,
                                                  0,
                                                  1,
                                                  collider.counter_dtype,
                                                  wait_for=[e],
                                                  is_blocking=True)
    assert n_collisions_map[0] == len(expected)

    (collisions_map, _) = cl.enqueue_map_buffer(cq,
                                                collisions_buf,
                                                cl.map_flags.READ,
                                                0, (n_collisions_map[0], 2),
                                                collider.id_dtype,
                                                wait_for=[e],
                                                is_blocking=True)
    assert set(map(tuple, collisions_map)) == expected
Ejemplo n.º 10
0
def test_scan(cl_env, scan_kernels):
    ctx, cq = cl_env

    values = np.array(
        [17, 6, 24, 28, 18, 22, 2, 1, 25, 17, 7, 17, 3, 19, 8, 23],
        dtype='uint32')

    block_size = 4
    nblocks = len(values) // 2 // block_size

    values_buf = cl.Buffer(ctx,
                           cl.mem_flags.READ_WRITE
                           | cl.mem_flags.COPY_HOST_PTR,
                           hostbuf=values)
    block_sums_buf = cl.Buffer(ctx, cl.mem_flags.READ_WRITE,
                               nblocks * values.dtype.itemsize)
    calc_scan = scan_kernels['local_scan'](
        cq,
        (len(values) // 2, ),
        (block_size, ),
        values_buf,
        cl.LocalMemory(block_size * 2 * values.dtype.itemsize),
        block_sums_buf,
    )

    (values_map, _) = cl.enqueue_map_buffer(
        cq,
        values_buf,
        cl.map_flags.READ,
        0,
        values.shape,
        values.dtype,
        wait_for=[calc_scan],
        is_blocking=True,
    )
    (block_sums_map, _) = cl.enqueue_map_buffer(
        cq,
        block_sums_buf,
        cl.map_flags.READ,
        0,
        (nblocks, ),
        values.dtype,
        wait_for=[calc_scan],
        is_blocking=True,
    )

    expected = np.array(
        [0, 17, 23, 47, 75, 93, 115, 117, 0, 25, 42, 49, 66, 69, 88, 96],
        dtype=values.dtype)
    np.testing.assert_equal(values_map, expected)
    expected = np.array([118, 119], dtype=values.dtype)
    np.testing.assert_equal(block_sums_map, expected)
Ejemplo n.º 11
0
def test_block_scan(cl_env, scan_kernels):
    ctx, cq = cl_env

    values = np.array(
        [0, 17, 23, 47, 75, 93, 115, 117, 0, 25, 42, 49, 66, 69, 88, 96],
        dtype='uint32')
    block_sums = np.array([118, 119], dtype=values.dtype)

    values_buf = cl.Buffer(ctx,
                           cl.mem_flags.READ_WRITE
                           | cl.mem_flags.COPY_HOST_PTR,
                           hostbuf=values)
    block_sums_buf = cl.Buffer(ctx,
                               cl.mem_flags.READ_ONLY
                               | cl.mem_flags.COPY_HOST_PTR,
                               hostbuf=block_sums)

    calc_block_scan = scan_kernels['local_scan'](
        cq, (1, ), (len(block_sums), ),
        block_sums_buf,
        cl.LocalMemory(len(block_sums) * 2 * values.dtype.itemsize),
        None,
        g_times_l=True)

    (block_sums_map, _) = cl.enqueue_map_buffer(cq,
                                                block_sums_buf,
                                                cl.map_flags.READ,
                                                0,
                                                block_sums.shape,
                                                block_sums.dtype,
                                                wait_for=[calc_block_scan],
                                                is_blocking=True)
    expected = np.array([0, 118], dtype=values.dtype)
    np.testing.assert_equal(block_sums_map, expected)

    calc_scan = scan_kernels['block_scan'](cq, (len(values) // 2, ), (4, ),
                                           values_buf,
                                           block_sums_buf,
                                           wait_for=[calc_block_scan])
    (values_map, _) = cl.enqueue_map_buffer(cq,
                                            values_buf,
                                            cl.map_flags.READ,
                                            0,
                                            values.shape,
                                            values.dtype,
                                            wait_for=[calc_scan],
                                            is_blocking=True)
    expected = np.array([
        0, 17, 23, 47, 75, 93, 115, 117, 118, 143, 160, 167, 184, 187, 206, 214
    ],
                        dtype=values.dtype)
    np.testing.assert_equal(values_map, expected)
Ejemplo n.º 12
0
    def solve(self,puzzle,simulations = 16384, iterations = 35, workGroupSize = 128):
        self.simulations = simulations
        self.iterations = iterations
        self.workGroupSize = workGroupSize
        self.workGroups = int(self.simulations / self.workGroupSize)
        self.width = np.int8(puzzle['width'])
        self.height = np.int8(puzzle['height'])
        
        #initialise buffers
        self.initBuffers(puzzle)
        
        #create kernel
        self.kernel = cl.Kernel(self.program,"montecarlo")
        self.kernel.set_args(self.lengthsBuffer,self.groupLengthsBuffer,self.puzzlesBuffer,self.solutionsBuffer,self.height,self.width,np.int32(self.iterations))
        
        #execute program for a number of iterations
        cl.enqueue_nd_range_kernel(self.queue,self.kernel,(self.simulations,),(self.workGroupSize,))
        
        #unmap group lengths buffer from device
        cl.enqueue_map_buffer(self.queue,self.groupLengthsBuffer,cl.map_flags.WRITE,0,self.groupLengths.shape,self.groupLengths.dtype)
        self.groupLengths = self.groupLengthsBuffer.get_host_array(self.groupLengths.shape,dtype=self.groupLengths.dtype)

        #unmap solutions buffer from device
        cl.enqueue_map_buffer(self.queue,self.solutionsBuffer,cl.map_flags.WRITE,0,self.solutionsFlattened.shape,self.solutions.dtype)
        self.solutions = self.solutionsBuffer.get_host_array(self.solutions.shape,dtype=self.solutions.dtype)
        
        #release buffers
        self.lengthsBuffer.release()
        self.groupLengthsBuffer.release()
        self.puzzlesBuffer.release()
        self.solutionsBuffer.release()

        #get the best solution
        i = self.groupLengths.argmin()
        bestSolution = np.array(self.solutions[i])
        
        #convert solution to list format used by challenge
        solution = []
        for row in range(0,puzzle['height']):
            for col in range(0,puzzle['width']):
                if bestSolution[row][col]!=-1:
                    s = bestSolution[row][col]
                    
                    #add to solution list
                    solution.append({'X': int(col),'Y': int(row),'Size':int(s)})
                    
                    #clear cells in solution
                    for i in range(0,s):
                        for j in range(0,s):
                            bestSolution[row+i][col+j]=-1
        
        return solution
Ejemplo n.º 13
0
def test_codes(cl_env, kernels, coord_dtype):
    ctx, cq = cl_env

    coords = np.array([[ 0.0, 1.0, 3.0],
                       [ 0.0, 1.0, 3.0],
                       [ 4.0, 1.0, 8.0],
                       [-4.0,-6.0, 3.0],
                       [-5.0, 0.0,-1.0],
                       [-5.0, 0.5,-0.5]], dtype=coord_dtype)
    coord_range = np.array([coords.min(axis=0),
                            coords.max(axis=0)], dtype=coords.dtype)
    expected = np.array([862940378, 862940378, 1073741823,
                         20332620, 302580864, 306295426], dtype='int32')

    coords_buf = cl.Buffer(
        ctx, cl.mem_flags.READ_ONLY, len(coords) * 4 * coord_dtype.itemsize
    )
    (coords_map, _) = cl.enqueue_map_buffer(
        cq, coords_buf, cl.map_flags.WRITE_INVALIDATE_REGION,
        0, (len(coords), 4), coord_dtype,
        is_blocking=True
    )
    coords_map[..., :3] = coords
    del coords_map
    range_buf = cl.Buffer(
        ctx, cl.mem_flags.READ_ONLY, 2 * 4 * coord_dtype.itemsize
    )
    (range_map, _) = cl.enqueue_map_buffer(
        cq, range_buf, cl.map_flags.WRITE_INVALIDATE_REGION,
        0, (len(coord_range), 4), coord_dtype,
        is_blocking=True
    )
    range_map[..., :3] = coord_range
    del range_map
    codes_buf = cl.Buffer(
        ctx, cl.mem_flags.READ_WRITE, len(coords) * np.dtype('uint32').itemsize
    )
    calc_codes = kernels['calculateCodes'](
        cq, (roundUp(len(coords), 32),), None,
        codes_buf, coords_buf, range_buf, len(coords),
    )

    (codes_map, _) = cl.enqueue_map_buffer(
        cq, codes_buf, cl.map_flags.READ | cl.map_flags.WRITE,
        0, (len(coords),), np.dtype('uint32'),
        wait_for=[calc_codes], is_blocking=True
    )
    np.testing.assert_equal(codes_map, expected)
    del codes_map
Ejemplo n.º 14
0
def use_naive_kernel(ctx, queue, dev, A, B):
    newA, A_shape = pad(A.copy())
    newB, B_shape = pad(B.copy())

    C_shape = (A.shape[0], B.shape[1])
    newC_shape = (newA.shape[0], newB.shape[1])
    newC = np.zeros(newC_shape, dtype=np.float32)

    A_cache = np.array(newA.flatten(), dtype=np.float32)
    B_cache = np.array(newB.flatten(), dtype=np.float32)
    C_cache = np.array(newC.flatten(), dtype=np.float32)

    max_wg_size = dev.get_info(cl.device_info.MAX_WORK_GROUP_SIZE)
    kernel = naive_kernel()

    mf = cl.mem_flags
    flags = mf.READ_WRITE | mf.COPY_HOST_PTR | mf.ALLOC_HOST_PTR
    A_buffer = cl.Buffer(ctx, flags, hostbuf=A_cache)
    B_buffer = cl.Buffer(ctx, flags, hostbuf=B_cache)
    C_buffer = cl.Buffer(ctx, flags, hostbuf=C_cache)
    A_array, _ = cl.enqueue_map_buffer(queue, A_buffer, cl.map_flags.READ, 0,
                                       A_cache.shape, A_cache.dtype, "C")
    B_array, _ = cl.enqueue_map_buffer(queue, B_buffer, cl.map_flags.READ, 0,
                                       B_cache.shape, B_cache.dtype, "C")
    C_array, _ = cl.enqueue_map_buffer(queue, C_buffer, cl.map_flags.WRITE, 0,
                                       C_cache.shape, C_cache.dtype, "C")

    global_size = (round_up(C_cache.shape[0], max_wg_size), )
    local_size = None

    print("Local Size: ", local_size)
    print("Global Size: ", global_size)

    prg = cl.Program(ctx, kernel).build()

    event = prg.naiveMatMul(
        queue,
        global_size,
        local_size,
        A_array.data,
        B_array.data,
        C_array.data,
        np.int32(A_shape[1]),
        np.int32(newC.shape[1]),
        np.int32(C_shape[0]),  # row boundary
        np.int32(C_shape[1]))  # col boundary
    event.wait()
    cl.enqueue_copy(queue, C_cache, C_array)
    return C_cache.reshape(newC_shape)[:C_shape[0], :C_shape[1]]
def pairwise_pyopencl_cpu(data):
    data = np.asarray(data, order='C')
    N, D = data.shape
    try:
        lower, upper = _cache[(data.shape, data.dtype)]
    except:
        lower, upper = pairwise_pyopencl_cpu_prepare(data.shape, data.dtype)
        _cache[(data.shape, data.dtype)] = lower, upper
    data_buf = cl.Buffer(ctx, mf.COPY_HOST_PTR, hostbuf=data)
    dest_buf = cl.Buffer(ctx, mf.WRITE_ONLY, N * N * data.dtype.itemsize)
    try:
        rval, _ = cl.enqueue_map_buffer(queue,
                                        dest_buf,
                                        cl.map_flags.READ,
                                        offset=0,
                                        shape=(N, N),
                                        dtype=data.dtype)
        need_copy = False
    except TypeError:  #OSX's OCL needs this?
        rval = np.empty((N, N), dtype=data.dtype)
        need_copy = True
    lower(queue, (N, 1), (1, 1), data_buf, dest_buf)
    upper(queue, (4, 4), (1, 1), data_buf, dest_buf)
    if need_copy:
        cl.enqueue_copy(queue, rval, dest_buf)
    else:
        queue.finish()
    if PROFILING:
        comptimes.append(1e-9 * (ev.profile.end - ev.profile.start))
        print 'computation time', min(comptimes)
    return rval
Ejemplo n.º 16
0
def test_count_err(cl_env, coord_dtype, collision_programs, size, ngroups,
                   group_size):
    ctx, cq = cl_env
    collider = Collider(ctx, size, ngroups, group_size, coord_dtype,
                        *collision_programs)

    np.random.seed(4)
    coords = np.random.random((size, 3)).astype(coord_dtype)
    radius = 1 / (size**0.5)  # Keep number of collisions under control
    radii = np.random.uniform(0, radius, len(coords)).astype(coord_dtype)
    expected = find_collisions(coords, radii)

    coords_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY,
                           len(coords) * 4 * coord_dtype.itemsize)
    (coords_map,
     _) = cl.enqueue_map_buffer(cq,
                                coords_buf,
                                cl.map_flags.WRITE_INVALIDATE_REGION,
                                0, (len(coords), 4),
                                coord_dtype,
                                is_blocking=True)
    coords_map[..., :3] = coords
    del coords_map
    radii_buf = cl.Buffer(ctx,
                          cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR,
                          hostbuf=radii)
    n_collisions_buf = cl.Buffer(ctx, cl.mem_flags.READ_WRITE,
                                 collider.counter_dtype.itemsize)

    with pytest.raises(ValueError):
        e = collider.get_collisions(cq, coords_buf, radii_buf,
                                    n_collisions_buf, None, len(expected))
Ejemplo n.º 17
0
def pairwise_pyopencl_cpu(data):
    data = np.asarray(data, order='C')
    N, D = data.shape
    try:
        lower, upper = _cache[(data.shape, data.dtype)]
    except:
        lower, upper = pairwise_pyopencl_cpu_prepare(data.shape, data.dtype)
        _cache[(data.shape, data.dtype)] = lower, upper
    data_buf = cl.Buffer(ctx, mf.COPY_HOST_PTR, hostbuf=data)
    dest_buf = cl.Buffer(ctx, mf.WRITE_ONLY, N * N * data.dtype.itemsize)
    try:
        rval, _ = cl.enqueue_map_buffer(queue, dest_buf, cl.map_flags.READ,
                offset=0, shape=(N, N), dtype=data.dtype)
        need_copy = False
    except TypeError: #OSX's OCL needs this?
        rval = np.empty((N, N), dtype=data.dtype)
        need_copy = True
    lower(queue, (N, 1), (1, 1), data_buf, dest_buf)
    upper(queue, (4, 4), (1, 1), data_buf, dest_buf)
    if need_copy:
        cl.enqueue_copy(queue, rval, dest_buf)
    else:
        queue.finish()
    if PROFILING:
        comptimes.append(1e-9 * (ev.profile.end - ev.profile.start))
        print 'computation time', min(comptimes)
    return rval
Ejemplo n.º 18
0
def test_bounds_resized(cl_env, program, coord_dtype, size, old_shape,
                        new_shape):
    ctx, cq = cl_env

    reducer = Bounds(ctx, *old_shape, coord_dtype, program=program)
    reducer.resize(*new_shape)
    if coord_dtype.shape == (3, ):
        value_dtype = dtype((coord_dtype.base, 4))
    else:
        value_dtype = coord_dtype
    values = np.random.normal(size=(size, ) + value_dtype.shape).astype(
        value_dtype.base)

    values_buf = cl.Buffer(ctx,
                           cl.mem_flags.HOST_READ_ONLY | cl.mem_flags.READ_ONLY
                           | cl.mem_flags.COPY_HOST_PTR,
                           hostbuf=values)
    out_buf = cl.Buffer(ctx,
                        cl.mem_flags.HOST_READ_ONLY | cl.mem_flags.WRITE_ONLY,
                        2 * dtype_sizeof(coord_dtype))

    calc_reduce = reducer.reduce(cq, len(values), values_buf, out_buf)
    (out_buf, _) = cl.enqueue_map_buffer(cq,
                                         out_buf,
                                         cl.map_flags.READ,
                                         0, (2, ) + value_dtype.shape,
                                         value_dtype.base,
                                         wait_for=[calc_reduce],
                                         is_blocking=True)

    expected = np.stack([values.min(axis=0), values.max(axis=0)])
    if coord_dtype.shape == (3, ):
        out_buf = out_buf[..., :3]
        expected = expected[..., :3]
    np.testing.assert_equal(out_buf, expected)
Ejemplo n.º 19
0
def test_reducer(cl_env, reduce_program, size, ngroups, group_size, rounds,
                 benchmark):
    ctx, cq = cl_env
    reducer = Bounds(ctx, ngroups, group_size, program=reduce_program)

    values = np.random.uniform(0.0, 1.0, size=(size, 4)).astype('float32')
    expected = np.array([np.min(values, axis=0), np.max(values, axis=0)])

    values_buf = cl.Buffer(ctx,
                           cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR,
                           hostbuf=values)
    output_buf = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, expected.nbytes)

    calc_scan = benchmark.pedantic(reduce,
                                   (cq, reducer, size, values_buf, output_buf),
                                   rounds=rounds,
                                   warmup_rounds=10)

    (output_map, _) = cl.enqueue_map_buffer(cq,
                                            output_buf,
                                            cl.map_flags.READ,
                                            0,
                                            expected.shape,
                                            expected.dtype,
                                            wait_for=[],
                                            is_blocking=True)
    np.testing.assert_equal(output_map[..., :3], expected[..., :3])
Ejemplo n.º 20
0
def test_scanner(cl_env, scan_program, size, group_size, rounds, benchmark):
    ctx, cq = cl_env
    scanner = PrefixScanner(ctx, size, group_size, program=scan_program)

    values = np.random.randint(0, 128, size=size, dtype='uint32')
    expected = np.cumsum(values)

    values_buf = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, values.nbytes)

    calc_scan = benchmark.pedantic(prefix_sum, (cq, scanner, values_buf),
                                   setup=partial(prefix_sum_setup, cq,
                                                 values_buf, values),
                                   rounds=rounds,
                                   warmup_rounds=10)

    (values_map, _) = cl.enqueue_map_buffer(cq,
                                            values_buf,
                                            cl.map_flags.READ,
                                            0,
                                            values.shape,
                                            values.dtype,
                                            wait_for=[],
                                            is_blocking=True)
    assert values_map[0] == 0
    np.testing.assert_equal(values_map[1:], expected[:-1])
Ejemplo n.º 21
0
def test_scatter(cl_env, value_dtype, index_dtype):
    ctx, cq = cl_env

    size = 240
    nindices = 30
    indexer = Indexer(ctx, value_dtype, index_dtype)
    values = (np.random.uniform(0, 1000, (nindices,) + value_dtype.shape)
              .astype(value_dtype.base))
    indices = np.random.choice(size, size=nindices, replace=False).astype(index_dtype)

    values_buf = cl.Buffer(
        ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=values
    )
    values_out_buf = cl.Buffer(
        ctx, cl.mem_flags.WRITE_ONLY, size * value_dtype.itemsize
    )
    index_buf = cl.Buffer(
        ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=indices
    )

    e = cl.enqueue_fill_buffer(
        cq, values_out_buf, np.full(1, 1.0, value_dtype), 0, size * value_dtype.itemsize
    )
    e = indexer.scatter(cq, nindices, values_buf, index_buf, values_out_buf, wait_for=[e])
    (values_map, _) = cl.enqueue_map_buffer(
        cq, values_out_buf, cl.map_flags.READ,
        0, (size,) + value_dtype.shape, value_dtype.base,
        wait_for=[e], is_blocking=True
    )

    selection = np.zeros(size, dtype='bool')
    selection[indices] = True
    np.testing.assert_equal(values_map[indices], values)
    np.testing.assert_equal(values_map[~selection], 1.0)
Ejemplo n.º 22
0
def test_gather(cl_env, value_dtype, index_dtype):
    ctx, cq = cl_env

    size = 240
    nindices = 30
    indexer = Indexer(ctx, value_dtype, index_dtype)
    values = (np.random.uniform(0, 1000, (size,) + value_dtype.shape)
              .astype(value_dtype.base))
    indices = np.random.choice(size, size=nindices, replace=False).astype(index_dtype)

    values_buf = cl.Buffer(
        ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=values
    )
    values_out_buf = cl.Buffer(
        ctx, cl.mem_flags.WRITE_ONLY, nindices * value_dtype.itemsize
    )
    index_buf = cl.Buffer(
        ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=indices
    )

    e = indexer.gather(cq, nindices, values_buf, index_buf, values_out_buf)
    (values_map, _) = cl.enqueue_map_buffer(
        cq, values_out_buf, cl.map_flags.READ,
        0, (nindices,) + value_dtype.shape, value_dtype.base,
        wait_for=[e], is_blocking=True
    )
    np.testing.assert_equal(values_map, values[indices])
Ejemplo n.º 23
0
def test_fill_internal(cl_env, kernels):
    ctx, cq = cl_env

    n = 8
    ids = np.random.permutation(n).astype('uint32')
    nodes_buf = cl.Buffer(
        ctx, cl.mem_flags.READ_WRITE, (2 * n - 1) * Node.itemsize
    )
    ids_buf = cl.Buffer(
        ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=ids
    )

    fill_internal = kernels['fillInternal'](
        cq, (roundUp(n, 32),), None,
        nodes_buf, ids_buf, n,
    )

    (nodes_map, _) = cl.enqueue_map_buffer(
        cq, nodes_buf, cl.map_flags.READ,
        (n - 1) * Node.itemsize, n, Node,
        wait_for=[fill_internal], is_blocking=True
    )
    nodes_map.dtype = Node

    np.testing.assert_equal(nodes_map['data'][:, 0], ids)
    np.testing.assert_equal(nodes_map['right_edge'], np.arange(n))
Ejemplo n.º 24
0
def test_radix_sort(cl_env, radix_program, scan_program, key_dtype, size, gen,
                    group_size, rounds, benchmark):
    ctx, cq = cl_env
    sorter = RadixSorter(ctx,
                         size,
                         group_size,
                         key_dtype=key_dtype,
                         program=radix_program,
                         scan_program=scan_program)

    keys = gen(size, dtype=key_dtype)
    expected = np.sort(keys)

    keys_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY, keys.nbytes)
    out_buf = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, keys.nbytes)

    if key_dtype == np.dtype('uint64'):
        rounds //= 2
    benchmark.pedantic(radix_sort, (cq, sorter, keys_buf, out_buf),
                       setup=partial(radix_sort_setup, cq, [keys_buf], [keys]),
                       rounds=rounds,
                       warmup_rounds=10)

    (out_map, _) = cl.enqueue_map_buffer(cq,
                                         out_buf,
                                         cl.map_flags.READ,
                                         0,
                                         keys.shape,
                                         keys.dtype,
                                         wait_for=[],
                                         is_blocking=True)
    np.testing.assert_equal(out_map, expected)
Ejemplo n.º 25
0
 def _read_buffer(self):
     if (self._addspc == 'global'):
         buf = cl.enqueue_map_buffer(self._ctrl.clqueue, self._buffer, cl.map_flags.READ, offset=0, shape=self._value.shape, dtype=self._value.dtype, order="C", strides=None, wait_for=None, is_blocking=True)[0]
         #cl.enqueue_read_buffer(self._ctrl.clqueue, self._buffer, self._value)            
         self._ctrl.clqueue.finish()
         self._value = np.array(buf)
         del buf
Ejemplo n.º 26
0
def test_offset_missing(cl_env, offset_dtype, value_dtype, offset_program):
    ctx, cq = cl_env
    finder = OffsetFinder(ctx, value_dtype, offset_dtype, offset_program)

    values = np.array([1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3],
                      dtype=value_dtype)
    expected = np.array([0, 0, 7, 7, 13, 13, 13], dtype=offset_dtype)
    values_buf = cl.Buffer(ctx,
                           cl.mem_flags.READ_ONLY | cl.mem_flags.HOST_NO_ACCESS
                           | cl.mem_flags.COPY_HOST_PTR,
                           hostbuf=values)
    offset_buf = cl.Buffer(
        ctx, cl.mem_flags.WRITE_ONLY | cl.mem_flags.HOST_READ_ONLY,
        len(expected) * offset_dtype.itemsize)

    e = finder.find_offsets(cq, values_buf, len(values), offset_buf, 7)
    (offset_map, _) = cl.enqueue_map_buffer(cq,
                                            offset_buf,
                                            cl.map_flags.READ,
                                            0,
                                            len(expected),
                                            offset_dtype,
                                            wait_for=[e],
                                            is_blocking=True)

    np.testing.assert_equal(offset_map, expected)
Ejemplo n.º 27
0
def test_sorter(cl_env, sort_program, scan_program, key_dtype, size,
                group_size):
    ctx, cq = cl_env
    sorter = RadixSorter(ctx,
                         size,
                         group_size,
                         key_dtype=key_dtype,
                         program=sort_program,
                         scan_program=scan_program)
    data = np.random.randint(500, size=size, dtype=key_dtype)
    data_buf = cl.Buffer(ctx,
                         cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR,
                         hostbuf=data)
    out_buf = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, data.nbytes)

    calc_sort = sorter.sort(cq, data_buf, out_buf)

    (out_map, _) = cl.enqueue_map_buffer(cq,
                                         out_buf,
                                         cl.map_flags.READ,
                                         0,
                                         data.shape,
                                         data.dtype,
                                         wait_for=[calc_sort],
                                         is_blocking=True)
    np.testing.assert_equal(out_map, np.sort(data))
Ejemplo n.º 28
0
    def _enqueue_readout(self,
                         buffer,
                         host_array,
                         range_start,
                         range_end,
                         wait_for=None):
        """Enqueue a readout for a buffer created with use_host_ptr.

        This encapsulates all the low level details needed to readout the given range of values.

        Args:
            buffer: the buffer on the device
            host_array (ndarray): the host side array of the given buffer
            range_start (int): the start of the range to read out (in the first dimension)
            range_end (int): the end of the range to read out (in the first dimension)
            wait_for (list of event): the list of events to wait for

        Returns:
            event; the event of the readout
        """
        nmr_problems = range_end - range_start
        return cl.enqueue_map_buffer(self._cl_run_context.queue,
                                     buffer,
                                     cl.map_flags.READ,
                                     range_start * host_array.strides[0],
                                     (nmr_problems, ) + host_array.shape[1:],
                                     host_array.dtype,
                                     order="C",
                                     wait_for=wait_for,
                                     is_blocking=False)[1]
Ejemplo n.º 29
0
    def _init_zero_copy_memory(self):
        self.logger.debug('Initializing NVIDIA zero-copy memory.')
        # Starting points host memory allocation and device copy
        memory = (self.size_of_startingpoint *
                  self.maximum_number_starting_points *
                  self.number_of_sequences * self.number_targets)
        self.pinned_starting_points_zero_copy = cl.Buffer(
            self.ctx, cl.mem_flags.ALLOC_HOST_PTR, size=memory)
        self.d_starting_points_zero_copy = cl.Buffer(self.ctx,
                                                     cl.mem_flags.WRITE_ONLY,
                                                     size=memory)
        self.h_starting_points_zero_copy = cl.enqueue_map_buffer(
            self.queue,
            self.pinned_starting_points_zero_copy,
            cl.map_flags.READ,
            0, (memory, 1),
            dtype=numpy.byte)[0]
        mem_size = memory

        # Global directions host memory allocation and device copy
        memory = (self.length_of_x_sequences * self.number_of_sequences *
                  self.length_of_y_sequences * self.number_targets)
        self.pinned_global_direction_zero_copy = cl.Buffer(
            self.ctx, cl.mem_flags.ALLOC_HOST_PTR, size=memory)
        self.d_global_direction_zero_copy = cl.Buffer(self.ctx,
                                                      cl.mem_flags.WRITE_ONLY,
                                                      size=memory)
        self.h_global_direction_zero_copy = cl.enqueue_map_buffer(
            self.queue,
            self.pinned_global_direction_zero_copy,
            cl.map_flags.READ,
            0, (memory, 1),
            dtype=numpy.byte)[0]
        mem_size += memory

        # Maximum zero copy memory allocation and device copy
        memory = (self.number_of_sequences * self.number_of_targets *
                  SmithWaterman.float_size)
        #        self.pinned_max_possible_score_zero_copy = cl.Buffer(self.ctx, cl.mem_flags.ALLOC_HOST_PTR, size=memory)
        #        self.d_max_possible_score_zero_copy = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY, size=memory)
        #        self.h_max_possible_score_zero_copy = cl.enqueue_map_buffer(self.queue, self.pinned_max_possible_score_zero_copy, cl.map_flags.WRITE, 0,
        #                                                                    (self.number_of_sequences * self.number_of_targets, 1), dtype=numpy.float32)[0]
        mem_size += memory

        # Zero copy buffers are allocated twice in NVIDIA
        return 2 * mem_size
Ejemplo n.º 30
0
def use_naive_kernel(ctx, queue, dev, A, B):
    newA, A_shape = pad(A.copy())
    newB, B_shape = pad(B.copy())

    C_shape = (A.shape[0], B.shape[1])
    newC_shape = (newA.shape[0], newB.shape[1])
    newC = np.zeros(newC_shape, dtype=np.float32)

    A_cache = np.array(newA.flatten(), dtype=np.float32)
    B_cache = np.array(newB.flatten(), dtype=np.float32)
    C_cache = np.array(newC.flatten(), dtype=np.float32)

    max_wg_size = dev.get_info(cl.device_info.MAX_WORK_GROUP_SIZE)
    kernel = naive_kernel()

    mf = cl.mem_flags
    flags = mf.READ_WRITE | mf.COPY_HOST_PTR | mf.ALLOC_HOST_PTR
    A_buffer = cl.Buffer(ctx, flags, hostbuf=A_cache)
    B_buffer = cl.Buffer(ctx, flags, hostbuf=B_cache)
    C_buffer = cl.Buffer(ctx, flags, hostbuf=C_cache)
    A_array, _ = cl.enqueue_map_buffer(queue, A_buffer, cl.map_flags.READ, 0, A_cache.shape, A_cache.dtype, "C")
    B_array, _ = cl.enqueue_map_buffer(queue, B_buffer, cl.map_flags.READ, 0, B_cache.shape, B_cache.dtype, "C")
    C_array, _ = cl.enqueue_map_buffer(queue, C_buffer, cl.map_flags.WRITE, 0, C_cache.shape, C_cache.dtype, "C")

    global_size = (round_up(C_cache.shape[0], max_wg_size),)
    local_size = None

    print("Local Size: ", local_size)
    print("Global Size: ", global_size)

    prg = cl.Program(ctx, kernel).build()

    event = prg.naiveMatMul( queue,
                        global_size,
                        local_size,
                        A_array.data,
                        B_array.data,
                        C_array.data,
                        np.int32(A_shape[1]),
                        np.int32(newC.shape[1]),
                        np.int32(C_shape[0]), # row boundary
                        np.int32(C_shape[1])) # col boundary
    event.wait()
    cl.enqueue_copy(queue, C_cache, C_array)
    return C_cache.reshape(newC_shape)[: C_shape[0], : C_shape[1]]
Ejemplo n.º 31
0
def kernel_test():
    for i in range(0, 100):
        program.array1d_add(queue, (N, ), None, buffer_a, buffer_b, buffer_c)
        array_a, event_a = cl.enqueue_map_buffer(queue,
                                                 buffer_a,
                                                 cl.map_flags.WRITE,
                                                 0,
                                                 shape=(N),
                                                 dtype=np.float32)
        array_c, event_c = cl.enqueue_map_buffer(queue,
                                                 buffer_c,
                                                 cl.map_flags.READ,
                                                 0,
                                                 shape=(N),
                                                 dtype=np.float32)
        with array_a.base, array_c.base:
            for i in range(0, N):
                array_a[i] = array_c[i]
Ejemplo n.º 32
0
def test_problem_codes(cl_env, kernels, coord_dtype):
    from .test_collision_py import find_collisions
    ctx, cq = cl_env

    codes = np.array([0b00000000000000000000000000000000,
                      0b00000000000000000000000000000000,
                      0b00000110110000110100000100000010,
                      0b00001001001001001001001001001001,
                      0b00001001001001001001001001001001,
                      0b00010010010010010010010010010010,
                      0b00010010010010010010010010010010,
                      0b00010010011010010010011011011010,
                      0b00011001001011001001011001001011,
                      0b00011011011011011011011011011011,
                      0b00100100010000100010110100010110,
                      0b00100100100100100100100100100100,
                      0b00100100100101101101100101100100,
                      0b00101001101001101101101101101001,
                      0b00101101101101101101101101101101,
                      0b00110110110110110110110110110110, # This node had no parent
                      0b00110110110110110110110110110110,
                      0b00110110110110110110110110110110,
                      0b00111111111111111111111111111111,
                      0b00111111111111111111111111111111,
                      0b00111111111111111111111111111111], dtype='uint32')
    ids = np.arange(len(codes), dtype='uint32')
    n_nodes = 2 * len(codes) - 1

    codes_buf = cl.Buffer(
        ctx, cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, hostbuf=codes
    )
    ids_buf = cl.Buffer(
        ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=ids
    )
    nodes_buf = cl.Buffer(
        ctx, cl.mem_flags.READ_WRITE, n_nodes * Node.itemsize
    )

    fill_internal = kernels['fillInternal'](
        cq, (roundUp(len(codes), 32),), None,
        nodes_buf, ids_buf, len(codes),
    )

    generate_bvh = kernels['generateBVH'](
        cq, (roundUp(len(codes) - 1, 32),), None,
        codes_buf, nodes_buf, len(codes),
        wait_for=[fill_internal]
    )

    (nodes_map, _) = cl.enqueue_map_buffer(
        cq, nodes_buf, cl.map_flags.READ,
        0, n_nodes, Node,
        wait_for=[generate_bvh], is_blocking=True
    )
    nodes_map.dtype = Node
    assert set(nodes_map['parent'][1:]) == set(range(len(codes) - 1))
Ejemplo n.º 33
0
 def map_write(self) -> np.ndarray:
     if self.mapping is None:
         self.mapping = cl.enqueue_map_buffer(self.queue,
                                              self.buffer,
                                              cl.map_flags.WRITE_INVALIDATE_REGION,
                                              0,
                                              self.shape,
                                              self.dtype)
     self.map_count += 1
     return self.mapping[0]
Ejemplo n.º 34
0
 def map_read(self) -> np.ndarray:
     if self.mapping is None:
         self.mapping = cl.enqueue_map_buffer(self.queue,
                                              self.buffer,
                                              cl.map_flags.READ,
                                              0,
                                              self.shape,
                                              self.dtype)
     self.map_count += 1
     return self.mapping[0]
Ejemplo n.º 35
0
    def _get_direction_byte_array(self):
        self.h_global_direction_zero_copy = cl.enqueue_map_buffer(
            self.queue,
            self.d_global_direction_zero_copy,
            cl.map_flags.READ,
            0, (self.number_of_sequences, self.number_targets,
                self.x_div_shared_x, self.y_div_shared_y, self.shared_x,
                self.shared_y),
            dtype=numpy.byte)[0]

        return self.h_global_direction_zero_copy
Ejemplo n.º 36
0
 def get_mem_map(self):
     """read buffer"""
     if (self._addspc == '__global'):
         buf = cl.enqueue_map_buffer(self._solverobj.clqueue, self._buffer, cl.map_flags.READ, offset=0, shape=self._array.shape, dtype=self._array.dtype, order="C", strides=None, wait_for=None, is_blocking=True)[0]
         self._array = np.array(buf)
         del buf
         self._solverobj.clqueue.finish()
     #for local vars return None
     if (self._addspc == '__local'):
         return None
     #return the array
     return self._array
Ejemplo n.º 37
0
    def map(self, map_flags, offset=None, shape=None, wait_for=None):
        """ Context manager that maps the buffer data as a numpy array.
        `wait_for` can be either None or list of opencl.Event. """

        if offset is None:
            offset = 0
        if shape is None:
            shape = self.shape

        array, _event = pyopencl.enqueue_map_buffer(
            self.queue,
            self,
            map_flags,
            offset,
            shape,
            self.dtype,
            wait_for=wait_for,
            is_blocking=True,
        )
        with array.base:
            yield array
Ejemplo n.º 38
0
    # Enqueue command to copy from buffers to host memory
    # Store data transfer event (return value)
    prof_event = cl.enqueue_copy(queue, dest=c, src=c_buff, is_blocking=True)

    read_time += prof_event.profile.end - prof_event.profile.start


# Execute the kernel repeatedly using enqueue_map_buffer
map_time = 0.0
for i in range(NUM_ITERATIONS):
    # __call__(queue, global_size, local_size, *args, global_offset=None, wait_for=None, g_times_l=False)
    # Store kernel execution event (return value)
    kernel_event = kernel(queue, global_size, local_size, c_buff, np.int32(NUM_VECTORS))

    # Enqueue command to map from buffer two to host memory
    (result_array, prof_event) = cl.enqueue_map_buffer(queue,
                                                       buf=c_buff,
                                                       flags=cl.map_flags.READ,
                                                       offset=0,
                                                       shape=(NUM_VECTORS,),
                                                       dtype=cl.array.vec.char16)

    map_time += prof_event.profile.end - prof_event.profile.start

    # Release the mapping (is this necessary?)
    result_array.base.release(queue)

# Print averaged results
print('Average read time (ms): {}'.format(read_time / ( NUM_ITERATIONS * 1000)))
print('Average map time (ms): {}'.format(map_time / ( NUM_ITERATIONS * 1000)))
Ejemplo n.º 39
0
# Create buffers
flags = cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR
buffer_one = cl.Buffer(context, flags, hostbuf=data_one)
buffer_two = cl.Buffer(context, flags, hostbuf=data_two)

# Set buffers as arguments to the kernel
# The arguments can also be specified by calling kernel(....) directly instead
kernel = prog.blank  # Note: Every call like this produces a new object
kernel.set_arg(0, buffer_one)
kernel.set_arg(1, buffer_two)

# Enqueue kernel (with arguments)
n_globals = data_one.shape
n_locals = None
cl.enqueue_nd_range_kernel(queue, kernel, n_globals, n_locals)

# Enqueue command to copy from buffer one to buffer two
cl.enqueue_copy(queue, dest=buffer_two, src=buffer_one)

# Enqueue command to map from buffer two to host memory
# enqueue_map_buffer(queue, buf, flags, offset, shape, dtype, order="C", strides=None, wait_for=None, is_blocking=True)
(result_array, _) = cl.enqueue_map_buffer(
    queue, buf=buffer_two, flags=cl.map_flags.READ, offset=0, shape=(100,), dtype=np.float32
)

print("\nSource array:")
print(data_two)

print("\nAfter copy back:")
print(result_array)
Ejemplo n.º 40
0
#!/usr/bin/env python

import pyopencl as cl
import numpy as np
import numpy.linalg as la


ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)

#ary = np.arange(3*4*5).reshape(3,4,5, order="C").astype(np.float32)
ary = np.arange(3*4*5).reshape(3,4,5, order="F").astype(np.float32)
#ary = np.arange(3*4*5).astype(np.float32)

mf = cl.mem_flags
flags = mf.READ_WRITE | mf.COPY_HOST_PTR | mf.ALLOC_HOST_PTR
buf = cl.Buffer(ctx, flags, hostbuf = ary)
queue.finish()

ar2 = np.empty_like(ary)
cl.enqueue_read_buffer(queue, buf, ar2)
print la.norm(ary-ar2), ary.strides, ar2.strides

#ar3, evt = cl.enqueue_map_buffer(queue, buf, cl.map_flags.READ, 0, ary.shape, ary.dtype, "C")
ar3, evt = cl.enqueue_map_buffer(queue, buf, cl.map_flags.READ, 0, ary.shape, ary.dtype, "F")
print la.norm(ary-ar3), ary.strides, ar3.strides