def filter_primes(bit_array, offset):

            if (not len(self.primes)):
                return empty_bitarray()

            b = numpy.array(self.primes, dtype=numpy.uint32)
            a = empty_bitarray()
            c = numpy.array(offset, dtype=numpy.uint32)

            b_buf = cl.Buffer(self.ctx,
                              self.mf.READ_ONLY | self.mf.COPY_HOST_PTR,
                              hostbuf=b)
            a_buf = cl.Buffer(self.ctx,
                              self.mf.READ_WRITE | self.mf.COPY_HOST_PTR,
                              hostbuf=a)
            c_buf = cl.Buffer(self.ctx,
                              self.mf.READ_ONLY | self.mf.COPY_HOST_PTR,
                              hostbuf=c)

            # send integers and new bit mask to pfilter
            event2 = self.program.pfilter(self.queue, (self.block_size, ),
                                          None, b_buf, a_buf, c_buf)
            cl.enqueue_read_buffer(self.queue, a_buf, a)

            print 'Filter Duration:', 1e-9 * (event2.profile.end -
                                              event2.profile.start)

            return a
Beispiel #2
0
def test_that_python_args_fail(ctx_factory):
    context = ctx_factory()

    prg = cl.Program(context, """
        __kernel void mult(__global float *a, float b, int c)
        { a[get_global_id(0)] *= (b+c); }
        """).build()

    a = np.random.rand(50000)
    queue = cl.CommandQueue(context)
    mf = cl.mem_flags
    a_buf = cl.Buffer(context, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=a)

    knl = cl.Kernel(prg, "mult")
    try:
        knl(queue, a.shape, None, a_buf, 2, 3)
        assert False, "PyOpenCL should not accept bare Python types as arguments"
    except cl.LogicError:
        pass

    try:
        prg.mult(queue, a.shape, None, a_buf, float(2), 3)
        assert False, "PyOpenCL should not accept bare Python types as arguments"
    except cl.LogicError:
        pass

    prg.mult(queue, a.shape, None, a_buf, np.float32(2), np.int32(3))

    a_result = np.empty_like(a)
    cl.enqueue_read_buffer(queue, a_buf, a_result).wait()
Beispiel #3
0
def test_cl():
    ctx = cl.create_some_context()  # (interactive=False)

    # print 'ctx', ctx
    queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)
    f = open("part1.cl", "r")
    fstr = "".join(f.readlines())
    program = cl.Program(ctx, fstr).build()
    mf = cl.mem_flags

    cameraPos = np.array([0, 6, -1, 0])
    invView = la.inv(look_at((0, 6, -1), (0, 1, 1), (0, 1, 0)))
    invProj = la.inv(perspective(60, 1, 1, 1000))
    print "view", invView
    print "proj", invProj
    viewParamsData = (
        cameraPos.flatten().tolist()
        + np.transpose(invView).flatten().tolist()
        + np.transpose(invProj).flatten().tolist()
    )
    # print 'vpd', viewParamsData
    viewParams = struct.pack("4f16f16f", *viewParamsData)
    viewParams_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=viewParams)
    num_pixels = 1000 * 1000
    # setup opencl
    dest = np.ndarray((1000, 1000, 4), dtype=np.float32)
    dest_buf = cl.Buffer(ctx, mf.WRITE_ONLY, dest.nbytes)
    local_shape = (8, 8)
    # run kernel
    evt = program.part1(queue, (dest.shape[0], dest.shape[1]), None, viewParams_buf, dest_buf)
    # evt = program.part1(queue, dest.shape, None, dest_buf)
    cl.enqueue_read_buffer(queue, dest_buf, dest).wait()
    print "time", (evt.profile.end - evt.profile.start) * 0.000001, "ms"
    return dest
Beispiel #4
0
    def search(self, midstate):
        msg = flipendian32(midstate)

        for i in xrange(8):
            self.sha512_fill.set_arg(i, msg[i * 4:i * 4 + 4])
        self.sha512_fill.set_arg(8, self.hashes_buf)
        self.sha512_fill.set_arg(9, self.keyhash_buf)
        # t1 = time.time()
        cl.enqueue_nd_range_kernel(self.queue, self.sha512_fill,
                                   (HASHES_NUM, ), (self.sha512_fill_ws, ))
        self.queue.finish()
        # print "fill %f" % (time.time() - t1)

        output = bytearray(OUTPUT_SIZE)
        cl.enqueue_write_buffer(self.queue, self.output_buf, output)
        self.queue.finish()

        self.ksearch.set_arg(0, self.hashes_buf)
        self.ksearch.set_arg(1, self.keyhash_buf)
        self.ksearch.set_arg(2, self.output_buf)
        cl.enqueue_nd_range_kernel(self.queue, self.ksearch, (KEYS_NUM, ),
                                   (self.ksearch_ws, ))
        self.queue.finish()
        cl.enqueue_read_buffer(self.queue, self.output_buf, output)
        self.queue.finish()
        return str(output)
Beispiel #5
0
def test_gpu_aes():
	import pyopencl as cl
	import numpy

	# Prepare context and command queue
	ctx = cl.create_some_context(interactive=False)
	queue = cl.CommandQueue(ctx)

	print "Compiling kernel ..."
	with open_cl("ralink.cl", "r") as fp:
		code = fp.read() % { 'STARTTIME': 0, 'MACADDR1': 0, 'MACADDR2': 0,
				'NONCE1': 0, 'NONCE2': 0, 'NONCE3': 0, 'NONCE4': 0,
				'KEYSTREAM1': 0, 'KEYSTREAM2': 0,}
		program = cl.Program(ctx, code).build(options="-I %s" % get_opencl_path())

	# Prepare memory
	result = numpy.zeros(shape=(8), dtype=numpy.uint32)
	result[0] = 0xffffffff;
	result[1] = 0xffffffff;
	dest_buf = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY| cl.mem_flags.COPY_HOST_PTR, hostbuf=result)

	# Run the program
	print "Running kernel ..."
	program.test_aes(queue, (1,), None, dest_buf)

	# Read the result
	cl.enqueue_read_buffer(queue, dest_buf, result).wait()
	print list2hex(result)
	assert result[0] == 0xD4E415CB
	assert result[1] == 0xD038A82B
	assert result[2] == 0x10A673DE
	assert result[3] == 0xEA25B206
Beispiel #6
0
def get_iterations(context, complex_values, iterations):
    command_queue = cl.CommandQueue(context)
    output_array = np.zeros(complex_values.shape, dtype=clar.vec.ushort4)

    flags = cl.mem_flags
    complex_values_buffer = cl.Buffer(context,
                                      flags.READ_ONLY | flags.COPY_HOST_PTR,
                                      hostbuf=complex_values)
    gradient_array_buffer = cl.Buffer(context,
                                      flags.READ_ONLY | flags.COPY_HOST_PTR,
                                      hostbuf=gradient)
    output_array_buffer = cl.Buffer(context, flags.WRITE_ONLY,
                                    output_array.nbytes)
    test = open('kernel.cl', 'r')
    program = cl.Program(context, test.read()).build()
    test.close()

    program.mandelbrot(
        command_queue,
        complex_values.shape,
        None,  # Local memory size not specified
        complex_values_buffer,
        output_array_buffer,
        gradient_array_buffer,
        #np.uint(iterations)
    )

    cl.enqueue_read_buffer(command_queue, output_array_buffer,
                           output_array).wait()

    return output_array
Beispiel #7
0
	def getData(self):
		if self.tickState == False:
			self.kUtil.GetWorld(self.queue, self.a.shape, None, self.ar_ySize, self.a_buf, self.dest_buf)
			cl.enqueue_read_buffer(self.queue, self.dest_buf, self.a).wait()
		else:
			self.kUtil.GetWorld(self.queue, self.a.shape, None, self.ar_ySize, self.b_buf, self.dest_buf)
			cl.enqueue_read_buffer(self.queue, self.dest_buf, self.a).wait()
Beispiel #8
0
    def getData(self, n, axis, data_D, data_H, name):
        """get data from device"""
        cl.enqueue_read_buffer(lbm.queue, data_D, data_H).wait()

        # retrieve mid cell points from cell node data
        if axis == 'x':
            N = lbm.X.size - 1
            X = lbm.X
            y = data_H[:, n]
        elif axis == 'y':
            N = lbm.Y.size - 1
            X = lbm.Y
            y = data_H[n, :]

        x = np.zeros((N))

        for i in range(1, X.size):
            x[i - 1] = (X[i] - X[i - 1]) / 2.0 + X[i - 1]

        self.x = x
        self.y = y
        self.n = n
        self.axis = axis
        self.data_D = data_D
        self.data_H = data_H
        self.name = name

        self.plotLine()

        return
Beispiel #9
0
    def getData(self, data_D, data_H, name):
        """
        plot passed in data as a surface
        """

        #plotting
        fig = mlab.figure(size=(512, 512))

        cl.enqueue_read_buffer(lbm.queue, data_D, data_H).wait()

        # retrieve mid cell points from cell node data
        Nx = lbm.X.size - 1
        Ny = lbm.Y.size - 1
        x = np.zeros((Nx))
        y = np.zeros((Ny))

        for i in range(1, lbm.X.size):
            x[i - 1] = (lbm.X[i] - lbm.X[i - 1]) / 2.0 + lbm.X[i - 1]

        for i in range(1, lbm.Y.size):
            y[i - 1] = (lbm.Y[i] - lbm.Y[i - 1]) / 2.0 + lbm.Y[i - 1]

        s = mlab.surf(x, y, data_H, warp_scale='auto', colormap="jet")
        mlab.axes(s)
        sb = mlab.scalarbar(s, title=name)

        self.s = s
        self.data_D = data_D
        self.data_H = data_H
Beispiel #10
0
 def __call__(self, ctx, x1, y1, x2, y2, rx, ry, sw, sh, ez, ex, ey):
     self.build(ctx)
     x1 = np.array(x1, dtype=np.float32, copy=False)
     y1 = np.array(y1, dtype=np.float32, copy=False)
     x2 = np.float32(x2)
     y2 = np.float32(y2)
     ez = np.float32(ez)
     ex = np.float32(ex)
     ey = np.float32(ey)
     rx = np.float32(rx)
     ry = np.float32(ry)
     sw = np.float32(sw)
     sh = np.float32(sh)
     x1_buf = cl.Buffer(self.ctx,
                        cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR,
                        hostbuf=x1)
     y1_buf = cl.Buffer(self.ctx,
                        cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR,
                        hostbuf=y1)
     out_buf = cl.Buffer(self.ctx, cl.mem_flags.WRITE_ONLY, x1.nbytes)
     queue = cl.CommandQueue(self.ctx)
     self.prg.subtended_angle2_naive(queue, x1.shape, None, x1_buf, y1_buf,
                                     x2, y2, rx, ry, sw, sh, ez, ex, ey,
                                     out_buf)
     out = np.empty_like(x1)
     cl.enqueue_read_buffer(queue, out_buf, out).wait()
     x1_buf.release()
     y1_buf.release()
     out_buf.release()
     return out
Beispiel #11
0
    def clFindRoute(self, key, candidateTableList):
        timeKernelHash = 0
        self.pos = np.array(-1, dtype=np.int32)

        cl.enqueue_write_buffer(self.queue, self.pos_buf, self.pos)

        for elem in candidateTableList:
            key.prefixlen = (elem)
            ip = int(key.network)

            event = self.program.match(self.queue,
                                   self.tableShape[elem-1],
                                   None,
                                   self.table_buf[elem-1],
                                   self.pos_buf,
                                   np.int32(ip)
                                   )
            event.wait()
            cl.enqueue_read_buffer(self.queue, self.pos_buf, self.pos)
            timeKernelHash += event.profile.end - event.profile.start
            if (self.pos != -1):
                break
        # print("Measured Time kernel Hash (eventProfiler OpenCL function): {:5.8f}"
        #       .format(1e-9*timeKernelHash))
        return [elem, self.pos]
Beispiel #12
0
    def compute(self, floatimage, histogram, k):
        width, height, nbins = np.shape(histogram)
        numpixels = width * height

        image_linear = np.reshape(floatimage, (numpixels, )).astype(np.float32)
        histogram_linear = np.reshape(
            histogram, (np.size(histogram), )).astype(np.float32)
        transform = np.zeros_like(image_linear).astype(np.float32)

        mf = cl.mem_flags
        self.buf_image = cl.Buffer(self.context,
                                   mf.READ_ONLY | mf.COPY_HOST_PTR,
                                   hostbuf=image_linear)
        self.buf_histogram = cl.Buffer(self.context,
                                       mf.READ_ONLY | mf.COPY_HOST_PTR,
                                       hostbuf=histogram_linear)
        self.output_buf = cl.Buffer(self.context, mf.READ_WRITE,
                                    transform.nbytes)

        kernel = self.program.IIF
        kernel.set_scalar_arg_dtypes([np.uintc, np.uintc, np.float32] +
                                     [None] * 3)
        kernel.set_arg(0, np.uintc(width))
        kernel.set_arg(1, np.uintc(height))
        kernel.set_arg(2, np.float32(k))
        kernel.set_arg(3, self.buf_image)
        kernel.set_arg(4, self.buf_histogram)
        kernel.set_arg(5, self.output_buf)

        cl.enqueue_nd_range_kernel(self.queue, kernel, image_linear.shape,
                                   None).wait()

        cl.enqueue_read_buffer(self.queue, self.output_buf, transform).wait()
        return np.reshape(transform, (width, height)).astype(np.float)
Beispiel #13
0
    def compute(self, image, num_bins):
        width, height = np.shape(image)
        numpixels = width * height

        image = np.reshape(image, (numpixels, )).astype(np.float32)
        result = np.zeros((numpixels * num_bins, ), dtype=np.float32)

        mf = cl.mem_flags
        self.buf_image = cl.Buffer(self.context,
                                   mf.READ_ONLY | mf.COPY_HOST_PTR,
                                   hostbuf=image)
        self.output_buf = cl.Buffer(self.context, mf.READ_WRITE, result.nbytes)

        kernel = self.program.iif_binid
        kernel.set_scalar_arg_dtypes([np.uintc, np.uintc, np.ubyte] +
                                     [None] * 2)
        kernel.set_arg(0, np.uintc(width))
        kernel.set_arg(1, np.uintc(height))
        kernel.set_arg(2, np.ubyte(num_bins))
        kernel.set_arg(3, self.buf_image)
        kernel.set_arg(4, self.output_buf)

        cl.enqueue_nd_range_kernel(self.queue, kernel, image.shape,
                                   None).wait()

        cl.enqueue_read_buffer(self.queue, self.output_buf, result).wait()
        return np.reshape(result, (width, height, num_bins)).astype(np.float32)
Beispiel #14
0
    def FuseRGBD_GPU(self, Image, boneDQ, jointDQ):
        """
        Update the TSDF volume with Image
        :param Image: RGBD image to update to its surfaces
        :param boneDQ: the dual quaternion of bone in new frame
        :param jointDQ: the dual quaternion of joint in new frame
        :param bp: the indexof body part
        :return: none
        """
        # initialize buffers
        #cl.enqueue_write_buffer(self.GPUManager.queue, self.Pose_GPU, Tg)
        cl.enqueue_write_buffer(self.GPUManager.queue, self.DepthGPU,
                                Image.depth_image)
        cl.enqueue_write_buffer(self.GPUManager.queue, self.boneDQGPU, boneDQ)
        cl.enqueue_write_buffer(self.GPUManager.queue, self.jointDQGPU,
                                jointDQ)

        # fuse data of the RGBD imnage with the TSDF volume 3D model
        self.GPUManager.programs['FuseTSDF'].FuseTSDF(self.GPUManager.queue, (self.Size[0], self.Size[1]), None, \
                                self.TSDFGPU, self.DepthGPU, self.Param, self.Size_Volume, self.Pose_GPU, \
                                self.boneDQGPU, self.jointDQGPU, self.planeF,\
                                self.Calib_GPU, np.int32(Image.Size[0]), np.int32(Image.Size[1]),self.WeightGPU)

        # update CPU array. Read the buffer to write in the CPU array.
        cl.enqueue_read_buffer(self.GPUManager.queue, self.TSDFGPU,
                               self.TSDF).wait()
        '''
        # TEST if TSDF contains NaN
        TSDFNaN = np.count_nonzero(np.isnan(self.TSDF))
        print "TSDFNaN : %d" %(TSDFNaN)
        '''
        cl.enqueue_read_buffer(self.GPUManager.queue, self.WeightGPU,
                               self.Weight).wait()
Beispiel #15
0
def do_opencl_pow(hash, target):
    output = numpy.zeros(1, dtype=[('v', numpy.uint64, 1)])
    if (len(enabledGpus) == 0):
        return output[0][0]

    data = numpy.zeros(1, dtype=hash_dt, order='C')
    data[0]['v'] = ("0000000000000000" + hash).decode("hex")
    data[0]['target'] = target

    hash_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=data)
    dest_buf = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, output.nbytes)

    kernel = program.kernel_sha512
    worksize = kernel.get_work_group_info(cl.kernel_work_group_info.WORK_GROUP_SIZE, enabledGpus[0])

    kernel.set_arg(0, hash_buf)
    kernel.set_arg(1, dest_buf)

    start = time.time()
    progress = 0
    globamt = worksize*2000

    while output[0][0] == 0 and shutdown == 0:
        kernel.set_arg(2, pack("<Q", progress))
        cl.enqueue_nd_range_kernel(queue, kernel, (globamt,), (worksize,))
        cl.enqueue_read_buffer(queue, dest_buf, output)
        queue.finish()
        progress += globamt
        sofar = time.time() - start
#       logger.debug("Working for %.3fs, %.2f Mh/s", sofar, (progress / sofar) / 1000000)
    if shutdown != 0:
        raise Exception ("Interrupted")
    taken = time.time() - start
#   logger.debug("Took %d tries.", progress)
    return output[0][0]
    def execute(self, params):
        ''' This handles the actual execution for the processing, which would
        get executed on each request - this is where we care about the
        performance
        '''

        timing.timings.start('load')
        self.load_program(params)
        timing.timings.stop('load')
        finish = timing.timings.timings['load']['timings'][-1]
        print '<<< Loaded program in %s ms' % (finish)

        timing.timings.start('execute')
        # Start the program
        self.program.worker(self.queue, 
            self.data['income'].shape,
            None,
            self.income_buf,
            self.capGains_buf,
            self.dividendsInterest_buf,
            self.children_buf,
            self.dest_buf,
        )

        # Get an empty numpy array in the shape of the original data
        result = numpy.empty_like(self.data['income'])

        #Wait for result
        cl.enqueue_read_buffer(self.queue, self.dest_buf, result).wait()

        #show timing info
        timing.timings.stop('execute')
        finish = timing.timings.timings['execute']['timings'][-1]
        print '<<< Executed in %s ms' % (finish)
        return result
Beispiel #17
0
    def randomfill(self):
        t = getTime()
        mf = cl.mem_flags
        self.inputBuf = [
            cl.Buffer(self.ctx,
                      mf.READ_ONLY | mf.COPY_HOST_PTR,
                      hostbuf=self.img[i]) for i in [0, 1]
        ]
        self.outputBuf = cl.Buffer(self.ctx,
                                   mf.WRITE_ONLY | mf.COPY_HOST_PTR,
                                   hostbuf=self.nff)

        self.program.randomfill(
            self.queue,
            self.effectiveSize,
            None,
            numpy.int32(self.patchSize[0]),  #patchHeight
            numpy.int32(self.patchSize[1]),  #patchWidth
            numpy.int32(self.size[0]),  #height
            numpy.int32(self.size[1]),  #width
            self.inputBuf[0],
            self.inputBuf[1],
            self.outputBuf)
        c = numpy.empty_like(self.nff)
        cl.enqueue_read_buffer(self.queue, self.outputBuf, c).wait()
        self.nff = numpy.copy(c)

        self.times["randomfill"] += getTime() - t
Beispiel #18
0
def do_opencl_pow(hash, target):
	output = numpy.zeros(1, dtype=[('v', numpy.uint64, 1)])
	if (ctx == False):
		return output[0][0]
	
	data = numpy.zeros(1, dtype=hash_dt, order='C')
	data[0]['v'] = ("0000000000000000" + hash).decode("hex")
	data[0]['target'] = target
	
	hash_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=data)
	dest_buf = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, output.nbytes)
	
	kernel = program.kernel_sha512
	worksize = kernel.get_work_group_info(cl.kernel_work_group_info.WORK_GROUP_SIZE, gpus[0])

	kernel.set_arg(0, hash_buf)
	kernel.set_arg(1, dest_buf)

	start = time.time()
	progress = 0
	globamt = worksize*2000

	while output[0][0] == 0:
		kernel.set_arg(2, pack("<Q", progress))
		cl.enqueue_nd_range_kernel(queue, kernel, (globamt,), (worksize,))
		cl.enqueue_read_buffer(queue, dest_buf, output)
		queue.finish()
		progress += globamt
		sofar = time.time() - start
		print sofar, progress / sofar, "hashes/sec"
	taken = time.time() - start
	print progress, taken
	return output[0][0]
Beispiel #19
0
 def execute(self, *args, **kwargs):
     self.load_data(*args, **kwargs)
     self.program.program__(self.queue, self.a.shape, None, self.a_buf,
                            self.b_buf, self.dest_buf)
     c = np.empty_like(self.a)
     cl.enqueue_read_buffer(self.queue, self.dest_buf, c).wait()
     return c
Beispiel #20
0
def map_function(data):
    proc = subprocess.Popen(["../bin/get-host-platform-device.sh"], stdout=subprocess.PIPE, shell=True)
    (proc_out, err) = proc.communicate()
    [SPARKCL_PLATFORM , SPARKCL_DEVICE] = proc_out.split()

    KERNEL_CODE="""
        __kernel void ArraySum(__global float *A,__global float *B,__global float *C){
       int i = get_global_id(0);
       C[i] = A[i]+B[i];
}

    """

    cl_device=cl.get_platforms()[int(SPARKCL_PLATFORM)].get_devices()[int(SPARKCL_DEVICE)]
    ctx = cl.Context([cl_device])
    queue = cl.CommandQueue(ctx)
    prg = cl.Program(ctx, KERNEL_CODE).build()
    kernel = prg.ArraySum
    mf = cl.mem_flags

    np_data = []
    np_data.append(np.array(data[0]).astype(np.float32))
    np_data.append(np.array(data[1]).astype(np.float32))

    data_buf = []
    data_buf.append(cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=np_data[0]))
    data_buf.append(cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=np_data[1]))

    result = np.zeros((5, )).astype(np.float32)
    result_buf = cl.Buffer(ctx, mf.WRITE_ONLY, result.nbytes)

    kernel(queue,(5,),None,data_buf[0],data_buf[1],result_buf)
    cl.enqueue_read_buffer(queue, result_buf, result).wait()
    return result
Beispiel #21
0
    def transform_uint32(self, data_np,
                         flip_x=False, flip_y=False, swap_xy=False,
                         out=None):

        height, width = data_np.shape[:2]

        new_ht, new_wd = height, width
        if swap_xy:
            new_ht, new_wd = width, height
        new_size = [new_ht, new_wd] + list(data_np.shape[2:])

        mf = cl.mem_flags

        #create OpenCL buffers on devices
        data_np = np.ascontiguousarray(data_np)
        src_buf = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR,
                            hostbuf=data_np)
        dst_buf = cl.Buffer(self.ctx, mf.WRITE_ONLY, data_np.nbytes)

        evt = self.program.image_transform_uint32(self.queue, [height, width], None,
                                                  src_buf, dst_buf,
                                                  np.int32(width), np.int32(height),
                                                  np.int32(flip_x), np.int32(flip_y),
                                                  np.int32(swap_xy))

        if out is None:
            out = np.empty_like(data_np).reshape(new_size)
        cl.enqueue_read_buffer(self.queue, dst_buf, out).wait()

        return out
Beispiel #22
0
def do_opencl_pow(hash, target):
	output = numpy.zeros(1, dtype=[('v', numpy.uint64, 1)])
	if (ctx == False):
		return output[0][0]
	
	data = numpy.zeros(1, dtype=hash_dt, order='C')
	data[0]['v'] = ("0000000000000000" + hash).decode("hex")
	data[0]['target'] = target
	
	hash_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=data)
	dest_buf = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, output.nbytes)
	
	kernel = program.kernel_sha512
	worksize = kernel.get_work_group_info(cl.kernel_work_group_info.WORK_GROUP_SIZE, cl.get_platforms()[0].get_devices()[1])

	kernel.set_arg(0, hash_buf)
	kernel.set_arg(1, dest_buf)

	start = time.time()
	progress = 0
	globamt = worksize*2000

	while output[0][0] == 0:
		kernel.set_arg(2, pack("<Q", progress))
		cl.enqueue_nd_range_kernel(queue, kernel, (globamt,), (worksize,))
		cl.enqueue_read_buffer(queue, dest_buf, output)
		queue.finish()
		progress += globamt
		sofar = time.time() - start
		print sofar, progress / sofar, "hashes/sec"
	taken = time.time() - start
	print progress, taken
	return output[0][0]
	def exchange_boundary_h(s):
		for queue, eh_fields, tmpf, offset in zip(s.queues, s.eh_fields_gpus, s.tmpfs, s.offsets)[:-1]:
			cl.enqueue_read_buffer(queue, eh_fields[4], tmpf[0], offset)	# hy_gpu
			cl.enqueue_read_buffer(queue, eh_fields[5], tmpf[1], offset)	# hz_gpu
		for queue, eh_fields, tmpf in zip(s.queues[1:], s.eh_fields_gpus[1:], s.tmpfs[:-1]):
			cl.enqueue_write_buffer(queue, eh_fields[4], tmpf[0])
			cl.enqueue_write_buffer(queue, eh_fields[5], tmpf[1])
    def execute(self):
        """ This handles the actual execution for the processing, which would
        get executed on each request - this is where we care about the
        performance
        """
        timing.timings.start("execute")

        # Start the program
        self.program.worker(self.queue, self.data1.shape, None, self.data1_buf, self.data2_buf, self.dest_buf)

        # Get an empty numpy array in the shape of the original data
        result = numpy.empty_like(self.data1)

        # Wait for result
        cl.enqueue_read_buffer(self.queue, self.dest_buf, result).wait()

        # show timing info
        timing.timings.stop("execute")
        finish = timing.timings.timings["execute"]["timings"][-1]
        print "<<< DONE in %s" % (finish)

        # Open data file to append to
        data_file = open(os.path.join(os.path.dirname(os.path.abspath(__file__)), "../data.csv"), "a")
        data_file.write("PyOpenCl %s,%s,%s,%s\n" % (process_type, finish, num_records, num_calculations))
        data_file.close()
Beispiel #25
0
    def update(self, sub_pos, angle, min_dist, max_dist, width, in_weight,
               out_weight):
        '''
        Perform one update on the probabilities by using the evidence that
        the sub is at position sub_pos, the target is seen at an absolute heading
        of `angle` and is most likely between min_dist and max_dist away.
        in_weight gives the chance that for every point in the region,
        if the buoy is there then we would get this result
        i.e. in_weight = P(this measurement | buoy at point p) for p in our region
        out_weight is the same but for points outside the region
        '''

        n, e = sub_pos
        cl_program.evidence(cl_queue, self.norths.shape, None, self.norths_buf,
                            self.easts_buf, self.prob_buf, float32(n),
                            float32(e), float32(radians(angle)),
                            float32(min_dist**2), float32(max_dist**2),
                            float32(width), float32(in_weight),
                            float32(out_weight))
        #TODO ?
        cl.enqueue_read_buffer(cl_queue, self.prob_buf,
                               self.probabilities).wait()

        #Normalize
        total_prob = numpy.sum(self.probabilities)
        self.probabilities /= total_prob

        cl.enqueue_write_buffer(cl_queue, self.prob_buf, self.probabilities)
Beispiel #26
0
	def execute(self, *args, **kwargs):
		self.load_data(*args, **kwargs)
		self.program.program__(self.queue, self.a.shape, None, 
						self.a_buf, self.b_buf, self.dest_buf)
		c = np.empty_like(self.a)
		cl.enqueue_read_buffer(self.queue, self.dest_buf, c).wait()
		return c
Beispiel #27
0
    def getMultipleRows(self,rowbase,rowlimit): #{{{
        """Computes multiple Tanimoto rows *rowbase:rowlimit* corresponding to comparing every SMILES string
        in the query set with the reference SMILES strings having index *row*, *row+1*, ..., *rowlimit-1* in the reference set,
        and stores this block as the most recent asynchronous result.

        This method is synchronous (it will not return until the block has been completely computed).
        """
        if rowbase < 0 or rowlimit > self.nref:
            raise

        # Pad rows out to 64 byte pitch
        rowpitchInFloat = 16*((self.nquery+15)/16)

        # Using pagelocked memory and async copy seems to actually slow us down
        # on large tiled calculations
        self.resultmatrix = numpy.empty((rowlimit-rowbase,rowpitchInFloat),dtype=numpy.float32)
        self.gpu.gpumatrix = cl.Buffer(self.gpu.context,cl.mem_flags.WRITE_ONLY,size=self.resultmatrix.nbytes)

        # With precalculated magnitudes
        lmem_bytes = int(2*4*max(self.rlengths[rowbase:rowlimit]))
        threads_per_block = 192
        self.gpu.multiRowKernel(self.gpu.queue,(threads_per_block*(rowlimit-rowbase),),
                                               self.gpu.rsmiles,self.gpu.rcounts,self.gpu.rl_gpu,self.gpu.rmag_gpu,
                                               self.refPitchInInt,
                                               self.gpu.qsmiles,self.gpu.qcounts,self.gpu.ql_gpu,self.gpu.qmag_gpu,
                                               self.qPitchTInInt,
                                               self.gpu.gpumatrix, numpy.int32(rowpitchInFloat),
                                               numpy.int32(self.qshape[0]),numpy.int32(self.qshape[1]),numpy.int32(rowbase),
                                               cl.LocalMemory(lmem_bytes),cl.LocalMemory(lmem_bytes),
                                               local_size=(threads_per_block,))

        cl.enqueue_read_buffer(self.gpu.queue,self.gpu.gpumatrix,self.resultmatrix).wait()
        return self.resultmatrix[:,0:self.nquery]
Beispiel #28
0
    def FuseRGBD_GPU(self, Image, Pose):
        """
        Update the TSDF volume with Image
        :param Image: RGBD image to update to its surfaces
        :param Pose: transform from the first camera pose to the last camera pose
        :return: none
        """
        # initialize buffers
        cl.enqueue_write_buffer(self.GPUManager.queue, self.Pose_GPU, Pose)
        cl.enqueue_write_buffer(self.GPUManager.queue, self.DepthGPU,
                                Image.depth_image)

        # fuse data of the RGBD imnage with the TSDF volume 3D model
        self.GPUManager.programs['FuseTSDF'].FuseTSDF(self.GPUManager.queue, (self.Size[0], self.Size[1]), None, \
                                self.TSDFGPU, self.DepthGPU, self.Param, self.Size_Volume, self.Pose_GPU, self.Calib_GPU, \
                                np.int32(Image.Size[0]), np.int32(Image.Size[1]),self.WeightGPU)

        # update CPU array. Read the buffer to write in the CPU array.
        cl.enqueue_read_buffer(self.GPUManager.queue, self.TSDFGPU,
                               self.TSDF).wait()
        '''
        # TEST if TSDF contains NaN
        TSDFNaN = np.count_nonzero(np.isnan(self.TSDF))
        print "TSDFNaN : %d" %(TSDFNaN)
        '''
        cl.enqueue_read_buffer(self.GPUManager.queue, self.WeightGPU,
                               self.Weight).wait()
def test_that_python_args_fail(ctx_factory):
    context = ctx_factory()

    prg = cl.Program(
        context, """
        __kernel void mult(__global float *a, float b, int c)
        { a[get_global_id(0)] *= (b+c); }
        """).build()

    a = np.random.rand(50000)
    queue = cl.CommandQueue(context)
    mf = cl.mem_flags
    a_buf = cl.Buffer(context, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=a)

    knl = cl.Kernel(prg, "mult")
    try:
        knl(queue, a.shape, None, a_buf, 2, 3)
        assert False, "PyOpenCL should not accept bare Python types as arguments"
    except cl.LogicError:
        pass

    try:
        prg.mult(queue, a.shape, None, a_buf, float(2), 3)
        assert False, "PyOpenCL should not accept bare Python types as arguments"
    except cl.LogicError:
        pass

    prg.mult(queue, a.shape, None, a_buf, np.float32(2), np.int32(3))

    a_result = np.empty_like(a)
    cl.enqueue_read_buffer(queue, a_buf, a_result).wait()
Beispiel #30
0
 def transform(self):
     """Realizes the calculus"""
     # Prepare the input and output memory
     mf = cl.mem_flags
     
     msg = np.char.array(self.Word_buffer)
     len_array = np.array(self.len).astype(np.int32)
     Hexdigest_array = np.char.array(['']*41*(len(len_array)))
     
     print msg
     print msg.nbytes
     print len_array
     print len_array.nbytes
     print Hexdigest_array
     print Hexdigest_array.nbytes
     
     # Allocate device memory
     msg_buf = cl.Buffer(self.ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, msg.nbytes, msg)
     len_buf = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, len_array.nbytes, len_array)
     Hexdigest_buf = cl.Buffer(self.ctx, mf.WRITE_ONLY, Hexdigest_array.nbytes)
     
     # Start OpenCL operation and wait for it to finish
     time1 = datetime.datetime.now()
     self.prg.sha1(self.queue, (len(len_array),), msg_buf, len_buf, Hexdigest_buf)
     cl.enqueue_read_buffer(self.queue, Hexdigest_buf, Hexdigest_array).wait()
     time2 = datetime.datetime.now()
     print "Execution time OpenCL sha1: " + repr((time2 - time1).microseconds/1000) + "ms"
     
     # Convert the result into strings
     for j in range(0,len(Hexdigest_array)/41):
         self.result.append(''.join(Hexdigest_array[j*41 + 0:j*41 + 41]))
    def dump_batch(self):
        keys = np.array(self.batch.keys(), dtype='S32')
        counts = np.array(self.batch.values(), dtype=np.int32)
        out = np.zeros([self.d, self.w], dtype=np.int32)

        # create the buffers to hold the values of the input
        rand_buf = cl.Buffer(self.ctx,
                             cl.mem_flags.READ_ONLY
                             | cl.mem_flags.COPY_HOST_PTR,
                             hostbuf=self.rand)
        keys_buf = cl.Buffer(self.ctx,
                             cl.mem_flags.READ_ONLY
                             | cl.mem_flags.COPY_HOST_PTR,
                             hostbuf=keys)
        counts_buf = cl.Buffer(self.ctx,
                               cl.mem_flags.READ_ONLY
                               | cl.mem_flags.COPY_HOST_PTR,
                               hostbuf=counts)

        # create output buffer
        out_buf = cl.Buffer(self.ctx, cl.mem_flags.WRITE_ONLY, out.nbytes)

        # Kernel is now launched
        launch = self.bld.increment(self.queue, (len(keys), self.d), None,
                                    rand_buf, keys_buf, counts_buf, out_buf)
        # wait till the process completes
        launch.wait()

        cl.enqueue_read_buffer(self.queue, out_buf, out).wait()

        self.M += out
        self.batch.clear()
Beispiel #32
0
 def execute(self):
     self.program.part1(self.queue, self.a.shape, None, self.a_buf, self.b_buf, self.dest_buf)
     c = numpy.array(range(10), dtype=numpy.uint32)
     cl.enqueue_read_buffer(self.queue, self.dest_buf, c).wait()
     print "a", self.a
     print "b", self.b
     print "c", c
Beispiel #33
0
    def execute(self):
        '''
        execute an iteration of patchMatch
        '''
        t = getTime()
        mf = cl.mem_flags
        self.inputBuf = [
            cl.Buffer(self.ctx,
                      mf.READ_ONLY | mf.COPY_HOST_PTR,
                      hostbuf=self.img[i]) for i in [0, 1]
        ]
        self.outputBuf = cl.Buffer(self.ctx,
                                   mf.READ_WRITE | mf.COPY_HOST_PTR,
                                   hostbuf=self.nff)

        self.program.propagate(
            self.queue,
            self.effectiveSize,
            None,
            numpy.int32(self.patchSize[0]),  #patchHeight
            numpy.int32(self.patchSize[1]),  #patchWidth
            numpy.int32(self.size[0]),  #height
            numpy.int32(self.size[1]),  #width
            numpy.int32(self.iteration),
            self.inputBuf[0],
            self.inputBuf[1],
            self.outputBuf)
        c = numpy.empty_like(self.nff)
        cl.enqueue_read_buffer(self.queue, self.outputBuf, c).wait()
        self.nff = numpy.copy(c)

        self.times["execute"] += getTime() - t
Beispiel #34
0
def gpu_array_sum(a, b):
    context = cl.create_some_context()  # Initialize the Context
    queue = cl.CommandQueue(context, properties=cl.command_queue_properties.PROFILING_ENABLE)  # Instantiate a Queue with profiling (timing) enabled
    a_buffer = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=a)
    b_buffer = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=b)
    c_buffer = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, b.nbytes)  # Create three buffers (plans for areas of memory on the device)
    program = cl.Program(context, """
    __kernel void sum(__global const float *a, __global const float *b, __global float *c)
    {
        int i = get_global_id(0);
        int j;
        for(j = 0; j < 1000; j++)
        {
            c[i] = a[i] + b[i];
        }
    }""").build()  # Compile the device program
    gpu_start_time = time()  # Get the GPU start time
    event = program.sum(queue, a.shape, None, a_buffer, b_buffer, c_buffer)  # Enqueue the GPU sum program XXX
    event.wait()  # Wait until the event finishes XXX
    elapsed = 1e-9*(event.profile.end - event.profile.start)  # Calculate the time it took to execute the kernel
    print("GPU Kernel Time: {0} s".format(elapsed))  # Print the time it took to execute the kernel
    c_gpu = np.empty_like(a)  # Create an empty array the same size as array a
    cl.enqueue_read_buffer(queue, c_buffer, c_gpu).wait()  # Read back the data from GPU memory into array c_gpu
    gpu_end_time = time()  # Get the GPU end time
    print("GPU Time: {0} s".format(gpu_end_time - gpu_start_time))  # Print the time the GPU program took, including both memory copies
    return c_gpu  # Return the sum of the two arrays
def gpu_array_sum(a, b):
    platform = cl.get_platforms()[0]
    device = platform.get_devices()[0]
    context = cl.Context([device])
    queue = cl.CommandQueue(context, properties=cl.command_queue_properties.PROFILING_ENABLE)  # Instantiate a Queue with profiling (timing) enabled
    a_buffer = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=a)
    b_buffer = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=b)
    c_buffer = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, b.nbytes)  # Create three buffers (plans for areas of memory on the device)
    program = cl.Program(context, """
    __kernel void sum(__global const float *a, __global const float *b, __global float *c)
    {
        int i = get_global_id(0);
        int j;
        for(j = 0; j < 10000; j++)
        {
            c[i] = a[i] + b[i];
        }
    }""").build()  # Compile the device program
    gpu_start_time = time()  # Get the GPU start time
    event = program.sum(queue, a.shape, None, a_buffer, b_buffer, c_buffer)  # Enqueue the GPU sum program XXX
    event.wait()  # Wait until the event finishes XXX
    elapsed = 1e-9*(event.profile.end - event.profile.start)  # Calculate the time it took to execute the kernel
    print("GPU Kernel Time: {0} s".format(elapsed))  # Print the time it took to execute the kernel
    c_gpu = np.empty_like(a)  # Create an empty array the same size as array a
    cl.enqueue_read_buffer(queue, c_buffer, c_gpu).wait()  # Read back the data from GPU memory into array c_gpu
    gpu_end_time = time()  # Get the GPU end time
    print("GPU Time: {0} s".format(gpu_end_time - gpu_start_time))  # Print the time the GPU program took, including both memory copies
    return c_gpu  # Return the sum of the two arrays
Beispiel #36
0
    def update(self, sub_pos, angle, min_dist, max_dist, width, in_weight, out_weight):
        '''
        Perform one update on the probabilities by using the evidence that
        the sub is at position sub_pos, the target is seen at an absolute heading
        of `angle` and is most likely between min_dist and max_dist away.
        in_weight gives the chance that for every point in the region,
        if the buoy is there then we would get this result
        i.e. in_weight = P(this measurement | buoy at point p) for p in our region
        out_weight is the same but for points outside the region
        '''

        n,e = sub_pos
        cl_program.evidence(cl_queue, self.norths.shape, None,
                            self.norths_buf, self.easts_buf, self.prob_buf,
                            float32(n), float32(e),
                            float32(radians(angle)),
                            float32(min_dist**2),
                            float32(max_dist**2),
                            float32(width),
                            float32(in_weight),
                            float32(out_weight))
        #TODO ?
        cl.enqueue_read_buffer(cl_queue, self.prob_buf, self.probabilities).wait()

        #Normalize
        total_prob = numpy.sum( self.probabilities )
        self.probabilities  /= total_prob

        cl.enqueue_write_buffer(cl_queue, self.prob_buf, self.probabilities)
Beispiel #37
0
def do_opencl_pow(hash_, target):
    """Perform PoW using OpenCL"""
    output = numpy.zeros(1, dtype=[('v', numpy.uint64, 1)])
    if not enabledGpus:
        return output[0][0]

    data = numpy.zeros(1, dtype=hash_dt, order='C')
    data[0]['v'] = ("0000000000000000" + hash_).decode("hex")
    data[0]['target'] = target

    hash_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=data)
    dest_buf = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, output.nbytes)

    kernel = program.kernel_sha512
    worksize = kernel.get_work_group_info(cl.kernel_work_group_info.WORK_GROUP_SIZE, enabledGpus[0])

    kernel.set_arg(0, hash_buf)
    kernel.set_arg(1, dest_buf)

    progress = 0
    globamt = worksize * 2000

    while output[0][0] == 0 and shutdown == 0:
        kernel.set_arg(2, pack("<Q", progress))
        cl.enqueue_nd_range_kernel(queue, kernel, (globamt,), (worksize,))
        try:
            cl.enqueue_read_buffer(queue, dest_buf, output)
        except AttributeError:
            cl.enqueue_copy(queue, output, dest_buf)
        queue.finish()
        progress += globamt
    if shutdown != 0:
        raise Exception("Interrupted")
#   logger.debug("Took %d tries.", progress)
    return output[0][0]
Beispiel #38
0
 def execute(self, settings):
     self.program.mandel(self.queue, (self.c_real.shape[0], ), None,
                         self.real_buf, self.imag_buf, self.depth_buf,
                         self.dest_buf)
     counts = np.zeros(settings.dim**2, dtype=np.int32)
     cl.enqueue_read_buffer(self.queue, self.dest_buf, counts).wait()
     return counts.reshape([settings.dim, settings.dim])
Beispiel #39
0
def reduce_flatrot():
  sums = np.empty((8,4),'f')  
  evt = program.float4_sum(queue, (64*8,), (64,), 
    reduce_buf, reduce_scratch, 
    qxdyqz_buf, np.int32(length))
  cl.enqueue_read_buffer(queue, reduce_buf, sums).wait()
  return sums.sum(0)
Beispiel #40
0
def map1(data):

    SPARKCL_PLATFORM = os.environ['CL_PLATFORM']
    SPARKCL_DEVICE = os.environ['CL_DEVICE']
    print str(SPARKCL_PLATFORM)+":"+str(SPARKCL_DEVICE)
    KERNEL_CODE="""
        __kernel void ArraySum(__global float *A,__global float *B,__global float *C){
       int i = get_global_id(0);
       C[i] = A[i]+B[i];
}

    """

    cl_device=cl.get_platforms()[int(SPARKCL_PLATFORM)].get_devices()[int(SPARKCL_DEVICE)]
    ctx = cl.Context([cl_device])
    queue = cl.CommandQueue(ctx)
    prg = cl.Program(ctx, KERNEL_CODE).build()
    kernel = prg.ArraySum
    mf = cl.mem_flags
    print "map" + str(data)

    np_data = []
    data_buf = []
    np_data.append(np.array(data[0]).astype(np.float32))
    data_buf.append(cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=np_data[0]))
    np_data.append(np.array(data[1]).astype(np.float32))
    data_buf.append(cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=np_data[1]))

    result = np.zeros((5,)).astype(np.float32)
    result_buf = cl.Buffer(ctx, mf.WRITE_ONLY, result.nbytes)

    kernel(queue,(5,),None,data_buf[0],data_buf[1],result_buf)
    cl.enqueue_read_buffer(queue, result_buf, result).wait()
    return [result.astype(np.float32)]
Beispiel #41
0
 def mineThread(self):
     for data in self.qr:
         for i in range(data.iterations):
             self.kernel.search(
                 self.commandQueue, (data.size, ), (self.WORKSIZE, ),
                 data.state[0], data.state[1], data.state[2], data.state[3],
                 data.state[4], data.state[5], data.state[6], data.state[7],
                 data.state2[1], data.state2[2], data.state2[3],
                 data.state2[5], data.state2[6], data.state2[7],
                 data.base[i],
                 data.f[0],
                 data.f[1],data.f[2],
                 data.f[3],data.f[4],
                 self.output_buf)
             cl.enqueue_read_buffer(
                 self.commandQueue, self.output_buf, self.output)
             self.commandQueue.finish()
             
             # The OpenCL code will flag the last item in the output buffer when
             # it finds a valid nonce. If that's the case, send it to the main
             # thread for postprocessing and clean the buffer for the next pass.
             if self.output[self.OUTPUT_SIZE]:
                 reactor.callFromThread(self.postprocess, self.output.copy(),
                 data.nr)
         
                 self.output.fill(0)
                 cl.enqueue_write_buffer(
                     self.commandQueue, self.output_buf, self.output)
Beispiel #42
0
 def plotCurrentMembraneCoordinates(self):
     cl.enqueue_read_buffer(self.queue, self.dev_membraneCoordinatesX.data,
                            self.host_membraneCoordinatesX).wait()
     cl.enqueue_read_buffer(self.queue, self.dev_membraneCoordinatesY.data,
                            self.host_membraneCoordinatesY).wait()
     plt.plot(self.host_membraneCoordinatesX,
              self.host_membraneCoordinatesY)
Beispiel #43
0
    def resize_uint32(self, data_np, scale_x, scale_y, out=None):

        height, width = data_np.shape[:2]

        new_ht = int(height * scale_y)
        new_wd = int(width * scale_x)
        new_shape = [new_ht, new_wd] + list(data_np.shape[2:])

        mf = cl.mem_flags

        #create OpenCL buffers on devices
        data_np = np.ascontiguousarray(data_np)
        src_buf = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR,
                            hostbuf=data_np)

        num_bytes = new_ht * new_wd * np.uint32(0).nbytes
        dst_buf = cl.Buffer(self.ctx, mf.WRITE_ONLY, num_bytes)

        evt = self.program.image_resize_uint32(self.queue, [new_ht, new_wd], None,
                                               src_buf, dst_buf,
                                               np.int32(width), np.int32(new_wd),
                                               np.float64(scale_x), np.float64(scale_y))

        if out is None:
            out = np.empty(new_shape, dtype=data_np.dtype)
        cl.enqueue_read_buffer(self.queue, dst_buf, out).wait()

        return out
def lombscarge_opencl(x, y, f):
    # start up gpu
    x = np.float64(x)
    y = np.float64(y)
    f = np.float64(f)
    ctx = cl.create_some_context()
    queue = cl.CommandQueue(ctx)
    mf = cl.mem_flags
    # make max arrays
    Nx, Nf = np.int32(x.shape[0]), np.int32(f.shape[0])
    # send data to card
    x_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=x)
    y_g = cl.Buffer(ctx, mf.READ_ONLY| mf.COPY_HOST_PTR, hostbuf=y)
    f_g = cl.Buffer(ctx, mf.READ_ONLY| mf.COPY_HOST_PTR, hostbuf=f)
    # make output
    pgram = np.empty_like(f)
    pgram_g = cl.Buffer(ctx, mf.WRITE_ONLY, pgram.nbytes)
    prg = cl.Program(ctx, lomb_txt)

    try:
        prg.build()
    except:
        print("Error:")
        print(prg.get_build_info(ctx.devices[0], cl.program_build_info.LOG))
        raise

    prg.lombscargle(queue, pgram.shape, None, x_g, y_g, f_g, pgram_g, Nx)
    cl.enqueue_read_buffer(queue, pgram_g, pgram)

    return pgram
def test_opencl_0(zz, a, b, c_result):
 
    for platform in cl.get_platforms():
        for device in [platform.get_devices()[1]]:
            print("===============================================================")
            print("Platform name:", platform.name)
            print("Platform profile:", platform.profile)
            print("Platform vendor:", platform.vendor)
            print("Platform version:", platform.version)
            print("---------------------------------------------------------------")
            print("Device name:", device.name)
            print("Device type:", cl.device_type.to_string(device.type))
            print("Device memory: ", device.global_mem_size//1024//1024, 'MB')
            print("Device max clock speed:", device.max_clock_frequency, 'MHz')
            print("Device compute units:", device.max_compute_units)

        # Simnple speed test
            ctx = cl.Context([device])
            queue = cl.CommandQueue(ctx, 
                                    properties=cl.command_queue_properties.PROFILING_ENABLE)

            mf = cl.mem_flags
            a_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a)
            b_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b)
            dest_buf = cl.Buffer(ctx, mf.WRITE_ONLY, b.nbytes)

            prg = cl.Program(ctx, """
                __kernel void sum(__global const double *a,
                __global const double *b, __global double *c)
                {
                            int loop;
                            int gid = get_global_id(0);
                            for(loop=0; loop<%s;loop++)
                            {
                                    c[gid] = a[gid] + b[gid];
                                    c[gid] = c[gid] * (a[gid] + b[gid]);
                                    c[gid] = c[gid] * (a[gid] / 2);
                                    c[gid] = log(exp(c[gid]));
                            }
                }
            """ % (zz)).build()

            exec_evt = prg.sum(queue, a.shape, None, a_buf, b_buf, dest_buf)
            exec_evt.wait()
            elapsed = 1e-9*(exec_evt.profile.end - exec_evt.profile.start)

            print("Execution time of test: %g s" % elapsed)

            c = numpy.empty_like(a)
            cl.enqueue_read_buffer(queue, dest_buf, c).wait()
            error = 0
            for i in range(zz):
                if c[i] != c_result[i]:
                        print("c_i: ", c[i], " c_results_i: ", c_result[i]) 
                        print("diff: ", numpy.abs(c[i] - c_result[i]))
                        error = 1
            if error:
                print("Results doesn't match!!")
            else:
                print("Results OK")
Beispiel #46
0
def lomb_scargle32(x, y, f):
    '''single percesion version of lomb-scargle'''
    x = np.float32(x)
    y = np.float32(y)
    f = np.float32(f)
    ctx = cl.create_some_context()
    queue = cl.CommandQueue(ctx)
    mf = cl.mem_flags
    # make max arrays
    Nx, Nf = np.int32(x.shape[0]), np.int32(f.shape[0])
    # send data to card
    x_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=x)
    y_g = cl.Buffer(ctx, mf.READ_ONLY| mf.COPY_HOST_PTR, hostbuf=y)
    f_g = cl.Buffer(ctx, mf.READ_ONLY| mf.COPY_HOST_PTR, hostbuf=f)
    # make output
    pgram = np.empty_like(f)
    pgram_g = cl.Buffer(ctx, mf.WRITE_ONLY, pgram.nbytes)
    prg = cl.Program(ctx, lomb_txt32)

    try:
        prg.build()
    except:
        # 
        print("Error:")
        print(prg.get_build_info(ctx.devices[0], cl.program_build_info.LOG))
        raise

    prg.lombscargle(queue, pgram.shape, None, x_g, y_g, f_g, pgram_g, Nx)
    cl.enqueue_read_buffer(queue, pgram_g, pgram)

    return pgram
	def exchange_boundary_e(s):
		for queue, eh_fields, tmpf in zip(s.queues, s.eh_fields_gpus, s.tmpfs)[1:]:
			cl.enqueue_read_buffer(queue, eh_fields[1], tmpf[0])	# ey_gpu
			cl.enqueue_read_buffer(queue, eh_fields[2], tmpf[1])	# ez_gpu
		for queue, eh_fields, tmpf, offset in zip(s.queues[:-1], s.eh_fields_gpus[:-1], s.tmpfs[1:], s.offsets[:-1]):
			cl.enqueue_write_buffer(queue, eh_fields[1], tmpf[0], offset)
			cl.enqueue_write_buffer(queue, eh_fields[2], tmpf[1], offset)
    def execute(self):
        """
        Runs test openCL kernel and returns elapsed time.
        """

        kernel = self.LoadKernelSrc(self.src)

        # build opencl kernel
        prg = cl.Program(self.ctx, kernel).build()

        exec_evt = prg.matrix_mul(
            self.queue,
            (
                self.m,
                self.p,
            ),
            self.A_buf,
            self.B_buf,
            self.C_buf,
            np.uint32(self.m),
            np.uint32(self.n),
            np.uint32(self.p),
            local_size=(
                self.block,
                self.block,
            ),
        ).wait()

        # read result from opencl buffer
        cl.enqueue_read_buffer(self.queue, self.C_buf, self.C).wait()

        # return elapsed time in seconds
        return 1e-9 * (exec_evt.profile.end - exec_evt.profile.start)
Beispiel #49
0
def do_opencl_pow(hash, target):
    global ctx, queue, program, gpus, hash_dt

    output = numpy.zeros(1, dtype=[("v", numpy.uint64, 1)])
    if ctx == False:
        return output[0][0]

    data = numpy.zeros(1, dtype=hash_dt, order="C")
    data[0]["v"] = ("0000000000000000" + hash).decode("hex")
    data[0]["target"] = target

    hash_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=data)
    dest_buf = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, output.nbytes)

    kernel = program.kernel_sha512
    worksize = kernel.get_work_group_info(cl.kernel_work_group_info.WORK_GROUP_SIZE, gpus[0])

    kernel.set_arg(0, hash_buf)
    kernel.set_arg(1, dest_buf)

    start = time.time()
    progress = 0
    globamt = worksize * 2000

    while output[0][0] == 0:
        kernel.set_arg(2, pack("<Q", progress))
        cl.enqueue_nd_range_kernel(queue, kernel, (globamt,), (worksize,))
        cl.enqueue_read_buffer(queue, dest_buf, output)
        queue.finish()
        progress += globamt
        sofar = time.time() - start
    # 		logger.debug("Working for %.3fs, %.2f Mh/s", sofar, (progress / sofar) / 1000000)
    taken = time.time() - start
    # 	logger.debug("Took %d tries.", progress)
    return output[0][0]
Beispiel #50
0
 def __call__(self, ctx, x, y, rx, ry, sw, sh, ez, ex, ey):
     self.build(ctx)
     x = np.array(x, dtype=np.float32, copy=False)
     y = np.array(y, dtype=np.float32, copy=False)
     ez = np.array(ez, dtype=np.float32, copy=False)
     ex = np.array(ex, dtype=np.float32, copy=False)
     ey = np.array(ey, dtype=np.float32, copy=False)
     rx = np.float32(rx)
     ry = np.float32(ry)
     sw = np.float32(sw)
     sh = np.float32(sh)
     x_buf = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=x)
     y_buf = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=y)
     ez_buf = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=ez)
     ex_buf = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=ex)
     ey_buf = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=ey)
     out_buf = cl.Buffer(self.ctx, cl.mem_flags.WRITE_ONLY, x.nbytes)
     queue = cl.CommandQueue(self.ctx)
     self.prg.distance_2_point(queue, x.shape, None, x_buf, y_buf, rx, ry, sw, sh, ez_buf, ex_buf, ey_buf, out_buf)
     out = np.empty_like(x)
     cl.enqueue_read_buffer(queue, out_buf, out).wait()
     x_buf.release()
     y_buf.release()
     ez_buf.release()
     ex_buf.release()
     ey_buf.release()
     out_buf.release()
     return out
Beispiel #51
0
 def copy_array(self, arr_like, arr_device):
     """
     This copy an array from device to host and returns it.
     """
     c = np.empty_like(arr_like)
     cl.enqueue_read_buffer(self.queue, arr_device, c).wait()
     return c
Beispiel #52
0
 def __call__(self, ctx, x1, y1, x2, y2, rx, ry, sw, sh, ez, ex, ey):
     self.build(ctx)
     x1 = np.array(x1, dtype=np.float32, copy=False)
     y1 = np.array(y1, dtype=np.float32, copy=False)
     x2 = np.array(x2, dtype=np.float32, copy=False)
     y2 = np.array(y2, dtype=np.float32, copy=False)
     ez = np.array(ez, dtype=np.float32, copy=False)
     ex = np.array(ex, dtype=np.float32, copy=False)
     ey = np.array(ey, dtype=np.float32, copy=False)
     rx = np.float32(rx)
     ry = np.float32(ry)
     sw = np.float32(sw)
     sh = np.float32(sh)
     x1_buf = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=x1)
     y1_buf = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=y1)
     x2_buf = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=x2)
     y2_buf = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=y2)
     ez_buf = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=ez)
     ex_buf = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=ex)
     ey_buf = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=ey)
     out_buf = cl.Buffer(self.ctx, cl.mem_flags.WRITE_ONLY, x1.nbytes)
     queue = cl.CommandQueue(self.ctx)
     self.prg.subtended_angle_naive(queue, x1.shape, None, x1_buf, y1_buf, x2_buf, y2_buf, rx, ry, sw, sh, ez_buf, ex_buf, ey_buf, out_buf)
     out = np.empty_like(x1)
     cl.enqueue_read_buffer(queue, out_buf, out).wait()
     x1_buf.release()
     y1_buf.release()
     x2_buf.release()
     y2_buf.release()
     ez_buf.release()
     ex_buf.release()
     ey_buf.release()
     out_buf.release()
     return out
Beispiel #53
0
    def mineThread(self):
        for data in self.qr:
            for i in range(data.iterations):
                self.kernel.search(
                    self.commandQueue, (data.size, ), (self.WORKSIZE, ),
                    data.state[0], data.state[1], data.state[2], data.state[3],
                    data.state[4], data.state[5], data.state[6], data.state[7],
                    data.state2[1], data.state2[2], data.state2[3],
                    data.state2[5], data.state2[6], data.state2[7],
                    data.base[i],
                    data.f[1],data.f[2],
                    data.f[3],data.f[4],
                    data.f[5],data.f[6],
                    data.f[7],data.f[8],
                    self.output_buf)
                cl.enqueue_read_buffer(
                    self.commandQueue, self.output_buf, self.output)
                self.commandQueue.finish()

                # The OpenCL code will flag the last item in the output buffer
                # when it finds a valid nonce. If that's the case, send it to
                # the main thread for postprocessing and clean the buffer
                # for the next pass.
                if self.output[self.OUTPUT_SIZE]:
                    reactor.callFromThread(self.postprocess,
                    self.output.copy(), data.nr)

                    self.output.fill(0)
                    cl.enqueue_write_buffer(
                        self.commandQueue, self.output_buf, self.output)
Beispiel #54
0
	def fromDevice(self, buf, shape=None):
		if shape is None:
			shape = buf.shape

		cpu_buf = numpy.empty(shape, dtype=buf.dtype)
		cl.enqueue_read_buffer(self.queue, buf, cpu_buf).wait()
		return cpu_buf
Beispiel #55
0
    def mineThread(self):
        for data in self.qr:
            for i in range(data.iterations):
                offset = (unpack('I', data.base[i])[0],) if self.GOFFSET else None
                self.kernel.search(
                    self.commandQueue, (data.size, ), (self.WORKSIZE, ),
                    data.state[0], data.state[1], data.state[2], data.state[3],
                    data.state[4], data.state[5], data.state[6], data.state[7],
                    data.state2[1], data.state2[2], data.state2[3],
                    data.state2[5], data.state2[6], data.state2[7],
                    data.base[i],
                    data.f[0], data.f[1], data.f[2], data.f[3],
                    data.f[4], data.f[5], data.f[6], data.f[7],
                    self.output_buf, global_offset=offset)
                cl.enqueue_read_buffer(self.commandQueue, self.output_buf,
                                       self.output, is_blocking=False)
                self.commandQueue.finish()

                # The OpenCL code will flag the last item in the output buffer
                # when it finds a valid nonce. If that's the case, send it to
                # the main thread for postprocessing and clean the buffer
                # for the next pass.
                if self.output[self.WORKSIZE]:
                    reactor.callFromThread(self.postprocess,
                    self.output.copy(), data.nr)

                    self.output.fill(0)
                    cl.enqueue_write_buffer(self.commandQueue, self.output_buf,
                                            self.output, is_blocking=False)
Beispiel #56
0
    def transform_uint32(self,
                         data_np,
                         flip_x=False,
                         flip_y=False,
                         swap_xy=False,
                         out=None):

        height, width = data_np.shape[:2]

        new_ht, new_wd = height, width
        if swap_xy:
            new_ht, new_wd = width, height
        new_size = [new_ht, new_wd] + list(data_np.shape[2:])

        mf = cl.mem_flags

        #create OpenCL buffers on devices
        data_np = np.ascontiguousarray(data_np)
        src_buf = cl.Buffer(self.ctx,
                            mf.READ_ONLY | mf.COPY_HOST_PTR,
                            hostbuf=data_np)
        dst_buf = cl.Buffer(self.ctx, mf.WRITE_ONLY, data_np.nbytes)

        self.program.image_transform_uint32(self.queue, [height, width],
                                            None, src_buf, dst_buf,
                                            np.int32(width), np.int32(height),
                                            np.int32(flip_x), np.int32(flip_y),
                                            np.int32(swap_xy))

        if out is None:
            out = np.empty_like(data_np).reshape(new_size)
        cl.enqueue_read_buffer(self.queue, dst_buf, out).wait()

        return out
Beispiel #57
0
 def execute(self):
     self.program.part1(self.queue, self.a.shape, None, self.a_buf, self.b_buf, self.dest_buf)
     c = numpy.empty_like(self.a)
     cl.enqueue_read_buffer(self.queue, self.dest_buf, c).wait()
     print "a", self.a
     print "b", self.b
     print "c", c
Beispiel #58
0
    def resize_uint32(self, data_np, scale_x, scale_y, out=None):

        height, width = data_np.shape[:2]

        new_ht = int(height * scale_y)
        new_wd = int(width * scale_x)
        new_shape = [new_ht, new_wd] + list(data_np.shape[2:])

        mf = cl.mem_flags

        #create OpenCL buffers on devices
        data_np = np.ascontiguousarray(data_np)
        src_buf = cl.Buffer(self.ctx,
                            mf.READ_ONLY | mf.COPY_HOST_PTR,
                            hostbuf=data_np)

        num_bytes = new_ht * new_wd * np.uint32(0).nbytes
        dst_buf = cl.Buffer(self.ctx, mf.WRITE_ONLY, num_bytes)

        self.program.image_resize_uint32(self.queue, [new_ht, new_wd], None,
                                         src_buf, dst_buf, np.int32(width),
                                         np.int32(new_wd), np.float64(scale_x),
                                         np.float64(scale_y))

        if out is None:
            out = np.empty(new_shape, dtype=data_np.dtype)
        cl.enqueue_read_buffer(self.queue, dst_buf, out).wait()

        return out
Beispiel #59
0
	def prepare_environment(self, filename,camera_index):
		
		#build kernel for videocard
		kernel_file = open(filename, 'r')
		kernel_string = "".join(kernel_file.readlines())
		self.program = cl.Program(self.gpu_context, kernel_string).build()
		#get frames from the webcam
		self.stream = cv.CaptureFromCAM(camera_index)
		self.line_cols = cv.GetMat(cv.QueryFrame(self.stream)).cols
		while True:
			self.frame = cv.QueryFrame(self.stream)
			self.frame = cv.GetMat(self.frame)
			self.image_data = numpy.asarray(self.frame)
			self.image_data = numpy.array(self.image_data, dtype=numpy.int32)
			final = numpy.zeros(shape=(self.image_data.shape))
			for position,line in enumerate(self.image_data):
				if position == 0:
					continue
				if position == self.image_data.shape[0]-1:
					continue
				line = line.ravel()
				self.line_buffer = cl.Buffer(self.gpu_context, self.memory_flags.READ_ONLY | self.memory_flags.COPY_HOST_PTR, hostbuf=line)
				self.top_line_buffer = cl.Buffer(self.gpu_context, self.memory_flags.READ_ONLY | self.memory_flags.COPY_HOST_PTR, hostbuf=self.image_data[position-1])
				self.bottom_line_buffer = cl.Buffer(self.gpu_context, self.memory_flags.READ_ONLY | self.memory_flags.COPY_HOST_PTR, hostbuf=self.image_data[position+1])
				self.contour_buffer = cl.Buffer(self.gpu_context, self.memory_flags.WRITE_ONLY, line.nbytes)
				self.program.calculate_differences(self.command_queue, line.shape, None,self.top_line_buffer,self.line_buffer,self.bottom_line_buffer, self.contour_buffer)
				contour = numpy.empty_like(line)
				cl.enqueue_read_buffer(self.command_queue, self.contour_buffer, contour).wait()
				line = contour.reshape(self.line_cols,3)
				final[position] = line
			img = numpy.uint8(final)
			img = cv.fromarray(img)
			cv.ShowImage("camera_window", img)
			if cv.WaitKey(10) == 27:
				breakcv.DestroyWindow("camera_window")
Beispiel #60
0
 def subtended_angle(self, x1, y1, x2, y2, rx, ry, sw, sh, ez, ex, ey):
     x1 = np.array(x1, dtype=np.float32, copy=False)
     y1 = np.array(y1, dtype=np.float32, copy=False)
     x2 = np.array(x2, dtype=np.float32, copy=False)
     y2 = np.array(y2, dtype=np.float32, copy=False)
     ez = np.array(ez, dtype=np.float32, copy=False)
     ex = np.array(ex, dtype=np.float32, copy=False)
     ey = np.array(ey, dtype=np.float32, copy=False)
     rx = np.float32(rx)
     ry = np.float32(ry)
     sw = np.float32(sw)
     sh = np.float32(sh)
     x1_buf = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=x1)
     y1_buf = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=y1)
     x2_buf = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=x2)
     y2_buf = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=y2)
     ez_buf = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=ez)
     ex_buf = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=ex)
     ey_buf = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=ey)
     out_buf = cl.Buffer(self.ctx, cl.mem_flags.WRITE_ONLY, x1.nbytes)
     self.cl_subtended_angle(self.queue, x1.shape, None, x1_buf, y1_buf, x2_buf, y2_buf, rx, ry, sw, sh, ez_buf, ex_buf, ey_buf, out_buf)
     self.queue.finish()
     out = np.empty_like(x1)
     cl.enqueue_read_buffer(self.queue, out_buf, out)
     return out