def setup_opencl(data, cube_size): import pycl blocking = True with timeify("Making context, loading kernel"): devices = pycl.clGetDeviceIDs() ctx = pycl.clCreateContext(devices=devices) queue = pycl.clCreateCommandQueue(ctx) program = pycl.clCreateProgramWithSource(ctx, SOURCE).build() score_matrix = program['score_matrix_to_rms'] score_matrix.argtypes = (pycl.cl_mem, pycl.cl_mem, pycl.cl_mem, pycl.cl_mem, pycl.cl_mem, pycl.cl_int, pycl.cl_int) sub_divisions = cube_size**3 with timeify("Creating buffers"): in_r_buf, in_evt1 = pycl.buffer_from_pyarray(queue, data['in_r'], blocking=blocking) in_g_buf, in_evt2 = pycl.buffer_from_pyarray(queue, data['in_g'], blocking=blocking) in_b_buf, in_evt3 = pycl.buffer_from_pyarray(queue, data['in_b'], blocking=blocking) out_r = data['out_r'] out_r_buf, in_evt4 = pycl.buffer_from_pyarray(queue, out_r, blocking=blocking) score = array.array('f', [0 for x in range(sub_divisions)]) score_buf, in_evt5 = pycl.buffer_from_pyarray(queue, score, blocking=blocking) with timeify("Run kernel r"): run_evt = score_matrix( #in_r_buf, in_g_buf, in_b_buf, out_r_buf, score_buf, in_r_buf, in_g_buf, in_b_buf, in_r_buf, score_buf, len(data['in_r']), cube_size, wait_for=[in_evt1, in_evt2, in_evt3, in_evt4, in_evt5]).on(queue, sub_divisions) with timeify("Retrive data"): score_from_gpu, evt = pycl.buffer_to_pyarray(queue, score_buf, wait_for=run_evt, like=score) return score_from_gpu
def run_conv( prog, queue ): func = prog['conv__num_imgs_20__in_pad_3__in_dim_0_227__in_dim_1_227__conv_has_relu_1__kern_sz_7__stride_2__out_chans_64__in_chans_3'] print func func.argtypes = (cl.cl_mem, cl.cl_mem, cl.cl_mem, cl.cl_mem) in_ar = cl.array('f', range(100 * 1000 * 1000)) in_buf, in_evt = cl.buffer_from_pyarray(queue, in_ar, blocking=False) filt_buf = in_buf.empty_like_this() bias_buf = in_buf.empty_like_this() out_buf = in_buf.empty_like_this() #run_evt = func(filt_buf, bias_buf, in_buf, out_buf).on(queue, gsize=(63,), lsize=(32,), wait_for=in_evt) func.setarg( 0, filt_buf ) func.setarg( 1, bias_buf ) func.setarg( 2, in_buf ) func.setarg( 3, out_buf ) run_evt = cl.clEnqueueNDRangeKernel( queue, func, gsize=(120*2166,), lsize=(120,), wait_for=in_evt) out, evt = cl.buffer_to_pyarray(queue, out_buf, wait_for=run_evt, like=in_ar) print "start wait" evt.wait() print "end wait, start loop" for i in range(1000): run_evt = cl.clEnqueueNDRangeKernel( queue, func, gsize=(120*2166,), lsize=(120,) ) run_evt.wait() print "end loop" print out[0:10]
def run_conv(prog, queue): func = prog[ "conv__num_imgs_20__in_pad_3__in_dim_0_227__in_dim_1_227__conv_has_relu_1__kern_sz_7__stride_2__out_chans_64__in_chans_3" ] print func func.argtypes = (cl.cl_mem, cl.cl_mem, cl.cl_mem, cl.cl_mem) in_ar = cl.array("f", range(100 * 1000 * 1000)) in_buf, in_evt = cl.buffer_from_pyarray(queue, in_ar, blocking=False) filt_buf = in_buf.empty_like_this() bias_buf = in_buf.empty_like_this() out_buf = in_buf.empty_like_this() # run_evt = func(filt_buf, bias_buf, in_buf, out_buf).on(queue, gsize=(63,), lsize=(32,), wait_for=in_evt) func.setarg(0, filt_buf) func.setarg(1, bias_buf) func.setarg(2, in_buf) func.setarg(3, out_buf) run_evt = cl.clEnqueueNDRangeKernel(queue, func, gsize=(120 * 2166,), lsize=(120,), wait_for=in_evt) out, evt = cl.buffer_to_pyarray(queue, out_buf, wait_for=run_evt, like=in_ar) print "start wait" evt.wait() print "end wait, start loop" for i in range(1000): run_evt = cl.clEnqueueNDRangeKernel(queue, func, gsize=(120 * 2166,), lsize=(120,)) run_evt.wait() print "end loop" print out[0:10]
def setup_opencl(data, cube_size): import pycl blocking = True with timeify("Making context, loading kernel"): devices = pycl.clGetDeviceIDs() ctx = pycl.clCreateContext(devices = devices) queue = pycl.clCreateCommandQueue(ctx) program = pycl.clCreateProgramWithSource(ctx, SOURCE).build() score_matrix = program['score_matrix_to_rms'] score_matrix.argtypes = (pycl.cl_mem, pycl.cl_mem, pycl.cl_mem, pycl.cl_mem, pycl.cl_mem, pycl.cl_int, pycl.cl_int) sub_divisions = cube_size**3 with timeify("Creating buffers"): in_r_buf, in_evt1 = pycl.buffer_from_pyarray(queue, data['in_r'], blocking = blocking) in_g_buf, in_evt2 = pycl.buffer_from_pyarray(queue, data['in_g'], blocking = blocking) in_b_buf, in_evt3 = pycl.buffer_from_pyarray(queue, data['in_b'], blocking = blocking) out_r = data['out_r'] out_r_buf, in_evt4 = pycl.buffer_from_pyarray(queue, out_r, blocking = blocking) score = array.array('f', [0 for x in range(sub_divisions)]) score_buf, in_evt5 = pycl.buffer_from_pyarray(queue, score, blocking = blocking) with timeify("Run kernel r"): run_evt = score_matrix( #in_r_buf, in_g_buf, in_b_buf, out_r_buf, score_buf, in_r_buf, in_g_buf, in_b_buf, in_r_buf, score_buf, len(data['in_r']), cube_size, wait_for = [in_evt1, in_evt2, in_evt3, in_evt4, in_evt5]).on(queue, sub_divisions) with timeify("Retrive data"): score_from_gpu, evt = pycl.buffer_to_pyarray(queue, score_buf, wait_for=run_evt, like=score) return score_from_gpu
def run_mxplusb( prog, queue ): func = prog['mxplusb'] print func func.argtypes = (cl.cl_float, cl.cl_mem, cl.cl_float, cl.cl_mem) x = cl.array('f', range(100)) x_buf, in_evt = cl.buffer_from_pyarray(queue, x, blocking=False) y_buf = x_buf.empty_like_this() run_evt = func(2, x_buf, 5, y_buf).on(queue, len(x), wait_for=in_evt) y, evt = cl.buffer_to_pyarray(queue, y_buf, wait_for=run_evt, like=x) evt.wait() print y[0:10]
def run_mxplusb(prog, queue): func = prog["mxplusb"] print func func.argtypes = (cl.cl_float, cl.cl_mem, cl.cl_float, cl.cl_mem) x = cl.array("f", range(100)) x_buf, in_evt = cl.buffer_from_pyarray(queue, x, blocking=False) y_buf = x_buf.empty_like_this() run_evt = func(2, x_buf, 5, y_buf).on(queue, len(x), wait_for=in_evt) y, evt = cl.buffer_to_pyarray(queue, y_buf, wait_for=run_evt, like=x) evt.wait() print y[0:10]