from shared_gpu_kernels import gen_kernel from config import SIGNIFICANT_LENGTH, SIZE, MT_N, M, STATE_SIZE, TEST_ITERATIONS MT_state_result = np.zeros((SIGNIFICANT_LENGTH, SIZE)).astype(np.uint32) ctx = cl.create_some_context() queue_instruction = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) queue_data = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) mf = cl.mem_flags MT_state_buf = cl.Buffer(ctx, mf.WRITE_ONLY, SIZE * MT_N * 4) MT_state_res_buf = cl.Buffer(ctx, mf.WRITE_ONLY, MT_state_result.nbytes) prg = cl.Program(ctx, gen_kernel(MT_N, STATE_SIZE, M, SIZE, SIGNIFICANT_LENGTH)).build() z = cl.enqueue_marker(queue_instruction) zzz = time.time() instr_event = prg.mt_brute(queue_instruction, (SIZE, ), (STATE_SIZE, ), np.uint32(0), MT_state_buf, MT_state_res_buf)#, g_times_l=True) data_event = cl.enqueue_copy(queue_instruction, MT_state_result, MT_state_res_buf, wait_for=[instr_event,]) for i in xrange(TEST_ITERATIONS):#2**31 / SIZE): instr_event = prg.mt_brute(queue_instruction, (SIZE, ), (STATE_SIZE, ), np.uint32(i*SIZE), MT_state_buf, MT_state_res_buf, wait_for=[data_event,])#, g_times_l=True) data_event = cl.enqueue_copy(queue_instruction, MT_state_result, MT_state_res_buf, wait_for=[instr_event,]) data_event.wait() #for row in (tmp for tmp in MT_state_result[0]): # f.write('{0}\n'.format(row)) z2 = cl.enqueue_marker(queue_instruction)
from shared_gpu_kernels import gen_kernel, transform_to_cuda from config import SIGNIFICANT_LENGTH, SIZE, MT_N, M, STATE_SIZE, TEST_ITERATIONS MT_state_result = np.zeros((SIGNIFICANT_LENGTH, SIZE)).astype(np.uint32) Stream = drv.Stream() Stream2 = drv.Stream() MT_state_buf = drv.mem_alloc(SIZE * MT_N * 4) MT_state_res_buf = drv.mem_alloc(MT_state_result.nbytes) prg = SourceModule( transform_to_cuda( gen_kernel(MT_N, STATE_SIZE, M, SIZE, SIGNIFICANT_LENGTH) ) ) prog = prg.get_function('mt_brute') zzz = time.time() ev = prog(np.uint32(0), MT_state_buf, MT_state_res_buf, block=(STATE_SIZE, 1, 1), grid=(SIZE/STATE_SIZE, 1), stream=Stream) drv.memcpy_dtoh_async(MT_state_result, MT_state_res_buf, stream=Stream2) for i in xrange(TEST_ITERATIONS): prog(np.uint32(i*SIZE), MT_state_buf, MT_state_res_buf, block=(STATE_SIZE, 1, 1), grid=(SIZE/STATE_SIZE, 1), stream=Stream) drv.memcpy_dtoh(MT_state_result, MT_state_res_buf)#, stream=Stream2) zzz = time.time() - zzz print '>>>', zzz