def TestCompileExec(): import time SIZE = 1024 kernel = ( "il_ps_2_0\n" + "dcl_input_position_interp(linear_noperspective) v0\n" + "dcl_output_generic o0\n" + "dcl_output_generic o1\n" + #"dcl_output_generic o2\n" + "dcl_resource_id(0)_type(2d,unnorm)_fmtx(float)_fmty(float)_fmtz(float)_fmtw(float)\n" + #"mov r0, g[0]\n" + "sample_resource(0)_sampler(0) o0, v0.xyxx\n" + "mov g[0], r0\n" + "end\n") t1 = time.time() image = cal_exec.compile(kernel) t2 = time.time() print "compile time", t2 - t1 input = cal_exec.alloc_remote(cal_exec.FMT_FLOAT32_4, SIZE, SIZE, 0) output = cal_exec.alloc_remote(cal_exec.FMT_FLOAT32_4, SIZE, SIZE, 0) #glob = cal_exec.alloc_remote(cal_exec.FMT_FLOAT32_4, 4096, 4096, cal_exec.GLOBAL_BUFFER) print "input", input print "output", output remote = {"o0": output, "i0": input} local = { "o1": (SIZE, SIZE, cal_exec.FMT_FLOAT32_4), "g[]": (4096, 4096, cal_exec.FMT_FLOAT32_4) } domain = (0, 0, SIZE, SIZE) print "remote bindings", remote print "local bindings", local # image, dev num, (x, y, w, h) t1 = time.time() cal_exec.run_stream(image, 0, domain, local, remote) t2 = time.time() print "run time", t2 - t1 cal_exec.free_remote(input) cal_exec.free_remote(output) #cal_exec.free_remote(glob) cal_exec.free_image(image) return
def TestCompileExec(): import time SIZE = 1024 kernel = ("il_ps_2_0\n" + "dcl_input_position_interp(linear_noperspective) v0\n" + "dcl_output_generic o0\n" + "dcl_output_generic o1\n" + #"dcl_output_generic o2\n" + "dcl_resource_id(0)_type(2d,unnorm)_fmtx(float)_fmty(float)_fmtz(float)_fmtw(float)\n" + #"mov r0, g[0]\n" + "sample_resource(0)_sampler(0) o0, v0.xyxx\n" + "mov g[0], r0\n" + "end\n") t1 = time.time() image = cal_exec.compile(kernel) t2 = time.time() print "compile time", t2 - t1 input = cal_exec.alloc_remote(cal_exec.FMT_FLOAT32_4, SIZE, SIZE, 0) output = cal_exec.alloc_remote(cal_exec.FMT_FLOAT32_4, SIZE, SIZE, 0) #glob = cal_exec.alloc_remote(cal_exec.FMT_FLOAT32_4, 4096, 4096, cal_exec.GLOBAL_BUFFER) print "input", input print "output", output remote = {"o0": output, "i0": input} local = {"o1": (SIZE, SIZE, cal_exec.FMT_FLOAT32_4), "g[]": (4096, 4096, cal_exec.FMT_FLOAT32_4)} domain = (0, 0, SIZE, SIZE) print "remote bindings", remote print "local bindings", local # image, dev num, (x, y, w, h) t1 = time.time() cal_exec.run_stream(image, 0, domain, local, remote) t2 = time.time() print "run time", t2 - t1 cal_exec.free(input) cal_exec.free(output) #cal_exec.free(glob) cal_exec.free_image(image) return
class Processor(spe.Processor): exec_module = cal_exec def __init__(self, device): spe.Processor.__init__(self) if device < 0 or device > N_GPUS: raise Exception("Invalid device number %d" % device) self.device = device return def execute(self, code, domain=None, async=False): code.cache_code() if domain is None: try: input = code.get_remote_binding("i0") except KeyError: raise Exception( "No domain specified and no remote i0 register bound") domain = (0, 0, input.gpu_width, len(input) / input.gpu_width) if async: th = cal_exec.run_stream_async(code.render_code, self.device, domain, code._local_bindings, code._remote_bindings, code._copy_bindings) return (th, code) else: cal_exec.run_stream(code.render_code, self.device, domain, code._local_bindings, code._remote_bindings, code._copy_bindings) try: import numpy for (key, arr) in code._remote_bindings_data.items(): if isinstance(arr, extarray.extarray): arr.set_memory(arr.gpu_mem_handle[0], arr.data_len * arr.itemsize) elif isinstance(arr, numpy.ndarray): cal_exec.set_ndarray_ptr(arr, code._remote_bindings[key][0]) for (key, arr) in code._copy_bindings_data.items(): if isinstance(arr, extarray.extarray): arr.set_memory(arr.gpu_mem_handle[0], arr.data_len * arr.itemsize) elif isinstance(arr, numpy.ndarray): cal_exec.set_ndarray_ptr(arr, code._remote_bindings[key][0]) except ImportError: for arr in code._remote_bindings_data.values(): arr.set_memory(arr.gpu_mem_handle[0], arr.data_len * arr.itemsize) for arr in code._copy_bindings_data.values(): arr.set_memory(arr.gpu_mem_handle[0], arr.data_len * arr.itemsize) return
if isinstance(arr, extarray.extarray): domain = (0, 0, arr.gpu_width, arr.gpu_height) elif isinstance(arr, numpy.ndarray): domain = (0, 0, arr.base.width, arr.base.height) elif isinstance(arr, LocalMemory): domain = (0, 0, arr.width, arr.height) else: raise Exception("Invalid o0 binding!") if async: th = cal_exec.run_stream_async(prgm.render_code, self.ctx, domain, prgm._bindings) return (th, prgm) else: cal_exec.run_stream(prgm.render_code, self.ctx, domain, prgm._bindings) # Go through the bindings and re-set all the pointers # When a kernel is executed, remote memory has to be unmapped and # remapped, meaning the memory location can change. for (key, arr) in prgm._bindings_data.items(): binding = prgm._bindings[key] if isinstance(arr, extarray.extarray): arr.set_memory(binding[1], arr.data_len * arr.itemsize) elif isinstance(arr, numpy.ndarray) and HAS_NUMPY: cal_exec.set_ndarray_ptr(arr, binding[1]) return def join(self, hdl): # TODO - do something better to differentiate