def allocate_buffered_data_arrays(self, **kwargs): n0 = kwargs.get('n0', self.n0) if self.buffered_transfer: n0 = kwargs.get('n0_buffer', self.n0_buffer) assert (n0 is not None) kw = dict(dtype=self.real_type, alignment=resource.getpagesize()) self.t = cuda.aligned_zeros(shape=(n0, ), **kw) self.t = cuda.register_host_memory(self.t) self.y = cuda.aligned_zeros(shape=(n0, ), dtype=self.ytype, alignment=resource.getpagesize()) self.y = cuda.register_host_memory(self.y) if self.weighted: self.dy = cuda.aligned_zeros(shape=(n0, ), **kw) self.dy = cuda.register_host_memory(self.dy) if self.balanced_magbins: self.mag_bwf = cuda.aligned_zeros(shape=(self.mag_bins, ), **kw) self.mag_bwf = cuda.register_host_memory(self.mag_bwf) if self.compute_log_prob: self.mag_bin_fracs = cuda.aligned_zeros(shape=(self.mag_bins, ), **kw) self.mag_bin_fracs = cuda.register_host_memory(self.mag_bin_fracs) return self
def allocate_buffered_data_arrays(self, **kwargs): """ Allocates pinned memory for lightcurves if we're reusing this container """ n0 = kwargs.get('n0', self.n0) if self.buffered_transfer: n0 = kwargs.get('n0_buffer', self.n0_buffer) assert (n0 is not None) self.t = cuda.aligned_zeros(shape=(n0, ), dtype=self.real_type, alignment=resource.getpagesize()) self.t = cuda.register_host_memory(self.t) self.yw = cuda.aligned_zeros(shape=(n0, ), dtype=self.real_type, alignment=resource.getpagesize()) self.yw = cuda.register_host_memory(self.yw) self.w = cuda.aligned_zeros(shape=(n0, ), dtype=self.real_type, alignment=resource.getpagesize()) self.w = cuda.register_host_memory(self.w) return self
def inference(features, tokens): global h_output _NetworkOutput = collections.namedtuple( # pylint: disable=invalid-name "NetworkOutput", ["start_logits", "end_logits", "feature_index"]) networkOutputs = [] eval_time_elapsed = 0 for feature_index, feature in enumerate(features): # Copy inputs input_ids_batch = np.dstack([feature.input_ids] * args.batch_size).squeeze() segment_ids_batch = np.dstack([feature.segment_ids] * args.batch_size).squeeze() input_mask_batch = np.dstack([feature.input_mask] * args.batch_size).squeeze() input_ids = cuda.register_host_memory(np.ascontiguousarray(input_ids_batch.ravel())) segment_ids = cuda.register_host_memory(np.ascontiguousarray(segment_ids_batch.ravel())) input_mask = cuda.register_host_memory(np.ascontiguousarray(input_mask_batch.ravel())) eval_start_time = time.time() cuda.memcpy_htod_async(d_inputs[0], input_ids, stream) cuda.memcpy_htod_async(d_inputs[1], segment_ids, stream) cuda.memcpy_htod_async(d_inputs[2], input_mask, stream) # Run inference context.execute_async_v2(bindings=[0 for i in range(binding_idx_offset)] + [int(d_inp) for d_inp in d_inputs] + [int(d_output)], stream_handle=stream.handle) # Synchronize the stream stream.synchronize() eval_time_elapsed += (time.time() - eval_start_time) # Transfer predictions back from GPU cuda.memcpy_dtoh_async(h_output, d_output, stream) stream.synchronize() # Only retrieve and post-process the first batch batch = h_output[0] networkOutputs.append(_NetworkOutput( start_logits = np.array(batch.squeeze()[:, 0]), end_logits = np.array(batch.squeeze()[:, 1]), feature_index = feature_index )) eval_time_elapsed /= len(features) # Total number of n-best predictions to generate in the nbest_predictions.json output file n_best_size = 20 # The maximum length of an answer that can be generated. This is needed # because the start and end predictions are not conditioned on one another max_answer_length = 30 prediction, nbest_json, scores_diff_json = dp.get_predictions(tokens, features, networkOutputs, args.n_best_size, args.max_answer_length) return eval_time_elapsed, prediction, nbest_json
def test_register_host_memory(self): if drv.get_version() < (4,): from py.test import skip skip("register_host_memory only exists on CUDA 4.0 and later") import sys if sys.platform == "darwin": from py.test import skip skip("register_host_memory is not supported on OS X") a = drv.aligned_empty((2**20,), np.float64, alignment=4096) drv.register_host_memory(a)
def test_register_host_memory(self): if drv.get_version() < (4, ): from py.test import skip skip("register_host_memory only exists on CUDA 4.0 and later") import sys if sys.platform == "darwin": from py.test import skip skip("register_host_memory is not supported on OS X") a = drv.aligned_empty((2**20, ), np.float64, alignment=4096) drv.register_host_memory(a)
def inference(features, tokens): global h_output _NetworkOutput = collections.namedtuple( # pylint: disable=invalid-name "NetworkOutput", ["start_logits", "end_logits", "feature_index"]) networkOutputs = [] eval_time_elapsed = 0 for feature_index, feature in enumerate(features): # Copy inputs input_ids = cuda.register_host_memory( np.ascontiguousarray(feature.input_ids.ravel())) segment_ids = cuda.register_host_memory( np.ascontiguousarray(feature.segment_ids.ravel())) input_mask = cuda.register_host_memory( np.ascontiguousarray(feature.input_mask.ravel())) eval_start_time = time.time() cuda.memcpy_htod_async(d_inputs[0], input_ids, stream) cuda.memcpy_htod_async(d_inputs[1], segment_ids, stream) cuda.memcpy_htod_async(d_inputs[2], input_mask, stream) # Run inference context.execute_async_v2( bindings=[int(d_inp) for d_inp in d_inputs] + [int(d_output)], stream_handle=stream.handle) # Synchronize the stream stream.synchronize() eval_time_elapsed += (time.time() - eval_start_time) # Transfer predictions back from GPU cuda.memcpy_dtoh_async(h_output, d_output, stream) stream.synchronize() for index, batch in enumerate(h_output): # Data Post-processing networkOutputs.append( _NetworkOutput( start_logits=np.array(batch.squeeze()[:, 0]), end_logits=np.array(batch.squeeze()[:, 1]), feature_index=feature_index)) eval_time_elapsed /= len(features) prediction, nbest_json, scores_diff_json = dp.get_predictions( tokens, features, networkOutputs, args.n_best_size, args.max_answer_length) return eval_time_elapsed, prediction, nbest_json
def _set_thread_args(self, dev_id: int, ctx: cuda.Context, moment: np.ndarray, w_out: np.ndarray, x_out: np.ndarray, y_out: np.ndarray): ''' Set the input moment for all the stream for a specific GPU ''' ctx.push() # number of input for this GPU max_size = moment.shape[1] # loop through the streams to set their input for i in range(0, self.num_stream, 1): # Size of input allocated for each stream size_per_batch = int(np.ceil(max_size / self.num_stream)) # location on the original input array where the input to this stream starts loc = np.int32((i) * size_per_batch) if loc + size_per_batch > max_size: size_per_batch = max_size - loc self.moment_chunk_host[dev_id].append( np.ascontiguousarray(moment[:, loc:loc + size_per_batch], dtype=np.float32)) self.moment_chunk_host[dev_id][i] = cuda.register_host_memory( self.moment_chunk_host[dev_id][i], cuda.mem_host_register_flags.PORTABLE) self.w_chunk_host[dev_id].append( np.ascontiguousarray( np.zeros_like(w_out[:, loc:loc + size_per_batch]))) self.w_chunk_host[dev_id][i] = cuda.register_host_memory( self.w_chunk_host[dev_id][i], cuda.mem_host_register_flags.PORTABLE) self.x_chunk_host[dev_id].append( np.ascontiguousarray( np.zeros_like(x_out[:, loc:loc + size_per_batch]))) self.x_chunk_host[dev_id][i] = cuda.register_host_memory( self.x_chunk_host[dev_id][i], cuda.mem_host_register_flags.PORTABLE) self.y_chunk_host[dev_id].append( np.ascontiguousarray( np.zeros_like(y_out[:, loc:loc + size_per_batch]))) self.y_chunk_host[dev_id][i] = cuda.register_host_memory( self.y_chunk_host[dev_id][i], cuda.mem_host_register_flags.PORTABLE) ctx.synchronize() ctx.pop()
def allocate(self, data): if len(data) > len(self.streams): self._create_streams(len(data) - len(self.streams)) gpu_data, pow_cpus = [], [] for t, y, w, freqs in data: pow_cpu = cuda.aligned_zeros(shape=(len(freqs), ), dtype=np.float32, alignment=resource.getpagesize()) pow_cpu = cuda.register_host_memory(pow_cpu) t_g, y_g, w_g = None, None, None if len(t) > 0: t_g, y_g, w_g = tuple([ gpuarray.zeros(len(t), dtype=np.float32) for i in range(3) ]) pow_g = gpuarray.zeros(len(pow_cpu), dtype=pow_cpu.dtype) freqs_g = gpuarray.to_gpu(np.asarray(freqs).astype(np.float32)) gpu_data.append((t_g, y_g, w_g, freqs_g, pow_g)) pow_cpus.append(pow_cpu) return gpu_data, pow_cpus
def copy_htod(self, np_buffer, stream=None): if stream: # PyCUDA requires the host buffer to be pagelocked for asynchronous memcpys. pagelocked = cuda.register_host_memory( np.ascontiguousarray(np_buffer.ravel())) cuda.memcpy_htod_async(self.ptr, pagelocked, stream) else: cuda.memcpy_htod(self.ptr, np.ascontiguousarray(np_buffer.ravel()))
def allocate_pinned_cpu(self, **kwargs): """ Allocates pinned CPU memory for asynchronous transfer of result """ nf = kwargs.get('nf', self.nf) assert(nf is not None) self.lsp_c = cuda.aligned_zeros(shape=(nf,), dtype=self.real_type, alignment=resource.getpagesize()) self.lsp_c = cuda.register_host_memory(self.lsp_c) return self
def allocate_pinned_cpu(self, **kwargs): nf = kwargs.get('nf', self.nf) assert (nf is not None) self.ce_c = cuda.aligned_zeros(shape=(nf, ), dtype=self.real_type, alignment=resource.getpagesize()) self.ce_c = cuda.register_host_memory(self.ce_c) return self
def batch_memcpy_cmp(size: int, batch: int): event_start_1 = cuda.Event() event_stop_1 = cuda.Event() event_start_2 = cuda.Event() event_stop_2 = cuda.Event() array = np.random.rand(size, 9) array.astype(np.float32) mem = cuda.aligned_zeros_like(array) mem = cuda.register_host_memory(mem, cuda.mem_host_register_flags.DEVICEMAP) mem_d = cuda.mem_alloc_like(mem) event_start_1.record() cuda.memcpy_htod(mem_d, mem) event_stop_1.record() event_stop_1.synchronize() mem2 = [] this_mem = [] size_per_batch = int(size / batch) for i in range(batch): mem2.append( cuda.mem_alloc_like(array[i * size_per_batch:(i + 1) * size_per_batch])) this_mem.append(array[i * size_per_batch:(i + 1) * size_per_batch]) this_mem[i] = cuda.register_host_memory( this_mem[i], cuda.mem_host_register_flags.DEVICEMAP) event_start_2.record() for i in range(batch): cuda.memcpy_htod(mem2[i], this_mem[i]) event_stop_2.record() event_stop_2.synchronize() t1 = event_stop_1.time_since(event_start_1) t2 = event_stop_2.time_since(event_start_2) print("batch_memcpy_cmp size", size, " batch ", batch) print(t1) print(t2)
def allocate_pinned_arrays(self, nfreqs=None, ndata=None): if nfreqs is None: nfreqs = int(self.max_nfreqs) if ndata is None: ndata = int(self.max_ndata) self.bls = cuda.aligned_zeros(shape=(nfreqs,), dtype=self.rtype, alignment=resource.getpagesize()) self.bls = cuda.register_host_memory(self.bls) self.nbins0 = cuda.aligned_zeros(shape=(nfreqs,), dtype=np.int32, alignment=resource.getpagesize()) self.nbins0 = cuda.register_host_memory(self.nbins0) self.nbinsf = cuda.aligned_zeros(shape=(nfreqs,), dtype=np.int32, alignment=resource.getpagesize()) self.nbinsf = cuda.register_host_memory(self.nbinsf) self.t = cuda.aligned_zeros(shape=(ndata,), dtype=self.rtype, alignment=resource.getpagesize()) self.t = cuda.register_host_memory(self.t) self.yw = cuda.aligned_zeros(shape=(ndata,), dtype=self.rtype, alignment=resource.getpagesize()) self.yw = cuda.register_host_memory(self.yw) self.w = cuda.aligned_zeros(shape=(ndata,), dtype=self.rtype, alignment=resource.getpagesize()) self.w = cuda.register_host_memory(self.w)
def test_register_host_memory(self): if drv.get_version() < (4, ): from py.test import skip skip("register_host_memory only exists on CUDA 4.0 and later") import sys if sys.platform == "darwin": from py.test import skip skip("register_host_memory is not supported on OS X") a = drv.aligned_empty((2**20, ), np.float64) a_pin = drv.register_host_memory(a) gpu_ary = drv.mem_alloc_like(a) stream = drv.Stream() drv.memcpy_htod_async(gpu_ary, a_pin, stream) drv.Context.synchronize()
def test_register_host_memory(self): if drv.get_version() < (4,): from py.test import skip skip("register_host_memory only exists on CUDA 4.0 and later") import sys if sys.platform == "darwin": from py.test import skip skip("register_host_memory is not supported on OS X") a = drv.aligned_empty((2**20,), np.float64) a_pin = drv.register_host_memory(a) gpu_ary = drv.mem_alloc_like(a) stream = drv.Stream() drv.memcpy_htod_async(gpu_ary, a_pin, stream) drv.Context.synchronize()
def trt_inference(stream, trt_ctx, d_input, d_output, input_signal, input_signal_length): print("infer with shape: {}".format(input_signal.shape)) trt_ctx.set_binding_shape(0, input_signal.shape) assert trt_ctx.all_binding_shapes_specified h_output = cuda.pagelocked_empty(tuple(trt_ctx.get_binding_shape(1)), dtype=np.float32) h_input_signal = cuda.register_host_memory( np.ascontiguousarray(input_signal.cpu().numpy().ravel())) cuda.memcpy_htod_async(d_input, h_input_signal, stream) trt_ctx.execute_async_v2(bindings=[int(d_input), int(d_output)], stream_handle=stream.handle) cuda.memcpy_dtoh_async(h_output, d_output, stream) stream.synchronize() greedy_predictions = torch.tensor(h_output).argmax(dim=-1, keepdim=False) return greedy_predictions
def test_kernel(n, a, x_gpu, y_gpu): code = """ #include <stdio.h> __global__ void saxpy(int n, float a, float *x, float *y) { int index = blockIdx.x * blockDim.x + threadIdx.x; int stride = blockDim.x * gridDim.x; for (int i = index; i < n; i += stride) { y[i] = a*x[i] + y[i]; } } """ mod = SourceModule(code) saxpy = mod.get_function("saxpy") saxpy(n, a, x_gpu, y_gpu, block=(1024, 1, 1), grid=(1, 1)) out = cuda.register_host_memory(np.empty(n, dtype=np.float32)) cuda.memcpy_dtoh_async(out, y_gpu) return out
# omega_e = omega_e.astype(np.float32); omega_eps = omega_eps.astype(np.float32) # omega_q = omega_q.astype(np.float32); omega_nu = omega_nu.astype(np.float32) # Allocate memory on GPU fin_g = cuda.mem_alloc(fin.size * fin.dtype.itemsize) ftemp_g = cuda.mem_alloc(fin.nbytes) feq_g = cuda.mem_alloc(fin.size * fin.dtype.itemsize) rho_g = cuda.mem_alloc(rho.size * rho.dtype.itemsize) taus_g = cuda.mem_alloc(tauS.size * tauS.dtype.itemsize) u_g = cuda.mem_alloc(u.nbytes) c_g = cuda.mem_alloc(c.nbytes) t_g = cuda.mem_alloc(t.nbytes) #fpost_g = cuda.mem_alloc(fin.size * fin.dtype.itemsize) #Pinning memory for faster transfers between cpu and gpu fin_pin = cuda.register_host_memory(fin) #this didnt seem to make difference, so using fin only for transfers cuda.memcpy_htod(fin_g,fin) cuda.memcpy_htod(ftemp_g,fin) cuda.memcpy_htod(feq_g,fin) cuda.memcpy_htod(rho_g,rho) cuda.memcpy_htod(taus_g, tauS) cuda.memcpy_htod(u_g,u) #cuda.memcpy_htod(fpost_g,fin) # cuda.memcpy_htod(c_g,c) # cuda.memcpy_htod(t_g,t)
const float x = g_x[i]; g_y[i] = cos(x)*exp(sin(x)-sqrt(x*x)); } """ mod = SourceModule(code) custom_kernel = mod.get_function("custom_kernel") size = 5120000 block_size = 512 grid_size = size / block_size block = (block_size, 1, 1) grid = (grid_size, 1) x = np.linspace(1, size, size).astype(np.float32) print x.shape x_pin = cuda.register_host_memory(x) x_gpu = cuda.mem_alloc(x.nbytes) cuda.memcpy_htod(x_gpu, x) y_gpu = cuda.mem_alloc(x.nbytes) custom_kernel(y_gpu, x_gpu, block=block, grid=grid) ans = np.zeros_like(x_pin) cuda.memcpy_dtoh(ans, y_gpu) ans = np.sum(ans) print ans print np.sum(np.cos(x) * np.exp(np.sin(x) - np.sqrt(x * x)))
def compare_performance(): # a quick warm up.. n = 25000000 a = np.random.randn(n).astype(np.float32) # allocate space on GPU mem_gpu = cuda.mem_alloc(a.nbytes) cuda.memcpy_htod(mem_gpu, a) # free space on GPU mem_gpu.free() h2d_nopin = [] h2d_nopin_bw = [] # measure timing without pinning for n in num_elems: # the data to be transferred a = np.random.randn(n).astype(np.float32) # allocate space on GPU mem_gpu = cuda.mem_alloc(a.nbytes) # only measure h2d transfer time start = time.perf_counter() cuda.memcpy_htod(mem_gpu, a) te = time.perf_counter() - start #te: time elapsed h2d_nopin.append(te) h2d_nopin_bw.append(a.nbytes / (10**9 * (te))) # convert to a bandwidth # free space on GPU mem_gpu.free() # now do pinning and measure time to pin and time to transfer h2h_pinned = [] # records the transfer time from unpinned -> pinned memory h2d_pin = [ ] # records the host to device transfer time with data in pinned memory. h2d_pin_total = [] # records the total (sum of the previous two) h2d_pin_bw = [] #h2d_pin, converted to a bandwidth (GB/sec) for i, n in enumerate(num_elems): a = np.random.randn(n).astype(np.float32) # allocate space on GPU mem_gpu = cuda.mem_alloc(a.nbytes) # allocate page locked memory a_pin = cuda.register_host_memory(a) # copy data from np array to pinned memory and measure transfer time start = time.perf_counter() copy_np_to_pinned_memory(a, a_pin) te = time.perf_counter() - start # te: time elapsed h2h_pinned.append(te) # measure h2d transfer time start = time.perf_counter() cuda.memcpy_htod(mem_gpu, a_pin) te = time.perf_counter() - start #te: time elapsed h2d_pin.append(te) h2d_pin_bw.append(a.nbytes / (10**9 * te)) h2d_pin_total.append(h2d_pin[i] + h2h_pinned[i]) # free allocated pinned memory a_pin.base.unregister() # free space on GPU mem_gpu.free() fig = plt.figure() num_elems_mb = [x * 4 / 10**6 for x in num_elems] plt.plot(num_elems_mb, h2d_nopin, 'g', label='h2d transfer_time (no pinning)') plt.plot(num_elems_mb, h2d_pin, 'r', label='h2d transfer_time (with pinning)') plt.plot(num_elems_mb, h2h_pinned, 'b', label='h2h transfer_time') plt.plot(num_elems_mb, h2d_pin_total, 'k', label='h2d transfer_time (with pinning, total)') plt.legend() plt.xlabel('data size (MB)') plt.ylabel('time (sec)') plt.show()
def inference(features, tokens): global h_output _NetworkOutput = collections.namedtuple( # pylint: disable=invalid-name "NetworkOutput", ["start_logits", "end_logits", "feature_index"]) networkOutputs = [] eval_time_elapsed = 0 for feature_index, feature in enumerate(features): # Copy inputs B = 1 S = np.sum(feature.input_mask) input_ids = feature.input_ids[0:S] segment_ids = feature.segment_ids[0:S] cu_seq_lens = np.array([0, S], dtype=np.int32) if context.get_binding_shape(0)[0] != S: context.set_binding_shape(0, (S, )) if context.get_binding_shape(1)[0] != S: context.set_binding_shape(1, (S, )) if context.get_binding_shape(2)[0] != 2: context.set_binding_shape(2, (2, )) if context.get_binding_shape(3)[0] != S: context.set_binding_shape(3, (S, )) h_input_ids = cuda.register_host_memory( np.ascontiguousarray(input_ids.ravel())) h_segment_ids = cuda.register_host_memory( np.ascontiguousarray(segment_ids.ravel())) h_cu_seq_lens = cuda.register_host_memory( np.ascontiguousarray(cu_seq_lens.ravel())) eval_start_time = time.time() cuda.memcpy_htod_async(d_inputs[0], h_input_ids, stream) cuda.memcpy_htod_async(d_inputs[1], h_segment_ids, stream) cuda.memcpy_htod_async(d_inputs[2], h_cu_seq_lens, stream) # Run inference context.execute_async_v2( bindings=[int(d_inp) for d_inp in d_inputs] + [int(d_output)], stream_handle=stream.handle) # Synchronize the stream stream.synchronize() eval_time_elapsed += (time.time() - eval_start_time) # Transfer predictions back from GPU cuda.memcpy_dtoh_async(h_output, d_output, stream) stream.synchronize() # Only retrieve and post-process the first batch networkOutputs.append( _NetworkOutput(start_logits=np.array(h_output[0:S]), end_logits=np.array(h_output[S:S * 2]), feature_index=feature_index)) eval_time_elapsed /= len(features) # Total number of n-best predictions to generate in the nbest_predictions.json output file n_best_size = 20 # The maximum length of an answer that can be generated. This is needed # because the start and end predictions are not conditioned on one another max_answer_length = 30 prediction, nbest_json, scores_diff_json = dp.get_predictions( tokens, features, networkOutputs, args.n_best_size, args.max_answer_length) return eval_time_elapsed, prediction, nbest_json
x_orig = x.copy() start = time() increment( drv.InOut(x), np.uint32(N), block=(320, 1, 1), grid=(N/320,1,1) ) times[i] = time()-start np.allclose(x_orig + 1, x), "%r %r" % (x_orig, x) print "Average kernel execution time with pageable memory: %3.7f" % np.mean(times) # Time use of pinned host memory: #x = drv.aligned_empty((N, N), dtype=np.float64, order='C') x = drv.register_host_memory(x, flags=drv.mem_host_register_flags.DEVICEMAP) x_gpu_ptr = np.intp(x.base.get_device_pointer()) times = np.empty(M) for i in xrange(M): #x[:, :] = np.random.rand(N, N) x_orig = x.copy() start = time() increment( x_gpu_ptr, np.uint32(N), block=(320, 1, 1), grid=(N/320,1,1) ) times[i] = time()-start
def convert_image_rgb(self, image): global program start = time.time() iplanes = image.get_planes() w = image.get_width() h = image.get_height() stride = image.get_rowstride() pixels = image.get_pixels() debug("convert_image(%s) planes=%s, pixels=%s, size=%s", image, iplanes, type(pixels), len(pixels)) assert iplanes==ImageWrapper.PACKED, "must use packed format as input" assert image.get_pixel_format()==self.src_format, "invalid source format: %s (expected %s)" % (image.get_pixel_format(), self.src_format) divs = get_subsampling_divs(self.dst_format) #copy packed rgb pixels to GPU: upload_start = time.time() stream = driver.Stream() mem = numpy.frombuffer(pixels, dtype=numpy.byte) in_buf = driver.mem_alloc(len(pixels)) hmem = driver.register_host_memory(mem, driver.mem_host_register_flags.DEVICEMAP) pycuda.driver.memcpy_htod_async(in_buf, mem, stream) out_bufs = [] out_strides = [] out_sizes = [] for i in range(3): x_div, y_div = divs[i] out_stride = roundup(self.dst_width/x_div, 4) out_height = roundup(self.dst_height/y_div, 2) out_buf, out_stride = driver.mem_alloc_pitch(out_stride, out_height, 4) out_bufs.append(out_buf) out_strides.append(out_stride) out_sizes.append((out_stride, out_height)) #ensure uploading has finished: stream.synchronize() #we can now unpin the host memory: hmem.base.unregister() debug("allocation and upload took %.1fms", 1000.0*(time.time() - upload_start)) kstart = time.time() kargs = [in_buf, numpy.int32(stride)] for i in range(3): kargs.append(out_bufs[i]) kargs.append(numpy.int32(out_strides[i])) blockw, blockh = 16, 16 #figure out how many pixels we process at a time in each dimension: xdiv = max([x[0] for x in divs]) ydiv = max([x[1] for x in divs]) gridw = max(1, w/blockw/xdiv) if gridw*2*blockw<w: gridw += 1 gridh = max(1, h/blockh/ydiv) if gridh*2*blockh<h: gridh += 1 debug("calling %s%s, with grid=%s, block=%s", self.kernel_function_name, tuple(kargs), (gridw, gridh), (blockw, blockh, 1)) self.kernel_function(*kargs, block=(blockw,blockh,1), grid=(gridw, gridh)) #we can now free the GPU source buffer: in_buf.free() kend = time.time() debug("%s took %.1fms", self.kernel_function_name, (kend-kstart)*1000.0) self.frames += 1 #copy output YUV channel data to host memory: read_start = time.time() pixels = [] strides = [] for i in range(3): x_div, y_div = divs[i] out_size = out_sizes[i] #direct full plane async copy keeping current GPU padding: plane = driver.aligned_empty(out_size, dtype=numpy.byte) driver.memcpy_dtoh_async(plane, out_bufs[i], stream) pixels.append(plane.data) stride = out_strides[min(len(out_strides)-1, i)] strides.append(stride) stream.synchronize() #the copying has finished, we can now free the YUV GPU memory: #(the host memory will be freed by GC when 'pixels' goes out of scope) for out_buf in out_bufs: out_buf.free() self.cuda_context.synchronize() read_end = time.time() debug("strides=%s", strides) debug("read back took %.1fms, total time: %.1f", (read_end-read_start)*1000.0, 1000.0*(time.time()-start)) return ImageWrapper(0, 0, self.dst_width, self.dst_height, pixels, self.dst_format, 24, strides, planes=ImageWrapper._3_PLANES)
def reg_mem(a): temp = drv.register_host_memory( a, flags=drv.mem_host_register_flags.DEVICEMAP) return numpy.intp(temp.base.get_device_pointer())
def chyqmom9_pycuda(moments: np.ndarray, size: int, w: np.ndarray, x: np.ndarray, y: np.ndarray, batch_size: int): mem_d_size_in_byte = np.ones(size).astype(np.float32).nbytes sizeof_float = np.int32(np.dtype(np.float32).itemsize) size = np.int32(size) # Allocate 1 concurrent streams to each batch num_stream = batch_size streams = [] for i in range(num_stream): streams.append(cuda.Stream()) BlockSize = (256, 1, 1) GridSize = (size + BlockSize[0] - 1) / BlockSize[0] GridSize = (int(GridSize), 1, 1) # timers event_start = cuda.Event() event_stop = cuda.Event() size_per_batch = np.int32(np.ceil(float(size) / batch_size)) print("size_per_batch: ", size_per_batch) # initialize kernels c_kernel = CHYQMOM9.get_function('chyqmom9_cmoments') float_value_set = CHYQMOM9.get_function('float_value_set') float_array_set = CHYQMOM9.get_function('float_array_set') chyqmom9_mu_yf = CHYQMOM9.get_function('chyqmom9_mu_yf') chyqmom9_wout = CHYQMOM9.get_function('chyqmom9_wout') chyqmom9_xout = CHYQMOM9.get_function('chyqmom9_xout') chyqmom9_yout = CHYQMOM9.get_function('chyqmom9_yout') moments_d = [] this_moment = [] this_x = [] this_w = [] this_y = [] w_out_d = [] x_out_d = [] y_out_d = [] c_moments = [] mu = [] yf = [] m1 = [] x1 = [] w1 = [] x2 = [] w2 = [] for i in range(0, num_stream, 1): loc = np.int32((i) * size_per_batch) if loc + size_per_batch > size: size_per_batch = size - loc # allocate memory on device moments_d.append( cuda.mem_alloc(int(sizeof_float * size_per_batch * 10))) w_out_d.append(cuda.mem_alloc(int(sizeof_float * size_per_batch * 9))) x_out_d.append(cuda.mem_alloc(int(sizeof_float * size_per_batch * 9))) y_out_d.append(cuda.mem_alloc(int(sizeof_float * size_per_batch * 9))) this_moment.append( np.ascontiguousarray(moments[:, loc:loc + size_per_batch], dtype=np.float32)) this_moment[i] = cuda.register_host_memory( this_moment[i], cuda.mem_host_register_flags.PORTABLE) this_w.append( np.ascontiguousarray(np.zeros_like(w[:, loc:loc + size_per_batch]))) this_w[i] = cuda.register_host_memory( this_w[i], cuda.mem_host_register_flags.PORTABLE) this_x.append( np.ascontiguousarray(np.zeros_like(x[:, loc:loc + size_per_batch]))) this_x[i] = cuda.register_host_memory( this_x[i], cuda.mem_host_register_flags.PORTABLE) this_y.append( np.ascontiguousarray(np.zeros_like(y[:, loc:loc + size_per_batch]))) this_y[i] = cuda.register_host_memory( this_y[i], cuda.mem_host_register_flags.PORTABLE) c_moments.append(cuda.mem_alloc(int(sizeof_float * size_per_batch * 7))) mu.append(cuda.mem_alloc(int(sizeof_float * size_per_batch * 3))) yf.append(cuda.mem_alloc(int(sizeof_float * size_per_batch * 3))) m1.append(cuda.mem_alloc(int(sizeof_float * size_per_batch * 5))) float_value_set(m1[i], np.float32(1), size_per_batch, np.int32(0), block=BlockSize, grid=GridSize, stream=streams[i]) float_value_set(m1[i], np.float32(0), size_per_batch, size_per_batch, block=BlockSize, grid=GridSize, stream=streams[i]) x1.append(cuda.mem_alloc(int(sizeof_float * size_per_batch * 3))) w1.append(cuda.mem_alloc(int(sizeof_float * size_per_batch * 3))) x2.append(cuda.mem_alloc(int(sizeof_float * size_per_batch * 3))) w2.append(cuda.mem_alloc(int(sizeof_float * size_per_batch * 3))) hyq = hyqmom.Hyqmom(BlockSize, GridSize) event_start.record() for i in range(0, num_stream, 1): loc = np.int32((i) * size_per_batch) if loc + size_per_batch > size: size_per_batch = size - loc cuda.memcpy_htod_async(moments_d[i], this_moment[i], stream=streams[i]) c_kernel(moments_d[i], c_moments[i], size_per_batch, block=BlockSize, grid=GridSize, stream=streams[i]) float_array_set(m1[i], c_moments[i], np.int32(size_per_batch), np.int32(size_per_batch * 2), np.int32(0), block=BlockSize, grid=GridSize, stream=streams[i]) float_array_set(m1[i], c_moments[i], np.int32(size_per_batch * 2), np.int32(size_per_batch * 3), np.int32(size_per_batch * 4), block=BlockSize, grid=GridSize, stream=streams[i]) hyq.hyqmom3(m1[i], x1[i], w1[i], size_per_batch, block=BlockSize, grid=GridSize, stream=streams[i]) chyqmom9_mu_yf(c_moments[i], x1[i], w1[i], yf[i], mu[i], size_per_batch, block=BlockSize, grid=GridSize, stream=streams[i]) float_array_set(m1[i], mu[i], np.int32(size_per_batch * 3), np.int32(size_per_batch * 2), np.int32(0), block=BlockSize, grid=GridSize, stream=streams[i]) hyq.hyqmom3(m1[i], x2[i], w2[i], size_per_batch, size_per_batch, block=BlockSize, grid=GridSize, stream=streams[i]) for i in range(0, num_stream, 1): streams[i].synchronize() chyqmom9_wout(moments_d[i], w1[i], w2[i], w_out_d[i], size_per_batch, block=BlockSize, grid=GridSize, stream=streams[i]) # w[:, loc:loc+size_per_batch] = cuda.from_device(w_out_d[i], (9, size_per_batch), np.float32, order="C") cuda.memcpy_dtoh_async(this_w[i], w_out_d[i], stream=streams[i]) chyqmom9_xout(moments_d[i], x1[i], x_out_d[i], size_per_batch, block=BlockSize, grid=GridSize, stream=streams[i]) # x[:, loc:loc+size_per_batch] = cuda.from_device(x_out_d[i], (9, size_per_batch), np.float32, order="C") cuda.memcpy_dtoh_async(this_x[i], x_out_d[i], stream=streams[i]) chyqmom9_yout(moments_d[i], x2[i], yf[i], y_out_d[i], size_per_batch, block=BlockSize, grid=GridSize, stream=streams[i]) # y[:, loc:loc+size_per_batch] = cuda.from_device(y_out_d[i], (9, size_per_batch), np.float32, order="C") cuda.memcpy_dtoh_async(this_y[i], y_out_d[i], stream=streams[i]) event_stop.record() event_stop.synchronize() for i in range(0, num_stream, 1): loc = np.int32((i) * size_per_batch) if loc + size_per_batch > size: size_per_batch = size - loc w[:, loc:loc + size_per_batch] = this_w[i] y[:, loc:loc + size_per_batch] = this_y[i] x[:, loc:loc + size_per_batch] = this_x[i] for i in range(0, num_stream, 1): # allocate memory on device moments_d[i].free() w_out_d[i].free() x_out_d[i].free() y_out_d[i].free() this_moment[i].base.unregister() this_w[i].base.unregister() this_x[i].base.unregister() this_y[i].base.unregister() c_moments[i].free() mu[i].free() yf[i].free() m1[i].free() x1[i].free() w1[i].free() x2[i].free() w2[i].free() calc_time = event_stop.time_since(event_start) return calc_time
s = cuda.Event() e = cuda.Event() s.record() N = np.int32(1 << 20) a = np.float32(2) x = np.ones(N, dtype=np.float32) y = 2. * np.ones(N, dtype=np.float32) nStreams = 2 streams = [cuda.Stream() for i in range(nStreams)] x_pin = [ cuda.register_host_memory(x[i * N / nStreams:(i + 1) * N / nStreams]) for i in range(nStreams) ] y_pin = [ cuda.register_host_memory(y[i * N / nStreams:(i + 1) * N / nStreams]) for i in range(nStreams) ] h = cublas.cublasCreate() x_gpu = np.empty(nStreams, dtype=object) y_gpu = np.empty(nStreams, dtype=object) ans = np.empty(nStreams, dtype=object) for i in range(nStreams): cublas.cublasSetStream(h, streams[i].handle)
def clean(res, ker, mdl=None, area=None, gain=0.1, maxiter=10000, tol=1e-3, stop_if_div=True, verbose=False): #s = cuda.Event() #e = cuda.Event() res = np.array(res) ker = np.array(ker) if mdl is not None: mdl = np.array(mdl) if area is not None: area = np.array(area) isComplex = (res.dtype == np.complex64) imgType = res.dtype oneImg = (res.ndim == 1) gain = np.float64(gain) maxiter = np.int32(maxiter) tol = np.float64(tol) stop_if_div = np.int32(stop_if_div) if oneImg: dim = len(res) res = np.array([res], dtype=imgType) ker = np.array([ker], dtype=imgType) if mdl is None: mdl = np.array([np.zeros(dim, dtype=imgType)], dtype=imgType) else: res[0] = res[0] - np.fft.ifft( np.fft.fft(mdl) * np.fft.fft(ker[0])).astype(imgType) mdl = np.array([mdl], dtype=imgType) if area is None: area = np.array([np.ones(dim, dtype=np.int32)], dtype=np.int32) else: area = np.array([area], dtype=np.int32) else: dim = len(res[0]) numImgs = len(res) res = np.array(res, dtype=imgType) if ker.ndim == 1: ker = np.array([ker] * numImgs, dtype=imgType) else: ker = np.array(ker, dtype=imgType) if mdl is None: mdl = np.array([np.zeros(dim, dtype=imgType)] * numImgs, dtype=imgType) elif mdl.ndim == 1: res = np.array([ res[i] - np.fft.ifft( np.fft.fft(mdl) * np.fft.fft(ker[i])).astype(imgType) for i in xrange(numImgs) ]) mdl = np.array([mdl] * numImgs, dtype=imgType) else: res = np.array([ res[i] - np.fft.ifft( np.fft.fft(mdl[i]) * np.fft.fft(ker[i])).astype(imgType) for i in xrange(numImgs) ]) mdl = np.array(mdl, dtype=imgType) if area is None: area = np.array([np.ones(dim, dtype=np.int32)] * numImgs, dtype=np.int32) elif area.ndim == 1: area = np.array([area] * numImgs, dtype=np.int32) else: area = np.array(area, dtype=np.int32) blockDimX = min(1024, len(ker)) block = (blockDimX, 1, 1) grid = (int(ceil(len(ker) / blockDimX)), 1, 1) # block=(1,1,1) # grid=(1,1,1) # make all the arguments 1 level deeper of a pointer, use thread index to choose which one at the very start, then continue through like normal code_complex = """ #pragma comment(linker, "/HEAP:40000000") #include <cuComplex.h> #include <stdio.h> #include <cmath> __global__ void clean(cuFloatComplex *resP, cuFloatComplex *kerP, cuFloatComplex *mdlP, int* areaP, int stop_if_div) { const int dim = %(DIM)s; const int maxiter = %(MAXITER)s; const double gain = %(GAIN)s; const double tol = %(TOL)s; const int index = blockDim.x * blockIdx.x + threadIdx.x; cuFloatComplex *res = resP + index * %(DIM)s; cuFloatComplex *ker = kerP + index * %(DIM)s; cuFloatComplex *mdl = mdlP + index * %(DIM)s; int *area = areaP + index * %(DIM)s; float maxr=0, maxi=0, valr=0, vali, stepr, stepi, qr=0, qi=0; float score=-1, nscore, best_score=-1; float mmax, mval, mq=0; float firstscore=-1; int argmax=0, nargmax=0, wrap_n; cuFloatComplex best_mdl[%(DIM)s]; cuFloatComplex best_res[%(DIM)s]; cuFloatComplex stepComplex; // Compute gain/phase of kernel for (int n = 0; n < %(DIM)s; n++) { valr = cuCrealf(ker[n]); vali = cuCimagf(ker[n]); mval = valr * valr + vali * vali; if (mval > mq && area[n]) { mq = mval; qr = valr; qi = vali; } } qr /= mq; qi /= -mq; // The clean loop for (int i = 0; i < maxiter; i++) { nscore = 0; mmax = -1; stepr = (float) gain * (maxr * qr - maxi * qi); stepi = (float) gain * (maxr * qi + maxi * qr); stepComplex = make_cuFloatComplex(stepr, stepi); mdl[argmax] = cuCaddf(mdl[argmax], stepComplex); // Take next step and compute score for (int n = 0; n < %(DIM)s; n++) { wrap_n = (n + argmax) %% dim; float kr = cuCrealf(ker[n]), ki = cuCimagf(ker[n]); float realSub = kr * stepr - ki * stepi; float imagSub = kr * stepi + ki * stepr; res[wrap_n] = cuCsubf(res[wrap_n], make_cuFloatComplex(realSub, imagSub)); valr = cuCrealf(res[wrap_n]); vali = cuCimagf(res[wrap_n]); mval = valr * valr + vali * vali; nscore += mval; if (mval > mmax && area[wrap_n]) { nargmax = wrap_n; maxr = valr; maxi = vali; mmax = mval; } } nscore = sqrt(nscore/dim); if (firstscore < 0) firstscore = nscore; if (score > 0 && nscore > score) { if (stop_if_div) { // We've diverged: undo last step and give up mdl[argmax] = cuCsubf(mdl[argmax], stepComplex); for (int n=0; n < dim; n++) { wrap_n = (n + argmax) %% dim; float kr = cuCrealf(ker[n]), ki = cuCimagf(ker[n]); float realAdd = kr * stepr - ki * stepi; float imagAdd = kr * stepi + ki * stepr; res[wrap_n] = cuCaddf(res[wrap_n], make_cuFloatComplex(realAdd, imagAdd)); } return; } else if (best_score < 0 || score < best_score) { // We've diverged: buf prev score in case it's global best for (int n=0; n < dim; n++) { wrap_n = (n + argmax) %% dim; best_mdl[n] = mdl[n]; float kr = cuCrealf(ker[n]), ki = cuCimagf(ker[n]); float realAdd = kr * stepr - ki * stepi; float imagAdd = kr * stepi + ki * stepr; best_res[wrap_n] = cuCaddf(res[wrap_n], make_cuFloatComplex(realAdd, imagAdd)); } best_mdl[argmax] = cuCsubf(best_mdl[argmax], stepComplex); best_score = score; i = 0; // Reset maxiter counter } } else if (score > 0 && (score - nscore) / firstscore < tol) { // We're done return; } else if (not stop_if_div && (best_score < 0 || nscore < best_score)) { i = 0; // Reset maxiter counter } score = nscore; argmax = nargmax; } // If we end on maxiter, then make sure mdl/res reflect best score if (best_score > 0 && best_score < nscore) { for (int n=0; n < dim; n++) { mdl[n] = best_mdl[n]; res[n] = best_res[n]; } } } """ code = """ #pragma comment(linker, "/HEAP:40000000") #include <stdio.h> #include <cmath> __global__ void clean(float *resP, float *kerP, float *mdlP, int *areaP, int stop_if_div) { const int dim = %(DIM)s; const int maxiter = %(MAXITER)s; const double gain = %(GAIN)s; const double tol = %(TOL)s; const int index = blockDim.x * blockIdx.x + threadIdx.x; float *res = resP + index * %(DIM)s; float *ker = kerP + index * %(DIM)s; float *mdl = mdlP + index * %(DIM)s; int *area = areaP + index * %(DIM)s; float score=-1, nscore, best_score=-1; float max=0, mmax, val, mval, step, q=0, mq=0; float firstscore=-1; int argmax=0, nargmax=0, wrap_n; float best_mdl[%(DIM)s], best_res[%(DIM)s]; // Compute gain/phase of kernel for (int n=0; n < dim; n++) { val = ker[n]; mval = val * val; if (mval > mq && area[n]) { mq = mval; q = val; } } q = 1/q; // The clean loop for (int i=0; i < maxiter; i++) { nscore = 0; mmax = -1; step = (float) gain * max * q; mdl[argmax] += step; // Take next step and compute score for (int n=0; n < dim; n++) { wrap_n = (n + argmax) %% dim; res[wrap_n] -= ker[n] * step; val = res[wrap_n]; mval = val * val; nscore += mval; if (mval > mmax && area[wrap_n]) { nargmax = wrap_n; max = val; mmax = mval; } } nscore = sqrt(nscore / dim); if (firstscore < 0) firstscore = nscore; if (i > 10000) { printf("MY CLEAN Iter %%d: Max=(%%d), Score = %%f, Prev = %%f\\n", \ i, nargmax, (double) (nscore/firstscore), \ (double) (score/firstscore)); } if (score > 0 && nscore > score) { if (stop_if_div) { // We've diverged: undo last step and give up mdl[argmax] -= step; for (int n=0; n < dim; n++) { wrap_n = (n + argmax) %% dim; res[wrap_n] += ker[n] * step; } return; } else if (best_score < 0 || score < best_score) { // We've diverged: buf prev score in case it's global best for (int n=0; n < dim; n++) { wrap_n = (n + argmax) %% dim; best_mdl[n] = mdl[n]; best_res[wrap_n] = res[wrap_n] + ker[n] * step; } best_mdl[argmax] -= step; best_score = score; i = 0; // Reset maxiter counter } } else if (score > 0 && (score - nscore) / firstscore < tol) { // We're done return; } else if (!stop_if_div && (best_score < 0 || nscore < best_score)) { i = 0; // Reset maxiter counter } score = nscore; argmax = nargmax; } // If we end on maxiter, then make sure mdl/res reflect best score if (best_score > 0 && best_score < nscore) { for (int n=0; n < dim; n++) { mdl[n] = best_mdl[n]; res[n] = best_res[n]; } } } """ code = code % { 'DIM': dim, 'MAXITER': maxiter, 'GAIN': gain, 'TOL': tol, } code_complex = code_complex % { 'DIM': dim, 'MAXITER': maxiter, 'GAIN': gain, 'TOL': tol, } if isComplex: mod = SourceModule(code_complex, options=["-fmad=false"]) else: mod = SourceModule(code, options=["-fmad=false"]) clean = mod.get_function("clean") res_pin = cuda.register_host_memory(res) ker_pin = cuda.register_host_memory(ker) mdl_pin = cuda.register_host_memory(mdl) area_pin = cuda.register_host_memory(area) res_gpu = cuda.mem_alloc(res.nbytes) ker_gpu = cuda.mem_alloc(ker.nbytes) mdl_gpu = cuda.mem_alloc(mdl.nbytes) area_gpu = cuda.mem_alloc(area.nbytes) cuda.memcpy_htod(res_gpu, res_pin) cuda.memcpy_htod(ker_gpu, ker_pin) cuda.memcpy_htod(mdl_gpu, mdl_pin) cuda.memcpy_htod(area_gpu, area_pin) clean.prepare("PPPPi") clean.prepared_call(grid, block, res_gpu, ker_gpu, mdl_gpu, area_gpu, stop_if_div) cuda.memcpy_dtoh(res_pin, res_gpu) cuda.memcpy_dtoh(mdl_pin, mdl_gpu) if oneImg: return mdl_pin[0], res_pin[0] return mdl_pin, res_pin
stream = [] for i in range(nStreams): stream.append(cuda.Stream()) idata = np.tril(np.ones((N, N), dtype=np.float32)) odata = np.zeros_like(idata, dtype=np.float32) idata_pin_list = [] odata_pin_list = [] for i in range(nStreams): i_slice = idata[i * N / nStreams:(i + 1) * N / nStreams] o_slice = odata[i * N / nStreams:(i + 1) * N / nStreams] idata_pin_list.append(cuda.register_host_memory(i_slice)) odata_pin_list.append(cuda.register_host_memory(o_slice)) #print idata_pin_list[0] # Using independent host->device, kernel, device->host streams? idata_gpu_list = [] odata_gpu_list = [] for i in range(nStreams): idata_gpu_list.append(cuda.mem_alloc(idata.nbytes / nStreams)) odata_gpu_list.append(cuda.mem_alloc(odata.nbytes / nStreams)) cuda.memcpy_htod_async(idata_gpu_list[i], idata_pin_list[i]) cuda.memcpy_htod_async(odata_gpu_list[i], odata_pin_list[i])
def MedianFilter(input=None, kernel_size=3, bw=32, bh=32): #s = cuda.Event() #e = cuda.Event() input_list = input BLOCK_WIDTH = bw BLOCK_HEIGHT = bh if isinstance(kernel_size, (int, long)): kernel_size = [kernel_size]*2 WS_x, WS_y = kernel_size padding_y = WS_x/2 padding_x = WS_y/2 input_list = np.asarray(input_list) if input_list.ndim == 3: _, N, M = input_list.shape elif input_list.ndim == 2: N, M = input_list.shape input_list = [input_list] expanded_N = N + (2 * padding_y) expanded_M = M + (2 * padding_x) gridx = int(np.ceil((expanded_N)/BLOCK_WIDTH))+1 gridy = int(np.ceil((expanded_M)/BLOCK_HEIGHT))+1 grid = (gridx,gridy, 1) block = (BLOCK_WIDTH, BLOCK_HEIGHT, 1) code = """ #pragma comment(linker, "/HEAP:4000000") /* Some sample C code for the quickselect algorithm, taken from Numerical Recipes in C. */ #define SWAP(a,b) temp=(a);(a)=(b);(b)=temp; __device__ float quickselect(float *arr, int n, int k) { unsigned long i,ir,j,l,mid; float a,temp; l=0; ir=n-1; for(;;) { if (ir <= l+1) { if (ir == l+1 && arr[ir] < arr[l]) { SWAP(arr[l],arr[ir]); } return arr[k]; } else { mid=(l+ir) >> 1; SWAP(arr[mid],arr[l+1]); if (arr[l] > arr[ir]) { SWAP(arr[l],arr[ir]); } if (arr[l+1] > arr[ir]) { SWAP(arr[l+1],arr[ir]); } if (arr[l] > arr[l+1]) { SWAP(arr[l],arr[l+1]); } i=l+1; j=ir; a=arr[l+1]; for (;;) { do i++; while (arr[i] < a); do j--; while (arr[j] > a); if (j < i) break; SWAP(arr[i],arr[j]); } arr[l+1]=arr[j]; arr[j]=a; if (j >= k) ir=j-1; if (j <= k) l=i; } } } /* https://softwareengineering.stackexchange.com/questions/284767/kth-selection-routine-floyd-algorithm-489 * Implementation from Stack Exchange user: Andy Dansby */ __device__ float FloydWirth_kth(float arr[], const int kTHvalue) { #define F_SWAP(a,b) { float temp=(a);(a)=(b);(b)=temp; } #define SIGNUM(x) ((x) < 0 ? -1 : ((x) > 0 ? 1 : (x))) int left = 0; int right = %(WS^2)s - 1; int left2 = 0; int right2 = %(WS^2)s - 1; while (left < right) { if( arr[right2] < arr[left2] ) F_SWAP(arr[left2],arr[right2]); if( arr[right2] < arr[kTHvalue] ) F_SWAP(arr[kTHvalue],arr[right2]); if( arr[kTHvalue] < arr[left2] ) F_SWAP(arr[left2],arr[kTHvalue]); int rightleft = right - left; if (rightleft < kTHvalue) { int n = right - left + 1; int ii = kTHvalue - left + 1; int s = (n + n) / 3; int sd = (n * s * (n - s) / n) * SIGNUM(ii - n / 2); int left2 = max(left, kTHvalue - ii * s / n + sd); int right2 = min(right, kTHvalue + (n - ii) * s / n + sd); } float x=arr[kTHvalue]; while ((right2 > kTHvalue) && (left2 < kTHvalue)) { do { left2++; }while (arr[left2] < x); do { right2--; }while (arr[right2] > x); F_SWAP(arr[left2],arr[right2]); } left2++; right2--; if (right2 < kTHvalue) { while (arr[left2]<x) { left2++; } left = left2; right2 = right; } if (kTHvalue < left2) { while (x < arr[right2]) { right2--; } right = right2; left2 = left; } if( arr[left] < arr[right] ) F_SWAP(arr[right],arr[left]); } #undef F_SWAP #undef SIGNUM return arr[kTHvalue]; } texture<float, 2> tex; __global__ void mf(float* in, float* out, int imgDimY, int imgDimX) { float window[%(WS^2)s]; int x_thread_offset = %(BY)s * blockIdx.x + threadIdx.x; int y_thread_offset = %(BX)s * blockIdx.y + threadIdx.y; for (int y = %(WSx/2)s + y_thread_offset; y < imgDimX - %(WSx/2)s; y += %(y_stride)s) { for (int x = %(WSy/2)s + x_thread_offset; x < imgDimY - %(WSy/2)s; x += %(x_stride)s) { int i = 0; for (int fx = 0; fx < %(WSy)s; ++fx) { for (int fy = 0; fy < %(WSx)s; ++fy) { //window[i] = tex2D(tex, (float) (x + fx - %(WSy/2)s), (float) (y + fy - %(WSx/2)s)); window[i] = in[(x + fx - %(WSy/2)s) + (y + fy - %(WSx/2)s)*imgDimY]; i += 1; } } // Sort to find the median //for (int j = 0; j < %(WS^2)s/2 + 1; j++) //{ // for (int k = j + 1; k < %(WS^2)s; k++) // { // if (window[j] > window[k]) // { // float tmp = window[j]; // window[j] = window[k]; // window[k] = tmp; // } // } //} //out[y*imgDimY + x] = window[%(WS^2)s/2]; out[y*imgDimY + x] = FloydWirth_kth(window, %(WS^2)s/2); //out[y*imgDimY + x] = quickselect(window, %(WS^2)s, %(WS^2)s/2); } } } __global__ void mf_shared(float *in, float* out, int imgDimY, int imgDimX) { const int TSx = %(BX)s + %(WSx)s - 1; const int TSy = %(BY)s + %(WSy)s - 1; __shared__ float tile[TSx][TSy]; float window[%(WS^2)s]; const int x_thread_offset = %(BX)s * blockIdx.x + threadIdx.x; const int y_thread_offset = %(BY)s * blockIdx.y + threadIdx.y; const int thread_index = blockDim.y * threadIdx.x + threadIdx.y; int imgX = blockIdx.x * blockDim.x + thread_index; int imgY; // Load into the tile for this block if (thread_index < TSx && imgX < imgDimX) { for (int i = 0; i < TSy && i < imgDimY - blockIdx.y * blockDim.y; i++) { imgY = blockIdx.y * blockDim.y + i; tile[thread_index][i] = in[imgX * imgDimY + imgY]; //tile[thread_index][i] = tex2D(tex, (float) imgY, (float) imgX); } } __syncthreads(); int x = %(WSx/2)s + x_thread_offset; int y = %(WSy/2)s + y_thread_offset; if (x >= imgDimX - %(WSx/2)s || y >= imgDimY - %(WSy/2)s) { return; } int i = 0; for (int fx = 0; fx < %(WSx)s; ++fx) { for (int fy = 0; fy < %(WSy)s; ++fy) { window[i++] = tile[threadIdx.x + fx][threadIdx.y + fy]; } } // Sort to find the median //for (int j = 0; j <= %(WS^2)s/2; j++) //{ // for (int k = j + 1; k < %(WS^2)s; k++) // { // if (window[j] > window[k]) // { // float tmp = window[j]; // window[j] = window[k]; // window[k] = tmp; // } // } //} //out[x*imgDimY + y] = window[%(WS^2)s/2]; out[x*imgDimY + y] = FloydWirth_kth(window, %(WS^2)s/2); //forgetfulSelection(window, %(WSx)s); //out[x*imgDimY + y] = window[%(WS^2)s/2]; //out[x*imgDimY + y] = myForgetfulSelection(window); } """ code = code % { 'BY' : BLOCK_WIDTH, 'BX' : BLOCK_HEIGHT, 'WS^2' : WS_x * WS_y, 'x_stride' : BLOCK_WIDTH * gridx, 'y_stride' : BLOCK_HEIGHT * gridy, 'WSx' : WS_x, 'WSy' : WS_y, 'WSx/2' : WS_x/2, 'WSy/2' : WS_y/2, } mod = SourceModule(code) #mf_shared = mod.get_function('mf_shared') mf = mod.get_function('mf') texref = mod.get_texref("tex") # NSTREAMS := NUMBER OF INPUT IMAGES nStreams = len(input_list) # Initialize the streams stream = [cuda.Stream()]*nStreams # Pad all the images with zeros input_list = [np.array( np.pad(img, ( (padding_y, padding_y), (padding_x, padding_x) ), 'constant', constant_values=0) , dtype=np.float32) for img in input_list] # Use pinned memory for all the images in_pin_list = [cuda.register_host_memory(img) for img in input_list] imgBytes = in_pin_list[0].nbytes # Initialize the outputs to empty images (assuming all images are of the same shape) outdata_list = [cuda.pagelocked_empty_like(img) for img in input_list] # Malloc on the GPU for each input and output image #in_gpu_list = [cuda.mem_alloc(pinnedImg.nbytes) for pinnedImg in in_pin_list] in_gpu_list = [None]*nStreams #out_gpu_list = [cuda.mem_alloc(pinnedImg.nbytes) for pinnedImg in in_pin_list] out_gpu_list = [None]*nStreams mf.prepare("PPii") for i in xrange(nStreams + 2): ii = i - 1 iii = i - 2 if 0 <= iii < nStreams: st = stream[iii] cuda.memcpy_dtoh_async(outdata_list[iii], out_gpu_list[iii], stream=st) if 0 <= ii < nStreams: st = stream[ii] out_gpu_list[ii] = cuda.mem_alloc(imgBytes) # s.record(stream=stream[0]) # mf_shared.prepare("Pii") # mf_shared.prepared_async_call(grid, block, st, out_gpu_list[ii], expanded_M, expanded_N) #mf.prepare("PPii") mf.prepared_async_call(grid, block, st, in_gpu_list[ii], out_gpu_list[ii], expanded_M, expanded_N) # e.record(stream=stream[0]) # e.synchronize() # print s.time_till(e), "ms for the kernel" if 0 <= i < nStreams: st = stream[i] #cuda.matrix_to_texref(in_pin_list[i], texref, order="C") in_gpu_list[i] = cuda.mem_alloc(imgBytes) cuda.memcpy_htod_async(in_gpu_list[i], in_pin_list[i], stream=st) if (padding_y > 0): outdata_list = [out[padding_y:-padding_y] for out in outdata_list] if (padding_x > 0): outdata_list = [out[:, padding_x:-padding_x] for out in outdata_list] return outdata_list
import pycuda.autoinit import pycuda.gpuarray as gpuarray import numpy as np import skcuda.linalg as linalg import skcuda.misc as misc import pycuda.driver as cuda s = cuda.Event() e = cuda.Event() s.record() linalg.init() M = 4096 N = 4096 A = np.asarray(np.random.rand(M, N), dtype=np.float32) B = np.asarray(np.random.rand(N, M), dtype=np.float32) A_pin = cuda.register_host_memory(A) B_pin = cuda.register_host_memory(B) A_gpu = gpuarray.to_gpu(A) B_gpu = gpuarray.to_gpu(B) C_gpu = linalg.dot(A_gpu, B_gpu) #print np.allclose(np.dot(A,B), C_gpu.get()) e.record() e.synchronize() print s.time_till(e) / 1000., "s"
def convert_image_rgb(self, image): global program start = time.time() iplanes = image.get_planes() w = image.get_width() h = image.get_height() stride = image.get_rowstride() pixels = image.get_pixels() debug("convert_image(%s) planes=%s, pixels=%s, size=%s", image, iplanes, type(pixels), len(pixels)) assert iplanes == ImageWrapper.PACKED, "must use packed format as input" assert image.get_pixel_format( ) == self.src_format, "invalid source format: %s (expected %s)" % ( image.get_pixel_format(), self.src_format) divs = get_subsampling_divs(self.dst_format) #copy packed rgb pixels to GPU: upload_start = time.time() stream = driver.Stream() mem = numpy.frombuffer(pixels, dtype=numpy.byte) in_buf = driver.mem_alloc(len(pixels)) hmem = driver.register_host_memory( mem, driver.mem_host_register_flags.DEVICEMAP) pycuda.driver.memcpy_htod_async(in_buf, mem, stream) out_bufs = [] out_strides = [] out_sizes = [] for i in range(3): x_div, y_div = divs[i] out_stride = roundup(self.dst_width / x_div, 4) out_height = roundup(self.dst_height / y_div, 2) out_buf, out_stride = driver.mem_alloc_pitch( out_stride, out_height, 4) out_bufs.append(out_buf) out_strides.append(out_stride) out_sizes.append((out_stride, out_height)) #ensure uploading has finished: stream.synchronize() #we can now unpin the host memory: hmem.base.unregister() debug("allocation and upload took %.1fms", 1000.0 * (time.time() - upload_start)) kstart = time.time() kargs = [in_buf, numpy.int32(stride)] for i in range(3): kargs.append(out_bufs[i]) kargs.append(numpy.int32(out_strides[i])) blockw, blockh = 16, 16 #figure out how many pixels we process at a time in each dimension: xdiv = max([x[0] for x in divs]) ydiv = max([x[1] for x in divs]) gridw = max(1, w / blockw / xdiv) if gridw * 2 * blockw < w: gridw += 1 gridh = max(1, h / blockh / ydiv) if gridh * 2 * blockh < h: gridh += 1 debug("calling %s%s, with grid=%s, block=%s", self.kernel_function_name, tuple(kargs), (gridw, gridh), (blockw, blockh, 1)) self.kernel_function(*kargs, block=(blockw, blockh, 1), grid=(gridw, gridh)) #we can now free the GPU source buffer: in_buf.free() kend = time.time() debug("%s took %.1fms", self.kernel_function_name, (kend - kstart) * 1000.0) self.frames += 1 #copy output YUV channel data to host memory: read_start = time.time() pixels = [] strides = [] for i in range(3): x_div, y_div = divs[i] out_size = out_sizes[i] #direct full plane async copy keeping current GPU padding: plane = driver.aligned_empty(out_size, dtype=numpy.byte) driver.memcpy_dtoh_async(plane, out_bufs[i], stream) pixels.append(plane.data) stride = out_strides[min(len(out_strides) - 1, i)] strides.append(stride) stream.synchronize() #the copying has finished, we can now free the YUV GPU memory: #(the host memory will be freed by GC when 'pixels' goes out of scope) for out_buf in out_bufs: out_buf.free() self.cuda_context.synchronize() read_end = time.time() debug("strides=%s", strides) debug("read back took %.1fms, total time: %.1f", (read_end - read_start) * 1000.0, 1000.0 * (time.time() - start)) return ImageWrapper(0, 0, self.dst_width, self.dst_height, pixels, self.dst_format, 24, strides, planes=ImageWrapper._3_PLANES)
import numpy as np s = cuda.Event() e = cuda.Event() s.record() TILE_DIM = 32 BLOCK_ROWS = 8 N = 32 * 1024 idata = np.tril(np.ones((N, N), dtype=np.float32)) odata = np.empty_like(idata, dtype=np.float32) idata_pin = cuda.register_host_memory(idata) odata_pin = cuda.register_host_memory(odata) #s.record() idata_gpu = cuda.mem_alloc(idata.nbytes) odata_gpu = cuda.mem_alloc(odata.nbytes) cuda.memcpy_htod_async(idata_gpu, idata_pin) cuda.memcpy_htod_async(odata_gpu, odata_pin) #e.record() #e.synchronize() #print s.time_till(e) code = """
context.append(device[i].make_context()) # init streams for i in range(0, n_devs): # activate context for current device context[i].push() # create streams stream.append(cuda.Stream()) # deactivate context for current device context[i].pop() # activate the first context and register memory on host context[0].push() # define matrices in which each dev will compute slice of the final matrix j_regs = [] i_reg = cuda.register_host_memory(image, 1) for dev in device: # each slice that will contain the partial matrix of the final image # need to be L rows greater than the fraction of n_rows/n_devs to correctly # compute elements at the edges image_final_gpu_slice = np.empty(shape=(imageDims[0]//n_devs+L, imageDims[1]), dtype=np.float32) j_reg = cuda.register_host_memory(image_final_gpu_slice, 1) j_regs.append(j_reg) # "here flag 1 stands for cudaHostRegisterPortable, which means that the pinned memory will be available # for all contexts at the same time" context[0].pop() # split the data in array 'image' to equal parts # and transfer the data to GPUs asynchronously for i in range(0, n_devs): # input matrix is divided in vertical direction