def _apply_transform_cp(self): source_shape_y: cp.int32 = cp.int32(self._source_cp.shape[0]) source_shape_x: cp.int32 = cp.int32(self._source_cp.shape[1]) frame_shape_y: cp.int32 = cp.int32(self.frame_shape.y) frame_shape_x: cp.int32 = cp.int32(self.frame_shape.x) matrix = cp.asarray(self._matrix, dtype=cp.float32) vector = cp.asarray(self._vector, dtype=cp.float32) frame_to_source_fp32 = cp.matmul(self._frame_pixels_cp, matrix.T) + vector frame_to_source_int32 = cp.rint(frame_to_source_fp32).astype(cp.int32) # for 3D array: 0 is x, 1 is y source_x = frame_to_source_int32[:, :, 0] # 2D array of x pixel in source source_y = frame_to_source_int32[:, :, 1] # 2D array of y pixel in source # boolean 2-D array of portal pixels that map to a pixel on the source # will be false if the pixel on source would fall outside of source mapped = (source_y >= 0) & (source_y < source_shape_y) & \ (source_x >= 0) & (source_x < source_shape_x) frame_rgba = cp.zeros(shape=(frame_shape_y, frame_shape_x, 4), dtype=cp.uint8) # if cp.all(mapped): # frame_rgba[:, :] = self._source_cp[source_y, source_x, :] # else: # frame_rgba[mapped, :] = self._source_cp[source_y[mapped], source_x[mapped], :] frame_rgba[mapped, :] = self._source_cp[source_y[mapped], source_x[mapped], :] # self.image_rgba[~mapped, :] = self._zero_uint probably slower than just zeroing everything first self._frame_rgba = cp.asnumpy(frame_rgba)
def _numba_psf(in_x, in_y, psf, spacing_x, spacing_y, O_begin, O_end, Nx, Ny, width_x, width_y, mesh_ratio): btx, bty = cuda.grid(2) x = in_x[bty, btx] y = in_y[bty, btx] temp = 0 for order in range(cupy.int32(O_begin), cupy.int32(O_end), 1): if order == 0: continue intensity = (math.sin(order / mesh_ratio) / (order / mesh_ratio))**2 for i in range(cupy.int32(len(spacing_x))): dx = spacing_x[i] dy = spacing_y[i] x_centered = x - (0.5 * Nx + dx * order + 0.5) y_centered = y - (0.5 * Ny + dy * order + 0.5) temp += math.exp(-width_x * x_centered * x_centered - width_y * y_centered * y_centered) * intensity psf[bty, btx] = temp
def load_image(path_dir, mode): # ----------------------------------------------------------------------------- if mode: # 学習用データセットの準備 fnames = sorted(glob.glob(path_dir + "/*")) # file = [(cp.load(f)/255.0).astype(cp.float32).transpose([2, 0, 1]) for f in fnames] file = [] for index, item in enumerate(fnames): item = cp.load(item).astype(cp.float32).transpose([2, 0, 1]) / 255.0 file.append(item) # label = [os.path.basename(f) for f in fnames] labels = [] for num in range(len(fnames) / 2): labels.append(cp.int32(num)) labels.append(cp.int32(num)) # List = list(zip(file, labels)) # data = chainer.datasets.TupleDataset(List) data = chainer.datasets.TupleDataset(file, labels) return data # ------------------------------------------------------------------------------- if not mode: # 識別用データセットの準備 data = [] labels = [] filelist = sorted(glob.glob(path_dir + "/*")) for index, item in enumerate(filelist): item = np.load(item).astype(np.float32).transpose([2, 0, 1]) / 255.0 item = item.reshape((1, item.shape[0], item.shape[1], item.shape[2])) data.append(item) labels.append(int(index)) return data, labels
def conv_forward_pass(image, filt, bias): # print("CUDA cODE") (n_f, n_c_f, f, _) = filt.shape # filter dimensions n_c, in_dim, _ = image.shape # image dimensions convfilter_kernel = cp.RawKernel(r''' extern "C" __global__ void my_conv(const float* img, const float * filt, const float * bias, float * out, const int depth, const int img_dim, const int f, const int out_dim ) { const unsigned int j = blockDim.x*blockIdx.x + threadIdx.x; const unsigned int i = blockDim.y*blockIdx.y + threadIdx.y; const unsigned int pp = blockIdx.z; const unsigned int dp = depth; //printf("%d,%d ::: ",i,j); if(i<img_dim && j<img_dim){ unsigned int oPixelPos = i*out_dim + j+ pp*out_dim*out_dim; out[oPixelPos] = 0; for(int h=0;h<dp;h++){ for(int k=0;k<f;k++){ for(int l=0;l<f;l++){ unsigned int iPixelPos = ((i+k)*img_dim + (j+l)) + img_dim*img_dim*h; unsigned int filtPos = (k*f+l) + f*f*h + f*f*dp*pp; out[oPixelPos] += img[iPixelPos] * filt[filtPos] + bias[pp]; } } } } } ''', 'my_conv') out_dim = int(in_dim - f)+1 # calculate output dimensions img_gpu = cp.asarray(image, dtype=cp.float32) #28X28 img_dim_gpu = cp.int32(in_dim) #28 filt_size_gpu = cp.int32(f) # 5 out_dim_gpu = cp.int32(out_dim) #24 depth_gpu = cp.int32(n_c) threads_per_block = f num_blocks = (in_dim//f) +1 filt_gpu = cp.asarray(filt, dtype=cp.float32) #5X5 # print(filt) # filt_gpu = cp.zeros((n_f, n_c_f, f, f), dtype=cp.float32) #5X5 filt_gpu = cp.asarray(filt_gpu.flatten(), dtype=cp.float32) # print(filt_gpu) # out_gpu = cp.zeros((out_dim,out_dim,n_f), dtype=cp.float32) # 24 X 24 out_gpu = cp.zeros((n_f,out_dim,out_dim), dtype=cp.float32) # 24 X 24 out_gpu = out_gpu.flatten() bias_gpu = cp.asarray(bias, dtype=cp.float32) #5X5 bias_gpu = bias_gpu.flatten() convfilter_kernel((num_blocks,num_blocks,8), (threads_per_block,threads_per_block), (img_gpu, filt_gpu, bias_gpu, out_gpu, depth_gpu, img_dim_gpu, filt_size_gpu, out_dim_gpu)) # # convfilter_kernel((576,), (24,5), (img_gpu, filt_gpu, out_gpu, depth_gpu, img_dim_gpu, filt_size_gpu, out_dim_gpu)) out_gpu = cp.reshape(out_gpu,(n_f,out_dim,out_dim)) output = cp.asnumpy(out_gpu) return output
def prominent_peaks_optimized(img, min_xdistance=1, min_ydistance=1, threshold=None, num_peaks=cp.inf): """Return peaks with non-maximum suppression. Identifies most prominent features separated by certain distances. Non-maximum suppression with different sizes is applied separately in the first and second dimension of the image to identify peaks. Parameters ---------- image : (M, N) ndarray Input image. min_xdistance : int Minimum distance separating features in the x dimension. min_ydistance : int Minimum distance separating features in the y dimension. threshold : float Minimum intensity of peaks. Default is `0.5 * max(image)`. num_peaks : int Maximum number of peaks. When the number of peaks exceeds `num_peaks`, return `num_peaks` coordinates based on peak intensity. Returns ------- intensity, xcoords, ycoords : tuple of array Peak intensity values, x and y indices. Notes ----- Modified from https://github.com/mritools/cupyimg _prominent_peaks method """ THREADS_PER_BLOCK = (32, 1) # Each thread is responsible for a (min_ydistance * min_xdistance) patch # THREADS_PER_BLOCK is in the order of (x, y), but img.shape is in the order of (y, x) NUM_BLOCKS = (img.shape[1] // (THREADS_PER_BLOCK[0] * min_xdistance) + ((img.shape[1] % (THREADS_PER_BLOCK[0] * min_xdistance)) > 0), img.shape[0] // (THREADS_PER_BLOCK[1] * min_ydistance) + ((img.shape[0] % (THREADS_PER_BLOCK[1] * min_ydistance)) > 0)) NUM_THREADS = np.multiply(THREADS_PER_BLOCK, NUM_BLOCKS) elems = (NUM_THREADS[0] * NUM_THREADS[1], ) intensity, xcoords, ycoords = cp.zeros(elems, dtype=cp.float32), cp.zeros( elems, dtype=cp.int32), cp.zeros(elems, dtype=cp.int32) prominent_peaks_kernel( NUM_BLOCKS, THREADS_PER_BLOCK, (img, cp.int32(img.shape[0]), cp.int32( img.shape[1]), cp.int32(min_xdistance), cp.int32(min_ydistance), cp.float32(threshold), intensity, xcoords, ycoords)) indices = intensity != 0.0 return intensity[indices], xcoords[indices], ycoords[indices]
def forward(self, one, two): rbot0 = one.new_zeros([ one.shape[0], one.shape[2] + 40, one.shape[3] + 40, one.shape[1] ]) rbot1 = one.new_zeros([ one.shape[0], one.shape[2] + 40, one.shape[3] + 40, one.shape[1] ]) one = one.contiguous(); assert(one.is_cuda == True) two = two.contiguous(); assert(two.is_cuda == True) output = one.new_zeros([ one.shape[0], 441, one.shape[2], one.shape[3] ]) if one.is_cuda == True: n = one.shape[2] * one.shape[3] cupy_launch('kernel_Correlation_rearrange', cupy_kernel('kernel_Correlation_rearrange', { 'input': one, 'output': rbot0 }))( grid=tuple([ int((n + 16 - 1) / 16), one.shape[1], one.shape[0] ]), block=tuple([ 16, 1, 1 ]), args=[ cupy.int32(n), one.data_ptr(), rbot0.data_ptr() ] ) n = two.shape[2] * two.shape[3] cupy_launch('kernel_Correlation_rearrange', cupy_kernel('kernel_Correlation_rearrange', { 'input': two, 'output': rbot1 }))( grid=tuple([ int((n + 16 - 1) / 16), two.shape[1], two.shape[0] ]), block=tuple([ 16, 1, 1 ]), args=[ cupy.int32(n), two.data_ptr(), rbot1.data_ptr() ] ) n = output.shape[1] * output.shape[2] * output.shape[3] cupy_launch('kernel_Correlation_updateOutput', cupy_kernel('kernel_Correlation_updateOutput', { 'rbot0': rbot0, 'rbot1': rbot1, 'top': output }))( grid=tuple([ output.shape[3], output.shape[2], output.shape[0] ]), block=tuple([ 32, 1, 1 ]), shared_mem=one.shape[1] * 4, args=[ cupy.int32(n), rbot0.data_ptr(), rbot1.data_ptr(), output.data_ptr() ] ) elif one.is_cuda == False: raise NotImplementedError() # end self.save_for_backward(one, two, rbot0, rbot1) return output
def backward(self, gradOutput): one, two, rbot0, rbot1 = self.saved_tensors gradOutput = gradOutput.contiguous(); assert(gradOutput.is_cuda == True) gradOne = one.new_zeros([ one.shape[0], one.shape[1], one.shape[2], one.shape[3] ]) if self.needs_input_grad[0] == True else None gradTwo = one.new_zeros([ one.shape[0], one.shape[1], one.shape[2], one.shape[3] ]) if self.needs_input_grad[1] == True else None if one.is_cuda == True: if gradOne is not None: for intSample in range(one.shape[0]): n = one.shape[1] * one.shape[2] * one.shape[3] cupy_launch('kernel_Correlation_updateGradOne', cupy_kernel('kernel_Correlation_updateGradOne', { 'rbot0': rbot0, 'rbot1': rbot1, 'gradOutput': gradOutput, 'gradOne': gradOne, 'gradTwo': None }))( grid=tuple([ int((n + 512 - 1) / 512), 1, 1 ]), block=tuple([ 512, 1, 1 ]), args=[ cupy.int32(n), intSample, rbot0.data_ptr(), rbot1.data_ptr(), gradOutput.data_ptr(), gradOne.data_ptr(), None ] ) # end # end if gradTwo is not None: for intSample in range(one.shape[0]): n = one.shape[1] * one.shape[2] * one.shape[3] cupy_launch('kernel_Correlation_updateGradTwo', cupy_kernel('kernel_Correlation_updateGradTwo', { 'rbot0': rbot0, 'rbot1': rbot1, 'gradOutput': gradOutput, 'gradOne': None, 'gradTwo': gradTwo }))( grid=tuple([ int((n + 512 - 1) / 512), 1, 1 ]), block=tuple([ 512, 1, 1 ]), args=[ cupy.int32(n), intSample, rbot0.data_ptr(), rbot1.data_ptr(), gradOutput.data_ptr(), None, gradTwo.data_ptr() ] ) # end # end elif one.is_cuda == False: raise NotImplementedError() # end return gradOne, gradTwo
def CFAR_CA_GPU(signal_ext, origSignalLen, guardBandLen_1side, validSampLen_1side, scratchPad, noiseMargin, outputBoolVector): thrdID = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x if (thrdID < origSignalLen - 1) or (thrdID > 2 * origSignalLen - 2): return # check for local maxima on the CUT i.e. signal_ext[thrdID] if (signal_ext[thrdID] >= signal_ext[thrdID - 1]) and ( signal_ext[thrdID] >= signal_ext[thrdID + 1]): count = cp.int32(0) for i in range(thrdID - guardBandLen_1side - validSampLen_1side, thrdID - guardBandLen_1side): # scratchPad[count] = signal_ext[i]; # This should not be done. There should be a separate scratch pad for each thread when it is vector/matrix copying scratchPad[thrdID - (origSignalLen - 1), count] = signal_ext[i] count += 1 for j in range(thrdID + guardBandLen_1side + 1, thrdID + guardBandLen_1side + validSampLen_1side + 1): # scratchPad[count] = signal_ext[j]; # This should not be done. There should be a separate scratch pad for each thread when it is vector/matrix copying scratchPad[thrdID - (origSignalLen - 1), count] = signal_ext[j] count += 1 avgNoisePower = cp.float32(0) for ele in range(2 * validSampLen_1side): avgNoisePower += scratchPad[thrdID - (origSignalLen - 1), ele] avgNoisePower = avgNoisePower / (2 * validSampLen_1side) if (signal_ext[thrdID] > noiseMargin * avgNoisePower): outputBoolVector[thrdID - (origSignalLen - 1)] = 1
def compress(self, tensor, name): shape = tensor.size() tensor_flatten = tensor.flatten() cupy_tensor = cupy.fromDlpack(to_dlpack(tensor_flatten)) tensor_cast = cupy_tensor.view(cupy.int32) sign = tensor_cast & cupy.int32(0b10000000000000000000000000000000) exp = tensor_cast & cupy.int32(0b01111111100000000000000000000000) mantissa = tensor_cast & cupy.int32(0b00000000011111111111111111111111) exp_add_one = mantissa > cupy.random.randint(low=0, high=0b00000000011111111111111111111111, size=cupy_tensor.shape, dtype=cupy.int32) exponent = cupy.where(exp_add_one, exp + 0b00000000100000000000000000000000, exp) exp_shift = cupy.clip(exponent, a_min=0b00001001000000000000000000000000, a_max=0b01001000100000000000000000000000) exps = cupy.right_shift(exp_shift, 23) exps = cupy.bitwise_or(cupy.right_shift(sign, 24), exps - 18) tensor_compressed = exps.astype(cupy.uint8) return [from_dlpack(tensor_compressed.toDlpack())], shape
def Q_inner_product_cupy(Q, A, start_indices, window_size): num_time_points, num_lms = Q.shape num_extrinsic_samples, _ = A.shape assert not cupy.isfortran(Q) assert not cupy.isfortran(A) out = cupy.empty( (num_extrinsic_samples, window_size), dtype=cupy.complex128, order="C", ) global _cuda_code if _cuda_code is None: # it's assumed that cuda_Q_inner_product.cu is placed in the same folder as this code path = os.path.join(os.path.dirname(__file__), 'cuda_Q_inner_product.cu') # alternative to deal with packaging in another directory if not (os.path.isfile(path)): path = os.path.join( os.path.split(os.path.dirname(__file__))[0], 'cuda_Q_inner_product.cu') with open(path, 'r') as f: _cuda_code = f.read() Q_prod_fn = cupy.RawKernel(_cuda_code, "Q_inner") else: Q_prod_fn = cupy.RawKernel(_cuda_code, "Q_inner") float_prec = 16 num_threads_x = 4 num_threads_y = 1024 // 4 block_size = num_threads_x, num_threads_y, 0 grid_size = ( (num_extrinsic_samples + num_threads_x - 1) // num_threads_x, 0, 0, ) args = ( Q, A, start_indices, window_size, num_time_points, num_extrinsic_samples, num_lms, out, ) Q_prod_fn( grid_size, block_size, args, shared_mem=cupy.int32(num_threads_x * num_lms * float_prec), ) return out
def test_const_memory(self): mod = cupy.RawModule(code=test_const_mem, backend=self.backend) ker = mod.get_function('multiply_by_const') mem_ptr = mod.get_global('some_array') const_arr = cupy.ndarray((100, ), cupy.float32, mem_ptr) data = cupy.arange(100, dtype=cupy.float32) const_arr[...] = data output_arr = cupy.ones(100, dtype=cupy.float32) ker((1, ), (100, ), (output_arr, cupy.int32(100))) assert (data == output_arr).all()
def xengine_full(signalsFX, validFX, signalsFY, validFY, blockDim=(4,16)): """ X-engine for the outputs of fengine(). """ nStand, nChan, nWin = signalsFX.shape nBL = nStand*(nStand+1) // 2 with cupy.cuda.Stream(): try: signalsFX = cupy.asarray(signalsFX) signalsFY = cupy.asarray(signalsFY) validFX = cupy.asarray(validFX) validFY = cupy.asarray(validFY) except cupy.cuda.memory.OutOfMemoryError: _CACHE.free() signalsFX = cupy.asarray(signalsFX) signalsFY = cupy.asarray(signalsFY) validFX = cupy.asarray(validFX) validFY = cupy.asarray(validFY) try: combined = _CACHE[(2*nStand,nChan,nWin,numpy.complex64)] valid = _CACHE[(2*nStand,nWin,numpy.uint8)] except KeyError: combined = cupy.empty((2*nStand,nChan,nWin), dtype=numpy.complex64) valid = cupy.empty((2*nStand,nWin), dtype=numpy.uint8) _CACHE[(2*nStand,nChan,nWin,numpy.complex64)] = combined _CACHE[(2*nStand,nWin,numpy.uint8)] = valid nct, nwt = blockDim ncb = int(numpy.ceil(nChan/nct)) nwb = int(numpy.ceil(nWin/nwt)) _INTERLEAVE((nStand,ncb,nwb), (nct,nwt), (signalsFX, signalsFY, validFX, validFY, cupy.int32(nStand), cupy.int32(nChan), cupy.int32(nWin), combined, valid)) try: output = _CACHE[(4,nBL,nChan,numpy.complex64)] except KeyError: output = cupy.empty((4,nBL,nChan), dtype=numpy.complex64) _CACHE[(4,nBL,nChan,numpy.complex64)] = output nbt, nct = blockDim nbb = int(numpy.ceil(nBL/nbt)) ncb = int(numpy.ceil(nChan/nct)) _XENGINE3((nbb,ncb), (nbt, nct), (combined, valid, cupy.int32(nStand), cupy.int32(nBL), cupy.int32(nChan), cupy.int32(nWin), output)) output_cpu = cupy.asnumpy(output) return output_cpu[0,:,:], output_cpu[1,:,:], output_cpu[2,:,:], output_cpu[3,:,:]
def start(self, rand_seed=None): if rand_seed is None: rand_seed = np.random.randint(1e5) self.nPh = int(self.nPh) self._reset_results() self._generate_initial_coodinate(self.nPh) M = np.int32(self.model.voxel_model.shape[1]) L = np.int32(self.model.voxel_model.shape[2]) print("") print("###### Start (Random seed: %s) ######" % rand_seed) print("") start_ = time.time() cp.get_default_memory_pool().free_all_blocks() cp.get_default_pinned_memory_pool().free_all_blocks() add_ = cp.asarray(self.add.astype(np.int32), dtype=np.int32) p_ = cp.asarray(self.p.astype(np.float32), dtype=np.float32) v_ = cp.asarray(self.v.astype(np.float32), dtype=np.float32) w_ = cp.asarray(self.w.astype(np.float32), dtype=np.float32) ma_ = cp.asarray(self.model.ma.astype(np.float32)) ms_ = cp.asarray(self.model.ms.astype(np.float32)) n_ = cp.asarray(self.model.n.astype(np.float32)) g_ = cp.asarray(self.model.g.astype(np.float32)) v_model = cp.asarray(self.model.voxel_model.astype(np.int8), dtype=np.int8) l_ = cp.float32(self.model.voxel_space) nph = cp.int32(self.nPh) end_p = cp.int8(self.model.end_point) func((int((self.nPh + self.threadnum - 1) / self.threadnum), 1), (self.threadnum, 1), (add_, p_, v_, w_, ma_, ms_, n_, g_, v_model, l_, M, L, nph, end_p, np.int32(rand_seed))) self.add = cp.asnumpy(add_) self.p = cp.asnumpy(p_) self.v = cp.asnumpy(v_) self.w = cp.asnumpy(w_) del add_, p_, v_, w_, ma_, ms_, n_, g_, del v_model, l_, M, L, nph, end_p, rand_seed, cp.get_default_memory_pool().free_all_blocks() cp.get_default_pinned_memory_pool().free_all_blocks() gc.collect() self._end_process() print("###### End ######") self.getRdTtRate() calTime(time.time(), start_) return self
def xengine(signalsF1, validF1, signalsF2, validF2, blockDim=(4,16)): """ X-engine for the outputs of fengine(). """ nStand, nChan, nWin = signalsF1.shape nBL = nStand*(nStand+1) // 2 with cupy.cuda.Stream(): try: signalsF1 = cupy.asarray(signalsF1) signalsF2 = cupy.asarray(signalsF2) validF1 = cupy.asarray(validF1) validF2 = cupy.asarray(validF2) except cupy.cuda.memory.OutOfMemoryError: _CACHE.free() signalsF1 = cupy.asarray(signalsF1) signalsF2 = cupy.asarray(signalsF2) validF1 = cupy.asarray(validF1) validF2 = cupy.asarray(validF2) try: output = _CACHE[(1,nBL,nChan,numpy.complex64)] except KeyError: output = cupy.empty((nBL,nChan), dtype=numpy.complex64) _CACHE[(1,nBL,nChan,numpy.complex64)] = output nbt, nct = blockDim nbb = int(numpy.ceil(nBL/nbt)) ncb = int(numpy.ceil(nChan/nct)) _XENGINE2((nbb,ncb), (nbt, nct), (signalsF1, signalsF2, validF1, validF2, cupy.int32(nStand), cupy.int32(nBL), cupy.int32(nChan), cupy.int32(nWin), output)) output_cpu = cupy.asnumpy(output) return output_cpu
def image_to_data(image, code): # 大きさを揃える img64 = image.resize((64, 64)) # 画像を数値配列に pixels = xp.array(img64, dtype=xp.float32) pixels = pixels.reshape((1, 64, 64)) pixels /= 255 # インデックスを決める if code in all_labels: label = all_labels[code] else: label = len(all_labels) all_labels[code] = label return (pixels, xp.int32(label))
def test_shfl_width(self): @jit.rawkernel() def f(a, b, w): laneId = jit.threadIdx.x & 0x1f value = jit.shfl_sync(0xffffffff, b[jit.threadIdx.x], 0, width=w) b[laneId] = value c = cupy.arange(32, dtype=cupy.int32) for w in (2, 4, 8, 16, 32): a = cupy.int32(100) b = cupy.arange(32, dtype=cupy.int32) f[1, 32](a, b, w) c[c % w != 0] = c[c % w == 0] assert (b == c).all()
def _call_nms_kernel(bbox, thresh): n_bbox = bbox.shape[0] threads_per_block = 64 col_blocks = np.ceil(n_bbox / threads_per_block).astype(np.int32) blocks = (col_blocks, col_blocks, 1) threads = (threads_per_block, 1, 1) mask_dev = cp.zeros((n_bbox * col_blocks,), dtype=np.uint64) bbox = cp.ascontiguousarray(bbox, dtype=np.float32) kern = _load_kernel('nms_kernel', _nms_gpu_code) kern(blocks, threads, args=(cp.int32(n_bbox), cp.float32(thresh), bbox, mask_dev)) mask_host = mask_dev.get() selection, n_selec = _nms_gpu_post( mask_host, n_bbox, threads_per_block, col_blocks) return selection, n_selec
def _call_nms_kernel(bbox, thresh): n_bbox = bbox.shape[0] threads_per_block = 64 col_blocks = np.ceil(n_bbox / threads_per_block).astype(np.int32) blocks = (col_blocks, col_blocks, 1) threads = (threads_per_block, 1, 1) mask_dev = cp.zeros((n_bbox * col_blocks,), dtype=np.uint64) bbox = cp.ascontiguousarray(bbox, dtype=np.float32) kern = _load_kernel('nms_kernel', _nms_gpu_code) kern(blocks, threads, args=(cp.int32(n_bbox), cp.float32(thresh), bbox, mask_dev)) mask_host = mask_dev.get() selection, n_selec = _nms_gpu_post( mask_host, n_bbox, threads_per_block, col_blocks) return selection, n_selec
def forward(self, tenOne, tenTwo): tenOne = tenOne.contiguous() assert (tenOne.is_cuda == True) tenTwo = tenTwo.contiguous() assert (tenTwo.is_cuda == True) tenOut = tenOne.new_zeros([ tenOne.shape[0], tenOne.shape[1], tenOne.shape[2], tenOne.shape[3] ]) if tenOne.is_cuda == True: cuda_launch( 'hadamard_out', ''' extern "C" __global__ void __launch_bounds__(512) hadamard_out( const int n, const float* __restrict__ tenOne, const float* __restrict__ tenTwo, float* __restrict__ tenOut ) { int intIndex = (blockIdx.x * blockDim.x) + threadIdx.x; if (intIndex >= n) { return; } tenOut[intIndex] = tenOne[intIndex] * tenTwo[intIndex]; } ''')(grid=tuple([int((tenOut.nelement() + 512 - 1) / 512), 1, 1]), block=tuple([512, 1, 1]), args=[ cupy.int32(tenOut.nelement()), tenOne.data_ptr(), tenTwo.data_ptr(), tenOut.data_ptr() ], stream=collections.namedtuple('Stream', 'ptr')( torch.cuda.current_stream().cuda_stream)) elif tenOne.is_cuda == False: raise NotImplementedError() # end self.save_for_backward(tenOne, tenTwo) return tenOut
def _call_nms_kernel(bbox, thresh): # PyTorch does not support unsigned long Tensor. # Doesn't matter,since it returns ndarray finally. # So I'll keep it unmodified. n_bbox = bbox.shape[0] threads_per_block = 64 col_blocks = np.ceil(n_bbox / threads_per_block).astype(np.int32) blocks = (col_blocks, col_blocks, 1) threads = (threads_per_block, 1, 1) mask_dev = cp.zeros((n_bbox * col_blocks,), dtype=np.uint64) bbox = cp.ascontiguousarray(bbox, dtype=np.float32) kern = _load_kernel('nms_kernel', _nms_gpu_code) kern(blocks, threads, args=(cp.int32(n_bbox), cp.float32(thresh), bbox, mask_dev)) mask_host = mask_dev.get() selection, n_selec = _nms_gpu_post( mask_host, n_bbox, threads_per_block, col_blocks) return selection, n_selec
def _call_nms_kernel(bbox, thresh): # PyTorch does not support unsigned long Tensor. # Doesn't matter,since it returns ndarray finally. # So I'll keep it unmodified. n_bbox = bbox.shape[0] #框的个数 threads_per_block = 64 #一个block有多少thread col_blocks = np.ceil(n_bbox / threads_per_block).astype(np.int32)#cuda常用的对齐block操作 保证线程数最小限度全覆盖数据 blocks = (col_blocks, col_blocks, 1) #因为对齐一个blocks按理说是(n_blocks,1,1) 说明后面要全排列了 threads = (threads_per_block, 1, 1) mask_dev = cp.zeros((n_bbox * col_blocks,), dtype=np.uint64)#开辟64*n_box*sizeof(np.uint64)的连续内存 置为0 用于存放结果 bbox = cp.ascontiguousarray(bbox, dtype=np.float32) #将bbox从numpy转成cupycuda计算 放到连续的内存中以便cuda运算 很重要 kern = _load_kernel('nms_kernel', _nms_gpu_code)#/加载自己写的c-cuda核函数 kern(blocks, threads, args=(cp.int32(n_bbox), cp.float32(thresh), #调用核函数 bbox, mask_dev)) mask_host = mask_dev.get() #将计算结果从gpu取到本地 selection, n_selec = _nms_gpu_post( mask_host, n_bbox, threads_per_block, col_blocks) #调用我们Cython导入的nms函数进行计算 return selection, n_selec
def CFAR_OS_GPU(signal_ext, origSignalLen, guardBandLen_1side, validSampLen_1side, scratchPad, noiseMargin, ordStat, outputBoolVector): thrdID = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x if (thrdID < origSignalLen - 1) or (thrdID > 2 * origSignalLen - 2): return # check for local maxima on the CUT i.e. signal_ext[thrdID] if (signal_ext[thrdID] >= signal_ext[thrdID - 1]) and ( signal_ext[thrdID] >= signal_ext[thrdID + 1]): count = cp.int32(0) for i in range(thrdID - guardBandLen_1side - validSampLen_1side, thrdID - guardBandLen_1side): scratchPad[thrdID - (origSignalLen - 1), count] = signal_ext[i] count += 1 for j in range(thrdID + guardBandLen_1side + 1, thrdID + guardBandLen_1side + validSampLen_1side + 1): scratchPad[thrdID - (origSignalLen - 1), count] = signal_ext[j] count += 1 temp = cp.float32(0) ordStat_largestVal = cp.float32(0) # sort in decreasing order of strength upto the ordStat kth largest value for i in range(ordStat): for j in range(i + 1, 2 * validSampLen_1side): if (scratchPad[thrdID - (origSignalLen - 1), i] < scratchPad[thrdID - (origSignalLen - 1), j]): temp = scratchPad[thrdID - (origSignalLen - 1), i] scratchPad[thrdID - (origSignalLen - 1), i] = scratchPad[thrdID - (origSignalLen - 1), j] scratchPad[thrdID - (origSignalLen - 1), j] = temp ordStat_largestVal = scratchPad[thrdID - (origSignalLen - 1), ordStat - 1] if (signal_ext[thrdID] > noiseMargin * ordStat_largestVal): outputBoolVector[thrdID - (origSignalLen - 1)] = 1
def get_number_of_ranges(record: OrderedDict) -> int: """ Gets the number of ranges for the record. Parameters ---------- record: OrderedDict hdf5 record containing antennas_iq data and metadata Returns ------- num_ranges: int The number of ranges of the data """ # Infer the number of ranges from the record metadata first_range_offset = ProcessAntennasIQ2Bfiq.calculate_first_range_rtt(record) * 1e-6 * record['rx_sample_rate'] num_ranges = record['num_samps'] - xp.int32(first_range_offset) - record['blanked_samples'][-1] # 3 extra samples taken for each record (not sure why) num_ranges = num_ranges - 3 return xp.uint32(num_ranges)
def _numba_upfirdn_1d( x, h_trans_flip, up, down, axis, x_shape_a, h_per_phase, padded_len, out ): X = cuda.grid(1) strideX = cuda.gridsize(1) for i in range(X, cp.int32(out.shape[0]), strideX): x_idx = cp.int32(cp.int32(cp.int32(i * down) // up) % padded_len) h_idx = cp.int32(cp.int32(cp.int32(i * down) % up) * h_per_phase) x_conv_idx = x_idx - h_per_phase + 1 if x_conv_idx < 0: h_idx -= x_conv_idx x_conv_idx = 0 # If axis = 0, we need to know each column in x. for x_conv_idx in range(x_conv_idx, x_idx + 1): if x_conv_idx < x_shape_a and x_conv_idx >= 0: out[i] += x[x_conv_idx] * h_trans_flip[h_idx] h_idx += 1
def batch(self, batchsize=2): x, c = self.data.next(batchSize=batchsize, dataSize=[8190], dataSelect=[0]) x = x[0].reshape(batchsize, 1, 1, -1) c = xp.asarray(c[0]) c_ = xp.random.randint(0, 111, batchsize) c_ = c_ + (c_ >= c) # t = next(self.test) # t = self.data.test(size=6143) # _ = lambda x:self.encode(x) # _ = lambda x:x/xp.float32(32768) # B0_ = _(B0) A_gen = self.generator(x, c_) # print(A_gen.shape) B_gen = self.generator(x, c) F_tf, F_c = self.discriminator(A_gen[:, :, :, 5119:]) T_tf, T_c = self.discriminator(x[:, :, :, 2047:-5119]) dis_acc = (F.argmax(F_tf, axis=1).data.sum(), xp.int32(batchsize) - F.argmax(T_tf, axis=1).data.sum(), (T_c.data.argmax(axis=-1) == c).sum()) # acc = (dis_acc[0]+dis_acc[1])/8 # self.dataRate = self.dataRate if dis_acc[0] == dis_acc[1] else self.dataRate / xp.float32(0.99) if dis_acc[0] > dis_acc[1] else self.dataRate * xp.float32(0.99) # receptionSize = B0.shape[-1] - B_gen.shape[-1] # L_gen0 = F.softmax_cross_entropy(B_gen, B0[:,:,receptionSize:].reshape(batchsize,-1)) # print(B_gen.shape) # print(B0_.shape) # L_gen0 = 0 L_gen0 = F.mean_squared_error(B_gen, x[:, :, :, 1023:-1024]) L_gen1 = F.softmax_cross_entropy(F_tf, xp.zeros(batchsize, dtype=np.int32)) L_gen2 = F.softmax_cross_entropy(F_c, c_) gen_loss = (L_gen0.data, L_gen1.data) L_gen = L_gen1 + L_gen0 + L_gen2 # L_gen = L_gen1 + (L_gen0 if L_gen0.data > 0.0001 else 0) L_dis0 = F.softmax_cross_entropy(F_tf, xp.ones(batchsize, dtype=np.int32)) L_dis1 = F.softmax_cross_entropy(T_tf, xp.zeros(batchsize, dtype=np.int32)) L_dis2 = F.softmax_cross_entropy(T_c, c) dis_loss = (L_dis0.data.get(), L_dis1.data.get(), L_dis2.data.get()) # L_dis = L_dis0 * min(xp.float32(1), 1 / self.dataRate) + L_dis1 * min(xp.float32(1), self.dataRate) L_dis = L_dis0 + L_dis1 + L_dis2 self.generator.cleargrads() L_gen.backward() self.gen_opt.update() self.discriminator.cleargrads() L_dis.backward() self.dis_opt.update() self.dis_opt.alpha *= 0.99999 self.gen_opt.alpha *= 0.99999 return (gen_loss, dis_loss, dis_acc, self.dataRate, (F_tf.data, T_tf.data))
def cuda_int32(intIn: int): return cupy.int32(intIn)
# for GPU timing using CuPy start = cp.cuda.Event() end = cp.cuda.Event() timing_cp = 0 timing_cp_wall = 0 # running the kernel using CuPy's functionality for i in range(4): if i > 0: # warm-up not needed if using RawModule start.record() _s = time.time() brute_force_pairs_kernel( (blocks, ), (threads, ), (d_x1, d_y1, d_z1, d_w1, d_x2, d_y2, d_z2, d_w2, d_rbins_squared, d_result_cp, cp.int32(d_x1.shape[0]), cp.int32( d_x2.shape[0]), cp.int32(d_rbins_squared.shape[0]))) if i > 0: # warm-up not needed if using RawModule end.record() end.synchronize() _e = time.time() timing_cp += cp.cuda.get_elapsed_time(start, end) timing_cp_wall += (_e - _s) print('cupy+CUDA events:', timing_cp / 3, 'ms') print('cupy+CUDA wall :', timing_cp_wall / 3 * 1000, 'ms') d_result_cp = d_result_cp.copy() if kind in ['both', 'numba']: # for GPU timing using Numba @cuda.jit
def _calculate_to(self, end_point: int) -> int: end = cp.int32(end_point) self._mandel_kernel((self._total_blocks, ), (self._BLOCK_SIZE, ), (self._c, self._z, self._iteration, end)) # approximation to the work done return self._request_size * self.iterations_per_kernel
def CFAR_OS_2D_cross_GPU(signal_ext, origSignalLenX, origSignalLenY, guardBandLen_1sideX, guardBandLen_1sideY, validSampLen_1sideX, validSampLen_1sideY, scratchPad, noiseMargin, ordStat, outputBoolVector): thrdIDx = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x thrdIDy = cuda.blockIdx.y * cuda.blockDim.y + cuda.threadIdx.y if (thrdIDx < guardBandLen_1sideX + validSampLen_1sideX) or ( thrdIDx > origSignalLenX + guardBandLen_1sideX + validSampLen_1sideX - 1) or (thrdIDy < guardBandLen_1sideY + validSampLen_1sideY) or ( thrdIDy > origSignalLenY + guardBandLen_1sideY + validSampLen_1sideY - 1): return # check for local maxima on the CUT i.e. signal_ext[thrdID] if (signal_ext[thrdIDy, thrdIDx] >= signal_ext[thrdIDy, thrdIDx - 1]) and ( signal_ext[thrdIDy, thrdIDx] >= signal_ext[thrdIDy, thrdIDx + 1] ) and (signal_ext[thrdIDy, thrdIDx] >= signal_ext[thrdIDy - 1, thrdIDx] ) and (signal_ext[thrdIDy, thrdIDx] >= signal_ext[thrdIDy + 1, thrdIDx]): count = cp.int32(0) for i in range(thrdIDx - guardBandLen_1sideX - validSampLen_1sideX, thrdIDx - guardBandLen_1sideX): scratchPad[thrdIDy-(guardBandLen_1sideY + validSampLen_1sideY), \ thrdIDx-(guardBandLen_1sideX + validSampLen_1sideX),count] = signal_ext[thrdIDy,i] count += 1 for j in range(thrdIDx + guardBandLen_1sideX + 1, thrdIDx + guardBandLen_1sideX + validSampLen_1sideX + 1): scratchPad[thrdIDy-(guardBandLen_1sideY + validSampLen_1sideY), \ thrdIDx-(guardBandLen_1sideX + validSampLen_1sideX),count] = signal_ext[thrdIDy,j] count += 1 for k in range(thrdIDy - guardBandLen_1sideY - validSampLen_1sideY, thrdIDy - guardBandLen_1sideY): scratchPad[thrdIDy-(guardBandLen_1sideY + validSampLen_1sideY), \ thrdIDx-(guardBandLen_1sideX + validSampLen_1sideX),count] = signal_ext[k,thrdIDx] count += 1 for l in range(thrdIDy + guardBandLen_1sideY + 1, thrdIDy + guardBandLen_1sideY + validSampLen_1sideY + 1): scratchPad[thrdIDy-(guardBandLen_1sideY + validSampLen_1sideY), \ thrdIDx-(guardBandLen_1sideX + validSampLen_1sideX),count] = signal_ext[l,thrdIDx] count += 1 temp = cp.float32(0) ordStat_largestVal = cp.float32(0) # sort in decreasing order of strength upto the ordStat kth largest value for i in range(ordStat): for j in range(i + 1, 2 * validSampLen_1sideX + 2 * validSampLen_1sideX): if (scratchPad[thrdIDy - (guardBandLen_1sideY + validSampLen_1sideY), thrdIDx - (guardBandLen_1sideX + validSampLen_1sideX), i] < scratchPad[thrdIDy - (guardBandLen_1sideY + validSampLen_1sideY), thrdIDx - (guardBandLen_1sideX + validSampLen_1sideX), j]): temp = scratchPad[ thrdIDy - (guardBandLen_1sideY + validSampLen_1sideY), thrdIDx - (guardBandLen_1sideX + validSampLen_1sideX), i] scratchPad[thrdIDy - (guardBandLen_1sideY + validSampLen_1sideY), thrdIDx - (guardBandLen_1sideX + validSampLen_1sideX), i] = scratchPad[ thrdIDy - (guardBandLen_1sideY + validSampLen_1sideY), thrdIDx - (guardBandLen_1sideX + validSampLen_1sideX), j] scratchPad[thrdIDy - (guardBandLen_1sideY + validSampLen_1sideY), thrdIDx - (guardBandLen_1sideX + validSampLen_1sideX), j] = temp ordStat_largestVal = scratchPad[ thrdIDy - (guardBandLen_1sideY + validSampLen_1sideY), thrdIDx - (guardBandLen_1sideX + validSampLen_1sideX), ordStat - 1] if (signal_ext[thrdIDy, thrdIDx] > noiseMargin * ordStat_largestVal): outputBoolVector[thrdIDy - (guardBandLen_1sideY + validSampLen_1sideY), thrdIDx - (guardBandLen_1sideX + validSampLen_1sideX)] = 1
def CFAR_CA_2D_cross_GPU(signal_ext, origSignalLenX, origSignalLenY, guardBandLen_1sideX, guardBandLen_1sideY, validSampLen_1sideX, validSampLen_1sideY, scratchPad, noiseMargin, outputBoolVector): thrdIDx = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x thrdIDy = cuda.blockIdx.y * cuda.blockDim.y + cuda.threadIdx.y if (thrdIDx < guardBandLen_1sideX + validSampLen_1sideX) or ( thrdIDx > origSignalLenX + guardBandLen_1sideX + validSampLen_1sideX - 1) or (thrdIDy < guardBandLen_1sideY + validSampLen_1sideY) or ( thrdIDy > origSignalLenY + guardBandLen_1sideY + validSampLen_1sideY - 1): return # check for local maxima on the CUT i.e. signal_ext[thrdID] if (signal_ext[thrdIDy, thrdIDx] >= signal_ext[thrdIDy, thrdIDx - 1]) and ( signal_ext[thrdIDy, thrdIDx] >= signal_ext[thrdIDy, thrdIDx + 1] ) and (signal_ext[thrdIDy, thrdIDx] >= signal_ext[thrdIDy - 1, thrdIDx] ) and (signal_ext[thrdIDy, thrdIDx] >= signal_ext[thrdIDy + 1, thrdIDx]): count = cp.int32(0) for i in range(thrdIDx - guardBandLen_1sideX - validSampLen_1sideX, thrdIDx - guardBandLen_1sideX): scratchPad[thrdIDy-(guardBandLen_1sideY + validSampLen_1sideY), \ thrdIDx-(guardBandLen_1sideX + validSampLen_1sideX),count] = signal_ext[thrdIDy,i] count += 1 for j in range(thrdIDx + guardBandLen_1sideX + 1, thrdIDx + guardBandLen_1sideX + validSampLen_1sideX + 1): scratchPad[thrdIDy-(guardBandLen_1sideY + validSampLen_1sideY), \ thrdIDx-(guardBandLen_1sideX + validSampLen_1sideX),count] = signal_ext[thrdIDy,j] count += 1 for k in range(thrdIDy - guardBandLen_1sideY - validSampLen_1sideY, thrdIDy - guardBandLen_1sideY): scratchPad[thrdIDy-(guardBandLen_1sideY + validSampLen_1sideY), \ thrdIDx-(guardBandLen_1sideX + validSampLen_1sideX),count] = signal_ext[k,thrdIDx] count += 1 for l in range(thrdIDy + guardBandLen_1sideY + 1, thrdIDy + guardBandLen_1sideY + validSampLen_1sideY + 1): scratchPad[thrdIDy-(guardBandLen_1sideY + validSampLen_1sideY), \ thrdIDx-(guardBandLen_1sideX + validSampLen_1sideX),count] = signal_ext[l,thrdIDx] count += 1 avgNoisePower = cp.float32(0) for ele in range(2 * validSampLen_1sideX + 2 * validSampLen_1sideX): avgNoisePower += scratchPad[thrdIDy-(guardBandLen_1sideY + validSampLen_1sideY), \ thrdIDx-(guardBandLen_1sideX + validSampLen_1sideX),ele] avgNoisePower = avgNoisePower / (2 * validSampLen_1sideX + 2 * validSampLen_1sideX) if (signal_ext[thrdIDy, thrdIDx] > noiseMargin * avgNoisePower): outputBoolVector[thrdIDy - (guardBandLen_1sideY + validSampLen_1sideY), thrdIDx - (guardBandLen_1sideX + validSampLen_1sideX)] = 1
def correlations_from_samples(beamformed_samples_1: np.array, beamformed_samples_2: np.array, record: OrderedDict) -> np.array: """ Correlate two sets of beamformed samples together. Correlation matrices are used and indices corresponding to lag pulse pairs are extracted. Parameters ---------- beamformed_samples_1: ndarray [num_slices, num_beams, num_samples] The first beamformed samples. beamformed_samples_2: ndarray [num_slices, num_beams, num_samples] The second beamformed samples. record: OrderedDict hdf5 record containing bfiq data and metadata Returns ------- values: np.array Array of correlations for each beam, range, and lag """ # beamformed_samples_1: [num_beams, num_samples] # beamformed_samples_2: [num_beams, num_samples] # correlated: [num_beams, num_samples, num_samples] correlated = xp.einsum('jk,jl->jkl', beamformed_samples_1, beamformed_samples_2.conj()) if cupy_available: correlated = xp.asnumpy(correlated) values = [] if record['lags'].size == 0: values.append(xp.array([])) return values # First range offset in samples sample_off = record['first_range_rtt'] * 1e-6 * record['rx_sample_rate'] sample_off = xp.int32(sample_off) # Helpful values converted to units of samples range_off = xp.arange(record['num_ranges'], dtype=xp.int32) + sample_off tau_in_samples = record['tau_spacing'] * 1e-6 * record['rx_sample_rate'] lag_pulses_as_samples = xp.array(record['lags'], xp.int32) * xp.int32(tau_in_samples) # [num_range_gates, 1, 1] # [1, num_lags, 2] samples_for_all_range_lags = (range_off[..., xp.newaxis, xp.newaxis] + lag_pulses_as_samples[xp.newaxis, :, :]) # [num_range_gates, num_lags, 2] row = samples_for_all_range_lags[..., 1].astype(xp.int32) # [num_range_gates, num_lags, 2] column = samples_for_all_range_lags[..., 0].astype(xp.int32) # [num_beams, num_range_gates, num_lags] values = correlated[:, row, column] # Find the sample that corresponds to the second pulse transmitting second_pulse_sample_num = xp.int32( tau_in_samples) * record['pulses'][1] - sample_off - 1 # Replace all ranges which are contaminated by the second pulse for lag 0 # with the data from those ranges after the final pulse. values[:, second_pulse_sample_num:, 0] = values[:, second_pulse_sample_num:, -1] return values