def __init__(self): mujoco_env.MujocoEnv.__init__(self, 'humanoidasimoMRD3.xml', 5) utils.EzPickle.__init__(self) self.pos = [] self.vel = [] fileHandle = open( '/home/initial/my_project_folder/my_project/src/python_code3/trpo-master/data/states/biped3d_sim_walk_state-asimo.txt', 'r') str = fileHandle.readlines() fileHandle.close() for i in str[1][8:-3].split(','): self.pos.append(float32(i)) for i in str[2][7:-2].split(','): self.vel.append(float32(i)) fileHandle = open( '/home/initial/my_project_folder/my_project/src/python_code3/trpo-master/data/motions/mocap/asimo/0007_Walking001_motion_00000_retargeted_asimo.txt', 'r') str = fileHandle.readlines() fileHandle.close() self.motion = [] for i in range(4, 31, 1): motion_sub = [] for j in str[i][1:-3].split(','): motion_sub.append(float32(j)) self.motion.append(motion_sub) self.time_step = 0.0 #e-3 self.i = 0
def conv_2d(inputImageExtnd, pointSpreadFn, outputImage, InputLenX, InputLenY, psfOneSideLenX, psfOneSideLenY): thrdIDx = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x thrdIDy = cuda.blockIdx.y * cuda.blockDim.y + cuda.threadIdx.y psfLenX = 2 * psfOneSideLenX + 1 psfLenY = 2 * psfOneSideLenY + 1 if (thrdIDx >= psfOneSideLenX + InputLenX + psfLenX - 1) or ( thrdIDx < psfOneSideLenX) or ( thrdIDy >= psfOneSideLenY + InputLenY + psfLenY - 1) or ( thrdIDy < psfOneSideLenY): return convSum = cp.float32(0) for x in range(-psfOneSideLenX, psfOneSideLenX + 1): for y in range(-psfOneSideLenY, psfOneSideLenY + 1): convSum += inputImageExtnd[thrdIDy + y, thrdIDx + x] * pointSpreadFn[psfOneSideLenY - y, psfOneSideLenX - x] # if (thrdIDx == psfOneSideLenX + InputLenX + psfLenX -2) and (thrdIDy == psfOneSideLenY + InputLenY + psfLenY -2): # print('x:',x,'y:',y,'imagVal:',inputImageExtnd[thrdIDy+y,thrdIDx+x],'psfval:',pointSpreadFn[2*psfOneSideLenY-y,2*psfOneSideLenX-x]) # if (thrdIDx == psfOneSideLenX + InputLenX + psfLenX -2) and (thrdIDy == psfOneSideLenY + InputLenY + psfLenY -2): # print('conSum:',convSum) outputImage[thrdIDy - psfOneSideLenY, thrdIDx - psfOneSideLenX] = convSum
def updateGeometry(self): # GPU variables self._psi = cp.zeros(self.shape, dtype=cp.complex64) self._phi = cp.zeros(self.shape, dtype=cp.uint8) self._theta = cp.zeros(self.shape, dtype=cp.float32) self._rho = cp.zeros(self.shape, dtype=cp.float32) alpha = cp.cos(cp.radians(self.phis, dtype=cp.float32)) x = alpha*(cp.arange(self.width, dtype=cp.float32) - cp.float64(self.xs)) y = cp.arange(self.height, dtype=cp.float32) - cp.float32(self.ys) qx = self.qprp * x qy = self.qprp * y self._iqx = (1j * qx).astype(cp.complex64) self._iqy = (1j * qy).astype(cp.complex64) self._iqxz = (1j * self.qpar * x * x).astype(cp.complex64) self._iqyz = (1j * self.qpar * y * y).astype(cp.complex64) self.outeratan2f(y, x, self._theta) self.outerhypot(qy, qx, self._rho) # CPU variables self.phi = self._phi.get() self.iqx = self._iqx.get() self.iqy = self._iqy.get() self.theta = self._theta.get().astype(cp.float64) self.qr = self._rho.get().astype(cp.float64) self.sigUpdateGeometry.emit()
def em(lp, init_recon, tomo0, num_iter, reg_par, gpu): """ Reconstruction with the Expectation Maximization algorithm for denoising with parameter reg_par manually chosen for avoiding division by 0. Maximization of the likelihood function L(tomo,rho) """ # choose device cp.cuda.Device(gpu).use() # Allocating necessary gpu arrays recon = cp.array(init_recon) tomo = cp.array(tomo0) xi = recon * 0 upd = recon * 0 g = tomo * 0 # Constructing iterative scheme eps = reg_par # R^*(ones) lp.adjp(xi, tomo * 0 + 1, gpu) xi = xi + 1e-5 # em iteratins for i in range(0, num_iter): lp.fwdp(g, recon, gpu) lp.adjp(upd, tomo / (g + cp.float32(eps)), gpu) recon = recon * (upd / xi) return recon.get()
def CFAR_CA_GPU(signal_ext, origSignalLen, guardBandLen_1side, validSampLen_1side, scratchPad, noiseMargin, outputBoolVector): thrdID = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x if (thrdID < origSignalLen - 1) or (thrdID > 2 * origSignalLen - 2): return # check for local maxima on the CUT i.e. signal_ext[thrdID] if (signal_ext[thrdID] >= signal_ext[thrdID - 1]) and ( signal_ext[thrdID] >= signal_ext[thrdID + 1]): count = cp.int32(0) for i in range(thrdID - guardBandLen_1side - validSampLen_1side, thrdID - guardBandLen_1side): # scratchPad[count] = signal_ext[i]; # This should not be done. There should be a separate scratch pad for each thread when it is vector/matrix copying scratchPad[thrdID - (origSignalLen - 1), count] = signal_ext[i] count += 1 for j in range(thrdID + guardBandLen_1side + 1, thrdID + guardBandLen_1side + validSampLen_1side + 1): # scratchPad[count] = signal_ext[j]; # This should not be done. There should be a separate scratch pad for each thread when it is vector/matrix copying scratchPad[thrdID - (origSignalLen - 1), count] = signal_ext[j] count += 1 avgNoisePower = cp.float32(0) for ele in range(2 * validSampLen_1side): avgNoisePower += scratchPad[thrdID - (origSignalLen - 1), ele] avgNoisePower = avgNoisePower / (2 * validSampLen_1side) if (signal_ext[thrdID] > noiseMargin * avgNoisePower): outputBoolVector[thrdID - (origSignalLen - 1)] = 1
def GeneralizedDiceLossFunction(y,t,w): dice_numerator=0.0 dice_denominator=0.0 eps = 0.0001 div = cp.float32(y.shape[0] * y.shape[1]) y = F.softmax(y,axis=1) for i in range(y.shape[0]):#batch-size soft = y[i] tb = cp.array(t[i].flatten()).astype(cp.float32) for j in range(y.shape[1]):#class-size wb = cp.array(w[i][j].flatten()).astype(cp.float32) V_in = cp.where(tb == j,1,0).astype(cp.float32) t_temp = chainer.Variable(V_in) w_temp = chainer.Variable(wb) soft_temp = F.flatten(soft[j]) dice_numerator += F.sum(w_temp * soft_temp * t_temp) dice_denominator += F.sum(w_temp * (soft_temp + t_temp)) loss = 2.0 * dice_numerator / (dice_denominator+eps) return -loss
def valid_positions(R, vertices, depth, K, mask, lower, grid_size): valid_positions_device( ((mask.size * len(vertices)) // 512 + 1, ), (512, ), (cp.asarray(K.flatten()), cp.asarray(R.flatten()), cp.asarray(vertices.flatten()), cp.asarray(depth.flatten()), cp.array(depth.shape, cp.int), cp.float32(grid_size), cp.asarray(mask.flatten(), cp.int), cp.array( mask.shape, cp.int), cp.asarray(lower), cp.int(len(vertices))))
def reduction(x, y, size): tid = jit.threadIdx.x ntid = jit.blockDim.x value = cupy.float32(0) for i in range(tid, size, ntid): value += x[i] smem = jit.shared_memory(cupy.float32, 1024) smem[tid] = value jit.syncthreads() if tid == cupy.uint32(0): value = cupy.float32(0) for i in range(ntid): value += smem[i] y[0] = value
def reduction(x, y, size): tid = jit.blockIdx.x * jit.blockDim.x + jit.threadIdx.x ntid = jit.blockDim.x * jit.gridDim.x value = cupy.float32(0) for i in range(tid, size, ntid): value += x[i] smem = jit.shared_memory(cupy.float32, 1024) smem[jit.threadIdx.x] = value jit.syncthreads() if jit.threadIdx.x == cupy.uint32(0): value = cupy.float32(0) for i in range(jit.blockDim.x): value += smem[i] jit.atomic_add(y, 0, value)
def cumulative_distribution(data, bins): assert cup.min(data) >= 0.0 and cup.max(data) <= 1.0 hg_av, hg_a = cup.unique(cup.floor(data * (bins - 1)), return_index=True) hg_a = cup.float32(hg_a) hgs = cup.sum(hg_a) hg_a /= hgs res = cup.zeros((bins, )) res[cup.int64(hg_av)] = hg_a return cup.cumsum(res)
def CFAR_OS_GPU(signal_ext, origSignalLen, guardBandLen_1side, validSampLen_1side, scratchPad, noiseMargin, ordStat, outputBoolVector): thrdID = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x if (thrdID < origSignalLen - 1) or (thrdID > 2 * origSignalLen - 2): return # check for local maxima on the CUT i.e. signal_ext[thrdID] if (signal_ext[thrdID] >= signal_ext[thrdID - 1]) and ( signal_ext[thrdID] >= signal_ext[thrdID + 1]): count = cp.int32(0) for i in range(thrdID - guardBandLen_1side - validSampLen_1side, thrdID - guardBandLen_1side): scratchPad[thrdID - (origSignalLen - 1), count] = signal_ext[i] count += 1 for j in range(thrdID + guardBandLen_1side + 1, thrdID + guardBandLen_1side + validSampLen_1side + 1): scratchPad[thrdID - (origSignalLen - 1), count] = signal_ext[j] count += 1 temp = cp.float32(0) ordStat_largestVal = cp.float32(0) # sort in decreasing order of strength upto the ordStat kth largest value for i in range(ordStat): for j in range(i + 1, 2 * validSampLen_1side): if (scratchPad[thrdID - (origSignalLen - 1), i] < scratchPad[thrdID - (origSignalLen - 1), j]): temp = scratchPad[thrdID - (origSignalLen - 1), i] scratchPad[thrdID - (origSignalLen - 1), i] = scratchPad[thrdID - (origSignalLen - 1), j] scratchPad[thrdID - (origSignalLen - 1), j] = temp ordStat_largestVal = scratchPad[thrdID - (origSignalLen - 1), ordStat - 1] if (signal_ext[thrdID] > noiseMargin * ordStat_largestVal): outputBoolVector[thrdID - (origSignalLen - 1)] = 1
def prominent_peaks_optimized(img, min_xdistance=1, min_ydistance=1, threshold=None, num_peaks=cp.inf): """Return peaks with non-maximum suppression. Identifies most prominent features separated by certain distances. Non-maximum suppression with different sizes is applied separately in the first and second dimension of the image to identify peaks. Parameters ---------- image : (M, N) ndarray Input image. min_xdistance : int Minimum distance separating features in the x dimension. min_ydistance : int Minimum distance separating features in the y dimension. threshold : float Minimum intensity of peaks. Default is `0.5 * max(image)`. num_peaks : int Maximum number of peaks. When the number of peaks exceeds `num_peaks`, return `num_peaks` coordinates based on peak intensity. Returns ------- intensity, xcoords, ycoords : tuple of array Peak intensity values, x and y indices. Notes ----- Modified from https://github.com/mritools/cupyimg _prominent_peaks method """ THREADS_PER_BLOCK = (32, 1) # Each thread is responsible for a (min_ydistance * min_xdistance) patch # THREADS_PER_BLOCK is in the order of (x, y), but img.shape is in the order of (y, x) NUM_BLOCKS = (img.shape[1] // (THREADS_PER_BLOCK[0] * min_xdistance) + ((img.shape[1] % (THREADS_PER_BLOCK[0] * min_xdistance)) > 0), img.shape[0] // (THREADS_PER_BLOCK[1] * min_ydistance) + ((img.shape[0] % (THREADS_PER_BLOCK[1] * min_ydistance)) > 0)) NUM_THREADS = np.multiply(THREADS_PER_BLOCK, NUM_BLOCKS) elems = (NUM_THREADS[0] * NUM_THREADS[1], ) intensity, xcoords, ycoords = cp.zeros(elems, dtype=cp.float32), cp.zeros( elems, dtype=cp.int32), cp.zeros(elems, dtype=cp.int32) prominent_peaks_kernel( NUM_BLOCKS, THREADS_PER_BLOCK, (img, cp.int32(img.shape[0]), cp.int32( img.shape[1]), cp.int32(min_xdistance), cp.int32(min_ydistance), cp.float32(threshold), intensity, xcoords, ycoords)) indices = intensity != 0.0 return intensity[indices], xcoords[indices], ycoords[indices]
def start(self, rand_seed=None): if rand_seed is None: rand_seed = np.random.randint(1e5) self.nPh = int(self.nPh) self._reset_results() self._generate_initial_coodinate(self.nPh) M = np.int32(self.model.voxel_model.shape[1]) L = np.int32(self.model.voxel_model.shape[2]) print("") print("###### Start (Random seed: %s) ######" % rand_seed) print("") start_ = time.time() cp.get_default_memory_pool().free_all_blocks() cp.get_default_pinned_memory_pool().free_all_blocks() add_ = cp.asarray(self.add.astype(np.int32), dtype=np.int32) p_ = cp.asarray(self.p.astype(np.float32), dtype=np.float32) v_ = cp.asarray(self.v.astype(np.float32), dtype=np.float32) w_ = cp.asarray(self.w.astype(np.float32), dtype=np.float32) ma_ = cp.asarray(self.model.ma.astype(np.float32)) ms_ = cp.asarray(self.model.ms.astype(np.float32)) n_ = cp.asarray(self.model.n.astype(np.float32)) g_ = cp.asarray(self.model.g.astype(np.float32)) v_model = cp.asarray(self.model.voxel_model.astype(np.int8), dtype=np.int8) l_ = cp.float32(self.model.voxel_space) nph = cp.int32(self.nPh) end_p = cp.int8(self.model.end_point) func((int((self.nPh + self.threadnum - 1) / self.threadnum), 1), (self.threadnum, 1), (add_, p_, v_, w_, ma_, ms_, n_, g_, v_model, l_, M, L, nph, end_p, np.int32(rand_seed))) self.add = cp.asnumpy(add_) self.p = cp.asnumpy(p_) self.v = cp.asnumpy(v_) self.w = cp.asnumpy(w_) del add_, p_, v_, w_, ma_, ms_, n_, g_, del v_model, l_, M, L, nph, end_p, rand_seed, cp.get_default_memory_pool().free_all_blocks() cp.get_default_pinned_memory_pool().free_all_blocks() gc.collect() self._end_process() print("###### End ######") self.getRdTtRate() calTime(time.time(), start_) return self
def __init__(self): self.data = dataset() self.data.reset() self.reset() # self.load(1) self.setLR() self.time = time.time() self.dataRate = xp.float32(0.8) self.mado = xp.hanning(442).astype(xp.float32) # n=10 # load_npz(f"param/gen/gen_{n}.npz",self.generator) # load_npz(f"param/dis/dis_{n}.npz",self.discriminator) self.training(batchsize=6)
def _call_nms_kernel(bbox, thresh): n_bbox = bbox.shape[0] threads_per_block = 64 col_blocks = np.ceil(n_bbox / threads_per_block).astype(np.int32) blocks = (col_blocks, col_blocks, 1) threads = (threads_per_block, 1, 1) mask_dev = cp.zeros((n_bbox * col_blocks, ), dtype=np.uint64) bbox = cp.ascontiguousarray(bbox, dtype=np.float32) kern = _load_kernel('nms_kernel', _nms_gpu_code) kern(blocks, threads, args=(n_bbox, cp.float32(thresh), bbox, mask_dev)) mask_host = mask_dev.get() selection, n_selec = _nms_gpu_post(mask_host, n_bbox, threads_per_block, col_blocks) return selection, n_selec
def backward(ctx, grad_output): """ the backward of psRoI_pooling :param ctx: context variable :param grad_output: gradient input(backward) of psRoI module :return: """ # Here we must handle None grad_output tensor. In this case we # can skip unnecessary computations and just return None. if grad_output is None: return None, None, None grad_output = grad_output.contiguous() int_info, in_size, spatial_scale, rois, mapping_channel = ctx.saved_tensors count, N, out_dim, outh, outw = int_info.tolist() in_size = tuple(in_size.tolist()) B, C, H, W = in_size # e.g.(b, 21 * 7 * 7, h, w) grad_input = t.zeros(in_size).cuda( ) # developing cuda memory to save gradient for output # create cuda stream stream = Stream(ptr=torch.cuda.current_stream().cuda_stream) args = [ count, grad_output.data_ptr(), mapping_channel.data_ptr(), N, cp.float32(spatial_scale), C, H, W, outh, outw, out_dim, grad_input.data_ptr(), rois.data_ptr(), ] psROI_backward_fn(args=args, block=(CUDA_NUM_THREADS, 1, 1), grid=(GET_BLOCKS(grad_output.numel()), 1, 1), stream=stream) return grad_input, None, None # The 'None' indicates that backpropagation to RPN and info is ignored
def _call_nms_kernel(bbox, thresh): n_bbox = bbox.shape[0] threads_per_block = 64 col_blocks = np.ceil(n_bbox / threads_per_block).astype(np.int32) blocks = (col_blocks, col_blocks, 1) threads = (threads_per_block, 1, 1) mask_dev = cp.zeros((n_bbox * col_blocks,), dtype=np.uint64) bbox = cp.ascontiguousarray(bbox, dtype=np.float32) kern = _load_kernel('nms_kernel', _nms_gpu_code) kern(blocks, threads, args=(cp.int32(n_bbox), cp.float32(thresh), bbox, mask_dev)) mask_host = mask_dev.get() selection, n_selec = _nms_gpu_post( mask_host, n_bbox, threads_per_block, col_blocks) return selection, n_selec
def compute_similarities(input_a, input_b, chunk_id, threshold): start_time = time.time() size_a, size_b = len(input_a) // 32, len(input_b) // 32 similarities = cp.zeros((size_a, size_b), dtype=cp.float16) a_threads_per_block = 16 b_threads_per_block = 16 threads_per_block = (a_threads_per_block, b_threads_per_block) nblocks_a = size_a // a_threads_per_block nblocks_b = size_b // b_threads_per_block nblocks = (nblocks_a, nblocks_b) dice_kernel(nblocks, threads_per_block, (similarities, input_a, input_b, size_a, size_b, cp.float32(threshold))) sparse_similarities = apply_threshold(similarities, size_a, size_b, threshold=threshold) sort_sparse_similarities(sparse_similarities) cp.cuda.Stream.null.synchronize() compute_time = time.time() - start_time # Copy data back from device to host start_time = time.time() # Transfer sparse matrix back to host data = sparse_similarities.get() transfer_time = time.time() - start_time comparisons = size_a * size_b cmp_per_sec = comparisons / compute_time if chunk_id % 1 == 0: print( f"{chunk_id}: Comparisons: {humanize.intword(comparisons)}, Rate: {humanize.intword(cmp_per_sec)} cmp/s. Computation: {compute_time:.3f} Result transfer: {transfer_time:.6f}s" ) return data, len(data.data)
def calculate_first_range_rtt(record: OrderedDict) -> float: """ Calculates the round-trip time (in microseconds) to the first range in a record. Parameters ---------- record: OrderedDict hdf5 record containing antennas_iq data and metadata Returns ------- first_range_rtt: float Time that it takes signal to travel to first range gate and back, in microseconds """ # km * (there and back) * (km to meters) * (seconds to us) / c first_range_rtt = record['first_range'] * 2.0 * 1.0e3 * 1e6 / speed_of_light return xp.float32(first_range_rtt)
def calculate_range_separation(record: OrderedDict) -> float: """ Calculates the separation between ranges in km. Parameters ---------- record: OrderedDict hdf5 record containing antennas_iq data and metadata Returns ------- range_sep: float The separation between adjacent ranges, in km. """ # (1 / (sample rate)) * c / (km to meters) / 2 range_sep = 1 / record['rx_sample_rate'] * speed_of_light / 1.0e3 / 2.0 return xp.float32(range_sep)
def __init__(self,n_layers,rg,measurement,f,beta=1,variant="dunlop",verbose=True,hybrid_mode=False,mempool=None): self.n_layers = n_layers self.beta = cp.float32(beta) self.betaZ = cp.sqrt(1-beta**2).astype(cp.float32) self.random_gen = rg self.measurement = measurement self.fourier = f self.variant=variant self.meas_var = self.measurement.stdev**2 self.verbose = verbose self.hybrid_mode = hybrid_mode self.epsilon = 0 self.cholesky_stabilizer = 0 if mempool is None: mempool = cp.get_default_memory_pool() self.create_matrix_file_name() if not (self.measurement_matrix_file).exists(): self.create_H_matrix() else: self.load_H_matrix() #do normalizing self.H /= self.measurement.stdev if self.verbose: print("Used bytes so far, after creating H {}".format(mempool.used_bytes())) # self.H_t_H = self.H.conj()[email protected] self.H_t_H /= self.meas_var self.I = cp.eye(self.measurement.num_sample,dtype=cp.float32) self.In = cp.eye(self.fourier.basis_number_2D_sym,dtype=cp.float32) # self.I = cpx.scipy.sparse.identity(self.measurement.num_sample) # self.In = cpx.scipy.sparse.identity(self.fourier.basis_number_2D_sym) self.y = self.measurement.y self.yBar = cp.concatenate((self.y,cp.zeros(2*self.fourier.basis_number_2D_ravel-1))) self.beta_feedback_gain = 2.1 self.target_acceptance_rate = 0.234 self.record_skip = 1 self.record_count = 0 self.max_record_history = 1000000
def calculate_first_range(record: OrderedDict) -> float: """ Calculates the distance from the main array to the first range (in km). Parameters ---------- record: OrderedDict hdf5 record containing antennas_iq data and metadata Returns ------- first_range: float Distance to first range in km """ # TODO: Get this from somewhere, probably linked to the experiment ran. Might need to look up # based on githash first_range = 180.0 # scf.FIRST_RANGE return xp.float32(first_range)
def _call_nms_kernel(bbox, thresh): # PyTorch does not support unsigned long Tensor. # Doesn't matter,since it returns ndarray finally. # So I'll keep it unmodified. n_bbox = bbox.shape[0] threads_per_block = 64 col_blocks = np.ceil(n_bbox / threads_per_block).astype(np.int32) blocks = (col_blocks, col_blocks, 1) threads = (threads_per_block, 1, 1) mask_dev = cp.zeros((n_bbox * col_blocks,), dtype=np.uint64) bbox = cp.ascontiguousarray(bbox, dtype=np.float32) kern = _load_kernel('nms_kernel', _nms_gpu_code) kern(blocks, threads, args=(cp.int32(n_bbox), cp.float32(thresh), bbox, mask_dev)) mask_host = mask_dev.get() selection, n_selec = _nms_gpu_post( mask_host, n_bbox, threads_per_block, col_blocks) return selection, n_selec
def _call_nms_kernel(bbox, thresh): # PyTorch does not support unsigned long Tensor. # Doesn't matter,since it returns ndarray finally. # So I'll keep it unmodified. n_bbox = bbox.shape[0] #框的个数 threads_per_block = 64 #一个block有多少thread col_blocks = np.ceil(n_bbox / threads_per_block).astype(np.int32)#cuda常用的对齐block操作 保证线程数最小限度全覆盖数据 blocks = (col_blocks, col_blocks, 1) #因为对齐一个blocks按理说是(n_blocks,1,1) 说明后面要全排列了 threads = (threads_per_block, 1, 1) mask_dev = cp.zeros((n_bbox * col_blocks,), dtype=np.uint64)#开辟64*n_box*sizeof(np.uint64)的连续内存 置为0 用于存放结果 bbox = cp.ascontiguousarray(bbox, dtype=np.float32) #将bbox从numpy转成cupycuda计算 放到连续的内存中以便cuda运算 很重要 kern = _load_kernel('nms_kernel', _nms_gpu_code)#/加载自己写的c-cuda核函数 kern(blocks, threads, args=(cp.int32(n_bbox), cp.float32(thresh), #调用核函数 bbox, mask_dev)) mask_host = mask_dev.get() #将计算结果从gpu取到本地 selection, n_selec = _nms_gpu_post( mask_host, n_bbox, threads_per_block, col_blocks) #调用我们Cython导入的nms函数进行计算 return selection, n_selec
def integrate(self, t, x, y, S_T, L, rho, m, f, lamb, x_0, y_0, z_0, S, dx_0, dy_0, out, shape): # Get shape and step size nx, ny = shape nt = t.size dt = cp.float32((t[-1] - t[0]) / t.size) # Type cast S_T, L = (cp.float32(S_T), cp.float32(L)) rho, m = (cp.float32(rho), cp.float32(m)) f, lamb = (cp.float32(f), cp.float32(lamb)) x_0, y_0, z_0 = (cp.asarray(x_0, dtype=cp.float32), cp.asarray(y_0, dtype=cp.float32), cp.asarray(z_0, dtype=cp.float32)) dx_0, dy_0, S = (cp.asarray(dx_0, dtype=cp.float32), cp.asarray(dy_0, dtype=cp.float32), cp.asarray(S, dtype=cp.float32)) # Integrate self._integrate(self.grid, self.block, (x, y, S_T, L, rho, m, f, lamb, x_0, y_0, z_0, S, dx_0, dy_0, out, dt, nx, ny, nt)) out = cp.asnumpy(out)
def create_gl(N0, Nproj, Nslices, cor, interp_type): Nspan = 3 beta = cp.pi / Nspan # size after zero padding in radial direction N = int(cp.ceil((N0 + abs(N0 / 2 - cor) * 2.0) / 16.0) * 16) # size after zero padding in the angle direction (for nondense sampling rate) osangles = int(max(round(3.0 * N / 2.0 / Nproj), 1)) Nproj = osangles * Nproj # polar space proj = cp.arange(0, Nproj) * cp.pi / Nproj - beta / 2 s = cp.linspace(-1, 1, N) # log-polar parameters (Nrho, Ntheta, dtheta, drho, aR, am, g) = getparameters(beta, proj[1] - proj[0], 2.0 / (N - 1), N, Nproj) # log-polar space thsp = (cp.arange(-Ntheta / 2, Ntheta / 2) * cp.float32(dtheta)).astype('float32') rhosp = (cp.arange(-Nrho, 0) * drho).astype('float32') erho = cp.tile(cp.exp(rhosp)[..., cp.newaxis], [1, Ntheta]) # compensation for cubic interpolation B3th = splineB3(thsp, 1) B3th = cp.fft.fft(cp.fft.ifftshift(B3th)) B3rho = splineB3(rhosp, 1) B3rho = (cp.fft.fft(cp.fft.ifftshift(B3rho))) B3com = cp.outer(B3rho, B3th) # struct with global parameters P = Pgl(Nspan, N, N0, Nproj, Nslices, Ntheta, Nrho, proj, s, thsp, rhosp, aR, beta, B3com, am, g, cor, osangles, interp_type) # represent as array parsi = cp.array([ P.N, P.N0, P.Ntheta, P.Nrho, P.Nspan, P.Nproj, P.Nslices, P.cor, P.osangles, P.interp_type == 'cubic' ], dtype='float32') params = cp.concatenate((parsi, erho.flatten())).get() return (P, params)
def DiceLossFunction(y,t): loss = 0.0 div = cp.float32(y.shape[0] * y.shape[1]) y = F.softmax(y,axis=1) eps = 0.0001 for i in range(y.shape[0]): soft=y[i] tb = cp.array(t[i].flatten()) for j in range(y.shape[1]): V_in = cp.where(tb == j,1,0).astype(cp.float32) if (cp.sum(V_in) == 0.0): div -=1.0 t_temp = chainer.Variable(V_in) soft_temp = F.flatten(soft[j]) loss += 2.0*F.sum(soft_temp*t_temp)/(F.sum(soft_temp + t_temp) + eps) loss = loss/div return -loss
__add_const_cu(c_float_p(dst_cu), c_float_p(dst_cu), add, c_int_p(pnsz)) __mul_mat_cu(c_float_p(dst_cu), c_float_p(dst_cu), c_float_p(dst_cu), c_int_p(pnsz)) __add_mat_cu(c_float_p(dst_cu), c_float_p(dst_cu), c_float_p(dst_cu), c_int_p(pnsz)) # print(dst_np[:10, 0, 0]) # print(dst_cu[:10, 0, 0]) # # print(np.sum(dst_np - dst_cu)) ## Cupy in GPU src_cp = cp.asarray(copy.deepcopy(src)) dst_cp = src_cp mul_cp = cp.float32(mul) add_cp = cp.float32(add) dst_cp = dst_cp * mul_cp dst_cp = dst_cp + add_cp dst_cp = dst_cp * dst_cp dst_cp = dst_cp + dst_cp dst_cp2np = cp.asnumpy(dst_cp) # print(dst_np[:10, 0, 0]) # print(dst_cp[:10, 0, 0]) # print(np.sum(dst_np - dst_cp2np)) ## Cupy with kernel in GPU cu_file = os.path.join(os.path.dirname(__file__), 'math_cu.cuh')
signalExtend = cp.asarray(signalExtend.copy(),dtype=cp.float32) origSigShapeX = np.int32(signal_mag.shape[1]); origSigShapeY = np.int32(signal_mag.shape[0]); thrdsPerBlkx = 16; thrdsPerBlky = 16; thrdsPerBlk = (thrdsPerBlky,thrdsPerBlkx) blksPerGridx = np.int32(np.ceil(origSigShapeX/thrdsPerBlkx)); blksPerGridy = np.int32(np.ceil(origSigShapeY/thrdsPerBlky)); blksPerGrid = (blksPerGridy,blksPerGridx) valid_samp_num = 2*(valid_samp_len_x + valid_samp_len_y) scratchPad = cp.zeros((signal_mag.shape[0],signal_mag.shape[1],valid_samp_num),dtype=cp.float32); noiseMargin = cp.float32(valid_samp_num*(false_alarm_rate**(-1/valid_samp_num) -1)) outPutBoolArray_CAcross = cp.zeros((signal_mag.shape[0], signal_mag.shape[1]),dtype=cp.int32) # GPU_memorySize = (signal_mag.nbytes + origSigShapeX.nbytes + origSigShapeY.nbytes + guardband_len_x.nbytes, guardband_len_y.nbytes + valid_samp_len_x.nbytes + valid_samp_len_y.nbytes + scratchPad.nbytes + noiseMargin.nbytes + outPutBoolArray_CAcross.nbytes)/(1024*1024); GPU_memorySize = (signal_mag.nbytes + scratchPad.nbytes + outPutBoolArray_CAcross.nbytes)/(1024*1024); print('Total memory on GPU = {0:.2f} MB\n'.format(GPU_memorySize)); CFAR_CA_2D_cross_GPU[blksPerGrid,thrdsPerBlk](signalExtend, origSigShapeX, origSigShapeY, guardband_len_x, guardband_len_y, valid_samp_len_x, valid_samp_len_y, scratchPad, noiseMargin, outPutBoolArray_CAcross) outPutBoolArray_CAcross = cp.asnumpy(outPutBoolArray_CAcross) det_indices_caCross_gpu = np.where(outPutBoolArray_CAcross>0) t6 = time() print('CFAR CA cross GPU det range bins:', det_indices_caCross_gpu[0]) print('CFAR CA cross GPU det doppler bins:', det_indices_caCross_gpu[1],'\n') outPutBoolArray_OScross = cp.zeros((signal_mag.shape[0], signal_mag.shape[1]),dtype=cp.int32)
def forward(ctx, x, rois, Info: psRoI_Info): """ :param ctx: context variable(similar to 'self') :param x: input feature map :param rois: rois generated by rpn, note:this 'rois' is indices_and_rois combined indexes and rois ==> [batch_ind, x_min, y_min, x_max, y_max] :return: """ # Ensure memory contiguous x = x.contiguous() rois = rois.contiguous() in_size = B, C, H, W = x.size() # e.g.(b, 21 * 7 * 7, h, w) N = rois.size(0) # the numbers of roi if C % (Info.group_size * Info.group_size) != 0: raise ValueError( "The group_size must be an integral multiple of input_channel!" ) out_dim = C // (Info.group_size * Info.group_size) output = t.zeros(N, out_dim, Info.outh, Info.outw).cuda() # Used to save output count = output.numel() # the number of sub regions for psROI mapping_channel = torch.zeros( count, dtype=cp.int).cuda() # hich channel is the bottom data in # Packing parameters args = [ count, x.data_ptr(), cp.float32( Info.spatial_scale), # must convert float param to cp.float32 C, H, W, Info.outh, Info.outw, rois.data_ptr(), out_dim, Info.group_size, output.data_ptr(), mapping_channel.data_ptr(), ] # create cuda stream so that Kernel calculation and data transmission can be executed asynchronously stream = Stream(ptr=torch.cuda.current_stream().cuda_stream) # using one-dimensional index for block and thread Info.forward_fn(args=args, block=(CUDA_NUM_THREADS, 1, 1), grid=(GET_BLOCKS(count), 1, 1), stream=stream) # save info for backward saveBackwardInfo_int = [count, N, out_dim, Info.outh, Info.outw] saveBackwardInfo_int = torch.tensor(saveBackwardInfo_int) ctx.save_for_backward(saveBackwardInfo_int, torch.tensor(in_size), torch.tensor(Info.spatial_scale), rois, mapping_channel) return output
def cuda_float32(fltIn: float): return cupy.float32(fltIn)