def copy2D_array_to_device(dst, src, type_sz, width, height): copy = cuda_driver.Memcpy2D() copy.set_src_array(src) copy.set_dst_device(dst) copy.height = height copy.dst_pitch = copy.src_pitch = copy.width_in_bytes = width * type_sz copy(aligned=True)
def get_Memcpy2D_d2d(src, dst, src_pitch, dst_pitch, dim_args, itemsize, **kwargs): ''' Wrapper for the pycuda.driver.Memcpy2d() function (same args) Returns a callable object which copies the arrays on invocation of () dim_args: list, [width, height, depth] !not width_in_bytes kwargs: gets ignored, exists to provide a uniform interface with 3d ''' height, width = dim_args width_in_bytes = width * itemsize src_ptr = getattr(src, 'gpudata', 0) # set to NULL if no valid ptr dst_ptr = getattr(dst, 'gpudata', 0) # set to NULL if no valid ptr cpy = drv.Memcpy2D() cpy.set_src_device(src_ptr) cpy.set_dst_device(dst_ptr) cpy.height = np.int64(height) cpy.width_in_bytes = np.int64(width_in_bytes) cpy.src_pitch = src_pitch cpy.dst_pitch = dst_pitch class _copy(): ''' Proxy class for the memcpy2d object: Wrap the call to pass aligned=True which seems to be necessary in the 2D version (compared to 3D where it doesn't work with this arg Add the set_src_device and set_dst_device proxy methods to be able to set the src/dst ''' def __init__(self, memcpy2d): self.cpy = memcpy2d def set_src_device(self, src_ptr): self.cpy.set_src_device(src_ptr) def set_dst_device(self, dst_ptr): self.cpy.set_dst_device(dst_ptr) def __call__(self): self.cpy(aligned=True) return _copy(cpy)
def enqueue(self, batch_size, inputs, output, workspace, stream): ELEM_SIZE = 4 in_dims = list(self.in_dims) last = 0 for i, pos in enumerate(self.sections): acc_size_lo, acc_size_hi = 1, 1 for d in list(self.in_dims)[self.axis + 1:]: acc_size_lo *= d.size for d in list(self.in_dims)[:self.axis]: acc_size_hi *= d.size src_offset = acc_size_lo * last src_pitch = acc_size_lo * self.in_dims[self.axis].size dst_pitch = acc_size_lo * (pos - last) height = acc_size_hi * batch_size copy = cuda.Memcpy2D() copy.set_src_device(int(inputs[0])) copy.set_dst_device(int(output[i])) copy.src_x_in_bytes = src_offset * ELEM_SIZE copy.src_pitch = src_pitch * ELEM_SIZE copy.dst_pitch = dst_pitch * ELEM_SIZE copy.width_in_bytes = dst_pitch * ELEM_SIZE copy.height = height copy(stream) last = pos
def extract_columns(mat, start=0, stop=None, target=None): dtype = mat.dtype itemsize = np.dtype(dtype).itemsize N, M = mat.shape if stop is None: stop = M m = stop - start assert mat.flags.c_contiguous assert start >= 0 and start <= M and stop >= 0 and \ stop <= M and stop > start if target is None: target = gpuarray.empty((N, m), dtype) copy = drv.Memcpy2D() copy.set_src_device(mat.gpudata) copy.src_x_in_bytes = start * itemsize copy.set_dst_device(target.gpudata) copy.src_pitch = M * itemsize copy.dst_pitch = copy.width_in_bytes = m * itemsize copy.height = N copy(aligned=True) return target
def upload(self, stream, cpu_data, extent=None): if (extent is None): x = self.x_halo y = self.y_halo nx = self.nx ny = self.ny else: x, y, nx, ny = extent assert (nx == cpu_data.shape[1]) assert (ny == cpu_data.shape[0]) assert (x + nx <= self.nx + 2 * self.x_halo) assert (y + ny <= self.ny + 2 * self.y_halo) #Create copy object from device to host copy = cuda.Memcpy2D() copy.set_dst_device(self.data.gpudata) copy.set_src_host(cpu_data) #Set offsets and pitch of source copy.dst_x_in_bytes = int(x) * self.data.strides[1] copy.dst_y = int(y) copy.dst_pitch = self.data.strides[0] #Set width in bytes to copy for each row and #number of rows to copy copy.width_in_bytes = int(nx) * cpu_data.itemsize copy.height = int(ny) copy(stream)
def test_pycuda_memcpy_Surface_Surface(self): while True: surf_src = self.nvDec.DecodeSingleSurface() if surf_src.Empty(): break src_plane = surf_src.PlanePtr() surf_dst = nvc.Surface.Make(self.nvDec.Format(), self.nvDec.Width(), self.nvDec.Height(), self.gpu_id) self.assertFalse(surf_dst.Empty()) dst_plane = surf_dst.PlanePtr() memcpy_2d = cuda.Memcpy2D() memcpy_2d.width_in_bytes = src_plane.Width() * src_plane.ElemSize() memcpy_2d.src_pitch = src_plane.Pitch() memcpy_2d.dst_pitch = dst_plane.Pitch() memcpy_2d.width = src_plane.Width() memcpy_2d.height = src_plane.Height() memcpy_2d.set_src_device(src_plane.GpuMem()) memcpy_2d.set_dst_device(dst_plane.GpuMem()) memcpy_2d(self.cuda_str) frame_src = np.ndarray(shape=(0), dtype=np.uint8) if not self.nvDwn.DownloadSingleSurface(surf_src, frame_src): self.fail('Failed to download decoded surface') frame_dst = np.ndarray(shape=(0), dtype=np.uint8) if not self.nvDwn.DownloadSingleSurface(surf_dst, frame_dst): self.fail('Failed to download decoded surface') if not np.array_equal(frame_src, frame_dst): self.fail('Video frames are not equal')
def download(self, stream, asynch=False): #self.logger.debug("Downloading [%dx%d] buffer", self.nx, self.ny) #Allocate host memory #cpu_data = cuda.pagelocked_empty((self.ny, self.nx), np.float32) #cpu_data = np.empty((self.nz, self.ny, self.nx), dtype=np.float32) cpu_data = self.memorypool.allocate((self.nz, self.ny, self.nx), dtype=np.float32) #Create copy object from device to host copy = cuda.Memcpy2D() copy.set_src_device(self.data.gpudata) copy.set_dst_host(cpu_data) #Set offsets and pitch of source copy.src_x_in_bytes = self.x_halo * self.data.strides[1] copy.src_y = self.y_halo copy.src_z = self.z_halo copy.src_pitch = self.data.strides[0] #Set width in bytes to copy for each row and #number of rows to copy copy.width_in_bytes = self.nx * cpu_data.itemsize copy.height = self.ny copy.depth = self.nz copy(stream) if asynch == False: stream.synchronize() return cpu_data
def copy2D_host_to_array(arr, host, width, height): copy = driver.Memcpy2D() copy.set_src_host(host) copy.set_dst_array(arr) copy.height = height copy.width_in_bytes = copy.src_pitch = width copy.height = height copy(aligned=True)
def copy_2d_host_to_device(dev, host, src_pitch, dst_pitch, width, height): c = driver.Memcpy2D() c.set_src_host(host) c.set_dst_device(dev) c.src_pitch = src_pitch c.dst_pitch = dst_pitch c.width_in_bytes = width c.height = height c(aligned=True)
def copy2D_device_to_host(host, dev, src_pitch, dst_pitch, width, height): copy = driver.Memcpy2D() copy.set_src_device(dev) copy.set_dst_host(host) copy.src_pitch = src_pitch copy.dst_pitch = dst_pitch copy.width_in_bytes = width copy.height = height copy(aligned=True)
def predict(data): print('data shape:', data.shape) batch_size = data.shape[0] print("TRT batch_size:", batch_size) d_input = cuda.mem_alloc(data.nbytes) cuda.memcpy_htod(d_input, data) time_step = data.shape[2] // 4 print('timestep:', time_step) h_time_step = np.array([time_step] * batch_size, np.int32) d_time_step = cuda.mem_alloc(h_time_step.nbytes) cuda.memcpy_htod(d_time_step, h_time_step) d_cnn_output = cuda.mem_alloc(batch_size * time_step * 16 * 4) d_lstm_input = cuda.mem_alloc(batch_size * max_time_step * 16 * 4) output = np.empty((batch_size, max_time_step), dtype=np.int32) d_output = cuda.mem_alloc(output.nbytes) predictor_conv = TrtPredictor_Conv(False) predictor_lstm = TrtPredictor_Lstm(False) n_round = 1 time0 = time.time() for _ in range(n_round): #start = time.time() predictor_conv.infer(data.shape, d_input, d_cnn_output) m = cuda.Memcpy2D() m.src_pitch = time_step * 16 * 4 m.dst_pitch = max_time_step * 16 * 4 m.width_in_bytes = m.src_pitch m.height = batch_size m.set_src_device(d_cnn_output) m.set_dst_device(d_lstm_input) m(False) predictor_lstm.infer(batch_size, d_lstm_input, d_time_step, d_output) #print "tensorrt forward batch spend : {}".format((time.time() - start) / 1.0) cuda.Context.synchronize() print("TRT average:", (time.time() - time0) * 1.0 / n_round) cuda.memcpy_dtoh(output, d_output) print(output) for k in range(len(output)): cur = None seq = [] for i in output[k]: if cur == i: continue seq.append(i) cur = i print([chr(ord('a') + i - 1) for i in seq if i != 0])
def update_2d_texture(texref, newdata): arr = texref.get_array() newdata = numpy.ascontiguousarray(newdata) h, w = newdata.shape desc = arr.get_descriptor() assert h == desc.height and w == desc.width assert desc.num_channels == 1 copy = cuda.Memcpy2D() copy.set_src_host(newdata) copy.set_dst_array(arr) copy.width_in_bytes = copy.src_pitch = newdata.strides[0] copy.src_height = copy.height = h copy(True)
def gpuArray2DtocudaArray(gpuArray): #import pycuda.autoinit h, w = gpuArray.shape descr2D = cuda.ArrayDescriptor() descr2D.width = w descr2D.height = h descr2D.format = cuda.dtype_to_array_format(gpuArray.dtype) descr2D.num_channels = 1 cudaArray = cuda.Array(descr2D) copy2D = cuda.Memcpy2D() copy2D.set_src_device(gpuArray.ptr) copy2D.set_dst_array(cudaArray) copy2D.src_pitch = gpuArray.strides[0] copy2D.width_in_bytes = copy2D.src_pitch = gpuArray.strides[0] copy2D.src_height = copy2D.height = h copy2D(aligned=True) return cudaArray, copy2D
def resize_gpu(y_gpu, out_shape): in_shape = np.array(y_gpu.shape).astype(np.uint32) dtype = y_gpu.dtype if dtype != np.float32: raise NotImplementedException('Only float at the moment') block_size = (16,16,1) grid_size = (int(np.ceil(float(out_shape[1])/block_size[0])), int(np.ceil(float(out_shape[0])/block_size[1]))) preproc = _generate_preproc(dtype) mod = SourceModule(preproc + resize_code, keep=True) resize_fun_gpu = mod.get_function("resize") resized_gpu = cua.empty(tuple((np.int(out_shape[0]), np.int(out_shape[1]))),y_gpu.dtype) temp_gpu, pitch = cu.mem_alloc_pitch(4 * y_gpu.shape[1], y_gpu.shape[0], 4) copy_object = cu.Memcpy2D() copy_object.set_src_device(y_gpu.gpudata) copy_object.set_dst_device(temp_gpu) copy_object.src_pitch = 4 * y_gpu.shape[1] copy_object.dst_pitch = pitch copy_object.width_in_bytes = 4 * y_gpu.shape[1] copy_object.height = y_gpu.shape[0] copy_object(aligned=False) in_tex = mod.get_texref('in_tex') descr = cu.ArrayDescriptor() descr.width = y_gpu.shape[1] descr.height = y_gpu.shape[0] descr.format = cu.array_format.FLOAT descr.num_channels = 1 #pitch = y_gpu.nbytes / y_gpu.shape[0] in_tex.set_address_2d(temp_gpu, descr, pitch) in_tex.set_filter_mode(cu.filter_mode.LINEAR) in_tex.set_flags(cu.TRSF_NORMALIZED_COORDINATES) resize_fun_gpu(resized_gpu.gpudata, np.uint32(out_shape[0]), np.uint32(out_shape[1]), block=block_size, grid=grid_size) temp_gpu.free() return resized_gpu
def insert_columns(src, dst, offset): dtype = src.dtype itemsize = np.dtype(dtype).itemsize h_src, w_src = src.shape h_dst, w_dst = dst.shape assert dst.dtype == dtype assert h_src == h_dst assert w_dst >= offset + w_src copy = drv.Memcpy2D() copy.set_src_device(src.gpudata) copy.set_dst_device(dst.gpudata) copy.dst_x_in_bytes = offset * itemsize copy.src_pitch = copy.width_in_bytes = w_src * itemsize copy.dst_pitch = w_dst * itemsize copy.height = h_src copy(aligned=True)
def extract_columns(mat, start=0, stop=None, target=None): dtype = mat.dtype itemsize = np.dtype(dtype).itemsize input_3d = False if len(mat.shape) == 2: N, M = mat.shape if stop is None: stop = M elif len(mat.shape) == 3: input_3d = True N, M, Z = mat.shape if stop is None: stop = M start = start * Z stop = stop * Z M = M * Z mat = mat.reshape((N, M)) else: raise ValueError("mat must have two or three dimensions") m = stop - start assert mat.flags.c_contiguous assert start >= 0 and start <= M and stop >= 0 and \ stop <= M and stop > start if target is None: target = gpuarray.empty((N, m), dtype, allocator=memory_pool.allocate) copy = drv.Memcpy2D() copy.set_src_device(mat.gpudata) copy.src_x_in_bytes = start * itemsize copy.set_dst_device(target.gpudata) copy.src_pitch = M * itemsize copy.dst_pitch = copy.width_in_bytes = m * itemsize copy.height = N copy(aligned=True) if input_3d: assert not m % Z target = target.reshape((N, m // Z, Z)) return target
def np2DtoCudaArray(npArray, allowSurfaceBind=False): #import pycuda.autoinit h, w = npArray.shape descr2D = cuda.ArrayDescriptor() descr2D.width = w descr2D.height = h descr2D.format = cuda.dtype_to_array_format(npArray.dtype) descr2D.num_channels = 1 if allowSurfaceBind: descr.flags = cuda.array3d_flags.SURFACE_LDST cudaArray = cuda.Array(descr2D) copy2D = cuda.Memcpy2D() copy2D.set_src_host(npArray) copy2D.set_dst_array(cudaArray) copy2D.src_pitch = npArray.strides[0] copy2D.width_in_bytes = copy2D.src_pitch = npArray.strides[0] copy2D.src_height = copy2D.height = h copy2D(aligned=True) return cudaArray, descr2D
def download(self, stream, cpu_data=None, asynch=False, extent=None): if (extent is None): x = self.x_halo y = self.y_halo nx = self.nx ny = self.ny else: x, y, nx, ny = extent if (cpu_data is None): #self.logger.debug("Downloading [%dx%d] buffer", self.nx, self.ny) #Allocate host memory #The following fails, don't know why (crashes python) #cpu_data = cuda.pagelocked_empty((self.ny, self.nx), np.float32)32) #Non-pagelocked: cpu_data = np.empty((ny, nx), dtype=np.float32) cpu_data = self.memorypool.allocate((ny, nx), dtype=np.float32) assert nx == cpu_data.shape[1] assert ny == cpu_data.shape[0] assert x + nx <= self.nx + 2 * self.x_halo assert y + ny <= self.ny + 2 * self.y_halo #Create copy object from device to host copy = cuda.Memcpy2D() copy.set_src_device(self.data.gpudata) copy.set_dst_host(cpu_data) #Set offsets and pitch of source copy.src_x_in_bytes = int(x) * self.data.strides[1] copy.src_y = int(y) copy.src_pitch = self.data.strides[0] #Set width in bytes to copy for each row and #number of rows to copy copy.width_in_bytes = int(nx) * cpu_data.itemsize copy.height = int(ny) copy(stream) if asynch == False: stream.synchronize() return cpu_data
def pad_array(mat, left=0, right=0, val=0., new_shape=None, stream=None): assert mat.flags.c_contiguous is_chararray = False if mat.dtype == '|S1': is_chararray = True mat.dtype = np.int8 if type(val) is str: val = ord(val) if len(mat.shape) == 2: height, width = mat.shape elif len(mat.shape) > 2: height = mat.shape[0] width = np.prod(mat.shape[1:]) mat = mat.reshape((height, width)) else: raise ValueError('Array must be at least two-dimensional.') padded_width = width + left + right padded_mat = gpuarray.empty((height, padded_width), dtype=mat.dtype, allocator=memory_pool.allocate).fill(val) itemsize = np.dtype(padded_mat.dtype).itemsize copy = drv.Memcpy2D() copy.set_src_device(mat.gpudata) copy.set_dst_device(padded_mat.gpudata) copy.dst_x_in_bytes = left * itemsize copy.src_pitch = copy.width_in_bytes = width * itemsize copy.dst_pitch = padded_width * itemsize copy.height = height copy(stream) if new_shape is not None: padded_mat = padded_mat.reshape(new_shape) if is_chararray: mat.dtype = np.dtype('|S1') padded_mat.dtype = np.dtype('|S1') return padded_mat
def test_pycuda_memcpy_Surface_Tensor(self): while True: surf_src = self.nvDec.DecodeSingleSurface() if surf_src.Empty(): break src_plane = surf_src.PlanePtr() surface_tensor = torch.zeros( src_plane.Height(), src_plane.Width(), 1, dtype=torch.uint8, device=torch.device(f'cuda:{self.gpu_id}')) dst_plane = surface_tensor.data_ptr() memcpy_2d = cuda.Memcpy2D() memcpy_2d.width_in_bytes = src_plane.Width() * src_plane.ElemSize() memcpy_2d.src_pitch = src_plane.Pitch() memcpy_2d.dst_pitch = self.nvDec.Width() memcpy_2d.width = src_plane.Width() memcpy_2d.height = src_plane.Height() memcpy_2d.set_src_device(src_plane.GpuMem()) memcpy_2d.set_dst_device(dst_plane) memcpy_2d(self.cuda_str) frame_src = np.ndarray(shape=(0), dtype=np.uint8) if not self.nvDwn.DownloadSingleSurface(surf_src, frame_src): self.fail('Failed to download decoded surface') frame_dst = surface_tensor.to('cpu').numpy() frame_dst = frame_dst.reshape( (src_plane.Height() * src_plane.Width())) if not np.array_equal(frame_src, frame_dst): self.fail('Video frames are not equal')
print original ''' destImage_gpu = cuda.mem_alloc_like(original) sourceImage_gpu = cuda.mem_alloc_like(original) intermediateImage_gpu = cuda.mem_alloc_like(original) ''' destImage_gpu, pit = cuda.mem_alloc_pitch(7 * 4, 2, numpy.dtype(numpy.float32).itemsize) sourceImage_gpu, pit2 = cuda.mem_alloc_pitch( 7 * 4, 2, numpy.dtype(numpy.float32).itemsize) print pit, pit2 #cuda.memcpy_htod(sourceImage_gpu, original) #cuda.memcpy_htod(destImage_gpu, original) copy = cuda.Memcpy2D() copy.set_src_host(original) copy.set_dst_device(destImage_gpu) copy.height = 2 copy.width_in_bytes = 7 * 4 copy.src_pitch = 7 * 4 copy.dst_pitch = 128 * 4 copy(aligned=True) destImage = original.copy() func(destImage_gpu, sourceImage_gpu, numpy.int32(10), numpy.int32(2), block=(10, 1, 1),
def copy_non_contiguous(dst, src): """Copy ``src`` array to ``dst`` array. A gpu-array may have a non contiguous block of memory, i.e. it may have substancial pitches/strides. However a cpu-array must have a contiguous block of memory. All four directions are allowed. """ assert src.dtype == dst.dtype,\ "src ({}) and dst ({}) must have the same datatype.".format(str(src.dtype), str(dst.dtype)) assert dst.shape == src.shape,\ "Shapes do not match: " + str(dst.shape) + " <-> " + str(src.shape) itemsize = np.dtype(src.dtype).itemsize copy = cuda.Memcpy2D() src_on_gpu = isinstance(src, pycuda.gpuarray.GPUArray) dst_on_gpu = isinstance(dst, pycuda.gpuarray.GPUArray) if src_on_gpu: copy.set_src_device(src.gpudata) else: copy.set_src_host(src) if dst_on_gpu: copy.set_dst_device(dst.gpudata) else: copy.set_dst_host(dst) if len(src.shape) == 1: copy.src_pitch = src.strides[0] if src_on_gpu else itemsize copy.dst_pitch = dst.strides[0] if dst_on_gpu else itemsize copy.width_in_bytes = itemsize copy.height = src.shape[0] copy(aligned=False) elif len(src.shape) == 2: if (itemsize != src.strides[1] if src_on_gpu else False) or \ (itemsize != dst.strides[1] if dst_on_gpu else False): # arrays have to be copied column by column, because there a two substantial pitches/strides # which is not supported by cuda. copy.src_pitch = src.strides[0] if src_on_gpu else itemsize copy.dst_pitch = dst.strides[0] if dst_on_gpu else itemsize copy.width_in_bytes = itemsize copy.height = src.shape[0] for col in range(src.shape[1]): copy.src_x_in_bytes = col * src.strides[ 1] if src_on_gpu else col * itemsize copy.dst_x_in_bytes = col * dst.strides[ 1] if dst_on_gpu else col * itemsize copy(aligned=False) else: # both arrays have a contiguous block of memory for each row copy.src_pitch = src.strides[ 0] if src_on_gpu else itemsize * src.shape[1] copy.dst_pitch = dst.strides[ 0] if dst_on_gpu else itemsize * src.shape[1] copy.width_in_bytes = itemsize * src.shape[1] copy.height = src.shape[0] copy(aligned=False) elif len(src.shape) == 3: if (src.strides[0] != src.shape[1] * src.strides[1] if src_on_gpu else False) or \ (dst.strides[0] != dst.shape[1] * dst.strides[1] if dst_on_gpu else False): # arrays have to be copied plane by plane, because there a substantial pitche/stride # for the z-axis which is not supported by cuda. for plane in range(src.shape[0]): copy_non_contiguous(dst[plane, :, :], src[plane, :, :]) return copy = cuda.Memcpy3D() if src_on_gpu: copy.set_src_device(src.gpudata) else: copy.set_src_host(src) if dst_on_gpu: copy.set_dst_device(dst.gpudata) else: copy.set_dst_host(dst) copy.src_pitch = src.strides[ 1] if src_on_gpu else itemsize * src.shape[2] copy.dst_pitch = dst.strides[ 1] if dst_on_gpu else itemsize * src.shape[2] copy.width_in_bytes = itemsize * src.shape[2] copy.height = copy.src_height = copy.dst_height = src.shape[1] copy.depth = src.shape[0] copy() else: raise RuntimeError("dimension %d is not supported." % len(src.shape))
def np3DtoCudaArray(npArray, prec, order = "C", allowSurfaceBind=False): ''' Some parameters like stride are explained in PyCUDA: driver.py test_driver.py gpuarray.py''' # For 1D-2D Cuda Arrays the descriptor is the same just puttin LAYERED flags # if order != "C": raise LogicError("Just implemented for C order") dimension = len(npArray.shape) case = order in ["C","F"] if not case: raise LogicError("order must be either F or C") # if dimension == 1: # w = npArray.shape[0] # h, d = 0,0 if dimension == 2: if order == "C": stride = 0 if order == "F": stride = -1 h, w = npArray.shape d = 1 if allowSurfaceBind: descrArr = cuda.ArrayDescriptor3D() descrArr.width = w descrArr.height = h descrArr.depth = d else: descrArr = cuda.ArrayDescriptor() descrArr.width = w descrArr.height = h # descrArr.depth = d elif dimension == 3: if order == "C": stride = 1 if order == "F": stride = 1 d, h, w = npArray.shape descrArr = cuda.ArrayDescriptor3D() descrArr.width = w descrArr.height = h descrArr.depth = d else: raise LogicError("CUDArray dimesnsion 2 and 3 supported at the moment ... ") if prec == 'float': descrArr.format = cuda.dtype_to_array_format(npArray.dtype) descrArr.num_channels = 1 elif prec == 'cfloat': # Hack for complex 64 = (float 32, float 32) == (re,im) descrArr.format = cuda.array_format.SIGNED_INT32 # Reading data as int2 (hi=re,lo=im) structure descrArr.num_channels = 2 elif prec == 'double': # Hack for doubles descrArr.format = cuda.array_format.SIGNED_INT32 # Reading data as int2 (hi,lo) structure descrArr.num_channels = 2 elif prec == 'cdouble': # Hack for doubles descrArr.format = cuda.array_format.SIGNED_INT32 # Reading data as int4 (re=(hi,lo),im=(hi,lo)) structure descrArr.num_channels = 4 else: descrArr.format = cuda.dtype_to_array_format(npArray.dtype) descrArr.num_channels = 1 if allowSurfaceBind: if dimension==2: descrArr.flags |= cuda.array3d_flags.ARRAY3D_LAYERED descrArr.flags |= cuda.array3d_flags.SURFACE_LDST cudaArray = cuda.Array(descrArr) if allowSurfaceBind or dimension==3 : copy3D = cuda.Memcpy3D() copy3D.set_src_host(npArray) copy3D.set_dst_array(cudaArray) copy3D.width_in_bytes = copy3D.src_pitch = npArray.strides[stride] # if dimension==3: copy3D.width_in_bytes = copy3D.src_pitch = npArray.strides[1] #Jut C order support # if dimension==2: copy3D.width_in_bytes = copy3D.src_pitch = npArray.strides[0] #Jut C order support copy3D.src_height = copy3D.height = h copy3D.depth = d copy3D() return cudaArray, copy3D else: # if dimension == 3: # copy3D = cuda.Memcpy3D() # copy3D.set_src_host(npArray) # copy3D.set_dst_array(cudaArray) # copy3D.width_in_bytes = copy3D.src_pitch = npArray.strides[stride] # # if dimension==3: copy3D.width_in_bytes = copy3D.src_pitch = npArray.strides[1] #Jut C order support # # if dimension==2: copy3D.width_in_bytes = copy3D.src_pitch = npArray.strides[0] #Jut C order support # copy3D.src_height = copy3D.height = h # copy3D.depth = d # copy3D() # return cudaArray, copy3D # if dimension == 2: cudaArray = cuda.Array(descrArr) copy2D = cuda.Memcpy2D() copy2D.set_src_host(npArray) copy2D.set_dst_array(cudaArray) copy2D.width_in_bytes = copy2D.src_pitch = npArray.strides[stride] # copy2D.width_in_bytes = copy2D.src_pitch = npArray.strides[0] #Jut C order support copy2D.src_height = copy2D.height = h copy2D(aligned=True) return cudaArray, copy2D
def _assignshape(shape, axis, value): a = [] for i in range(len(shape)): if i == axis: a.append(value) else: a.append(shape[i]) return tuple(a) def PitchTrans(shape, dst, dst_ld, src, src_ld, dtype, aligned=False, async = False, stream = None): size = np.dtype(dtype).itemsize trans = cuda.Memcpy2D() trans.src_pitch = src_ld * size if isinstance(src, (cuda.DeviceAllocation, int, long)): trans.set_src_device(src) else: trans.set_src_host(src) trans.dst_pitch = dst_ld * size if isinstance(dst, (cuda.DeviceAllocation, int, long)): trans.set_dst_device(dst) else: trans.set_dst_host(dst) trans.width_in_bytes = _pd(shape) * size trans.height = int(shape[0])
# So, we attempt to get a contiguous view of dst. dst = _as_strided(dst, shape=(dst.size,), strides=(dst.dtype.itemsize,)) if async: drv.memcpy_dtoh_async(dst, src.gpudata, stream=stream) else: drv.memcpy_dtoh(dst, src.gpudata) else: src = _as_strided(src, shape=(src.size,), strides=(src.dtype.itemsize,)) if async: drv.memcpy_htod_async(dst.gpudata, src, stream=stream) else: drv.memcpy_htod(dst.gpudata, src) return if len(shape) == 2: copy = drv.Memcpy2D() elif len(shape) == 3: copy = drv.Memcpy3D() else: raise ValueError("more than 2 discontiguous axes not supported %s" % (tuple(sorted(axes)),)) if isinstance(src, GPUArray): copy.set_src_device(src.gpudata) else: copy.set_src_host(src) if isinstance(dst, GPUArray): copy.set_dst_device(dst.gpudata) else: copy.set_dst_host(dst)