def set(self, queue_adapter, buf, no_async=False): device_idx = queue_adapter._device_idx assert device_idx == self._device_idx self._context_adapter.activate_device(device_idx) # PyCUDA needs pointers to be passed as `numpy.number` to kernels, # but `memcpy` functions require Python `int`s. ptr = int(self._ptr) if isinstance(self._ptr, numpy.number) else self._ptr if isinstance(buf, numpy.ndarray): if no_async: pycuda_driver.memcpy_htod(ptr, buf) else: pycuda_driver.memcpy_htod_async( ptr, buf, stream=queue_adapter._pycuda_stream) else: buf_ptr = int(buf._ptr) if isinstance(buf._ptr, numpy.number) else buf._ptr if no_async: pycuda_driver.memcpy_dtod(ptr, buf_ptr, buf.size) else: pycuda_driver.memcpy_dtod_async( ptr, buf_ptr, buf.size, stream=queue_adapter._pycuda_stream)
def gpuarray_memcpy(dest, src): '''Device memory copy with pycuda from src GPUArray to dest GPUArray. ''' # dest[:] = src # memcpy_atoa(dest, 0, src, 0, len(src)) memcpy_dtod_async(dest.gpudata, src.gpudata, src.nbytes)
def copy(self): ret = GPULongint() ret.hide_digit = self.hide_digit ret.intsize_level = self.intsize_level ret.digitN = 1 << self.intsize_level dmy = drv.mem_alloc(4 * self.digitN) drv.memcpy_dtod_async(dmy, self.number, 4 * self.digitN) ret.number = dmy return ret
def _copy_array_buffer(self, dest, src, nbytes, src_offset=0, dest_offset=0): cuda.memcpy_dtod_async(int(dest.gpudata) + dest_offset, int(src.gpudata) + src_offset, nbytes, stream=self._queue)
def copyBuffer(self, buf, dest=None): if dest is None: buf_copy = self.allocate(buf.shape, buf.dtype) else: buf_copy = dest cuda.memcpy_dtod_async(buf_copy.gpudata, buf.gpudata, buf.nbytes, stream=self.stream) if dest is None: return buf_copy
def copy_async(array, out=None, out_device=None, stream=None): """Copies a GPUArray object using the given stream. This function can copy the device array to the destination array on another device. Args: array (~pycuda.gpuarray.GPUArray): Array to be copied. out (~pycuda.gpuarray.GPUArray): Destination array. If it is not ``None``, then ``out_device`` argument is ignored. out_device: Destination device specifier. Actual device object is obtained by passing this value to :func:`get_device`. stream (~pycuda.driver.Stream): CUDA stream. Returns: ~pycuda.gpuarray.GPUArray: Copied array. If ``out`` is not specified, then the array is allocated on the device specified by ``out_device`` argument. .. warning:: Currently, copy_async over different devices raises exception, since PyCUDA drops the definition of :func:`pycuda.driver.memcopy_peer_async`. """ in_device = get_device(array) if out is None: if out_device is None: out_device = in_device else: out_device = get_device(out_device) with using_device(out_device): out = empty_like(array) else: out_device = get_device(out) with using_device(in_device): if in_device == out_device: drv.memcpy_dtod_async(out.ptr, array.ptr, out.nbytes, stream=stream) else: drv.memcpy_peer_async(out.ptr, array.ptr, out.nbytes, out_device, in_device, stream=stream) return out
def add_new_frame(self, x): x = x.flatten() assert len(x) == self._featuredim x_gpu = gpuarray.to_gpu_async(x) BLOCK_SIZE = (256,1,1) nblocks = int(np.ceil(float(self._featuredim) / BLOCK_SIZE[0])) GRID_SIZE = (nblocks, self._framecount, 1) cudabuffer.cyclebuffer(self._Y_gpu_scratch, x_gpu, self._Y_gpu, np.int32(self._featuredim), np.int32(self._framecount), block=BLOCK_SIZE, grid=GRID_SIZE) # Copy self._Y_gpu into self._Y_gpu_scratch cuda.memcpy_dtod_async(self._Y_gpu_scratch.gpudata, self._Y_gpu.gpudata, self._Y_gpu.nbytes)
def copy_async(array, out=None, out_device=None, stream=None): """Copies a GPUArray object using the given stream. This function can copy the device array to the destination array on another device. Args: array (~pycuda.gpuarray.GPUArray): Array to be copied. out (~pycuda.gpuarray.GPUArray): Destination array. If it is not ``None``, then ``out_device`` argument is ignored. out_device: Destination device specifier. Actual device object is obtained by passing this value to :func:`get_device`. stream (~pycuda.driver.Stream): CUDA stream. Returns: ~pycuda.gpuarray.GPUArray: Copied array. If ``out`` is not specified, then the array is allocated on the device specified by ``out_device`` argument. .. warning:: Currently, copy_async over different devices raises exception, since PyCUDA drops the definition of :func:`pycuda.driver.memcopy_peer_async`. """ in_device = get_device(array) if out is None: if out_device is None: out_device = in_device else: out_device = get_device(out_device) with using_device(out_device): out = empty_like(array) else: out_device = get_device(out) with using_device(in_device): if in_device == out_device: drv.memcpy_dtod_async( out.ptr, array.ptr, out.nbytes, stream=stream) else: drv.memcpy_peer_async(out.ptr, array.ptr, out.nbytes, out_device, in_device, stream=stream) return out
def _assign(self, value): stream = self.backend.stream if isinstance(value, (int, float)): # if we have a contiguous array, then use the speedy driver kernel if self.is_contiguous: value = self.dtype.type(value) if self.dtype.itemsize == 1: drv.memset_d8_async(self.gpudata, unpack_from('B', value)[0], self.size, stream) elif self.dtype.itemsize == 2: drv.memset_d16_async(self.gpudata, unpack_from('H', value)[0], self.size, stream) else: drv.memset_d32_async(self.gpudata, unpack_from('I', value)[0], self.size, stream) # otherwise use our copy kerel else: OpTreeNode.build("assign", self, value) elif isinstance(value, GPUTensor): # TODO: add an is_binary_compat like function if self.is_contiguous and value.is_contiguous and self.dtype == value.dtype: drv.memcpy_dtod_async(self.gpudata, value.gpudata, self.nbytes, stream) else: OpTreeNode.build("assign", self, value) # collapse and execute an op tree as a kernel elif isinstance(value, OpTreeNode): OpTreeNode.build("assign", self, value) # assign to numpy array (same as set()) elif isinstance(value, np.ndarray): self.set(value, device=None) else: raise TypeError("Invalid type for assignment: %s" % type(value)) return self
def _assign(self, value): stream = self.backend.stream if isinstance(value, (int, float)): # if we have a contiguous array, then use the speedy driver kernel if self.is_contiguous: value = self.dtype.type(value) if self.dtype.itemsize == 1: drv.memset_d8_async( self.gpudata, unpack_from('B', value)[0], self.size, stream) elif self.dtype.itemsize == 2: drv.memset_d16_async(self.gpudata, unpack_from('H', value)[0], self.size, stream) else: drv.memset_d32_async(self.gpudata, unpack_from('I', value)[0], self.size, stream) # otherwise use our copy kerel else: OpTreeNode.build("assign", self, value) elif isinstance(value, GPUTensor): # TODO: add an is_binary_compat like function if self.is_contiguous and value.is_contiguous and self.dtype == value.dtype: drv.memcpy_dtod_async(self.gpudata, value.gpudata, self.nbytes, stream) else: OpTreeNode.build("assign", self, value) # collapse and execute an op tree as a kernel elif isinstance(value, OpTreeNode): OpTreeNode.build("assign", self, value) # assign to numpy array (same as set()) elif isinstance(value, np.ndarray): self.set(value, device=None) else: raise TypeError("Invalid type for assignment: %s" % type(value)) return self
def copyBuffer(self, buf, dest=None, src_offset=0, dest_offset=0, length=None): elem_size = buf.dtype.itemsize size = buf.nbytes if length is None else elem_size * length src_offset *= elem_size dest_offset *= elem_size if dest is None: ddest = self.allocate(buf.shape, buf.dtype) else: ddest = dest cuda.memcpy_dtod_async(int(ddest.gpudata) + dest_offset, int(buf.gpudata) + src_offset, size, stream=self.stream) if dest is None: return ddest
def copyBuffer(self, gpu_stream, buffer): """ Copying the given device buffer into the already allocated memory """ if not self.holds_data: raise RuntimeError('The buffer has been freed before copying buffer') if not buffer.holds_data: raise RuntimeError('The provided buffer is either not allocated, or has been freed before copying buffer') # Make sure that the input is of correct size: assert(buffer.nx_halo == self.nx_halo), str(buffer.nx_halo) + " vs " + str(self.nx_halo) assert(buffer.ny_halo == self.ny_halo), str(buffer.ny_halo) + " vs " + str(self.ny_halo) assert(buffer.bytes_per_float == self.bytes_per_float), "Provided buffer itemsize is " + str(buffer.bytes_per_float) + ", but should have been " + str(self.bytes_per_float) # Okay, everything is fine - issue device-to-device-copy: total_num_bytes = self.bytes_per_float*self.nx_halo*self.ny_halo cuda.memcpy_dtod_async(self.data.ptr, buffer.data.ptr, total_num_bytes, stream=gpu_stream)
def solve(self, wt_n, y_nd, bend_coef, f_res): if y_nd.shape[0] != self.n or y_nd.shape[1] != self.d: raise RuntimeError( "The dimensions of y_nd doesn't match the dimensions of x_nd") if not y_nd.flags.c_contiguous: raise RuntimeError("Expected y_nd to be c-contiguous but it isn't") self.sqrtWQN_gpu.set_async(np.sqrt(wt_n)[:, None] * self.QN) geam(self.NKN_gpu, self.NRN_gpu, self.lhs_gpu, alpha=bend_coef, beta=1) gemm(self.sqrtWQN_gpu, self.sqrtWQN_gpu, self.lhs_gpu, transa='T', alpha=1, beta=1) drv.memcpy_dtod_async(self.rhs_gpu.gpudata, self.NR_gpu.gpudata, self.rhs_gpu.nbytes) self.y_dnW_gpu.set_async( y_nd.T * wt_n) # use transpose so that it is f_contiguous gemm(self.QN_gpu, self.y_dnW_gpu, self.rhs_gpu, transa='T', transb='T', alpha=1, beta=1) if lfd.registration._has_cula: culinalg.cho_solve(self.lhs_gpu, self.rhs_gpu) z = self.rhs_gpu.get() culinalg.dot(self.N_gpu, self.rhs_gpu, out=self.theta_gpu) theta = self.theta_gpu.get() else: # if cula is not install perform the last two computations in the CPU z = np.linalg.solve(self.lhs_gpu.get(), self.rhs_gpu.get()) theta = self.N.dot(z) f_res.update(self.x_nd, y_nd, bend_coef, self.rot_coef, wt_n, theta, N=self.N, z=z)
def replica_to_fragment(self, reptsr, fragtsr): ''' Scatters the replica into the fragments (this just discards, so no p2p communication necessary ''' numrep = self.num_dev fragsz = fragtsr.size dsz = fragtsr.dtype.itemsize assert reptsr.size == fragsz * numrep strms = self.strms starts = [i * fragsz for i in range(numrep)] for dbuf, sbuf, ctx, offset, strm in zip(fragtsr.tlist, reptsr.tlist, self.ctxs, starts, strms): ctx.push() drv.memcpy_dtod_async(dbuf.ptr, sbuf.ptr + offset * dsz, fragsz * dsz, strm) ctx.pop() self.synchronize()
def solve(self, wt_n, y_nd, bend_coef, f_res): if y_nd.shape[0] != self.n or y_nd.shape[1] != self.d: raise RuntimeError("The dimensions of y_nd doesn't match the dimensions of x_nd") if not y_nd.flags.c_contiguous: raise RuntimeError("Expected y_nd to be c-contiguous but it isn't") self.sqrtWQN_gpu.set_async(np.sqrt(wt_n)[:,None] * self.QN) geam(self.NKN_gpu, self.NRN_gpu, self.lhs_gpu, alpha=bend_coef, beta=1) gemm(self.sqrtWQN_gpu, self.sqrtWQN_gpu, self.lhs_gpu, transa='T', alpha=1, beta=1) drv.memcpy_dtod_async(self.rhs_gpu.gpudata, self.NR_gpu.gpudata, self.rhs_gpu.nbytes) self.y_dnW_gpu.set_async(y_nd.T * wt_n) # use transpose so that it is f_contiguous gemm(self.QN_gpu, self.y_dnW_gpu, self.rhs_gpu, transa='T', transb='T', alpha=1, beta=1) if lfd.registration._has_cula: culinalg.cho_solve(self.lhs_gpu, self.rhs_gpu) culinalg.dot(self.N_gpu, self.rhs_gpu, out=self.theta_gpu) theta = self.theta_gpu.get() else: # if cula is not install perform the last two computations in the CPU z = np.linalg.solve(self.lhs_gpu.get(), self.rhs_gpu.get()) theta = self.N.dot(z) f_res.set_ThinPlateSpline(self.x_nd, y_nd, bend_coef, self.rot_coef, wt_n, theta=theta)
def set_constant_buffer(self, queue_adapter: CuQueueAdapter, name: str, arr: Union[CuBufferAdapter, numpy.ndarray]): """ Uploads a constant array ``arr`` corresponding to the symbol ``name`` to the context. """ self._context_adapter.activate_device(self._device_idx) symbol, size = self._pycuda_program.get_global(name) pycuda_stream = queue_adapter._pycuda_stream if isinstance(arr, CuBufferAdapter): transfer_size = arr.size elif isinstance(arr, numpy.ndarray): transfer_size = prod(arr.shape) * arr.dtype.itemsize else: # pragma: no cover # Shouldn't reach this path because the type is already checked by the caller. # Nevertheless leaving it here as a sanity check. raise TypeError(f"Unsupported array type: {type(arr)}") if transfer_size != size: raise ValueError(f"Incorrect size of the constant buffer; " f"expected {size} bytes, got {transfer_size}") if isinstance(arr, CuBufferAdapter): pycuda_driver.memcpy_dtod_async(symbol, arr.kernel_arg, arr.size, stream=pycuda_stream) else: # This serves two purposes: # 1. Gives us a pagelocked array, as PyCUDA requires # 2. Makes the array contiguous # Constant array are usually quite small, so it won't affect the performance. buf = pycuda_driver.pagelocked_empty(arr.shape, arr.dtype) numpy.copyto(buf, arr) pycuda_driver.memcpy_htod_async(symbol, buf, stream=pycuda_stream)
def run(self, queue): cuda.memcpy_dtod_async(dst.data, src.data, dst.nbytes, stream=queue.cuda_stream_comp)
def run_step(self, iter_parameters, iter_limit=1000, debug=False, time=False): self.step_init(iter_parameters, debug) goal_reached = False iteration = 0 while True: start_iter = timer() ########## create Wave front ############### start_wave_f = timer() ############################# timer wavefront(self.dev_Gindicator, self.dev_open, self.dev_cost, self.dev_threshold, self.dev_n, block=(self.threadsPerBlock, 1, 1), grid=(self.nBlocksPerGrid, 1)) self.dev_threshold += 2 * self.dev_radius goal_reached = self.dev_Gindicator[self.goal].get() == 1 end_wave_f = timer() ############################# timer start_wave_c = timer() ############################# timer dev_Gscan = cuda.to_gpu(self.dev_Gindicator) exclusiveScan(dev_Gscan) dev_gSize = dev_Gscan[-1] + self.dev_Gindicator[-1] gSize = int(dev_gSize.get()) if iteration >= iter_limit: print('### iteration limit ###', iteration) return self.route elif goal_reached: print('### goal reached ### ', iteration) self.parent = self.dev_parent.get() self.route = [] self.get_path() return self.route elif gSize == 0: print('### threshold skip ', iteration) continue dev_G = cuda.GPUArray([ gSize, ], np.int32) #dev_G = cuda.zeros(gSize, dtype=np.int32) compact(dev_G, dev_Gscan, self.dev_Gindicator, self.dev_waypoints, self.dev_n, block=(self.threadsPerBlock, 1, 1), grid=(self.nBlocksPerGrid, 1)) end_wave_c = timer() ############################# timer ######### scan and compact open set to connect neighbors ############### start_open = timer() ############################# timer dev_yscan = cuda.to_gpu(self.dev_open) exclusiveScan(dev_yscan) dev_ySize = dev_yscan[-1] + self.dev_open[-1] ySize = int(dev_ySize.get()) #dev_y = cuda.zeros(ySize, dtype=np.int32) dev_y = cuda.GPUArray([ ySize, ], np.int32) compact(dev_y, dev_yscan, self.dev_open, self.dev_waypoints, self.dev_n, block=(self.threadsPerBlock, 1, 1), grid=(self.nBlocksPerGrid, 1)) end_open = timer() ############################# timer ########## creating neighbors of wave front to connect open ############### #dev_xindicator = cuda.zeros_like(self.dev_open, dtype= np.int32,stream= self.stream1) #self.dev_xindicator.fill(self.zero_val, stream = self.stream1) #print(self.dev_xindicator_zeros.nbytes) start_neighbor = timer() ############################# timer drv.memcpy_dtod_async(self.dev_xindicator.gpudata, self.dev_xindicator_zeros.gpudata, self.dev_xindicator_zeros.nbytes) gBlocksPerGrid = int( ((gSize + self.threadsPerBlock - 1) / self.threadsPerBlock)) neighborIndicator(self.dev_xindicator, dev_G, self.dev_unexplored, self.dev_neighbors, self.dev_num_neighbors, self.neighbors_index, dev_gSize, block=(self.threadsPerBlock, 1, 1), grid=(gBlocksPerGrid, 1)) end_neighbor = timer() ############################# timer start_neighbor_c = timer() ############################# timer dev_xscan = cuda.to_gpu(self.dev_xindicator) exclusiveScan(dev_xscan) #start_create_n= timer() #dev_xscan = cuda.to_gpu_async(self.dev_xindicator, stream=self.stream1) #start_create_n= timer() #dev_xSize = cuda.sum(self.dev_xindicator, stream = self.stream1) #exclusiveScan(dev_xscan, stream=self.stream1) #start_create_n= timer() dev_xSize = dev_xscan[-1] + self.dev_xindicator[-1] #end_create_n= timer() xSize = int(dev_xSize.get()) if xSize == 0: print('### x skip') continue dev_x = cuda.GPUArray([ xSize, ], np.int32) #dev_x = cuda.zeros(xSize, dtype=np.int32) compact(dev_x, dev_xscan, self.dev_xindicator, self.dev_waypoints, self.dev_n, block=(self.threadsPerBlock, 1, 1), grid=(self.nBlocksPerGrid, 1)) end_neighbor_c = timer() ############################# timer ######### connect neighbors #################### # # launch planning start_connect = timer() ############################# timer xBlocksPerGrid = int( ((xSize + self.threadsPerBlock - 1) / self.threadsPerBlock)) dubinConnection(self.dev_cost, self.dev_parent, dev_x, dev_y, self.dev_states, self.dev_open, self.dev_unexplored, dev_xSize, dev_ySize, self.dev_obstacles, self.dev_num_obs, self.dev_radius, block=(self.threadsPerBlock, 1, 1), grid=(xBlocksPerGrid, 1)) end_connect = timer() ############################# timer end_iter = timer() if debug: print('dev parents:', self.dev_parent) print('dev cost: ', self.dev_cost) print('dev unexplored: ', self.dev_unexplored) print('dev open: ', self.dev_open) print('dev threshold: ', self.dev_threshold) print('goal reached: ', goal_reached) print('y size: ', ySize, 'y: ', dev_y) print('G size: ', gSize, 'G: ', dev_G) print('x size: ', dev_xSize, 'x: ', dev_x) print('wave front timer: ', end_wave_f - start_wave_f) print('wave compact timer: ', end_wave_c - start_wave_c) print('open set timer: ', end_open - start_open) print('neighbor timer: ', end_neighbor - start_neighbor) print('neighbor compact timer: ', end_neighbor_c - start_neighbor_c) print('connection timer: ', end_connect - start_connect) iteration_time = end_iter - start_iter print( f'######### iteration: {iteration} iteration time: {iteration_time}' ) if time and iteration > 0: self.time_data["wavefront"].append(end_wave_f - start_wave_f) self.time_data["wavefront_compact"].append(end_wave_c - start_wave_c) self.time_data["open_compact"].append(end_open - start_open) self.time_data["neighbors"].append(end_neighbor - start_neighbor) self.time_data["neighbors_compact"].append(end_neighbor_c - start_neighbor_c) self.time_data["connection"].append(end_connect - start_connect) self.time_data["elapsed"].append(end_iter - start_iter) self.time_data["iteration"].append(iteration) iteration += 1
def infer_greedy(self): (infer_ndtype, dtype_esize) = get_dtype_info(self.args.input_dtype) batch_idx = 0 # Static knobs ALWAYS_ADVANCE_TIME = True # hack to always consume time pointer regardless of symbol outcome # Iterate over batches for image_idx in range(0, self.num_samples, self.batch_size): # Actual batch size might be smaller than max batch size actual_batch_size = self.batch_size if image_idx + self.batch_size <= self.num_samples else self.num_samples - image_idx start_time = time.time() # output and runtime data structures # enc_ptr = [0 for tdix in range(actual_batch_size) ] # holds encoder pointer per batch element (Xt) out_sym = [ list() for tdix in range(actual_batch_size) ] # holds output symbol translation per batch element (Yu) # data initialization for the batch ---------- # # dec_inputs : host data for the decoder transfers dec_host_inputs = [ np.ascontiguousarray( np.zeros((actual_batch_size, 1), dtype=np.int32, order='C')), # input label np.ascontiguousarray( np.zeros((actual_batch_size, 2 * self.hyperP.decoder_hidden_size), dtype=infer_ndtype, order='C')), # hiden: layers * hidden np.ascontiguousarray( np.zeros((actual_batch_size, 2 * self.hyperP.decoder_hidden_size), dtype=infer_ndtype, order='C')), # cell: layers * hidden ] # host_outputs : host data for outputs from decoder and joint/beam_search host_outputs = [ np.ascontiguousarray( np.zeros((actual_batch_size, 1 * self.hyperP.labels_size), dtype=infer_ndtype, order='C')), # input: 1 * input np.ascontiguousarray( np.zeros((actual_batch_size, 2 * self.hyperP.decoder_hidden_size), dtype=infer_ndtype, order='C')), # hiden: layers * hidden np.ascontiguousarray( np.zeros((actual_batch_size, 2 * self.hyperP.decoder_hidden_size), dtype=infer_ndtype, order='C')), # cell: layers * hidden ] # run the encoder ---------- # # outputs[0] - ( BS, max_seq_length // 2, enc_hidden_size=1024 ) enc_outputs = self.encoder([self.batch_inputs[:actual_batch_size]], actual_batch_size) self.encoder.stream.synchronize() enc_outputs[0] = enc_outputs[0].reshape( (actual_batch_size, self.hyperP.max_seq_length // 2, self.hyperP.encoder_hidden_size)) # logging.info(" greedy::enc_output shape {:} type {:}".format(enc_outputs[0].shape, enc_outputs[0].dtype )) # logging.info(" greedy::enc_output data\n{:}".format(enc_outputs[0])) # run the decoder-joint greedy loop ---------- # for seq_id in range(self.hyperP.max_seq_length // 2): # enc_input_seq = np.ascontiguousarray(enc_outputs[0][:,seq_id,:]) # enc_input_seq = enc_input_seq.reshape (actual_batch_size, 1, self.hyperP.encoder_hidden_size) enc_input_seq = np.ascontiguousarray( np.zeros((actual_batch_size, 1, self.hyperP.encoder_hidden_size), dtype=infer_ndtype, order='C')) for bs_index in range(actual_batch_size): enc_input_seq[bs_index, :, :] = enc_outputs[0][ bs_index, enc_ptr[seq_id], :] # logging.info(" greedy::enc_output seq[{}] shape {:} data\n{:}".format(seq_id, enc_input_seq.shape, enc_input_seq)) # run decoder/predictor (transfer data first) [ cuda.memcpy_htod_async(d_input, inp, self.decoder.stream) for (d_input, inp) in zip(self.decoder.d_inputs, dec_host_inputs) ] # self.debug_input_info(self.d_inputs, inputs, "_run_decoder") dec_dev_outputs = self.decoder.decoder_step(actual_batch_size) # trasfer decoding state to host cuda.memcpy_dtoh_async(host_outputs[1], dec_dev_outputs[1], self.decoder.stream) cuda.memcpy_dtoh_async(host_outputs[2], dec_dev_outputs[2], self.decoder.stream) # transfer data for joint # logging.info("tensor: shape = {:} -- {} :\n{}".format(enc_input_seq.shape, enc_input_seq.dtype, enc_input_seq)) cuda.memcpy_htod_async(self.joint.d_inputs[0], enc_input_seq, self.joint.stream) # encoder port self.decoder.stream.synchronize() cuda.memcpy_dtod_async( self.joint.d_inputs[1], dec_dev_outputs[0], actual_batch_size * self.hyperP.decoder_hidden_size * dtype_esize, self.joint.stream) # predictor # run joint self.joint.joint_step(actual_batch_size) # Transfer result to CPU for greedy decoder cuda.memcpy_dtoh_async(host_outputs[0], self.joint.outputs[0].device, self.joint.stream) self.joint.stream.synchronize() # greedy decoder winner_symbol = np.argmax(host_outputs[0], axis=1) # logging.info("Joint outputs:\n{} ".format(host_outputs[0])) # logging.info("Winner_symbol:\n{} ".format(winner_symbol)) for bs_index in range(actual_batch_size): new_symbol = winner_symbol[bs_index] if new_symbol != self.hyperP.labels_size - 1: # symbol is not blank dec_host_inputs[0][bs_index, 0] = winner_symbol[ bs_index] # update predicted symbol dec_host_inputs[1][bs_index, :] = host_outputs[1][ bs_index, :] # update hidden state dec_host_inputs[2][bs_index, :] = host_outputs[2][ bs_index, :] # update cell state out_sym[bs_index].append(winner_symbol[bs_index]) if ALWAYS_ADVANCE_TIME: enc_ptr[bs_index] += 1 # logging.info("Adding symbol {} in bs_id {} ".format(winner_symbol[bs_index], bs_index)) else: # advance the audio time pointer if the symbol is blank enc_ptr[bs_index] += 1 # Loop epilogue logging.info( "Batch {:d} (Size {:}) >> Inference time: {:f}".format( batch_idx, actual_batch_size, time.time() - start_time)) batch_idx += 1 # logging.info(" output sequences:\n{:}".format(out_sym)) # Function epilogue pass
def reset_tps_params(self): """ sets the tps params to be identity """ for p in self.tps_params: drv.memcpy_dtod_async(p.gpudata, self.default_tps_params.gpudata, p.nbytes)
def _memcpy_dtod(self, dest, src, nbytes, src_offset=0, dest_offset=0): cuda.memcpy_dtod_async( int(dest) + dest_offset, int(src) + src_offset, nbytes, stream=self._queue)
def execute(self): ndevs = len(self.op.device_ids) size = self.input_tensor.tensor.size dtype = self.input_tensor.dtype segment_size = int(size / ndevs) if ((segment_size * ndevs) < size): segment_size += 1 # Align segment size to 16 bytes if (segment_size & 0x03): segment_size = (segment_size & (~0x03)) + 4 # Determine GPU active mask based on segment size num_active = int(size / segment_size) if ((segment_size * num_active) < size): num_active += 1 # Copy tensor to output buffer drv.memcpy_dtod(self.output_buff.gpudata, self.input_tensor.tensor.gpudata, size * dtype.itemsize) # Send each GPU its assigned segment device_idx = self.op.device_ids.index(self.device_id) for peer_idx, peer_id in enumerate(self.op.device_ids): if (peer_id == self.device_id): continue # Only send if peer is active if (peer_idx >= num_active): continue # Compute size and offset of this peer's segment peer_segment_size = segment_size peer_segment_offset = peer_idx * segment_size if (device_idx > peer_idx): peer_scratch_offset = segment_size * (device_idx - 1) else: peer_scratch_offset = segment_size * device_idx if ((peer_idx + 1) == num_active): peer_segment_size = size - peer_segment_offset # Enqueue peer to peer memcpy src = int(self.output_buff_dict.get(self.device_id)) + \ peer_segment_offset * dtype.itemsize scratch = int(self.scratch_buff_dict.get(peer_id)) + \ peer_scratch_offset * dtype.itemsize drv.memcpy_dtod_async(scratch, src, peer_segment_size * dtype.itemsize, self.stream) # Record event in stream self.event.record(self.stream) # Sync with other devices self.process_sync() # Wait for other GPUs events for peer_id in self.op.device_ids: if (peer_id == self.device_id): continue self.stream.wait_for_event(self.event_buff_dict[peer_id]) segment_offset = device_idx * segment_size this_segment_size = segment_size if ((device_idx + 1) == num_active): this_segment_size = size - segment_offset src = int(self.output_buff_dict.get(self.device_id)) + \ segment_offset * dtype.itemsize # Sum received peer segments block_size = 1024 grid_size = int(this_segment_size / (block_size * ITEMS_PER_THREAD)) if ((grid_size * block_size * ITEMS_PER_THREAD) < this_segment_size): grid_size += 1 # Perform reduction operation if (device_idx < num_active): num_arrays = ndevs - 1 params = [ src, self.scratch_buff_dict[self.device_id], this_segment_size, num_arrays, segment_size ] grid_dim = (grid_size, 1, 1) block_dim = (block_size, 1, 1) kernel = _reduction_kernel(self.op.reduce_func) kernel.prepared_async_call(grid_dim, block_dim, self.stream, *params) # Send other GPUs this GPU's assigned segment for peer_id in self.op.device_ids: if (peer_id == self.device_id): continue # Enqueue peer to peer memcpy dst = int(self.output_buff_dict.get(peer_id)) + \ segment_offset * dtype.itemsize drv.memcpy_dtod_async(dst, src, this_segment_size * dtype.itemsize, self.stream) self.event.record(self.stream) self.process_sync() # Wait for other GPUs events for peer_id in self.op.device_ids: if (peer_id == self.device_id): continue self.event_buff_dict[peer_id].synchronize() self.event.synchronize() drv.memcpy_dtod_async(self.tensor.tensor.gpudata, self.output_buff.gpudata, size * dtype.itemsize, self.stream) # This sync is only needed if we call this kernel 'synchronously' # if the assumption is that another kernel is called right after, # and uses the same streams as us, then we can remove this and # rely on the next kernel being put into our stream. # Record event in stream self.event.record(self.stream) # Sync with other devices self.process_sync() # Wait for other GPUs events for peer_id in self.op.device_ids: if (peer_id == self.device_id): continue self.event_buff_dict[peer_id].synchronize() self.event.synchronize()
if dst_strides[i] < dst_strides[i-1]: raise ValueError("src and dst must have same order") if (src_strides[i-1] * shape[i-1] == src_strides[i] and dst_strides[i-1] * shape[i-1] == dst_strides[i]): shape[i-1:i+1] = [shape[i-1] * shape[i]] del src_strides[i] del dst_strides[i] del axes[i] else: i += 1 if len(shape) <= 1: if isinstance(src, GPUArray): if isinstance(dst, GPUArray): if async: drv.memcpy_dtod_async(dst.gpudata, src.gpudata, src.nbytes, stream=stream) else: drv.memcpy_dtod(dst.gpudata, src.gpudata, src.nbytes) else: # The arrays might be contiguous in the sense of # having no gaps, but the axes could be transposed # so that the order is neither Fortran or C. # So, we attempt to get a contiguous view of dst. dst = _as_strided(dst, shape=(dst.size,), strides=(dst.dtype.itemsize,)) if async: drv.memcpy_dtoh_async(dst, src.gpudata, stream=stream) else: drv.memcpy_dtoh(dst, src.gpudata) else: src = _as_strided(src, shape=(src.size,), strides=(src.dtype.itemsize,)) if async:
def _copy_array_buffer(self, dest, src, nbytes, src_offset=0, dest_offset=0): cuda.memcpy_dtod_async( int(dest.gpudata) + dest_offset, int(src.gpudata) + src_offset, nbytes, stream=self._queue)
def set_tps_params(self, vals): for d, s in zip(self.tps_params, vals): drv.memcpy_dtod_async(d.gpudata, s.gpudata, d.nbytes)
def _memcpy_dtod(self, dest, src, nbytes, src_offset=0, dest_offset=0): cuda.memcpy_dtod_async(int(dest) + dest_offset, int(src) + src_offset, nbytes, stream=self._queue)
def initialize_solver(self, b, wt_n): drv.memcpy_dtod_async(self.NHN_gpu.gpudata, self.NON_gpu[b].gpudata, self.NHN_gpu.nbytes) self.WQN_gpu.set_async(wt_n[:, None] * self.QN)
def _run_decoder(self, inputs, seq_id, batch_size=1): (infer_ndtype, dtype_esize) = get_dtype_info(self.args.input_dtype) # self.debug_input_info(self.d_inputs, inputs, "_run_decoder") # Transfer input data to the GPU if seq_id == 0: # iter 0 needs the initial state hidden_tensor = np.ascontiguousarray( np.zeros((batch_size, 2 * self.hyperP.decoder_hidden_size), dtype=infer_ndtype, order='C')) # layers * hidden cell_tensor = np.ascontiguousarray( np.zeros((batch_size, 2 * self.hyperP.decoder_hidden_size), dtype=infer_ndtype, order='C')) # layers * hidden [ cuda.memcpy_htod_async(d_input, inp, self.stream) for (d_input, inp) in zip( self.d_inputs, [inputs[0], hidden_tensor, cell_tensor]) ] else: # rest of iteration auto-reccur the state cuda.memcpy_htod_async(self.d_inputs[0], inputs[0], self.stream) # Run inference. if self.engine.has_implicit_batch_dimension: self.context.execute_async(batch_size=batch_size, bindings=self.bindings, stream_handle=self.stream.handle) else: for inp_idx in range(3): input_shape = self.context.get_binding_shape(inp_idx) input_shape[0] = batch_size self.context.set_binding_shape(inp_idx, input_shape) self.context.execute_async_v2(bindings=self.bindings, stream_handle=self.stream.handle) if self.args.debug_mode: # Transfer all outputs back from the GPU. [ cuda.memcpy_dtoh_async(out.host, out.device, self.stream) for out in self.outputs ] # Synchronize the stream self.stream.synchronize() logging.info("_run_decoder: out[0] = {}".format( self.outputs[0].host)) self.debug_input_info(self.d_inputs, [out.host for out in self.outputs], "_run_decoder") # [cuda.memcpy_dtod_async(d_input, out.device, self.stream) for (d_input, out) in zip(self.d_inputs, self.outputs)] hidden_size = self.hyperP.decoder_hidden_size input_size = self.hyperP.decoder_input_size # Update state for next iteration cuda.memcpy_dtod_async(self.d_inputs[1], self.outputs[1].device, batch_size * 2 * hidden_size * dtype_esize, self.stream) cuda.memcpy_dtod_async(self.d_inputs[2], self.outputs[2].device, batch_size * 2 * hidden_size * dtype_esize, self.stream) # Transfer output to host cuda.memcpy_dtoh_async(self.outputs[0].host, self.outputs[0].device, self.stream) # logging.info("_run_decoder: out[0] = {}".format(self.outputs[0].host)) # Synchronize the stream self.stream.synchronize() # return the 'symbol' host output return self.outputs[0].host