def spin_forever(self): encoded = cuda.mapped_array((self.NUM_SECTOR, rvl_cuda.SECTOR_LEN), dtype=np.uint64) # encoded = np.zeros((self.NUM_SECTOR, rvl_cuda.SECTOR_LEN), dtype=np.uint64) print(self.NUM_SECTOR, "sectors, length", rvl_cuda.SECTOR_LEN) sectors_per_packet = int( MAX_UDP_PACKET_BYTES / (rvl_cuda.SECTOR_LEN * np.dtype(np.uint64).itemsize)) print( "Starting UDP send pipeline (%d sectors/packet, %d packets/frame)" % (sectors_per_packet, int(self.NUM_SECTOR / sectors_per_packet))) self.pipeline.start(self.config) delta = cuda.mapped_array(self.dim, dtype=np.uint16) last_frame = delta while (True): try: start = time.perf_counter() has, frame = self.pipeline.try_wait_for_frames() if not has: break frame_data = np.asanyarray(frame.get_depth_frame().get_data()) fnum = frame.get_frame_number() # PERF_TODO actually take delta between frames #delta[:,:] = frame_data - last_frame delta[:, :] = frame_data last_frame = frame_data enc_start = time.perf_counter() rvl_cuda.encode[self.BLOCKS_PER_GRID, self.THREADS_PER_BLOCK](delta, encoded) enc_end_send_start = time.perf_counter() prep_amt = 0 for i in range(1, encoded.shape[0], sectors_per_packet): # PERF_TODO restrict width to max packet width # width = np.max(np.right_shift(encoded[i:(i+sectors_per_packet),0], 32)) self.sock.sendto( encoded[i:(i + sectors_per_packet), :].tobytes(), self.dest) send_end = time.perf_counter() if self.debug: cv2.imshow("frame", frame_data / max(1, np.max(frame_data))) cv2.imshow("encoded", encoded / max(1, np.max(encoded))) cv2.waitKey(1) # Update stats self.stats['skipped_frames'] += frame.get_frame_number( ) - self.last_frame_num - 1 self.avg_ms("avg_fetch_latency", enc_start - start) self.avg_ms("avg_encode_latency", enc_end_send_start - enc_start) self.avg_ms("avg_send_latency", send_end - enc_end_send_start) self.avg_ms("avg_latency", send_end - start) self.last_frame_num = frame.get_frame_number() if send_end - self.last_print_stats > self.print_pd: self.print_stats() self.last_print_stats = send_end except KeyboardInterrupt: return
def get_shared_mem( shape, dtype=np.float32, strides=None, order="C", stream=0, portable=False, wc=True, ): """Return shared memory between GPU and CPU. Similar to numpy.zeros Parameters ---------- shape : ndarray.shape Size of shared memory allocation dtype : cupy.dtype or numpy.dtype Data type of allocation strides: int or None order: char stream : int Stream number (0 for default) portable : bool wc : bool """ return cuda.mapped_array( shape, dtype=dtype, strides=strides, order=order, stream=stream, portable=portable, wc=wc, )
def get_shared_array( data, strides=None, order="C", stream=0, portable=False, wc=True ): """Return populated shared memory between GPU and CPU. Parameters ---------- data : cupy.ndarray or numpy.ndarray The array to be copied to shared buffer strides: int or None order: char stream : int Stream number (0 for default) portable : bool wc : bool """ shape = data.shape dtype = data.dtype # Allocate mapped, shared memory in Numba shared_mem_array = cuda.mapped_array( shape, dtype=dtype, strides=strides, order=order, stream=stream, portable=portable, wc=wc, ) # Load data into array space shared_mem_array[:] = data return shared_mem_array
def calculate_correlations(user_centric_array, film_correlations_number): number_users = len(user_centric_array) # CUDA stuff stream = cuda.stream() with stream.auto_synchronize(): threadsperblock = 8 correlation_fields_to_fill = sum(range(film_correlations_number+1)) blockspergrid = math.ceil(correlation_fields_to_fill / threadsperblock) user_centric_array_global_mem = cuda.to_device(user_centric_array, stream=stream) correlation_matrix_global_mem = cuda.mapped_array((film_correlations_number+1, film_correlations_number+1), dtype='float64') y = numpy.ones((1,1), dtype='float64') yc = cuda.to_device(y, stream=stream) # Start calculations calculation_thread[blockspergrid, threadsperblock](user_centric_array_global_mem, correlation_matrix_global_mem, int(number_users), int(film_correlations_number)) print(correlation_matrix_global_mem) return correlation_matrix_global_mem
def test_host_alloc_mapped(self): ary = cuda.mapped_array(10, dtype=np.uint32) ary.fill(123) self.assertTrue(all(ary == 123)) driver.device_memset(ary, 0, driver.device_memory_size(ary)) self.assertTrue(all(ary == 0)) self.assertTrue(sum(ary != 0) == 0)
def __call__(self, U, n): m = U.shape[0] # If dimensionality of problem has changed, re-do setup if (n, m) != self.lastNM: t1 = time.time() self.lastNM = (n, m) normalization, self.idxs = build_norm_and_idxs(n, m) self.normalization = normalization.copy() if n % 2 == 1: self.normalization *= -1 N = self.idxs.shape[0] self.N = N # Allocate mapped array for final result self.phiU = cuda.mapped_array((N, N), dtype=np.complex128, stream=self.stream) # Allocate array on device for computation self.d_phiU = cuda.device_array((N, N), dtype=np.complex128, stream=self.stream) # Copy idxs to the device self.d_idxs = cuda.to_device(self.idxs, stream=self.stream) # Copy normalization to the device self.d_normalization = cuda.to_device( self.normalization.astype('complex128'), stream=self.stream) # Set up call parameters blockspergrid_x = ( N + (self.threadsperblock[0] - 1)) // self.threadsperblock[0] blockspergrid_y = ( N + (self.threadsperblock[1] - 1)) // self.threadsperblock[1] self.blockspergrid = (blockspergrid_x, blockspergrid_y) # Get the JITed kernel self.gpu_phi = self.get_phi(n, m) self.lastTSetup = time.time() - t1 # Copy U to the device d_U = cuda.to_device(U, stream=self.stream) # Run the computation self.gpu_phi[self.blockspergrid, self.threadsperblock, self.stream](d_U, self.d_phiU, self.d_normalization, self.d_idxs) self.stream.synchronize() # Move the results to the mapped array self.d_phiU.copy_to_host(self.phiU, stream=self.stream) self.stream.synchronize() return self.phiU
def test_produce_stream(self): s = cuda.stream() c_arr = cuda.device_array(10, stream=s) cai_stream = c_arr.__cuda_array_interface__['stream'] self.assertEqual(s.handle.value, cai_stream) s = cuda.stream() mapped_arr = cuda.mapped_array(10, stream=s) cai_stream = mapped_arr.__cuda_array_interface__['stream'] self.assertEqual(s.handle.value, cai_stream)
def test_issue_6505(self): # On Windows, the writes to ary_v would not be visible prior to the # assertion, due to the assignment being done with a kernel launch that # returns asynchronously - there should now be a sync after the kernel # launch to ensure that the writes are always visible. ary = cuda.mapped_array(2, dtype=np.int32) ary[:] = 0 ary_v = ary.view('u1') ary_v[1] = 1 ary_v[5] = 1 self.assertEqual(sum(ary), 512)
def test_nowarn_on_mapped_array(self): @cuda.jit def foo(r, x): r[0] = x + 1 N = 10 ary = cuda.mapped_array(N, dtype=np.float32) with override_config('CUDA_WARN_ON_IMPLICIT_COPY', 1): with warnings.catch_warnings(record=True) as w: foo[1, N](ary, N) self.assertEqual(len(w), 0)
def get_shared_mem(shape, dtype=np.float32, strides=None, order='C', stream=0, portable=False, wc=True): return cuda.mapped_array(shape, dtype=dtype, strides=strides, order=order, stream=stream, portable=portable, wc=wc)
def test_host_operators(self): for ary in [ cuda.mapped_array(10, dtype=np.uint32), cuda.pinned_array(10, dtype=np.uint32) ]: ary[:] = range(10) self.assertTrue(sum(ary + 1) == 55) self.assertTrue(sum((ary + 1) * 2 - 1) == 100) self.assertTrue(sum(ary < 5) == 5) self.assertTrue(sum(ary <= 5) == 6) self.assertTrue(sum(ary > 6) == 3) self.assertTrue(sum(ary >= 6) == 4) self.assertTrue(sum(ary**2) == 285) self.assertTrue(sum(ary // 2) == 20) self.assertTrue(sum(ary / 2.0) == 22.5) self.assertTrue(sum(ary % 2) == 5)
def get_shared_array(data, strides=None, order='C', stream=0, portable=False, wc=True): shape = data.shape dtype = data.dtype # Allocate mapped, shared memory in Numba shared_mem_array = cuda.mapped_array(shape, dtype=dtype, strides=strides, order=order, stream=stream, portable=portable, wc=wc) # Load data into array space shared_mem_array[:] = data return shared_mem_array
def test_produce_no_stream(self): c_arr = cuda.device_array(10) self.assertIsNone(c_arr.__cuda_array_interface__['stream']) mapped_arr = cuda.mapped_array(10) self.assertIsNone(mapped_arr.__cuda_array_interface__['stream'])
def test_host_alloc_mapped(self): ary = cuda.mapped_array(10, dtype=np.uint32) ary.fill(123) self.assertTrue(all(ary == 123)) driver.device_memset(ary, 0, driver.device_memory_size(ary)) self.assertTrue(all(ary == 0))
def run_forever(self): encoded = {} decoded = {} while (True): start = time.perf_counter() nsec = 0 bufrcv = 0 peekrcv = 0 while nsec < self.NUM_SECTOR: recv_num_sector = min(self.sectors_per_packet, self.NUM_SECTOR - nsec) packet_len = recv_num_sector * rvl_cuda.SECTOR_LEN * np.dtype( np.uint64).itemsize # Find out who's sending us the next packet peekstart = time.perf_counter() (_, src) = self.sock.recvfrom(1, socket.MSG_PEEK) src = src[0] if encoded.get(src) is None: self.debug("Allocating mapped arrays for new client", src) encoded[src] = cuda.mapped_array( (self.NUM_SECTOR, rvl_cuda.SECTOR_LEN), dtype=np.uint64) decoded[src] = cuda.mapped_array(self.dim, dtype=np.uint16) brstart = time.perf_counter() (nbytes) = self.sock.recv_into( encoded[src][nsec:nsec + recv_num_sector, :], packet_len) brend = time.perf_counter() bufrcv += brend - brstart peekrcv += brstart - peekstart if nbytes != packet_len: self.invalid_packets += 1 continue nsec += self.sectors_per_packet decode_start = time.perf_counter() rvl_cuda.decode[self.BLOCKS_PER_GRID, self.THREADS_PER_BLOCK](encoded[src], decoded[src], self.deproject) # idxs = [] for i in range(self.NUM_SECTOR): v = int(encoded[src][i, 0]) x = v & 0xffff y = (v >> 16) & 0xffff if x < decoded[src].shape[0] and y < decoded[src].shape[1]: decoded[src][x, y] = 15208 # idxs.append((x,y)) # print(idxs) decode_end = time.perf_counter() if self.show_output: cv2.imshow(src + "dec", decoded[src] / max(1, np.max(decoded[src]))) # cv2.imshow(src + "enc", encoded[src] / max(1, np.max(encoded[src]))) # cv2.waitKey(10000) cv2.waitKey(1) if self.sinks is not None: frame_data = np.asanyarray(frame.get_depth_frame().get_data()) for cb in self.sinks: cb(frame.get_frame_number(), frame_data) if self.color_sinks is not None: frame_data = np.asanyarray( self.colorizer.colorize( frame.get_depth_frame()).get_data()) for cb in self.color_sinks: cb(frame.get_frame_number(), frame_data) end = time.perf_counter() self.avg_ms('avg_recv_latency', decode_start - start) self.avg_ms('avg_decode_latency', decode_end - decode_start) self.avg_ms('avg_viz_latency', end - decode_end) self.avg_ms('avg_latency', end - start) self.avg_ms('avg_buf_recv', bufrcv) self.avg_ms('avg_peek_recv', peekrcv) self.num_frames += 1 if end - self.last_print_stats > self.print_pd: self.print_stats() self.last_print_stats = end