Esempio n. 1
0
    def spin_forever(self):
        encoded = cuda.mapped_array((self.NUM_SECTOR, rvl_cuda.SECTOR_LEN),
                                    dtype=np.uint64)
        # encoded = np.zeros((self.NUM_SECTOR, rvl_cuda.SECTOR_LEN), dtype=np.uint64)
        print(self.NUM_SECTOR, "sectors, length", rvl_cuda.SECTOR_LEN)
        sectors_per_packet = int(
            MAX_UDP_PACKET_BYTES /
            (rvl_cuda.SECTOR_LEN * np.dtype(np.uint64).itemsize))
        print(
            "Starting UDP send pipeline (%d sectors/packet, %d packets/frame)"
            % (sectors_per_packet, int(self.NUM_SECTOR / sectors_per_packet)))
        self.pipeline.start(self.config)
        delta = cuda.mapped_array(self.dim, dtype=np.uint16)
        last_frame = delta
        while (True):
            try:
                start = time.perf_counter()
                has, frame = self.pipeline.try_wait_for_frames()
                if not has:
                    break
                frame_data = np.asanyarray(frame.get_depth_frame().get_data())
                fnum = frame.get_frame_number()
                # PERF_TODO actually take delta between frames
                #delta[:,:] = frame_data - last_frame
                delta[:, :] = frame_data
                last_frame = frame_data

                enc_start = time.perf_counter()
                rvl_cuda.encode[self.BLOCKS_PER_GRID,
                                self.THREADS_PER_BLOCK](delta, encoded)
                enc_end_send_start = time.perf_counter()
                prep_amt = 0
                for i in range(1, encoded.shape[0], sectors_per_packet):
                    # PERF_TODO restrict width to max packet width
                    # width = np.max(np.right_shift(encoded[i:(i+sectors_per_packet),0], 32))
                    self.sock.sendto(
                        encoded[i:(i + sectors_per_packet), :].tobytes(),
                        self.dest)
                send_end = time.perf_counter()

                if self.debug:
                    cv2.imshow("frame",
                               frame_data / max(1, np.max(frame_data)))
                    cv2.imshow("encoded", encoded / max(1, np.max(encoded)))
                    cv2.waitKey(1)

                # Update stats
                self.stats['skipped_frames'] += frame.get_frame_number(
                ) - self.last_frame_num - 1
                self.avg_ms("avg_fetch_latency", enc_start - start)
                self.avg_ms("avg_encode_latency",
                            enc_end_send_start - enc_start)
                self.avg_ms("avg_send_latency", send_end - enc_end_send_start)
                self.avg_ms("avg_latency", send_end - start)
                self.last_frame_num = frame.get_frame_number()
                if send_end - self.last_print_stats > self.print_pd:
                    self.print_stats()
                    self.last_print_stats = send_end
            except KeyboardInterrupt:
                return
Esempio n. 2
0
def get_shared_mem(
    shape,
    dtype=np.float32,
    strides=None,
    order="C",
    stream=0,
    portable=False,
    wc=True,
):
    """Return shared memory between GPU and CPU. Similar to numpy.zeros

    Parameters
    ----------
    shape : ndarray.shape
        Size of shared memory allocation
    dtype : cupy.dtype or numpy.dtype
        Data type of allocation
    strides: int or None
    order: char
    stream : int
        Stream number (0 for default)
    portable : bool
    wc : bool
    """

    return cuda.mapped_array(
        shape,
        dtype=dtype,
        strides=strides,
        order=order,
        stream=stream,
        portable=portable,
        wc=wc,
    )
Esempio n. 3
0
def get_shared_array(
    data, strides=None, order="C", stream=0, portable=False, wc=True
):
    """Return populated shared memory between GPU and CPU.

    Parameters
    ----------
    data : cupy.ndarray or numpy.ndarray
        The array to be copied to shared buffer
    strides: int or None
    order: char
    stream : int
        Stream number (0 for default)
    portable : bool
    wc : bool
    """

    shape = data.shape
    dtype = data.dtype

    # Allocate mapped, shared memory in Numba
    shared_mem_array = cuda.mapped_array(
        shape,
        dtype=dtype,
        strides=strides,
        order=order,
        stream=stream,
        portable=portable,
        wc=wc,
    )

    # Load data into array space
    shared_mem_array[:] = data

    return shared_mem_array
Esempio n. 4
0
def calculate_correlations(user_centric_array, film_correlations_number):

	number_users = len(user_centric_array)

	# CUDA stuff

	stream = cuda.stream()

	with stream.auto_synchronize():

		threadsperblock = 8
		correlation_fields_to_fill = sum(range(film_correlations_number+1))
		blockspergrid = math.ceil(correlation_fields_to_fill / threadsperblock)

		user_centric_array_global_mem = cuda.to_device(user_centric_array, stream=stream)
		correlation_matrix_global_mem = cuda.mapped_array((film_correlations_number+1, film_correlations_number+1), dtype='float64')

		y = numpy.ones((1,1), dtype='float64')
		yc = cuda.to_device(y, stream=stream)

		# Start calculations
		calculation_thread[blockspergrid, threadsperblock](user_centric_array_global_mem, correlation_matrix_global_mem, int(number_users), int(film_correlations_number))



	print(correlation_matrix_global_mem)
	return correlation_matrix_global_mem
Esempio n. 5
0
 def test_host_alloc_mapped(self):
     ary = cuda.mapped_array(10, dtype=np.uint32)
     ary.fill(123)
     self.assertTrue(all(ary == 123))
     driver.device_memset(ary, 0, driver.device_memory_size(ary))
     self.assertTrue(all(ary == 0))
     self.assertTrue(sum(ary != 0) == 0)
Esempio n. 6
0
    def __call__(self, U, n):
        m = U.shape[0]
        # If dimensionality of problem has changed, re-do setup
        if (n, m) != self.lastNM:
            t1 = time.time()
            self.lastNM = (n, m)

            normalization, self.idxs = build_norm_and_idxs(n, m)
            self.normalization = normalization.copy()

            if n % 2 == 1:
                self.normalization *= -1

            N = self.idxs.shape[0]
            self.N = N

            # Allocate mapped array for final result
            self.phiU = cuda.mapped_array((N, N),
                                          dtype=np.complex128,
                                          stream=self.stream)

            # Allocate array on device for computation
            self.d_phiU = cuda.device_array((N, N),
                                            dtype=np.complex128,
                                            stream=self.stream)

            # Copy idxs to the device
            self.d_idxs = cuda.to_device(self.idxs, stream=self.stream)

            # Copy normalization to the device
            self.d_normalization = cuda.to_device(
                self.normalization.astype('complex128'), stream=self.stream)

            # Set up call parameters
            blockspergrid_x = (
                N + (self.threadsperblock[0] - 1)) // self.threadsperblock[0]
            blockspergrid_y = (
                N + (self.threadsperblock[1] - 1)) // self.threadsperblock[1]
            self.blockspergrid = (blockspergrid_x, blockspergrid_y)

            # Get the JITed kernel
            self.gpu_phi = self.get_phi(n, m)
            self.lastTSetup = time.time() - t1

        # Copy U to the device
        d_U = cuda.to_device(U, stream=self.stream)

        # Run the computation
        self.gpu_phi[self.blockspergrid, self.threadsperblock,
                     self.stream](d_U, self.d_phiU, self.d_normalization,
                                  self.d_idxs)

        self.stream.synchronize()

        # Move the results to the mapped array
        self.d_phiU.copy_to_host(self.phiU, stream=self.stream)

        self.stream.synchronize()

        return self.phiU
    def test_produce_stream(self):
        s = cuda.stream()
        c_arr = cuda.device_array(10, stream=s)
        cai_stream = c_arr.__cuda_array_interface__['stream']
        self.assertEqual(s.handle.value, cai_stream)

        s = cuda.stream()
        mapped_arr = cuda.mapped_array(10, stream=s)
        cai_stream = mapped_arr.__cuda_array_interface__['stream']
        self.assertEqual(s.handle.value, cai_stream)
    def test_issue_6505(self):
        # On Windows, the writes to ary_v would not be visible prior to the
        # assertion, due to the assignment being done with a kernel launch that
        # returns asynchronously - there should now be a sync after the kernel
        # launch to ensure that the writes are always visible.
        ary = cuda.mapped_array(2, dtype=np.int32)
        ary[:] = 0

        ary_v = ary.view('u1')
        ary_v[1] = 1
        ary_v[5] = 1
        self.assertEqual(sum(ary), 512)
Esempio n. 9
0
    def test_nowarn_on_mapped_array(self):
        @cuda.jit
        def foo(r, x):
            r[0] = x + 1

        N = 10
        ary = cuda.mapped_array(N, dtype=np.float32)

        with override_config('CUDA_WARN_ON_IMPLICIT_COPY', 1):
            with warnings.catch_warnings(record=True) as w:
                foo[1, N](ary, N)

        self.assertEqual(len(w), 0)
Esempio n. 10
0
def get_shared_mem(shape,
                   dtype=np.float32,
                   strides=None,
                   order='C',
                   stream=0,
                   portable=False,
                   wc=True):
    return cuda.mapped_array(shape,
                             dtype=dtype,
                             strides=strides,
                             order=order,
                             stream=stream,
                             portable=portable,
                             wc=wc)
Esempio n. 11
0
 def test_host_operators(self):
     for ary in [
             cuda.mapped_array(10, dtype=np.uint32),
             cuda.pinned_array(10, dtype=np.uint32)
     ]:
         ary[:] = range(10)
         self.assertTrue(sum(ary + 1) == 55)
         self.assertTrue(sum((ary + 1) * 2 - 1) == 100)
         self.assertTrue(sum(ary < 5) == 5)
         self.assertTrue(sum(ary <= 5) == 6)
         self.assertTrue(sum(ary > 6) == 3)
         self.assertTrue(sum(ary >= 6) == 4)
         self.assertTrue(sum(ary**2) == 285)
         self.assertTrue(sum(ary // 2) == 20)
         self.assertTrue(sum(ary / 2.0) == 22.5)
         self.assertTrue(sum(ary % 2) == 5)
Esempio n. 12
0
def get_shared_array(data,
                     strides=None,
                     order='C',
                     stream=0,
                     portable=False,
                     wc=True):
    shape = data.shape
    dtype = data.dtype

    # Allocate mapped, shared memory in Numba
    shared_mem_array = cuda.mapped_array(shape,
                                         dtype=dtype,
                                         strides=strides,
                                         order=order,
                                         stream=stream,
                                         portable=portable,
                                         wc=wc)

    # Load data into array space
    shared_mem_array[:] = data

    return shared_mem_array
    def test_produce_no_stream(self):
        c_arr = cuda.device_array(10)
        self.assertIsNone(c_arr.__cuda_array_interface__['stream'])

        mapped_arr = cuda.mapped_array(10)
        self.assertIsNone(mapped_arr.__cuda_array_interface__['stream'])
 def test_host_alloc_mapped(self):
     ary = cuda.mapped_array(10, dtype=np.uint32)
     ary.fill(123)
     self.assertTrue(all(ary == 123))
     driver.device_memset(ary, 0, driver.device_memory_size(ary))
     self.assertTrue(all(ary == 0))
Esempio n. 15
0
    def run_forever(self):
        encoded = {}
        decoded = {}
        while (True):
            start = time.perf_counter()
            nsec = 0
            bufrcv = 0
            peekrcv = 0
            while nsec < self.NUM_SECTOR:
                recv_num_sector = min(self.sectors_per_packet,
                                      self.NUM_SECTOR - nsec)
                packet_len = recv_num_sector * rvl_cuda.SECTOR_LEN * np.dtype(
                    np.uint64).itemsize

                # Find out who's sending us the next packet
                peekstart = time.perf_counter()
                (_, src) = self.sock.recvfrom(1, socket.MSG_PEEK)
                src = src[0]
                if encoded.get(src) is None:
                    self.debug("Allocating mapped arrays for new client", src)
                    encoded[src] = cuda.mapped_array(
                        (self.NUM_SECTOR, rvl_cuda.SECTOR_LEN),
                        dtype=np.uint64)
                    decoded[src] = cuda.mapped_array(self.dim, dtype=np.uint16)

                brstart = time.perf_counter()
                (nbytes) = self.sock.recv_into(
                    encoded[src][nsec:nsec + recv_num_sector, :], packet_len)
                brend = time.perf_counter()
                bufrcv += brend - brstart
                peekrcv += brstart - peekstart
                if nbytes != packet_len:
                    self.invalid_packets += 1
                    continue

                nsec += self.sectors_per_packet

            decode_start = time.perf_counter()
            rvl_cuda.decode[self.BLOCKS_PER_GRID,
                            self.THREADS_PER_BLOCK](encoded[src], decoded[src],
                                                    self.deproject)
            # idxs = []
            for i in range(self.NUM_SECTOR):
                v = int(encoded[src][i, 0])
                x = v & 0xffff
                y = (v >> 16) & 0xffff
                if x < decoded[src].shape[0] and y < decoded[src].shape[1]:
                    decoded[src][x, y] = 15208
                # idxs.append((x,y))
            # print(idxs)
            decode_end = time.perf_counter()

            if self.show_output:
                cv2.imshow(src + "dec",
                           decoded[src] / max(1, np.max(decoded[src])))
                # cv2.imshow(src + "enc", encoded[src] / max(1, np.max(encoded[src])))
                # cv2.waitKey(10000)
                cv2.waitKey(1)
            if self.sinks is not None:
                frame_data = np.asanyarray(frame.get_depth_frame().get_data())
                for cb in self.sinks:
                    cb(frame.get_frame_number(), frame_data)
            if self.color_sinks is not None:
                frame_data = np.asanyarray(
                    self.colorizer.colorize(
                        frame.get_depth_frame()).get_data())
                for cb in self.color_sinks:
                    cb(frame.get_frame_number(), frame_data)

            end = time.perf_counter()
            self.avg_ms('avg_recv_latency', decode_start - start)
            self.avg_ms('avg_decode_latency', decode_end - decode_start)
            self.avg_ms('avg_viz_latency', end - decode_end)
            self.avg_ms('avg_latency', end - start)
            self.avg_ms('avg_buf_recv', bufrcv)
            self.avg_ms('avg_peek_recv', peekrcv)
            self.num_frames += 1
            if end - self.last_print_stats > self.print_pd:
                self.print_stats()
                self.last_print_stats = end