Esempio n. 1
0
    def __init__(self, bit_file_path):
        self.ol = Overlay(bit_file_path)
        self.dma = self.ol.axi_dma_0

        # declare input/output types and shapes for the accelerator
        # input FINN DataType
        self.idt = DataType.UINT2
        # normal, folded and packed input shapes
        self.ishape_normal = (1, 8)
        self.ishape_folded = (1, 1, 8)
        self.ishape_packed = (1, 1, 2)
        # output FINN DataType
        self.odt = DataType.INT32
        # normal, folded and packed output shapes
        self.oshape_normal = (1, 3)
        self.oshape_folded = (1, 1, 3)
        self.oshape_packed = (1, 1, 12)
        # allocate a PYNQ buffer for the packed input buffer
        self.ibuf_packed_device = allocate(shape=self.ishape_packed,
                                           dtype=np.uint8)
        # allocate a PYNQ buffer for the packed output buffer
        self.obuf_packed = allocate(shape=self.oshape_packed, dtype=np.uint8)
        self.mt_node_thresholds = np.asarray(
            [[-0.01027827151119709, 0.01027827151119709]])
        self.multiply_node_const = 0.014718746766448021
        self.add_node_mat = np.asarray(
            [-0.08831246197223663, 0.17662496864795685, -0.058874987065792084])
Esempio n. 2
0
    def __init__(self, fs=1000000, N=6000):

        # Input paramters
        self._fs = fs
        self._N = N

        # Cache timeseries and reusable random data
        self._random_data = [rnd.randint(0, 3) for _ in range(N)]
        self._t = np.array([i / fs for i in range(N)])

        # Overlay config
        bit_name = os.path.dirname(__file__) + '/agc_loopback.bit'
        ol = Overlay(bit_name)
        self._ol = ol

        #Avoid PYNQ's get_attr overhead by aliasing IPs
        self._agc = ol.agc
        self._dma_in_i = ol.dma_in_i
        self._dma_in_q = ol.dma_in_q
        self._dma_agc_i = ol.dma_agc_i
        self._dma_agc_q = ol.dma_agc_q
        self._dma_agc_g = ol.dma_agc_g

        # Allocate buffers
        self._buf_in_i = allocate(shape=(N, ), dtype=np.int16)
        self._buf_in_q = allocate(shape=(N, ), dtype=np.int16)
        self._buf_agc_i = allocate(shape=(N, ), dtype=np.int16)
        self._buf_agc_q = allocate(shape=(N, ), dtype=np.int16)
        self._buf_agc_g = allocate(shape=(N, ), dtype=np.uint32)
Esempio n. 3
0
    def hw_inference(self, input_data_processed):
        packed_input = self.pack(input_data_processed.image_array)
        directions = 2 if self.bidirectional_enabled else 1
        self.accel_input_buffer = allocate(shape=packed_input.shape,
                                           dtype=np.uint64)
        self.accel_output_buffer = allocate(shape=(128, ), dtype=np.uint64)
        np.copyto(self.accel_input_buffer, packed_input)

        bytes_read = np.ceil((self.input_bitwidth * self.input_size * 8) / 64)
        bytes_read = int(bytes_read * directions * input_data_processed.width)

        self.BLSTM_CTC.numberColumns_V = input_data_processed.width
        self.BLSTM_CTC.numberColumnsTwice_V = directions * input_data_processed.width
        self.BLSTM_CTC.numberBytesRead_V = bytes_read
        self.BLSTM_CTC.input_buffer_V_1 = self.accel_input_buffer.physical_address & 0xffffffff
        self.BLSTM_CTC.input_buffer_V_2 = (
            self.accel_input_buffer.physical_address >> 32) & 0xffffffff
        self.BLSTM_CTC.output_buffer_V_1 = self.accel_output_buffer.physical_address & 0xffffffff
        self.BLSTM_CTC.output_buffer_V_2 = (
            self.accel_output_buffer.physical_address >> 32) & 0xffffffff
        start = time.time()
        self.ExecAccel()
        end = time.time()
        print("Inference took = {} ms...".format((end - start) * 1000))
        print("Frames per second = {}".format(1 / (end - start)))
        predictions = np.copy(
            np.frombuffer(self.accel_output_buffer, dtype=np.uint64))
        return predictions
    def fpga_evaluate_dance(self, imu_data):
        global dma

        input_buffer0 = allocate(shape=(100,), dtype=np.float32)
        output_buffer0 = allocate(shape=(12,), dtype=np.float32)
        
        segment = self.preprocess_segment(imu_data)
        segment = (segment - self.scaler_mean) / self.scaler_scale

        self.choose_model[0] = 0
        dma.sendchannel.transfer(self.choose_model)
        dma.sendchannel.wait()
    
        for j in range(100):
            input_buffer0[j] = segment[j]

        dma.sendchannel.transfer(input_buffer0)
        dma.recvchannel.transfer(output_buffer0)
        dma.sendchannel.wait()
        dma.recvchannel.wait()

        raw_output = [output_buffer0[i] for i in range(12)] 
        index = np.argmax(raw_output, axis=-1)
        #print(f"Prediction: {self.dance_labels[index]} | {raw_output[index]}")
        return self.dance_labels[index]
Esempio n. 5
0
    def __init__(self, bit_file_path):
        self.ol = Overlay(bit_file_path)
        self.dma = self.ol.axi_dma_0

        # declare input/output types and shapes for the accelerator
        # input FINN DataType
        self.idt = DataType.UINT2
        # normal, folded and packed input shapes
        self.ishape_normal = (1, 256)
        self.ishape_folded = (1, 4, 64)
        self.ishape_packed = (1, 4, 16)
        # output FINN DataType
        self.odt = DataType.INT32
        # normal, folded and packed output shapes
        self.oshape_normal = (1, 6)
        self.oshape_folded = (1, 1, 6)
        self.oshape_packed = (1, 1, 24)
        # allocate a PYNQ buffer for the packed input buffer
        self.ibuf_packed_device = allocate(shape=self.ishape_packed,
                                           dtype=np.uint8)
        # allocate a PYNQ buffer for the packed output buffer
        self.obuf_packed = allocate(shape=self.oshape_packed, dtype=np.uint8)
        self.mt_node_thresholds = np.asarray(
            [[-0.00000762939453125, 0.00000762939453125]])
        self.multiply_node_const = 0.19533488154411316
        self.add_node_mat = np.asarray([
            0.7813395261764526, 0, 0, 0.7813395261764526, -0.3906697630882263,
            0
        ])
Esempio n. 6
0
    def __init__(self, N, bitfile):
        """Instantiate the FINN accelerator driver.
        Gets batchsize (N) as integer and path to bitfile as string."""
        self.N = N
        # input FINN DataType
        self.idt = DataType.UINT8
        # output FINN DataType
        self.odt = DataType.UINT32
        # input and output shapes
        self.ishape_normal = (N, 256)
        self.oshape_normal = (N, 9)
        self.ishape_folded = (N, 8, 32)
        self.oshape_folded = (N, 1, 9)
        self.ishape_packed = (N, 8, 32)  # datatype np.uint8
        self.oshape_packed = (N, 1, 36)  # datatype np.uint8
        # load bitfile and set up accelerator
        self.ol = Overlay(bitfile)
        self.dma = self.ol.axi_dma_0
        self.ctrl_regs = self.ol.resize_accel_0
        # neuron folding factor of output = iterations per sample
        self.itersPerSample = self.oshape_packed[-2]
        # AXI lite register offset for number of iterations
        # used by TLastMarker to signal end of transmission for AXI CDMA
        self.REG_OFFSET_NUM_ITERS = 0x10
        # set up TLastMarker with correct num. samples
        self.ctrl_regs.write(self.REG_OFFSET_NUM_ITERS,
                             self.N * self.itersPerSample)

        # allocate a PYNQ buffer for the packed input and buffer
        self.ibuf_packed_device = allocate(shape=self.ishape_packed,
                                           dtype=np.uint8)
        self.obuf_packed_device = allocate(shape=self.oshape_packed,
                                           dtype=np.uint8)
Esempio n. 7
0
def benchmark_synthetic(bs, nreps):
    ibuf = pynq.allocate((bs, 3, 224, 224), dtype=np.int8, target=ol.bank0)
    obuf = pynq.allocate((bs, 5), dtype=np.uint32, target=ol.bank0)

    # Start power monitoring
    pwr_rec = setup_power_recording()
    pwr_rec.record(0.1)

    total_duration = time.monotonic()
    for i in range(nreps):
        accelerator.call(ibuf, obuf, fcbuf, bs)
    total_duration = time.monotonic() - total_duration

    # Stop the power monitoring
    pwr_rec.stop()

    latency = total_duration / nreps
    fps = int((nreps / total_duration) * bs)

    # Aggregate board/fpga power into a Pandas dataframe
    f = pwr_rec.frame
    powers = pd.DataFrame(index=f.index)
    powers['board_power'] = f['12v_aux_power'] + f['12v_pex_power']
    powers['fpga_power'] = f['vccint_power']

    return fps, latency, powers
Esempio n. 8
0
 def predict(self, X, y_shape, dtype=np.float32, debug=None, profile=False, encode=None, decode=None):
     """
     Obtain the predictions of the NN implemented in the FPGA.
     Parameters:
     - X : the input vector. Should be numpy ndarray.
     - y_shape : the shape of the output vector. Needed to the accelerator to set the TLAST bit properly and
                 for sizing the output vector shape.
     - dtype : the data type of the elements of the input/output vectors. 
               Note: it should be set depending on the interface of the accelerator; if it uses 'float' 
               types for the 'data' AXI-Stream field, 'np.float32' dtype is the correct one to use. 
               Instead if it uses 'ap_fixed<A,B>', 'np.intA' is the correct one to use (note that A cannot
               any integer value, but it can assume {..., 8, 16, 32, ...} values. Check `numpy` 
               doc for more info).
               In this case the encoding/decoding has to be computed by the PS. For example for 
               'ap_fixed<16,6>' type the following 2 functions are the correct one to use for encode/decode 
               'float' -> 'ap_fixed<16,6>':
               ```
                 def encode(xi):
                     return np.int16(round(xi * 2**10)) # note 2**10 = 2**(A-B)
                 def decode(yi):
                     return yi * 2**-10
                 encode_v = np.vectorize(encode) # to apply them element-wise
                 decode_v = np.vectorize(decode)
               ```
     - profile : boolean. Set it to `True` to print the performance of the algorithm in term of `inference/s`.
     - encode/decode: function pointers. See `dtype` section for more information.
     - return: an output array based on `np.ndarray` with a shape equal to `y_shape` and a `dtype` equal to
               the namesake parameter.
     """
     if profile:
         timea = datetime.now()
     if encode is not None:
         X = encode(X)
     with allocate(shape=X.shape, dtype=dtype) as input_buffer, \
          allocate(shape=y_shape, dtype=dtype) as output_buffer:
         input_buffer[:] = X 
         self.hier_0.axi_dma_0.sendchannel.transfer(input_buffer)
         self.hier_0.axi_dma_0.recvchannel.transfer(output_buffer)
         if debug:
             print("Transfer OK")
         self.hier_0.axi_dma_0.sendchannel.wait()
         if debug:
             print("Send OK")
         self.hier_0.axi_dma_0.recvchannel.wait()
         if debug:
             print("Receive OK")
         result = output_buffer.copy()
     if decode is not None:
         result = decode(result)
     if profile:
         timeb = datetime.now()
         dts, rate = self._print_dt(timea, timeb, len(X))
         return result, dts, rate
     return result
Esempio n. 9
0
    def __init__(self,
                 N,
                 bitfile,
                 platform="alveo",
                 device_name="xilinx_u50_gen3x16_xdma_201920_3"):
        """Instantiate the FINN accelerator driver.
        Gets batchsize (N) as integer and path to bitfile as string."""
        self.platform = platform
        self.N = N
        # input FINN DataType
        self.idt = DataType.UINT8
        # output FINN DataType
        self.odt = DataType.UINT8
        # input and output shapes
        self.ishape_normal = (N, 32, 32, 3)
        self.oshape_normal = (N, 1)
        self.ishape_folded = (N, 32, 32, 1, 3)
        self.oshape_folded = (N, 1, 1)
        self.ishape_packed = (N, 32, 32, 1, 3)  # datatype np.uint8
        self.oshape_packed = (N, 1, 1)  # datatype np.uint8
        # load bitfile and set up accelerator
        self.device = [i for i in Device.devices if i.name == device_name][0]
        Device.active_device = self.device
        self.ol = Overlay(bitfile)
        # neuron folding factor of output = iterations per sample
        self.itersPerSample = self.oshape_packed[-2]
        # clock frequency as specified by user
        self.fclk_mhz = 100.0
        if self.platform == "alveo":
            self.idma = self.ol.idma0
            self.odma = self.ol.odma0
        elif self.platform == "zynq-iodma":
            self.idma = self.ol.idma0
            self.odma = self.ol.odma0
            # set the clock frequency as specified by user during transformations
            if self.fclk_mhz > 0:
                Clocks.fclk0_mhz = self.fclk_mhz
        else:
            raise ValueError("Supported platforms are zynq-iodma alveo")

        # allocate a PYNQ buffer for the packed input and buffer
        if self.platform == "alveo":
            self.ibuf_packed_device = allocate(shape=self.ishape_packed,
                                               dtype=np.uint8)
            self.obuf_packed_device = allocate(shape=self.oshape_packed,
                                               dtype=np.uint8)
        else:
            self.ibuf_packed_device = allocate(shape=self.ishape_packed,
                                               dtype=np.uint8,
                                               cacheable=True)
            self.obuf_packed_device = allocate(shape=self.oshape_packed,
                                               dtype=np.uint8,
                                               cacheable=True)
Esempio n. 10
0
 def _create_buffer(
         self,
         data=np.array(
             [72, 101, 108, 108, 111, 32, 87, 111, 114, 108, 100, 33],
             dtype=np.uint8),
         eof=1,
         padding=0):
     """Create a buffer that is loaded user data. Append the Extended Barker sequence
     to the user data and then pad with zeros
     """
     self._flags = eof
     if data.size == 0:
         raise ValueError('Message size should be greater than 0.')
     msg = np.array(data, dtype=np.uint8)
     # Append Barker and Random Data
     bkr = np.array([
         0, 0, 63, 112, 28,
         len(msg) + 5, self._frame_number, self._flags, 5,
         len(msg), padding
     ],
                    dtype=np.uint8)
     rnd = np.array([randint(0, 255) for p in range(0, self.random_size)],
                    dtype=np.uint8)
     seq = np.append(bkr, msg)
     seq = np.append(rnd, seq)
     pad = np.append(
         seq,
         np.zeros(int(
             np.ceil((len(rnd) + len(bkr) + len(msg)) / 32) * 32 -
             (len(rnd) + len(bkr) + len(msg))),
                  dtype=np.uint8))
     buf = allocate(shape=(len(pad), ), dtype=np.uint8)
     buf[:] = pad[:]
     return buf
Esempio n. 11
0
def TestStreamMaster(baseval):

    # Set DMA to write SS2M data to a specifc physical address
    output_buffer = allocate(shape=(8, ), dtype='u4')
    output_buffer[:] = 0
    dma.recvchannel.start()
    dma.recvchannel.transfer(output_buffer)

    # Confiugre the peripheral to send baseval to baseval + 7
    ol.AXIMasterSlaveStream_0.write(0x00, 0x02)
    ol.AXIMasterSlaveStream_0.write(0x00, 0x00)
    ol.AXIMasterSlaveStream_0.write(0x04, baseval)
    ol.AXIMasterSlaveStream_0.write(0x00, 0x01)

    dma.recvchannel.wait()
    dma.recvchannel.stop()

    # Check the data matches.
    for i in range(8):
        expected = baseval + i
        if not (output_buffer[i] == expected):
            print("TestStreamMaster: Error on output buffer " + str(i) +      \
                " Expected " + "{:x}".format(expected) +  \
                " Actual " + str(output_buffer[i]))
            return 0
    return 1
Esempio n. 12
0
def render_frame(ipRender, ipIDMA, ipODMA, mesh_buffers, transforms, texture_ids, output_buffer):
    mbuf_trans = []
    for i, mbuf in enumerate(mesh_buffers):
        num_faces = len(mbuf) // 24
        render_set_num_faces(ipRender, num_faces)
        mbuf_trans.append(allocate(shape=len(mbuf), dtype=np.float32))
        render_set_transform(ipRender, transforms[i])
        render_transform(ipRender, ipIDMA, ipODMA, mbuf, mbuf_trans[-1])

    for i in range(8):
        fh0 = 60 * i
        fh1 = 60 * (i + 1)
        ipRender.write(0x00C0, fh0)
        ipRender.write(0x00C4, fh1)
        render_reset_buffer(ipRender)

        for j, mbuf in enumerate(mbuf_trans):
            num_faces = len(mbuf) // 24
            render_set_num_faces(ipRender, num_faces)
            render_set_texture_id(ipRender, texture_ids[j])
            ipRender.write(0x0010, 2)
            ipRender.write(0x0000, 1)
            ipIDMA.sendchannel.transfer(mbuf)
            ipIDMA.sendchannel.wait()
            render_wait_done(ipRender)

        ipRender.write(0x0010, 3)
        ipRender.write(0x0000, 1)
        ipODMA.recvchannel.transfer(output_buffer[fh0*640:fh1*640])
        ipODMA.recvchannel.wait()
        render_wait_done(ipRender)
Esempio n. 13
0
def test_withfree(device):
    cache = MockCache()
    with device.check_memops(allocates=[(10, False, BUFFER_ADDRESS)]):
        with pynq.allocate((1024, 1024), 'u4', target=device) as buf:
            buf.pointer = 1234
            buf.return_to = cache
        assert cache.returns == [1234]
Esempio n. 14
0
    def getframe(self):
        """Retrieve a frame from the cache or create a new frame if the
        cache is empty. The freebuffer method of the returned array is
        overriden to return the object to the cache rather than freeing
        the object.

        """
        if self._cache:
            frame = allocate(
                shape=self._mode.shape, dtype='u1', cacheable=self._cacheable,
                pointer=self._cache.pop(), cache=self)
        else:
            frame = allocate(
                shape=self._mode.shape, dtype=np.uint8,
                cacheable=self._cacheable, cache=self)
        return frame
Esempio n. 15
0
    def allocate_buffer(self, name, num_samples, data_type="unsigned int"):
        """This method allocates the source or the destination buffers.

        Usually, the source buffer stores 32-bit samples, while the
        destination buffer stores 64-bit samples.

        Note that the numpy array has to be deep-copied before users can
        free the buffer.

        Parameters
        ----------
        name : str
            The name of the string, used for indexing the buffers.
        num_samples : int
            The number of samples that needs to be generated or captured.
        data_type : str
            The type of the data.

        Returns
        -------
        int
            The address of the source or destination buffer.

        """
        dtype = data_type
        for k, v in BYTE_WIDTH_TO_CTYPE.items():
            if v == data_type:
                dtype = BYTE_WIDTH_TO_NPTYPE[k]
                break
        buf = allocate(num_samples, dtype=dtype)
        self.buffers[name] = buf
        return buf.physical_address
Esempio n. 16
0
 def window(self, window):
     window_size = self._window_packetsize
     buffer = allocate(shape=(window_size, ), dtype=np.int32)
     self._window_address = buffer.device_address
     if window == 'rectangular':
         buffer[:] = np.int32(np.ones(window_size)[:] * 2**14)
     elif window == 'bartlett':
         buffer[:] = np.int32(np.bartlett(window_size)[:] * 2**14)
     elif window == 'blackman':
         buffer[:] = np.int32(np.blackman(window_size)[:] * 2**14)
     elif window == 'hamming':
         buffer[:] = np.int32(np.hamming(window_size)[:] * 2**14)
     elif window == 'hanning':
         buffer[:] = np.int32(np.hanning(window_size)[:] * 2**14)
     else:
         buffer[:] = np.int32(np.ones(window_size)[:] * 2**14)
         window = 'rectangular'
     self._window_transfer = 1
     while not self.window_ready:
         pass
     self._window_transfer = 0
     self._window_type = window
     self._window_squaresum = np.sum(
         (np.array(buffer, dtype=np.single) * 2**-14)**2)
     self._window_sum = np.sum((np.array(buffer, dtype=np.single)))
     buffer.freebuffer()
     self._spectrum_typescale = \
         int(struct.unpack('!i',struct.pack('!f',float((self._sample_frequency/self._decimation_factor)/(self._number_samples))))[0])
     self._spectrum_powerscale = \
         int(struct.unpack('!i',struct.pack('!f',float(1/((self._sample_frequency/self._decimation_factor)*self._window_squaresum))))[0])
Esempio n. 17
0
    def _transfer(self):
        buff_len = self.controller.receive_size

        if buff_len > 0:
            # Create new receive buffer for message
            self._rx_buff.freebuffer()
            self._rx_buff = allocate(shape=(buff_len, ), dtype=np.uint8)

            # Prepare to receive the message
            self._dma_transfer(self._rx_buff)

            # Obtain the message
            self._message = np.array(self._rx_buff.astype(np.uint32), \
                                     dtype = np.uint8)[5:len(self._rx_buff)]

            # Set frame to allow the user to read the frame data
            self.frame = {
                "number": self._rx_buff[0],
                "flags": self._rx_buff[1],
                "length": {
                    "total": buff_len,
                    "header": self._rx_buff[2],
                    "payload": self._rx_buff[3],
                    "padding": self._rx_buff[4]
                },
                "payload": self._message
            }
Esempio n. 18
0
 def fft_size(self, fft_size):
     if fft_size in [8192, 4096, 2048, 1024, 512, 256, 128, 64]:
         running = False
         if self.dma_enable:
             self.dma_enable = 0
             running = True
         self.ssr_packetsize = 0
         time.sleep(0.2)
         self._dma_length = fft_size
         [self._buffer[x].freebuffer() for x in range(0, 3)]
         self._buffer = [
             allocate(shape=(self._dma_length, ), dtype=np.single)
             for x in range(0, 3)
         ]
         self._dma_bufferaddress_0 = self._buffer[0].device_address
         self._dma_bufferaddress_1 = self._buffer[0].device_address
         self._dma_bufferaddress_2 = self._buffer[0].device_address
         self._spectrum_fftselector = int(np.log2(fft_size) - 6)
         self._window_packetsize = fft_size
         self.window = self._window_type
         self.ssr_packetsize = int(fft_size / 8)
         self.plot.number_samples = fft_size
         self._number_samples = int(2**(self._spectrum_fftselector + 6))
         self._spectrum_typescale = \
             int(struct.unpack('!i',struct.pack('!f',float((self._sample_frequency/self._decimation_factor)/(self._number_samples))))[0])
         self._spectrum_powerscale = \
             int(struct.unpack('!i',struct.pack('!f',float(1/((self._sample_frequency/self._decimation_factor)*self._window_squaresum))))[0])
         if running:
             self.dma_enable = 1
Esempio n. 19
0
def test_allocate(device):
    with device.check_memops(allocates=[(10, False, BUFFER_ADDRESS)]):
        buf = pynq.allocate((1024, 1024), 'u4', target=device)
    assert buf.device_address == BUFFER_ADDRESS
    assert buf.physical_address == BUFFER_ADDRESS
    assert buf.bo == 10
    assert buf.coherent is False
    assert buf.cacheable is True
Esempio n. 20
0
    def __init__(self, width, height, frequency, bitfile_name=None, **kwargs):
        """Construct a new HoughEvaluation
        width: Required. The width of the image to be processed.
        height: Required. The height of the image to be processed.
        frequency: Required. The system frequency of the Hough architecture.
        bitfile_name: Optional. If left None, the 'zcu104_hep.bit' bundled
                      with the hough_evaluation_platform package will be used.
        """

        # Generate default bitfile name
        if bitfile_name is None:
            this_dir = os.path.dirname(__file__)
            bitfile_name = os.path.join(this_dir, 'bitstream',
                                        'zcu104_hep.bit')

        # Set FPD and LPD interface widths
        from pynq import MMIO
        fpd_cfg = MMIO(0xfd615000, 4)
        fpd_cfg.write(0, 0x00000A00)
        lpd_cfg = MMIO(0xff419000, 4)
        lpd_cfg.write(0, 0x00000000)

        # Create Overlay
        super().__init__(bitfile_name, **kwargs)

        # Set system parameters
        self.width = width
        self.height = height
        self.frequency = frequency
        self.nrho = int(
            np.ceil(np.sqrt((self.width / 2)**2 + (self.height / 2)**2)) * 2)
        self.ntheta = np.size(np.arange(0, 180))

        # Initialise system outputs
        self.time = 0
        self.pmvr = 0

        # Extract ipcore from the overlay with friendly name and initialise
        self.hpa = self.hpa_module
        self.hpa.frequency = self.frequency

        # Set up dma buffers
        self.inarray = allocate(shape=(self.height, self.width),
                                dtype=np.uint32)
        self.outarray = allocate(shape=(self.ntheta, self.nrho),
                                 dtype=np.uint32)
Esempio n. 21
0
    def allocate_mem(self,
                     X_shape,
                     y_shape,
                     dtype=np.float32,
                     trg_in=None,
                     trg_out=None):
        """
        Buffer allocation in the card memory
        Parameters
        ----------
        X_shape : input buffer shape.
        y_shape : output buffer shape.
        dtype   : the data type of the elements of the input/output vectors.
                  Note: it should be set depending on the interface of the accelerator; if it uses 'float'
                  types for the 'data' AXI-Stream field, 'np.float32' dtype is the correct one to use.
                  Instead if it uses 'ap_fixed<A,B>', 'np.intA' is the correct one to use (note that A cannot
                  any integer value, but it can assume {..., 8, 16, 32, ...} values. Check `numpy`
                  doc for more info).
                  In this case the encoding/decoding has to be computed by the host machine. For example for
                  'ap_fixed<16,6>' type the following 2 functions are the correct one to use for encode/decode
                  'float' -> 'ap_fixed<16,6>':
                  ```
                    def encode(xi):
                        return np.int16(round(xi * 2**10)) # note 2**10 = 2**(A-B)
                    def decode(yi):
                        return yi * 2**-10
                    encode_v = np.vectorize(encode) # to apply them element-wise
                    decode_v = np.vectorize(decode)
                  ```
        trg_in  : input buffer target memory. By default the v++ command
                  set it to HBM[0] for alveo-u50. 
        trg_out : output buffer target memory.By default the v++ command
                  set it to HBM[0] for alveo-u50.

        Assigns
        -------
        input_buffer : input PYNQ buffer, must be allocated first and just once.
        output_buffer : output PYNQ buffer, must be allocated first and just once.
        input_buffer, output_buffer : input and output PYNQ buffers

        """
        self.input_buffer = allocate(shape=X_shape, dtype=dtype, target=trg_in)
        self.output_buffer = allocate(shape=y_shape,
                                      dtype=dtype,
                                      target=trg_out)
Esempio n. 22
0
 def __init__(self,
              bitfile_name,
              x_shape,
              y_shape,
              dtype=np.float32,
              dtbo=None,
              download=True,
              ignore_version=False,
              device=None):
     super().__init__(bitfile_name,
                      dtbo=None,
                      download=True,
                      ignore_version=False,
                      device=None)
     self.sendchannel = self.hier_0.axi_dma_0.sendchannel
     self.recvchannel = self.hier_0.axi_dma_0.recvchannel
     self.input_buffer = allocate(shape=x_shape, dtype=dtype)
     self.output_buffer = allocate(shape=y_shape, dtype=dtype)
Esempio n. 23
0
def TestStreamSlave(depth):

    # Send MM2S data from CPU Memory to Stream by DMA.

    sendBuffer = allocate(shape=(depth, ), dtype='u4')
    sendBuffer[:] = [100 + i for i in range(0, depth)]
    recvBuffer = allocate(shape=(depth, ), dtype='u4')
    recvBuffer[:] = 0

    dma.recvchannel.start()
    dma.recvchannel.transfer(recvBuffer)
    dma.sendchannel.start()
    dma.sendchannel.transfer(sendBuffer)
    dma.sendchannel.wait()
    dma.sendchannel.stop()
    dma.recvchannel.wait()
    dma.recvchannel.stop()
    CompareLists(sendBuffer, recvBuffer)
Esempio n. 24
0
    def __init__(self, description):
        super().__init__(description)

        # Init config register
        self.pkt_size = 16

        # Init buffer
        self.buf_dtype = np.uint32
        self.buf = allocate(shape=(16, ), dtype=np.uint32)
Esempio n. 25
0
 def batch_size(self, value):
     self._batch_size = value
     # free the old buffers by setting to None
     # (reference counting should care of it)
     if self.ibuf_packed_device is not None:
         self.ibuf_packed_device = None
     if self.obuf_packed_device is not None:
         self.obuf_packed_device = None
     if self.platform == "alveo":
         self.ibuf_packed_device = allocate(shape=self.ishape_packed, dtype=np.uint8)
         self.obuf_packed_device = allocate(shape=self.oshape_packed, dtype=np.uint8)
     else:
         self.ibuf_packed_device = allocate(
             shape=self.ishape_packed, dtype=np.uint8, cacheable=True
         )
         self.obuf_packed_device = allocate(
             shape=self.oshape_packed, dtype=np.uint8, cacheable=True
         )
     self.obuf_packed = np.empty_like(self.obuf_packed_device)
Esempio n. 26
0
def test_default(device):
    pynq.Device.active_device = device
    with device.check_memops(allocates=[(10, False, BUFFER_ADDRESS)]):
        buf = pynq.allocate((1024, 1024), 'u4')
    assert buf.device_address == BUFFER_ADDRESS
    assert buf.physical_address == BUFFER_ADDRESS
    assert buf.bo == 10
    assert buf.coherent is False
    assert buf.cacheable is True
    pynq.Device.active_device = None
def setup_accelerator(xclbin, wfile):
    ol = pynq.Overlay(xclbin)

    fcweights = np.genfromtxt(wfile, delimiter=',', dtype=np.int8)
    fcbuf = pynq.allocate((1000, 2048), dtype=np.int8, target=ol.PLRAM0)

    #csv reader erroneously adds one extra element to the end, so remove, then reshape
    fcweights = fcweights[:-1].reshape(1000, 2048)
    fcbuf[:] = fcweights
    fcbuf.sync_to_device()

    return ol, fcbuf
Esempio n. 28
0
 def __init__(self, description, pkt_size, buf_dtype=np.int16, buf_words_per_pkt=2):
     super().__init__(description)
     
     # Init config register
     self.reset = 1
     self.enable = 1
     self.pkt_size = pkt_size-1
     self.auto_restart = 0
     self.reset = 0
     
     # Init buffer
     self.buf = allocate(shape=(pkt_size * buf_words_per_pkt, ), dtype=np.int16)
Esempio n. 29
0
 def transfer(self, packetsize):
     """Returns a numpy array with inspected data of length packetsize.
     """
     transfersize = int(np.ceil(packetsize / 8))
     if transfersize > 4096 or transfersize < 2:
         raise ValueError(
             'Packet size incorrect, should be in range 16 to 32768.')
     self._pgen.packetsize = transfersize
     buffer_re = allocate(shape=(transfersize * 8, ), dtype=np.int16)
     buffer_im = allocate(shape=(transfersize * 8, ), dtype=np.int16)
     self._dma_real.recvchannel.transfer(buffer_re)
     self._dma_imag.recvchannel.transfer(buffer_im)
     self._pgen.transfer = 1
     self._dma_real.recvchannel.wait()
     self._dma_imag.recvchannel.wait()
     self._pgen.transfer = 0
     re_data = np.array(buffer_re) * 2**-15
     im_data = np.array(buffer_im) * 2**-15
     buffer_re.freebuffer()
     buffer_im.freebuffer()
     c_data = re_data.astype('double') + 1j * im_data.astype('double')
     return c_data[0:packetsize]
Esempio n. 30
0
 def set_shape(self, shape):
     """Set the buffer shape by first freeing the existing buffer
     and then allocating a new buffer with the given tuple. Obtain the
     tuple product to set the packetsize of the data_inspector_module.
     """
     self.buffer.freebuffer()
     lshape = list(shape)
     lshape[0] = lshape[0] * 2
     tshape = tuple(lshape)
     self.buffer = allocate(shape=tshape, dtype=np.int16)       
     product = 1 
     for i in shape:  
         product *= i
     self.data_inspector_module.packetsize = product