def __init__(self, bit_file_path): self.ol = Overlay(bit_file_path) self.dma = self.ol.axi_dma_0 # declare input/output types and shapes for the accelerator # input FINN DataType self.idt = DataType.UINT2 # normal, folded and packed input shapes self.ishape_normal = (1, 8) self.ishape_folded = (1, 1, 8) self.ishape_packed = (1, 1, 2) # output FINN DataType self.odt = DataType.INT32 # normal, folded and packed output shapes self.oshape_normal = (1, 3) self.oshape_folded = (1, 1, 3) self.oshape_packed = (1, 1, 12) # allocate a PYNQ buffer for the packed input buffer self.ibuf_packed_device = allocate(shape=self.ishape_packed, dtype=np.uint8) # allocate a PYNQ buffer for the packed output buffer self.obuf_packed = allocate(shape=self.oshape_packed, dtype=np.uint8) self.mt_node_thresholds = np.asarray( [[-0.01027827151119709, 0.01027827151119709]]) self.multiply_node_const = 0.014718746766448021 self.add_node_mat = np.asarray( [-0.08831246197223663, 0.17662496864795685, -0.058874987065792084])
def __init__(self, fs=1000000, N=6000): # Input paramters self._fs = fs self._N = N # Cache timeseries and reusable random data self._random_data = [rnd.randint(0, 3) for _ in range(N)] self._t = np.array([i / fs for i in range(N)]) # Overlay config bit_name = os.path.dirname(__file__) + '/agc_loopback.bit' ol = Overlay(bit_name) self._ol = ol #Avoid PYNQ's get_attr overhead by aliasing IPs self._agc = ol.agc self._dma_in_i = ol.dma_in_i self._dma_in_q = ol.dma_in_q self._dma_agc_i = ol.dma_agc_i self._dma_agc_q = ol.dma_agc_q self._dma_agc_g = ol.dma_agc_g # Allocate buffers self._buf_in_i = allocate(shape=(N, ), dtype=np.int16) self._buf_in_q = allocate(shape=(N, ), dtype=np.int16) self._buf_agc_i = allocate(shape=(N, ), dtype=np.int16) self._buf_agc_q = allocate(shape=(N, ), dtype=np.int16) self._buf_agc_g = allocate(shape=(N, ), dtype=np.uint32)
def hw_inference(self, input_data_processed): packed_input = self.pack(input_data_processed.image_array) directions = 2 if self.bidirectional_enabled else 1 self.accel_input_buffer = allocate(shape=packed_input.shape, dtype=np.uint64) self.accel_output_buffer = allocate(shape=(128, ), dtype=np.uint64) np.copyto(self.accel_input_buffer, packed_input) bytes_read = np.ceil((self.input_bitwidth * self.input_size * 8) / 64) bytes_read = int(bytes_read * directions * input_data_processed.width) self.BLSTM_CTC.numberColumns_V = input_data_processed.width self.BLSTM_CTC.numberColumnsTwice_V = directions * input_data_processed.width self.BLSTM_CTC.numberBytesRead_V = bytes_read self.BLSTM_CTC.input_buffer_V_1 = self.accel_input_buffer.physical_address & 0xffffffff self.BLSTM_CTC.input_buffer_V_2 = ( self.accel_input_buffer.physical_address >> 32) & 0xffffffff self.BLSTM_CTC.output_buffer_V_1 = self.accel_output_buffer.physical_address & 0xffffffff self.BLSTM_CTC.output_buffer_V_2 = ( self.accel_output_buffer.physical_address >> 32) & 0xffffffff start = time.time() self.ExecAccel() end = time.time() print("Inference took = {} ms...".format((end - start) * 1000)) print("Frames per second = {}".format(1 / (end - start))) predictions = np.copy( np.frombuffer(self.accel_output_buffer, dtype=np.uint64)) return predictions
def fpga_evaluate_dance(self, imu_data): global dma input_buffer0 = allocate(shape=(100,), dtype=np.float32) output_buffer0 = allocate(shape=(12,), dtype=np.float32) segment = self.preprocess_segment(imu_data) segment = (segment - self.scaler_mean) / self.scaler_scale self.choose_model[0] = 0 dma.sendchannel.transfer(self.choose_model) dma.sendchannel.wait() for j in range(100): input_buffer0[j] = segment[j] dma.sendchannel.transfer(input_buffer0) dma.recvchannel.transfer(output_buffer0) dma.sendchannel.wait() dma.recvchannel.wait() raw_output = [output_buffer0[i] for i in range(12)] index = np.argmax(raw_output, axis=-1) #print(f"Prediction: {self.dance_labels[index]} | {raw_output[index]}") return self.dance_labels[index]
def __init__(self, bit_file_path): self.ol = Overlay(bit_file_path) self.dma = self.ol.axi_dma_0 # declare input/output types and shapes for the accelerator # input FINN DataType self.idt = DataType.UINT2 # normal, folded and packed input shapes self.ishape_normal = (1, 256) self.ishape_folded = (1, 4, 64) self.ishape_packed = (1, 4, 16) # output FINN DataType self.odt = DataType.INT32 # normal, folded and packed output shapes self.oshape_normal = (1, 6) self.oshape_folded = (1, 1, 6) self.oshape_packed = (1, 1, 24) # allocate a PYNQ buffer for the packed input buffer self.ibuf_packed_device = allocate(shape=self.ishape_packed, dtype=np.uint8) # allocate a PYNQ buffer for the packed output buffer self.obuf_packed = allocate(shape=self.oshape_packed, dtype=np.uint8) self.mt_node_thresholds = np.asarray( [[-0.00000762939453125, 0.00000762939453125]]) self.multiply_node_const = 0.19533488154411316 self.add_node_mat = np.asarray([ 0.7813395261764526, 0, 0, 0.7813395261764526, -0.3906697630882263, 0 ])
def __init__(self, N, bitfile): """Instantiate the FINN accelerator driver. Gets batchsize (N) as integer and path to bitfile as string.""" self.N = N # input FINN DataType self.idt = DataType.UINT8 # output FINN DataType self.odt = DataType.UINT32 # input and output shapes self.ishape_normal = (N, 256) self.oshape_normal = (N, 9) self.ishape_folded = (N, 8, 32) self.oshape_folded = (N, 1, 9) self.ishape_packed = (N, 8, 32) # datatype np.uint8 self.oshape_packed = (N, 1, 36) # datatype np.uint8 # load bitfile and set up accelerator self.ol = Overlay(bitfile) self.dma = self.ol.axi_dma_0 self.ctrl_regs = self.ol.resize_accel_0 # neuron folding factor of output = iterations per sample self.itersPerSample = self.oshape_packed[-2] # AXI lite register offset for number of iterations # used by TLastMarker to signal end of transmission for AXI CDMA self.REG_OFFSET_NUM_ITERS = 0x10 # set up TLastMarker with correct num. samples self.ctrl_regs.write(self.REG_OFFSET_NUM_ITERS, self.N * self.itersPerSample) # allocate a PYNQ buffer for the packed input and buffer self.ibuf_packed_device = allocate(shape=self.ishape_packed, dtype=np.uint8) self.obuf_packed_device = allocate(shape=self.oshape_packed, dtype=np.uint8)
def benchmark_synthetic(bs, nreps): ibuf = pynq.allocate((bs, 3, 224, 224), dtype=np.int8, target=ol.bank0) obuf = pynq.allocate((bs, 5), dtype=np.uint32, target=ol.bank0) # Start power monitoring pwr_rec = setup_power_recording() pwr_rec.record(0.1) total_duration = time.monotonic() for i in range(nreps): accelerator.call(ibuf, obuf, fcbuf, bs) total_duration = time.monotonic() - total_duration # Stop the power monitoring pwr_rec.stop() latency = total_duration / nreps fps = int((nreps / total_duration) * bs) # Aggregate board/fpga power into a Pandas dataframe f = pwr_rec.frame powers = pd.DataFrame(index=f.index) powers['board_power'] = f['12v_aux_power'] + f['12v_pex_power'] powers['fpga_power'] = f['vccint_power'] return fps, latency, powers
def predict(self, X, y_shape, dtype=np.float32, debug=None, profile=False, encode=None, decode=None): """ Obtain the predictions of the NN implemented in the FPGA. Parameters: - X : the input vector. Should be numpy ndarray. - y_shape : the shape of the output vector. Needed to the accelerator to set the TLAST bit properly and for sizing the output vector shape. - dtype : the data type of the elements of the input/output vectors. Note: it should be set depending on the interface of the accelerator; if it uses 'float' types for the 'data' AXI-Stream field, 'np.float32' dtype is the correct one to use. Instead if it uses 'ap_fixed<A,B>', 'np.intA' is the correct one to use (note that A cannot any integer value, but it can assume {..., 8, 16, 32, ...} values. Check `numpy` doc for more info). In this case the encoding/decoding has to be computed by the PS. For example for 'ap_fixed<16,6>' type the following 2 functions are the correct one to use for encode/decode 'float' -> 'ap_fixed<16,6>': ``` def encode(xi): return np.int16(round(xi * 2**10)) # note 2**10 = 2**(A-B) def decode(yi): return yi * 2**-10 encode_v = np.vectorize(encode) # to apply them element-wise decode_v = np.vectorize(decode) ``` - profile : boolean. Set it to `True` to print the performance of the algorithm in term of `inference/s`. - encode/decode: function pointers. See `dtype` section for more information. - return: an output array based on `np.ndarray` with a shape equal to `y_shape` and a `dtype` equal to the namesake parameter. """ if profile: timea = datetime.now() if encode is not None: X = encode(X) with allocate(shape=X.shape, dtype=dtype) as input_buffer, \ allocate(shape=y_shape, dtype=dtype) as output_buffer: input_buffer[:] = X self.hier_0.axi_dma_0.sendchannel.transfer(input_buffer) self.hier_0.axi_dma_0.recvchannel.transfer(output_buffer) if debug: print("Transfer OK") self.hier_0.axi_dma_0.sendchannel.wait() if debug: print("Send OK") self.hier_0.axi_dma_0.recvchannel.wait() if debug: print("Receive OK") result = output_buffer.copy() if decode is not None: result = decode(result) if profile: timeb = datetime.now() dts, rate = self._print_dt(timea, timeb, len(X)) return result, dts, rate return result
def __init__(self, N, bitfile, platform="alveo", device_name="xilinx_u50_gen3x16_xdma_201920_3"): """Instantiate the FINN accelerator driver. Gets batchsize (N) as integer and path to bitfile as string.""" self.platform = platform self.N = N # input FINN DataType self.idt = DataType.UINT8 # output FINN DataType self.odt = DataType.UINT8 # input and output shapes self.ishape_normal = (N, 32, 32, 3) self.oshape_normal = (N, 1) self.ishape_folded = (N, 32, 32, 1, 3) self.oshape_folded = (N, 1, 1) self.ishape_packed = (N, 32, 32, 1, 3) # datatype np.uint8 self.oshape_packed = (N, 1, 1) # datatype np.uint8 # load bitfile and set up accelerator self.device = [i for i in Device.devices if i.name == device_name][0] Device.active_device = self.device self.ol = Overlay(bitfile) # neuron folding factor of output = iterations per sample self.itersPerSample = self.oshape_packed[-2] # clock frequency as specified by user self.fclk_mhz = 100.0 if self.platform == "alveo": self.idma = self.ol.idma0 self.odma = self.ol.odma0 elif self.platform == "zynq-iodma": self.idma = self.ol.idma0 self.odma = self.ol.odma0 # set the clock frequency as specified by user during transformations if self.fclk_mhz > 0: Clocks.fclk0_mhz = self.fclk_mhz else: raise ValueError("Supported platforms are zynq-iodma alveo") # allocate a PYNQ buffer for the packed input and buffer if self.platform == "alveo": self.ibuf_packed_device = allocate(shape=self.ishape_packed, dtype=np.uint8) self.obuf_packed_device = allocate(shape=self.oshape_packed, dtype=np.uint8) else: self.ibuf_packed_device = allocate(shape=self.ishape_packed, dtype=np.uint8, cacheable=True) self.obuf_packed_device = allocate(shape=self.oshape_packed, dtype=np.uint8, cacheable=True)
def _create_buffer( self, data=np.array( [72, 101, 108, 108, 111, 32, 87, 111, 114, 108, 100, 33], dtype=np.uint8), eof=1, padding=0): """Create a buffer that is loaded user data. Append the Extended Barker sequence to the user data and then pad with zeros """ self._flags = eof if data.size == 0: raise ValueError('Message size should be greater than 0.') msg = np.array(data, dtype=np.uint8) # Append Barker and Random Data bkr = np.array([ 0, 0, 63, 112, 28, len(msg) + 5, self._frame_number, self._flags, 5, len(msg), padding ], dtype=np.uint8) rnd = np.array([randint(0, 255) for p in range(0, self.random_size)], dtype=np.uint8) seq = np.append(bkr, msg) seq = np.append(rnd, seq) pad = np.append( seq, np.zeros(int( np.ceil((len(rnd) + len(bkr) + len(msg)) / 32) * 32 - (len(rnd) + len(bkr) + len(msg))), dtype=np.uint8)) buf = allocate(shape=(len(pad), ), dtype=np.uint8) buf[:] = pad[:] return buf
def TestStreamMaster(baseval): # Set DMA to write SS2M data to a specifc physical address output_buffer = allocate(shape=(8, ), dtype='u4') output_buffer[:] = 0 dma.recvchannel.start() dma.recvchannel.transfer(output_buffer) # Confiugre the peripheral to send baseval to baseval + 7 ol.AXIMasterSlaveStream_0.write(0x00, 0x02) ol.AXIMasterSlaveStream_0.write(0x00, 0x00) ol.AXIMasterSlaveStream_0.write(0x04, baseval) ol.AXIMasterSlaveStream_0.write(0x00, 0x01) dma.recvchannel.wait() dma.recvchannel.stop() # Check the data matches. for i in range(8): expected = baseval + i if not (output_buffer[i] == expected): print("TestStreamMaster: Error on output buffer " + str(i) + \ " Expected " + "{:x}".format(expected) + \ " Actual " + str(output_buffer[i])) return 0 return 1
def render_frame(ipRender, ipIDMA, ipODMA, mesh_buffers, transforms, texture_ids, output_buffer): mbuf_trans = [] for i, mbuf in enumerate(mesh_buffers): num_faces = len(mbuf) // 24 render_set_num_faces(ipRender, num_faces) mbuf_trans.append(allocate(shape=len(mbuf), dtype=np.float32)) render_set_transform(ipRender, transforms[i]) render_transform(ipRender, ipIDMA, ipODMA, mbuf, mbuf_trans[-1]) for i in range(8): fh0 = 60 * i fh1 = 60 * (i + 1) ipRender.write(0x00C0, fh0) ipRender.write(0x00C4, fh1) render_reset_buffer(ipRender) for j, mbuf in enumerate(mbuf_trans): num_faces = len(mbuf) // 24 render_set_num_faces(ipRender, num_faces) render_set_texture_id(ipRender, texture_ids[j]) ipRender.write(0x0010, 2) ipRender.write(0x0000, 1) ipIDMA.sendchannel.transfer(mbuf) ipIDMA.sendchannel.wait() render_wait_done(ipRender) ipRender.write(0x0010, 3) ipRender.write(0x0000, 1) ipODMA.recvchannel.transfer(output_buffer[fh0*640:fh1*640]) ipODMA.recvchannel.wait() render_wait_done(ipRender)
def test_withfree(device): cache = MockCache() with device.check_memops(allocates=[(10, False, BUFFER_ADDRESS)]): with pynq.allocate((1024, 1024), 'u4', target=device) as buf: buf.pointer = 1234 buf.return_to = cache assert cache.returns == [1234]
def getframe(self): """Retrieve a frame from the cache or create a new frame if the cache is empty. The freebuffer method of the returned array is overriden to return the object to the cache rather than freeing the object. """ if self._cache: frame = allocate( shape=self._mode.shape, dtype='u1', cacheable=self._cacheable, pointer=self._cache.pop(), cache=self) else: frame = allocate( shape=self._mode.shape, dtype=np.uint8, cacheable=self._cacheable, cache=self) return frame
def allocate_buffer(self, name, num_samples, data_type="unsigned int"): """This method allocates the source or the destination buffers. Usually, the source buffer stores 32-bit samples, while the destination buffer stores 64-bit samples. Note that the numpy array has to be deep-copied before users can free the buffer. Parameters ---------- name : str The name of the string, used for indexing the buffers. num_samples : int The number of samples that needs to be generated or captured. data_type : str The type of the data. Returns ------- int The address of the source or destination buffer. """ dtype = data_type for k, v in BYTE_WIDTH_TO_CTYPE.items(): if v == data_type: dtype = BYTE_WIDTH_TO_NPTYPE[k] break buf = allocate(num_samples, dtype=dtype) self.buffers[name] = buf return buf.physical_address
def window(self, window): window_size = self._window_packetsize buffer = allocate(shape=(window_size, ), dtype=np.int32) self._window_address = buffer.device_address if window == 'rectangular': buffer[:] = np.int32(np.ones(window_size)[:] * 2**14) elif window == 'bartlett': buffer[:] = np.int32(np.bartlett(window_size)[:] * 2**14) elif window == 'blackman': buffer[:] = np.int32(np.blackman(window_size)[:] * 2**14) elif window == 'hamming': buffer[:] = np.int32(np.hamming(window_size)[:] * 2**14) elif window == 'hanning': buffer[:] = np.int32(np.hanning(window_size)[:] * 2**14) else: buffer[:] = np.int32(np.ones(window_size)[:] * 2**14) window = 'rectangular' self._window_transfer = 1 while not self.window_ready: pass self._window_transfer = 0 self._window_type = window self._window_squaresum = np.sum( (np.array(buffer, dtype=np.single) * 2**-14)**2) self._window_sum = np.sum((np.array(buffer, dtype=np.single))) buffer.freebuffer() self._spectrum_typescale = \ int(struct.unpack('!i',struct.pack('!f',float((self._sample_frequency/self._decimation_factor)/(self._number_samples))))[0]) self._spectrum_powerscale = \ int(struct.unpack('!i',struct.pack('!f',float(1/((self._sample_frequency/self._decimation_factor)*self._window_squaresum))))[0])
def _transfer(self): buff_len = self.controller.receive_size if buff_len > 0: # Create new receive buffer for message self._rx_buff.freebuffer() self._rx_buff = allocate(shape=(buff_len, ), dtype=np.uint8) # Prepare to receive the message self._dma_transfer(self._rx_buff) # Obtain the message self._message = np.array(self._rx_buff.astype(np.uint32), \ dtype = np.uint8)[5:len(self._rx_buff)] # Set frame to allow the user to read the frame data self.frame = { "number": self._rx_buff[0], "flags": self._rx_buff[1], "length": { "total": buff_len, "header": self._rx_buff[2], "payload": self._rx_buff[3], "padding": self._rx_buff[4] }, "payload": self._message }
def fft_size(self, fft_size): if fft_size in [8192, 4096, 2048, 1024, 512, 256, 128, 64]: running = False if self.dma_enable: self.dma_enable = 0 running = True self.ssr_packetsize = 0 time.sleep(0.2) self._dma_length = fft_size [self._buffer[x].freebuffer() for x in range(0, 3)] self._buffer = [ allocate(shape=(self._dma_length, ), dtype=np.single) for x in range(0, 3) ] self._dma_bufferaddress_0 = self._buffer[0].device_address self._dma_bufferaddress_1 = self._buffer[0].device_address self._dma_bufferaddress_2 = self._buffer[0].device_address self._spectrum_fftselector = int(np.log2(fft_size) - 6) self._window_packetsize = fft_size self.window = self._window_type self.ssr_packetsize = int(fft_size / 8) self.plot.number_samples = fft_size self._number_samples = int(2**(self._spectrum_fftselector + 6)) self._spectrum_typescale = \ int(struct.unpack('!i',struct.pack('!f',float((self._sample_frequency/self._decimation_factor)/(self._number_samples))))[0]) self._spectrum_powerscale = \ int(struct.unpack('!i',struct.pack('!f',float(1/((self._sample_frequency/self._decimation_factor)*self._window_squaresum))))[0]) if running: self.dma_enable = 1
def test_allocate(device): with device.check_memops(allocates=[(10, False, BUFFER_ADDRESS)]): buf = pynq.allocate((1024, 1024), 'u4', target=device) assert buf.device_address == BUFFER_ADDRESS assert buf.physical_address == BUFFER_ADDRESS assert buf.bo == 10 assert buf.coherent is False assert buf.cacheable is True
def __init__(self, width, height, frequency, bitfile_name=None, **kwargs): """Construct a new HoughEvaluation width: Required. The width of the image to be processed. height: Required. The height of the image to be processed. frequency: Required. The system frequency of the Hough architecture. bitfile_name: Optional. If left None, the 'zcu104_hep.bit' bundled with the hough_evaluation_platform package will be used. """ # Generate default bitfile name if bitfile_name is None: this_dir = os.path.dirname(__file__) bitfile_name = os.path.join(this_dir, 'bitstream', 'zcu104_hep.bit') # Set FPD and LPD interface widths from pynq import MMIO fpd_cfg = MMIO(0xfd615000, 4) fpd_cfg.write(0, 0x00000A00) lpd_cfg = MMIO(0xff419000, 4) lpd_cfg.write(0, 0x00000000) # Create Overlay super().__init__(bitfile_name, **kwargs) # Set system parameters self.width = width self.height = height self.frequency = frequency self.nrho = int( np.ceil(np.sqrt((self.width / 2)**2 + (self.height / 2)**2)) * 2) self.ntheta = np.size(np.arange(0, 180)) # Initialise system outputs self.time = 0 self.pmvr = 0 # Extract ipcore from the overlay with friendly name and initialise self.hpa = self.hpa_module self.hpa.frequency = self.frequency # Set up dma buffers self.inarray = allocate(shape=(self.height, self.width), dtype=np.uint32) self.outarray = allocate(shape=(self.ntheta, self.nrho), dtype=np.uint32)
def allocate_mem(self, X_shape, y_shape, dtype=np.float32, trg_in=None, trg_out=None): """ Buffer allocation in the card memory Parameters ---------- X_shape : input buffer shape. y_shape : output buffer shape. dtype : the data type of the elements of the input/output vectors. Note: it should be set depending on the interface of the accelerator; if it uses 'float' types for the 'data' AXI-Stream field, 'np.float32' dtype is the correct one to use. Instead if it uses 'ap_fixed<A,B>', 'np.intA' is the correct one to use (note that A cannot any integer value, but it can assume {..., 8, 16, 32, ...} values. Check `numpy` doc for more info). In this case the encoding/decoding has to be computed by the host machine. For example for 'ap_fixed<16,6>' type the following 2 functions are the correct one to use for encode/decode 'float' -> 'ap_fixed<16,6>': ``` def encode(xi): return np.int16(round(xi * 2**10)) # note 2**10 = 2**(A-B) def decode(yi): return yi * 2**-10 encode_v = np.vectorize(encode) # to apply them element-wise decode_v = np.vectorize(decode) ``` trg_in : input buffer target memory. By default the v++ command set it to HBM[0] for alveo-u50. trg_out : output buffer target memory.By default the v++ command set it to HBM[0] for alveo-u50. Assigns ------- input_buffer : input PYNQ buffer, must be allocated first and just once. output_buffer : output PYNQ buffer, must be allocated first and just once. input_buffer, output_buffer : input and output PYNQ buffers """ self.input_buffer = allocate(shape=X_shape, dtype=dtype, target=trg_in) self.output_buffer = allocate(shape=y_shape, dtype=dtype, target=trg_out)
def __init__(self, bitfile_name, x_shape, y_shape, dtype=np.float32, dtbo=None, download=True, ignore_version=False, device=None): super().__init__(bitfile_name, dtbo=None, download=True, ignore_version=False, device=None) self.sendchannel = self.hier_0.axi_dma_0.sendchannel self.recvchannel = self.hier_0.axi_dma_0.recvchannel self.input_buffer = allocate(shape=x_shape, dtype=dtype) self.output_buffer = allocate(shape=y_shape, dtype=dtype)
def TestStreamSlave(depth): # Send MM2S data from CPU Memory to Stream by DMA. sendBuffer = allocate(shape=(depth, ), dtype='u4') sendBuffer[:] = [100 + i for i in range(0, depth)] recvBuffer = allocate(shape=(depth, ), dtype='u4') recvBuffer[:] = 0 dma.recvchannel.start() dma.recvchannel.transfer(recvBuffer) dma.sendchannel.start() dma.sendchannel.transfer(sendBuffer) dma.sendchannel.wait() dma.sendchannel.stop() dma.recvchannel.wait() dma.recvchannel.stop() CompareLists(sendBuffer, recvBuffer)
def __init__(self, description): super().__init__(description) # Init config register self.pkt_size = 16 # Init buffer self.buf_dtype = np.uint32 self.buf = allocate(shape=(16, ), dtype=np.uint32)
def batch_size(self, value): self._batch_size = value # free the old buffers by setting to None # (reference counting should care of it) if self.ibuf_packed_device is not None: self.ibuf_packed_device = None if self.obuf_packed_device is not None: self.obuf_packed_device = None if self.platform == "alveo": self.ibuf_packed_device = allocate(shape=self.ishape_packed, dtype=np.uint8) self.obuf_packed_device = allocate(shape=self.oshape_packed, dtype=np.uint8) else: self.ibuf_packed_device = allocate( shape=self.ishape_packed, dtype=np.uint8, cacheable=True ) self.obuf_packed_device = allocate( shape=self.oshape_packed, dtype=np.uint8, cacheable=True ) self.obuf_packed = np.empty_like(self.obuf_packed_device)
def test_default(device): pynq.Device.active_device = device with device.check_memops(allocates=[(10, False, BUFFER_ADDRESS)]): buf = pynq.allocate((1024, 1024), 'u4') assert buf.device_address == BUFFER_ADDRESS assert buf.physical_address == BUFFER_ADDRESS assert buf.bo == 10 assert buf.coherent is False assert buf.cacheable is True pynq.Device.active_device = None
def setup_accelerator(xclbin, wfile): ol = pynq.Overlay(xclbin) fcweights = np.genfromtxt(wfile, delimiter=',', dtype=np.int8) fcbuf = pynq.allocate((1000, 2048), dtype=np.int8, target=ol.PLRAM0) #csv reader erroneously adds one extra element to the end, so remove, then reshape fcweights = fcweights[:-1].reshape(1000, 2048) fcbuf[:] = fcweights fcbuf.sync_to_device() return ol, fcbuf
def __init__(self, description, pkt_size, buf_dtype=np.int16, buf_words_per_pkt=2): super().__init__(description) # Init config register self.reset = 1 self.enable = 1 self.pkt_size = pkt_size-1 self.auto_restart = 0 self.reset = 0 # Init buffer self.buf = allocate(shape=(pkt_size * buf_words_per_pkt, ), dtype=np.int16)
def transfer(self, packetsize): """Returns a numpy array with inspected data of length packetsize. """ transfersize = int(np.ceil(packetsize / 8)) if transfersize > 4096 or transfersize < 2: raise ValueError( 'Packet size incorrect, should be in range 16 to 32768.') self._pgen.packetsize = transfersize buffer_re = allocate(shape=(transfersize * 8, ), dtype=np.int16) buffer_im = allocate(shape=(transfersize * 8, ), dtype=np.int16) self._dma_real.recvchannel.transfer(buffer_re) self._dma_imag.recvchannel.transfer(buffer_im) self._pgen.transfer = 1 self._dma_real.recvchannel.wait() self._dma_imag.recvchannel.wait() self._pgen.transfer = 0 re_data = np.array(buffer_re) * 2**-15 im_data = np.array(buffer_im) * 2**-15 buffer_re.freebuffer() buffer_im.freebuffer() c_data = re_data.astype('double') + 1j * im_data.astype('double') return c_data[0:packetsize]
def set_shape(self, shape): """Set the buffer shape by first freeing the existing buffer and then allocating a new buffer with the given tuple. Obtain the tuple product to set the packetsize of the data_inspector_module. """ self.buffer.freebuffer() lshape = list(shape) lshape[0] = lshape[0] * 2 tshape = tuple(lshape) self.buffer = allocate(shape=tshape, dtype=np.int16) product = 1 for i in shape: product *= i self.data_inspector_module.packetsize = product