class User(Elaboratable): def __init__(self): self.user_rx_mem = Memory(width=8, depth=32) self.user_tx_mem = Memory(width=8, depth=32, init=[ord(x) for x in "Hello, World!!\r\n"]) self.mem_r_port = self.user_tx_mem.read_port() self.mem_w_port = self.user_rx_mem.write_port() self.packet_received = Signal() self.transmit_ready = Signal() self.transmit_packet = Signal() def elaborate(self, platform): m = Module() rx_port = self.user_rx_mem.read_port() tx_port = self.user_tx_mem.write_port() m.submodules += [self.mem_r_port, self.mem_w_port, rx_port, tx_port] led1 = platform.request("user_led", 0) led2 = platform.request("user_led", 1) m.d.comb += [ tx_port.addr.eq(0), tx_port.en.eq(0), tx_port.data.eq(0), rx_port.addr.eq(0), ] m.d.sync += [ led1.eq(rx_port.data & 1), led2.eq((rx_port.data & 2) >> 1), ] with m.FSM(): with m.State("IDLE"): m.d.sync += self.transmit_packet.eq(0) with m.If(self.packet_received): m.next = "RX" with m.State("RX"): with m.If(self.transmit_ready): m.d.sync += self.transmit_packet.eq(1) m.next = "IDLE" return m
def elab(self, m): mem = Memory(width=self.width, depth=self.depth, simulate=self.is_sim) m.submodules['wp'] = wp = mem.write_port() m.submodules['rp'] = rp = mem.read_port(transparent=False) m.d.comb += [ wp.en.eq(self.w_en), wp.addr.eq(self.w_addr), wp.data.eq(self.w_data), rp.en.eq(1), rp.addr.eq(self.r_addr), self.r_data.eq(rp.data), ]
def test_rmii_rx(): import random from nmigen.back import pysim from nmigen import Memory crs_dv = Signal() rxd0 = Signal() rxd1 = Signal() mem = Memory(8, 128) mem_port = mem.write_port() mac_addr = [random.randint(0, 255) for _ in range(6)] rmii_rx = RMIIRx(mac_addr, mem_port, crs_dv, rxd0, rxd1) def testbench(): def tx_packet(): yield (crs_dv.eq(1)) # Preamble for _ in range(random.randint(10, 40)): yield (rxd0.eq(1)) yield (rxd1.eq(0)) yield # SFD yield (rxd0.eq(1)) yield (rxd1.eq(1)) yield # Data for txbyte in txbytes: for dibit in range(0, 8, 2): yield (rxd0.eq((txbyte >> (dibit + 0)) & 1)) yield (rxd1.eq((txbyte >> (dibit + 1)) & 1)) yield yield (crs_dv.eq(0)) # Finish clocking for _ in range(6): yield for _ in range(10): yield txbytes = [ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xF0, 0xDE, 0xF1, 0x38, 0x89, 0x40, 0x08, 0x00, 0x45, 0x00, 0x00, 0x54, 0x00, 0x00, 0x40, 0x00, 0x40, 0x01, 0xB6, 0xD0, 0xC0, 0xA8, 0x01, 0x88, 0xC0, 0xA8, 0x01, 0x00, 0x08, 0x00, 0x0D, 0xD9, 0x12, 0x1E, 0x00, 0x07, 0x3B, 0x3E, 0x0C, 0x5C, 0x00, 0x00, 0x00, 0x00, 0x13, 0x03, 0x0F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x20, 0x57, 0x6F, 0x72, 0x6C, 0x64, 0x48, 0x65, 0x6C, 0x6C, 0x6F, 0x20, 0x57, 0x6F, 0x72, 0x6C, 0x64, 0x48, 0x65, 0x6C, 0x6C, 0x6F, 0x20, 0x57, 0x6F, 0x72, 0x6C, 0x64, 0x48, 0x65, 0x6C, 0x6C, 0x6F, 0x20, 0x57, 0x6F, 0x72, 0x6C, 0x64, 0x48, 0x52, 0x32, 0x1F, 0x9E ] # Transmit first packet yield from tx_packet() # Check packet was received assert (yield rmii_rx.rx_valid) assert (yield rmii_rx.rx_len) == 102 assert (yield rmii_rx.rx_offset) == 0 mem_contents = [] for idx in range(102): mem_contents.append((yield mem[idx])) assert mem_contents == txbytes # Pause (inter-frame gap) for _ in range(20): yield assert (yield rmii_rx.rx_valid) == 0 # Transmit a second packet yield from tx_packet() # Check packet was received assert (yield rmii_rx.rx_valid) assert (yield rmii_rx.rx_len) == 102 assert (yield rmii_rx.rx_offset) == 102 mem_contents = [] for idx in range(102): mem_contents.append((yield mem[(102 + idx) % 128])) assert mem_contents == txbytes yield mod = Module() mod.submodules += rmii_rx, mem_port vcdf = open("rmii_rx.vcd", "w") with pysim.Simulator(mod, vcd_file=vcdf) as sim: sim.add_clock(1 / 50e6) sim.add_sync_process(testbench()) sim.run()
class MAC(Elaboratable): """ Ethernet RMII MAC. Clock domain: This module is clocked at the system clock frequency and generates an RMII clock domain internally. All its inputs and outputs are in the system clock domain. Parameters: * `clk_freq`: MAC's clock frequency * `phy_addr`: 5-bit address of the PHY * `mac_addr`: MAC address in standard XX:XX:XX:XX:XX:XX format Memory Ports: * `rx_port`: Read port into RX packet memory, 8 bytes by 2048 cells. * `tx_port`: Write port into TX packet memory, 8 bytes by 2048 cells. Pins: * `rmii`: signal group containing: txd0, txd1, txen, rxd0, rxd1, crs_dv, ref_clk * `mdio`: signal group containing: mdc, mdio * `phy_rst`: PHY RST pin (output, active low) * `eth_led`: Ethernet LED, active high, pulsed on packet traffic TX port: * `tx_start`: Pulse high to begin transmission of a packet from memory * `tx_len`: 11-bit length of packet to transmit * `tx_offset`: n-bit address offset of packet to transmit, with n=log2(tx_buf_size) RX port: * `rx_valid`: Held high while `rx_len` and `rx_offset` are valid * `rx_len`: 11-bit length of received packet * `rx_offset`: n-bit address offset of received packet, with n=log2(rx_buf_size) * `rx_ack`: Pulse high to acknowledge packet receipt Inputs: * `phy_reset`: Assert to reset the PHY, de-assert for normal operation Outputs: * `link_up`: High while link is established """ def __init__(self, clk_freq, phy_addr, mac_addr, rmii, mdio, phy_rst, eth_led, tx_buf_size=2048, rx_buf_size=2048): # Memory Ports self.rx_port = None # Assigned below self.tx_port = None # Assigned below # TX port self.tx_start = Signal() self.tx_len = Signal(11) self.tx_offset = Signal(max=tx_buf_size - 1) # RX port self.rx_ack = Signal() self.rx_valid = Signal() self.rx_len = Signal(11) self.rx_offset = Signal(max=rx_buf_size - 1) # Inputs self.phy_reset = Signal() # Outputs self.link_up = Signal() self.clk_freq = clk_freq self.phy_addr = phy_addr self.mac_addr = [int(x, 16) for x in mac_addr.split(":")] self.rmii = rmii self.mdio = mdio self.phy_rst = phy_rst self.eth_led = eth_led # Create packet memories and interface ports self.tx_mem = Memory(8, tx_buf_size) self.tx_port = self.tx_mem.write_port() self.rx_mem = Memory(8, rx_buf_size) self.rx_port = self.rx_mem.read_port(transparent=False) def elaborate(self, platform): m = Module() # Create RMII clock domain from RMII clock input cd = ClockDomain("rmii", reset_less=True) m.d.comb += cd.clk.eq(self.rmii.ref_clk) m.domains.rmii = cd # Create RX write and TX read ports for RMII use rx_port_w = self.rx_mem.write_port(domain="rmii") tx_port_r = self.tx_mem.read_port(domain="rmii", transparent=False) m.submodules += [self.rx_port, rx_port_w, self.tx_port, tx_port_r] m.d.comb += [self.rx_port.en.eq(1), tx_port_r.en.eq(1)] # Create submodules for PHY and RMII m.submodules.phy_manager = phy_manager = PHYManager( self.clk_freq, self.phy_addr, self.phy_rst, self.mdio.mdio, self.mdio.mdc) m.submodules.stretch = stretch = PulseStretch(int(1e6)) rmii_rx = RMIIRx(self.mac_addr, rx_port_w, self.rmii.crs_dv, self.rmii.rxd0, self.rmii.rxd1) rmii_tx = RMIITx(tx_port_r, self.rmii.txen, self.rmii.txd0, self.rmii.txd1) # Create FIFOs to interface to RMII modules rx_fifo = AsyncFIFO(width=11 + self.rx_port.addr.nbits, depth=4) tx_fifo = AsyncFIFO(width=11 + self.tx_port.addr.nbits, depth=4) m.d.comb += [ # RX FIFO rx_fifo.din.eq(Cat(rmii_rx.rx_offset, rmii_rx.rx_len)), rx_fifo.we.eq(rmii_rx.rx_valid), Cat(self.rx_offset, self.rx_len).eq(rx_fifo.dout), rx_fifo.re.eq(self.rx_ack), self.rx_valid.eq(rx_fifo.readable), # TX FIFO tx_fifo.din.eq(Cat(self.tx_offset, self.tx_len)), tx_fifo.we.eq(self.tx_start), Cat(rmii_tx.tx_offset, rmii_tx.tx_len).eq(tx_fifo.dout), tx_fifo.re.eq(rmii_tx.tx_ready), rmii_tx.tx_start.eq(tx_fifo.readable), # Other submodules phy_manager.phy_reset.eq(self.phy_reset), self.link_up.eq(phy_manager.link_up), stretch.trigger.eq(self.rx_valid), self.eth_led.eq(stretch.pulse), ] rdr = DomainRenamer({"read": "sync", "write": "rmii"}) wdr = DomainRenamer({"write": "sync", "read": "rmii"}) rr = DomainRenamer("rmii") m.submodules.rx_fifo = rdr(rx_fifo) m.submodules.tx_fifo = wdr(tx_fifo) m.submodules.rmii_rx = rr(rmii_rx) m.submodules.rmii_tx = rr(rmii_tx) return m
def elaborate(self, platform: Platform) -> Module: m = Module() snoop_addr = Record(self.pc_layout) snoop_valid = Signal() # ------------------------------------------------------------------------- # Performance counter # TODO: connect to CSR's performance counter with m.If(~self.s1_stall & self.s1_valid & self.s1_access): m.d.sync += self.access_cnt.eq(self.access_cnt + 1) with m.If(self.s2_valid & self.s2_miss & ~self.bus_valid & self.s2_access): m.d.sync += self.miss_cnt.eq(self.miss_cnt + 1) # ------------------------------------------------------------------------- way_layout = [('data', 32 * self.nwords), ('tag', self.s1_address.tag.shape()), ('valid', 1), ('sel_lru', 1), ('snoop_hit', 1)] if self.enable_write: way_layout.append(('sel_we', 1)) ways = Array( Record(way_layout, name='way_idx{}'.format(_way)) for _way in range(self.nways)) fill_cnt = Signal.like(self.s1_address.offset) # Check hit/miss way_hit = m.submodules.way_hit = Encoder(self.nways) for idx, way in enumerate(ways): m.d.comb += way_hit.i[idx].eq((way.tag == self.s2_address.tag) & way.valid) m.d.comb += self.s2_miss.eq(way_hit.n) if self.enable_write: # Asumiendo que hay un HIT, indicar que la vía que dió hit es en la cual se va a escribir m.d.comb += ways[way_hit.o].sel_we.eq(self.s2_we & self.s2_valid) # set the LRU if self.nways == 1: # One way: LRU is useless lru = Const(0) # self.nlines else: # LRU es un vector de N bits, cada uno indicado el set a reemplazar # como NWAY es máximo 2, cada LRU es de un bit lru = Signal(self.nlines) _lru = lru.bit_select(self.s2_address.line, 1) write_ended = self.bus_valid & self.bus_ack & self.bus_last # err ^ ack = = 1 access_hit = ~self.s2_miss & self.s2_valid & (way_hit.o == _lru) with m.If(write_ended | access_hit): m.d.sync += _lru.eq(~_lru) # read data from the cache m.d.comb += self.s2_rdata.eq(ways[way_hit.o].data.word_select( self.s2_address.offset, 32)) # Internal Snoop snoop_use_cache = Signal() snoop_tag_match = Signal() snoop_line_match = Signal() snoop_cancel_refill = Signal() if not self.enable_write: bits_range = log2_int(self.end_addr - self.start_addr, need_pow2=False) m.d.comb += [ snoop_addr.eq(self.dcache_snoop.addr), # aux snoop_valid.eq(self.dcache_snoop.we & self.dcache_snoop.valid & self.dcache_snoop.ack), snoop_use_cache.eq(snoop_addr[bits_range:] == ( self.start_addr >> bits_range)), snoop_tag_match.eq(snoop_addr.tag == self.s2_address.tag), snoop_line_match.eq(snoop_addr.line == self.s2_address.line), snoop_cancel_refill.eq(snoop_use_cache & snoop_valid & snoop_line_match & snoop_tag_match), ] else: m.d.comb += snoop_cancel_refill.eq(0) with m.FSM(): with m.State('READ'): with m.If(self.s2_re & self.s2_miss & self.s2_valid): m.d.sync += [ self.bus_addr.eq(self.s2_address), self.bus_valid.eq(1), fill_cnt.eq(self.s2_address.offset - 1) ] m.next = 'REFILL' with m.State('REFILL'): m.d.comb += self.bus_last.eq(fill_cnt == self.bus_addr.offset) with m.If(self.bus_ack): m.d.sync += self.bus_addr.offset.eq(self.bus_addr.offset + 1) with m.If(self.bus_ack & self.bus_last | self.bus_err): m.d.sync += self.bus_valid.eq(0) with m.If(~self.bus_valid | self.s1_flush | snoop_cancel_refill): m.next = 'READ' m.d.sync += self.bus_valid.eq(0) # mark the way to use (replace) m.d.comb += ways[lru.bit_select(self.s2_address.line, 1)].sel_lru.eq(self.bus_valid) # generate for N ways for way in ways: # create the memory structures for valid, tag and data. valid = Signal(self.nlines) # Valid bits tag_m = Memory(width=len(way.tag), depth=self.nlines) # tag memory tag_rp = tag_m.read_port() snoop_rp = tag_m.read_port() tag_wp = tag_m.write_port() m.submodules += tag_rp, tag_wp, snoop_rp data_m = Memory(width=len(way.data), depth=self.nlines) # data memory data_rp = data_m.read_port() data_wp = data_m.write_port( granularity=32 ) # implica que solo puedo escribir palabras de 32 bits. m.submodules += data_rp, data_wp # handle valid with m.If(self.s1_flush & self.s1_valid): # flush m.d.sync += valid.eq(0) with m.Elif(way.sel_lru & self.bus_last & self.bus_ack): # refill ok m.d.sync += valid.bit_select(self.bus_addr.line, 1).eq(1) with m.Elif(way.sel_lru & self.bus_err): # refill error m.d.sync += valid.bit_select(self.bus_addr.line, 1).eq(0) with m.Elif(self.s2_evict & self.s2_valid & (way.tag == self.s2_address.tag)): # evict m.d.sync += valid.bit_select(self.s2_address.line, 1).eq(0) # assignments m.d.comb += [ tag_rp.addr.eq( Mux(self.s1_stall, self.s2_address.line, self.s1_address.line)), tag_wp.addr.eq(self.bus_addr.line), tag_wp.data.eq(self.bus_addr.tag), tag_wp.en.eq(way.sel_lru & self.bus_ack & self.bus_last), data_rp.addr.eq( Mux(self.s1_stall, self.s2_address.line, self.s1_address.line)), way.data.eq(data_rp.data), way.tag.eq(tag_rp.data), way.valid.eq(valid.bit_select(self.s2_address.line, 1)) ] # update cache: CPU or Refill # El puerto de escritura se multiplexa debido a que la memoria solo puede tener un # puerto de escritura. if self.enable_write: update_addr = Signal(len(data_wp.addr)) update_data = Signal(len(data_wp.data)) update_we = Signal(len(data_wp.en)) aux_wdata = Signal(32) with m.If(self.bus_valid): m.d.comb += [ update_addr.eq(self.bus_addr.line), update_data.eq(Repl(self.bus_data, self.nwords)), update_we.bit_select(self.bus_addr.offset, 1).eq(way.sel_lru & self.bus_ack), ] with m.Else(): m.d.comb += [ update_addr.eq(self.s2_address.line), update_data.eq(Repl(aux_wdata, self.nwords)), update_we.bit_select(self.s2_address.offset, 1).eq(way.sel_we & ~self.s2_miss) ] m.d.comb += [ # Aux data: no tengo granularidad de byte en el puerto de escritura. Así que para el # caso en el cual el CPU tiene que escribir, hay que construir el dato (wrord) a reemplazar aux_wdata.eq( Cat( Mux(self.s2_sel[0], self.s2_wdata.word_select(0, 8), self.s2_rdata.word_select(0, 8)), Mux(self.s2_sel[1], self.s2_wdata.word_select(1, 8), self.s2_rdata.word_select(1, 8)), Mux(self.s2_sel[2], self.s2_wdata.word_select(2, 8), self.s2_rdata.word_select(2, 8)), Mux(self.s2_sel[3], self.s2_wdata.word_select(3, 8), self.s2_rdata.word_select(3, 8)))), # data_wp.addr.eq(update_addr), data_wp.data.eq(update_data), data_wp.en.eq(update_we), ] else: m.d.comb += [ data_wp.addr.eq(self.bus_addr.line), data_wp.data.eq(Repl(self.bus_data, self.nwords)), data_wp.en.bit_select(self.bus_addr.offset, 1).eq(way.sel_lru & self.bus_ack), ] # -------------------------------------------------------------- # intenal snoop # for FENCE.i instruction _match_snoop = Signal() m.d.comb += [ snoop_rp.addr.eq(snoop_addr.line), # read tag memory _match_snoop.eq(snoop_rp.data == snoop_addr.tag), way.snoop_hit.eq(snoop_use_cache & snoop_valid & _match_snoop & valid.bit_select(snoop_addr.line, 1)), ] # check is the snoop match a write from this core with m.If(way.snoop_hit): m.d.sync += valid.bit_select(snoop_addr.line, 1).eq(0) # -------------------------------------------------------------- return m
class USBAnalyzer(Elaboratable): """ Core USB analyzer; backed by a small ringbuffer in FPGA block RAM. If you're looking to instantiate a full analyzer, you'll probably want to grab one of the DRAM-based ringbuffer variants (which are currently forthcoming). If you're looking to use this with a ULPI PHY, rather than the FPGA-convenient UMTI interface, grab the UMTITranslator from `luna.gateware.interface.ulpi`. I/O port: O: data_available -- indicates that new data is available in the analysis stream O: data_out[8] -- the next byte in the captured stream; valid when data_available is asserted I: next -- strobe that indicates when the data_out byte has been accepted; and can be discarded from the local memory """ # Current, we'll provide a packet header of 16 bits. HEADER_SIZE_BITS = 16 HEADER_SIZE_BYTES = HEADER_SIZE_BITS // 8 # Support a maximum payload size of 1024B, plus a 1-byte PID and a 2-byte CRC16. MAX_PACKET_SIZE_BYTES = 1024 + 1 + 2 def __init__(self, *, umti_interface, mem_depth=8192): """ Parameters: umti_interface -- A record or elaboratable that presents a UMTI interface. """ self.umti = umti_interface # Internal storage memory. self.mem = Memory(width=8, depth=mem_depth, name="analysis_ringbuffer") self.mem_size = mem_depth # # I/O port # self.data_available = Signal() self.data_out = Signal(8) self.next = Signal() self.overrun = Signal() self.capturing = Signal() # Diagnostic I/O. self.sampling = Signal() def elaborate(self, platform): m = Module() # Memory read and write ports. m.submodules.read = mem_read_port = self.mem.read_port(domain="ulpi") m.submodules.write = mem_write_port = self.mem.write_port( domain="ulpi") # Store the memory address of our active packet header, which will store # packet metadata like the packet size. header_location = Signal.like(mem_write_port.addr) write_location = Signal.like(mem_write_port.addr) # Read FIFO status. read_location = Signal.like(mem_read_port.addr) fifo_count = Signal.like(mem_read_port.addr, reset=0) fifo_new_data = Signal() # Current receive status. packet_size = Signal(16) # # Read FIFO logic. # m.d.comb += [ # We have data ready whenever there's not data in the FIFO. self.data_available.eq(fifo_count != 0), # Our data_out is always the output of our read port... self.data_out.eq(mem_read_port.data), # ... and our read port always reads from our read pointer. mem_read_port.addr.eq(read_location), self.sampling.eq(mem_write_port.en) ] # Once our consumer has accepted our current data, move to the next address. with m.If(self.next & self.data_available): m.d.ulpi += read_location.eq(read_location + 1) # # FIFO count handling. # fifo_full = (fifo_count == self.mem_size) data_pop = Signal() data_push = Signal() m.d.comb += [ data_pop.eq(self.next & self.data_available), data_push.eq(fifo_new_data & ~fifo_full) ] # If we have both a read and a write, don't update the count, # as we've both added one and subtracted one. with m.If(data_push & data_pop): pass # Otherwise, add when data's added, and subtract when data's removed. with m.Elif(data_push): m.d.ulpi += fifo_count.eq(fifo_count + 1) with m.Elif(data_pop): m.d.ulpi += fifo_count.eq(fifo_count - 1) # # Core analysis FSM. # with m.FSM(domain="ulpi") as f: m.d.comb += [ self.overrun.eq(f.ongoing("OVERRUN")), self.capturing.eq(f.ongoing("CAPTURE")), ] # IDLE: wait for an active receive. with m.State("IDLE"): # Wait until a transmission is active. # TODO: add triggering logic? with m.If(self.umti.rx_active): m.next = "CAPTURE" m.d.ulpi += [ header_location.eq(write_location), write_location.eq(write_location + self.HEADER_SIZE_BYTES), packet_size.eq(0), ] #with m.If(self.umti.rx_valid): # m.d.ulpi += [ # fifo_count .eq(fifo_count + 1), # write_location .eq(write_location + self.HEADER_SIZE_BYTES + 1), # packet_size .eq(1) # ] # m.d.comb += [ # mem_write_port.addr .eq(write_location + self.HEADER_SIZE_BYTES), # mem_write_port.data .eq(self.umti.data_out), # mem_write_port.en .eq(1) # ] # Capture data until the packet is complete. with m.State("CAPTURE"): # Capture data whenever RxValid is asserted. m.d.comb += [ mem_write_port.addr.eq(write_location), mem_write_port.data.eq(self.umti.data_out), mem_write_port.en.eq(self.umti.rx_valid & self.umti.rx_active), fifo_new_data.eq(self.umti.rx_valid & self.umti.rx_active) ] # Advance the write pointer each time we receive a bit. with m.If(self.umti.rx_valid & self.umti.rx_active): m.d.ulpi += [ write_location.eq(write_location + 1), packet_size.eq(packet_size + 1) ] # If this would be filling up our data memory, # move to the OVERRUN state. with m.If(fifo_count == self.mem_size - 1 - self.HEADER_SIZE_BYTES): m.next = "OVERRUN" # If we've stopped receiving, move to the "finalize" state. with m.If(~self.umti.rx_active): # Optimization: if we didn't receive any data, there's no need # to create a packet. Clear our header from the FIFO and disarm. with m.If(packet_size == 0): m.next = "IDLE" m.d.ulpi += [write_location.eq(header_location)] with m.Else(): m.next = "EOP_1" # EOP: handle the end of the relevant packet. with m.State("EOP_1"): # Now that we're done, add the header to the start of our packet. # This will take two cycles, currently, as we're using a 2-byte header, # but we only have an 8-bit write port. m.d.comb += [ mem_write_port.addr.eq(header_location), mem_write_port.data.eq(packet_size[7:16]), #mem_write_port.data .eq(0xAA), mem_write_port.en.eq(1), fifo_new_data.eq(1) ] m.next = "EOP_2" with m.State("EOP_2"): # Add the second byte of our header. # Note that, if this is an adjacent read, we should have # just captured our packet header _during_ the stop turnaround. m.d.comb += [ mem_write_port.addr.eq(header_location + 1), mem_write_port.data.eq(packet_size[0:8]), mem_write_port.en.eq(1), fifo_new_data.eq(1) ] # Move to the next state, which will either be another capture, # or our idle state, depending on whether we have another rx. with m.If(self.umti.rx_active): m.next = "CAPTURE" m.d.ulpi += [ header_location.eq(write_location), write_location.eq(write_location + self.HEADER_SIZE_BYTES), packet_size.eq(0), ] # FIXME: capture if rx_valid with m.Else(): m.next = "IDLE" # BABBLE -- handles the case in which we've received a packet beyond # the allowable size in the USB spec with m.State("BABBLE"): # Trap here, for now. pass with m.State("OVERRUN"): # TODO: we should probably set an overrun flag and then emit an EOP, here? pass return m
def elaborate(self, platform): m = Module() # N is size of RAMs. N = self.M + 2 assert _is_power_of_2(N) # kernel_RAM is a buffer of convolution kernel coefficents. # It is read-only. The 0'th element is zero, because the kernel # has length N-1. kernel_RAM = Memory(width=COEFF_WIDTH, depth=256, init=kernel) # kernel_RAM = Memory(width=COEFF_WIDTH, depth=N, init=kernel) m.submodules.kr_port = kr_port = kernel_RAM.read_port() # sample_RAM is a circular buffer for incoming samples. # sample_RAM = Array( # Signal(signed(self.sample_depth), reset_less=True, reset=0) # for _ in range(N) # ) sample_RAM = Memory(width=self.sample_depth, depth=N, init=[0] * N) m.submodules.sw_port = sw_port = sample_RAM.write_port() m.submodules.sr_port = sr_port = sample_RAM.read_port() # The rotors index through sample_RAM. They have an extra MSB # so we can distinguish between buffer full and buffer empty. # # w_rotor: write rotor. Points to the next entry to be written. # s_rotor: start rotor. Points to the oldest valid entry. # r_rotor: read rotor. Points to the next entry to be read. # # The polyphase decimator reads each sample N / R times, so # `r_rotor` is **NOT** the oldest needed sample. Instead, # `s_rotor` is the oldest. `s_rotor` is incremented by `R` # at the start of each convolution. # # We initialize the rotors so that the RAM contains N-1 zero samples, # and `r_rotor` is pointing to the first sample to be used. # The convolution engine can start immediately and produce a zero # result. w_rotor = Signal(range(2 * N), reset=N) s_rotor = Signal(range(2 * N), reset=1) r_rotor = Signal(range(2 * N), reset=1) # `c_index` is the next kernel coefficient to read. # `c_index` == 0 indicates done, so start at 1. c_index = Signal(range(N), reset=1) # kernel coefficient index # Useful conditions buf_n_used = Signal(range(N + 1)) buf_is_empty = Signal() buf_is_full = Signal() buf_n_readable = Signal(range(N + 1)) buf_has_readable = Signal() m.d.comb += [ buf_n_used.eq(w_rotor - s_rotor), buf_is_empty.eq(buf_n_used == 0), buf_is_full.eq(buf_n_used == N), buf_n_readable.eq(w_rotor - r_rotor), buf_has_readable.eq(buf_n_readable != 0), # Assert(buf_n_used <= N), # Assert(buf_n_readable <= buf_n_used), ] # put incoming samples into sample_RAM. m.d.comb += [ self.samples_in.o_ready.eq(~buf_is_full), sw_port.addr.eq(w_rotor[:-1]), sw_port.data.eq(self.samples_in.i_data), ] m.d.sync += sw_port.en.eq(self.samples_in.received()) with m.If(self.samples_in.received()): m.d.sync += [ # sample_RAM[w_rotor[:-1]].eq(self.samples_in.i_data), w_rotor.eq(w_rotor + 1), ] # The convolution is pipelined. # # stage 0: fetch coefficient and sample from their RAMs. # stage 1: multiply coefficient and sample. # stage 2: add product to accumulator. # stage 3: if complete, try to send accumulated sample. # p_valid = Signal(4) p_ready = Array( Signal(name=f'p_ready{i}', reset=True) for i in range(4)) p_complete = Signal(4) m.d.sync += [ p_valid[1:].eq(p_valid[:-1]), p_ready[0].eq(p_ready[1]), p_ready[1].eq(p_ready[2]), p_ready[2].eq(p_ready[3]), ] # calculation variables coeff = Signal(COEFF_SHAPE) sample = Signal(signed(self.sample_depth)) prod = Signal(signed(COEFF_WIDTH + self.sample_depth)) acc = Signal(signed(self.acc_width)) # Stage 0. en0 = Signal() m.d.comb += en0.eq(buf_has_readable & p_ready[0] * (c_index != 0)) # with m.If(en0): # m.d.sync += [ # coeff.eq(kernel_RAM[c_index]), # ] m.d.comb += coeff.eq(kr_port.data) with m.If(en0): m.d.sync += [ sample.eq(sample_RAM[r_rotor[:-1]]), ] with m.If(en0): m.d.sync += [ c_index.eq(c_index + 1), r_rotor.eq(r_rotor + 1), p_valid[0].eq(True), p_complete[0].eq(False), # kr_port.addr.eq(c_index + 1), ] m.d.comb += kr_port.addr.eq(c_index) # with m.If(buf_has_readable & p_ready[0] & (c_index != 0)): # m.d.sync += [ # coeff.eq(kernel_RAM[c_index]), # sample.eq(sample_RAM[r_rotor[:-1]]), # c_index.eq(c_index + 1), # r_rotor.eq(r_rotor + 1), # p_valid[0].eq(True), # p_complete[0].eq(False), # ] with m.If((~buf_has_readable | ~p_ready[0]) & (c_index != 0)): m.d.sync += [ p_valid[0].eq(False), p_complete[0].eq(False), ] # When c_index is zero, all convolution samples have been read. # Set up the rotors for the next sample. (and pause the # pipelined calculation.) with m.If(c_index == 0): m.d.sync += [ c_index.eq(c_index + 1), s_rotor.eq(s_rotor + self.R), r_rotor.eq(s_rotor + self.R), p_valid[0].eq(False), p_complete[0].eq(True), ] # Stage 1. with m.If(p_valid[1] & p_ready[1]): m.d.sync += [ prod.eq(coeff * sample), p_complete[1].eq(p_complete[0]), ] # Stage 2. with m.If(p_valid[2] & p_ready[2]): m.d.sync += [ acc.eq(acc + prod), p_complete[2].eq(p_complete[1]), ] # Stage 3. m.d.comb += p_ready[3].eq(~self.samples_out.full()) m.d.sync += p_complete[3].eq(p_complete[3] | p_complete[2]) with m.If(p_valid[3] & p_ready[3] & p_complete[2]): m.d.sync += [ self.samples_out.o_valid.eq(1), self.samples_out.o_data.eq(acc[self.shift:]), acc.eq(0), p_complete[3].eq(False), ] with m.If(self.samples_out.sent()): m.d.sync += [ self.samples_out.o_valid.eq(0), ] # # s_in_index = Signal(range(2 * N), reset=N) # # s_out_index = Signal(range(2 * N), reset=self.R + 1) # # c_index = Signal(range(N), reset=1) # # start_index = Signal(range(2 * N), reset=self.R) # s_in_index = Signal(range(2 * N), reset=N) # s_out_index = Signal(range(2 * N)) # c_index = Signal(range(N)) # start_index = Signal(range(2 * N)) # # coeff = Signal(signed(16)) # sample = Signal(signed(self.sample_depth)) # prod = Signal(signed(2 * self.sample_depth)) # acc = Signal(signed(self.acc_width)) # # m = Module() # # n_full = (s_in_index - start_index - 1)[:s_in_index.shape()[0]] # n_avail = (s_in_index - s_out_index)[:-1] # print(f's_in_index shape = {s_in_index.shape()}') # print(f'start_index shape = {start_index.shape()}') # print(f'n_full shape = {n_full.shape()}') # print(f'n_avail shape = {n_avail.shape()}') # full = Signal(n_full.shape()) # avail = Signal(n_avail.shape()) # m.d.comb += full.eq(n_full) # m.d.comb += avail.eq(n_avail) # # # # input process stores new samples in the ring buffer. # # with m.If(self.samples_in.received()): # # m.d.sync += [ # # sample_RAM[s_in_index[:-1]].eq(self.samples_in.i_data), # # ] # with m.If(self.samples_in.received()): # m.d.sync += [ # sample_RAM[s_in_index[:-1]].eq(self.samples_in.i_data), # s_in_index.eq(s_in_index + 1), # ] # m.d.sync += [ # self.samples_in.o_ready.eq(n_full < N), # ] # # # convolution process # sample_ready = Signal() # sample_ready = n_avail > 0 # # # with m.If(c_index == 1): # # with m.If(sample_ready): # # m.d.sync += [ # # start_index.eq(start_index + self.R), # # s_out_index.eq(start_index + self.R + 1), # # ] # # with m.Else(): # # with m.If(sample_ready): # # m.d.sync += [ # # s_out_index.eq(s_out_index + 1), # # ] # # with m.Else(): # # with m.If(sample_ready): # # m.d.sync += [ # # s_out_index.eq(s_out_index + 1), # # ] # # with m.If(sample_ready): # with m.If(c_index == 1): # m.d.sync += [ # start_index.eq(start_index + self.R), # s_out_index.eq(start_index + self.R + 1), # ] # with m.Else(): # m.d.sync += [ # s_out_index.eq(s_out_index + 1), # ] # # with m.If(c_index == 2): # with m.If(~self.samples_out.full()): # m.d.sync += [ # self.samples_out.o_valid.eq(True), # self.samples_out.o_data.eq(acc[self.shift:]), # acc.eq(0), # c_index.eq(c_index + 1), # ] # with m.Else(): # with m.If(sample_ready): # m.d.sync += [ # acc.eq(acc + prod), # c_index.eq(c_index + 1), # ] # # with m.If(sample_ready): # m.d.sync += [ # prod.eq(coeff * sample), # ] # # with m.If(sample_ready): # m.d.sync += [ # coeff.eq(kernel_RAM[c_index]), # ] # # with m.If(sample_ready): # m.d.sync += [ # sample.eq(sample_RAM[s_out_index]), # ] # # with m.If(self.samples_out.sent()): # m.d.sync += [ # self.samples_out.o_valid.eq(False), # ] # # # convolution process # # if c_index == 0: # # if sample_ready: # # start_index += R # # s_out_index = start_index + R + 1 # # else: # # if sample_ready: # # s_out_index += 1 # # # # if c_index == 1: # # if not out_full: # # out_sample = acc[shift:] # # out_valid = True # # acc = 0 # # c_index += 1 # # else # # if sample_ready: # # acc += prod # # c_index += 1 # # # # if sample_ready: # # prod = coeff * sample # # # # if sample_ready: # # coeff = kernel_RAM[c_index] # # # # if sample_ready: # # sample = sample_RAM[s_out_index] # # # # convolution process # # fill = Signal(range(N + 1)) # # m.d.comb += fill.eq(s_in_index - s_out_index) # # with m.FSM(): # # # # with m.State('RUN'): # # # when s_out_index MSB is zero, sample is ready. # # # when MSB is one, test out_idx < in_idx. # # # extended_in_index = Cat(s_in_index, 1) # # sample_ready = (fill > 0) & (fill <= N) # # # sample_ready = s_out_index < extended_in_index # # # xii = Signal.like(s_out_index) # # sr = Signal() # # # m.d.comb += xii.eq(extended_in_index) # # m.d.comb += sr.eq(sample_ready) # # with m.If(sample_ready): # # m.d.sync += [ # # acc.eq(acc + prod), # sign extension is automatic # # prod.eq(coeff * sample), # # coeff.eq(kernel_RAM[c_index]), # # sample.eq(sample_RAM[s_out_index[:-1]]), # # c_index.eq(c_index + 1), # # s_out_index.eq(s_out_index + 1), # # ] # # with m.If(c_index == 0): # # m.next = 'DONE' # # with m.If(self.samples_out.sent()): # # m.d.sync += [ # # self.samples_out.o_valid.eq(False), # # ] # # # # with m.State('DONE'): # # with m.If(~self.samples_out.full()): # # m.d.sync += [ # # self.samples_out.o_valid.eq(True), # # self.samples_out.o_data.eq(acc[self.shift:]), # # c_index.eq(1), # # start_index.eq(start_index + self.R), # # # s_out_index[:-1].eq(start_index + self.R + 1), # # # s_out_index[-1].eq(0), # # s_out_index.eq(start_index + self.R + 1), # # coeff.eq(0), # # sample.eq(0), # # prod.eq(0), # # acc.eq(0), # # ] # # m.next = 'RUN' # # # # i_ready = true # # if received: # # sample_RAM[in_index] = samples_in.i_data # # in_index += 1 # # # # FSM: # # start: # # if start_index + cv_index < in_index: <<<<< Wrong # # acc += sign_extend(prod) # # prod = coeff * sample # # coeff = kernel_RAM[cv_index] # # sample = sample_RAM[(start_index + cv_index)[:-1]] # # # # if cv_index + 1 == 0: # # goto done # # # # done: # # o_valid = true # # o_data = acc[shift:shift + 16] # # acc = 0 # # start_index = (start_index + R) % Mp2 # # cv_index = 0 # # prod = 0 # # coeff = 0 # # sample = 0 # # # # if sent: # # o_valid = false return m
def elaborate(self, platform): cpu = Module() # ---------------------------------------------------------------------- # create the pipeline stages a = cpu.submodules.a = Stage(None, _af_layout) f = cpu.submodules.f = Stage(_af_layout, _fd_layout) d = cpu.submodules.d = Stage(_fd_layout, _dx_layout) x = cpu.submodules.x = Stage(_dx_layout, _xm_layout) m = cpu.submodules.m = Stage(_xm_layout, _mw_layout) w = cpu.submodules.w = Stage(_mw_layout, None) # ---------------------------------------------------------------------- # connect the stages cpu.d.comb += [ a.endpoint_b.connect(f.endpoint_a), f.endpoint_b.connect(d.endpoint_a), d.endpoint_b.connect(x.endpoint_a), x.endpoint_b.connect(m.endpoint_a), m.endpoint_b.connect(w.endpoint_a) ] # ---------------------------------------------------------------------- # units adder = cpu.submodules.adder = AdderUnit() logic = cpu.submodules.logic = LogicUnit() shifter = cpu.submodules.shifter = ShifterUnit() compare = cpu.submodules.compare = CompareUnit() decoder = cpu.submodules.decoder = DecoderUnit(self.configuration) exception = cpu.submodules.exception = ExceptionUnit( self.configuration) data_sel = cpu.submodules.data_sel = DataFormat() csr = cpu.submodules.csr = CSRFile() if (self.configuration.getOption('icache', 'enable')): fetch = cpu.submodules.fetch = CachedFetchUnit(self.configuration) else: fetch = cpu.submodules.fetch = BasicFetchUnit() if (self.configuration.getOption('dcache', 'enable')): lsu = cpu.submodules.lsu = CachedLSU(self.configuration) else: lsu = cpu.submodules.lsu = BasicLSU() if self.configuration.getOption('isa', 'enable_rv32m'): multiplier = cpu.submodules.multiplier = Multiplier() divider = cpu.submodules.divider = Divider() if self.configuration.getOption('predictor', 'enable_predictor'): predictor = cpu.submodules.predictor = BranchPredictor( self.configuration) # ---------------------------------------------------------------------- # register file (GPR) gprf = Memory(width=32, depth=32) gprf_rp1 = gprf.read_port() gprf_rp2 = gprf.read_port() gprf_wp = gprf.write_port() cpu.submodules += gprf_rp1, gprf_rp2, gprf_wp # ---------------------------------------------------------------------- # CSR csr.add_csr_from_list(exception.csr.csr_list) csr_rp = csr.create_read_port() csr_wp = csr.create_write_port() # ---------------------------------------------------------------------- # forward declaration of signals fwd_x_rs1 = Signal() fwd_m_rs1 = Signal() fwd_w_rs1 = Signal() fwd_x_rs2 = Signal() fwd_m_rs2 = Signal() fwd_w_rs2 = Signal() x_result = Signal(32) m_result = Signal(32) w_result = Signal(32) m_kill_bj = Signal() # ---------------------------------------------------------------------- # Address Stage a_next_pc = Signal(32) a_next_pc_q = Signal(32) a_next_pc_fu = Signal(32) latched_pc = Signal() # set the reset value. # to (RA -4) because the value to feed the fetch unit is the next pc: a.endpoint_b.pc.reset = self.configuration.getOption( 'reset', 'reset_address') - 4 # select next pc with cpu.If(exception.m_exception & m.valid): cpu.d.comb += a_next_pc.eq(exception.csr.mtvec.read) # exception with cpu.Elif(m.endpoint_a.mret & m.valid): cpu.d.comb += a_next_pc.eq(exception.csr.mepc.read) # mret if (self.configuration.getOption('predictor', 'enable_predictor')): with cpu.Elif((m.endpoint_a.prediction & m.endpoint_a.branch) & ~m.endpoint_a.take_jmp_branch & m.valid): cpu.d.comb += a_next_pc.eq(m.endpoint_a.pc + 4) # branch not taken with cpu.Elif(~(m.endpoint_a.prediction & m.endpoint_a.branch) & m.endpoint_a.take_jmp_branch & m.valid): cpu.d.comb += a_next_pc.eq( m.endpoint_a.jmp_branch_target) # branck taken with cpu.Elif(predictor.f_prediction): cpu.d.comb += a_next_pc.eq( predictor.f_prediction_pc) # prediction else: with cpu.Elif(m.endpoint_a.take_jmp_branch & m.valid): cpu.d.comb += a_next_pc.eq( m.endpoint_a.jmp_branch_target) # jmp/branch with cpu.Elif(x.endpoint_a.fence_i & x.valid): cpu.d.comb += a_next_pc.eq(x.endpoint_a.pc + 4) # fence_i. with cpu.Else(): cpu.d.comb += a_next_pc.eq(f.endpoint_a.pc + 4) with cpu.If(f.stall): with cpu.If(f.kill & ~latched_pc): cpu.d.sync += [a_next_pc_q.eq(a_next_pc), latched_pc.eq(1)] with cpu.Else(): cpu.d.sync += latched_pc.eq(0) with cpu.If(latched_pc): cpu.d.comb += a_next_pc_fu.eq(a_next_pc_q) with cpu.Else(): cpu.d.comb += a_next_pc_fu.eq(a_next_pc) cpu.d.comb += [ fetch.a_pc.eq(a_next_pc_fu), fetch.a_stall.eq(a.stall), fetch.a_valid.eq(a.valid), ] cpu.d.comb += a.valid.eq(1) # the stage is always valid # ---------------------------------------------------------------------- # Fetch Stage cpu.d.comb += fetch.iport.connect( self.iport) # connect the wishbone port cpu.d.comb += [fetch.f_stall.eq(f.stall), fetch.f_valid.eq(f.valid)] f_kill_r = Signal() with cpu.If(f.stall): with cpu.If(f_kill_r == 0): cpu.d.sync += f_kill_r.eq(f.kill) with cpu.Else(): cpu.d.sync += f_kill_r.eq(0) if (self.configuration.getOption('icache', 'enable')): cpu.d.comb += [ fetch.flush.eq(x.endpoint_a.fence_i & x.valid & ~x.stall), fetch.f_pc.eq(f.endpoint_a.pc) ] f.add_kill_source(f_kill_r) f.add_stall_source(fetch.f_busy) f.add_kill_source(exception.m_exception & m.valid) f.add_kill_source(m.endpoint_a.mret & m.valid) f.add_kill_source(m_kill_bj) f.add_kill_source(x.endpoint_a.fence_i & x.valid & ~x.stall) # ---------------------------------------------------------------------- # Decode Stage cpu.d.comb += decoder.instruction.eq(d.endpoint_a.instruction) with cpu.If(~d.stall): cpu.d.comb += [ gprf_rp1.addr.eq(fetch.f_instruction[15:20]), gprf_rp2.addr.eq(fetch.f_instruction[20:25]) ] with cpu.Else(): cpu.d.comb += [ gprf_rp1.addr.eq(decoder.gpr_rs1), gprf_rp2.addr.eq(decoder.gpr_rs2) ] cpu.d.comb += [ gprf_wp.addr.eq(w.endpoint_a.gpr_rd), gprf_wp.data.eq(w_result), gprf_wp.en.eq(w.endpoint_a.gpr_we & w.valid) ] rs1_data = Signal(32) rs2_data = Signal(32) # select data for RS1 with cpu.If(decoder.aiupc): cpu.d.comb += rs1_data.eq(d.endpoint_a.pc) with cpu.Elif((decoder.gpr_rs1 == 0) | decoder.lui): cpu.d.comb += rs1_data.eq(0) with cpu.Elif(fwd_x_rs1 & x.valid): cpu.d.comb += rs1_data.eq(x_result) with cpu.Elif(fwd_m_rs1 & m.valid): cpu.d.comb += rs1_data.eq(m_result) with cpu.Elif(fwd_w_rs1 & w.valid): cpu.d.comb += rs1_data.eq(w_result) with cpu.Else(): cpu.d.comb += rs1_data.eq(gprf_rp1.data) # select data for RS2 with cpu.If(decoder.csr): cpu.d.comb += rs2_data.eq(0) with cpu.Elif(~decoder.gpr_rs2_use): cpu.d.comb += rs2_data.eq(decoder.immediate) with cpu.Elif(decoder.gpr_rs2 == 0): cpu.d.comb += rs2_data.eq(0) with cpu.Elif(fwd_x_rs2 & x.valid): cpu.d.comb += rs2_data.eq(x_result) with cpu.Elif(fwd_m_rs2 & m.valid): cpu.d.comb += rs2_data.eq(m_result) with cpu.Elif(fwd_w_rs2 & w.valid): cpu.d.comb += rs2_data.eq(w_result) with cpu.Else(): cpu.d.comb += rs2_data.eq(gprf_rp2.data) # Check if the forwarding is needed cpu.d.comb += [ fwd_x_rs1.eq((decoder.gpr_rs1 == x.endpoint_a.gpr_rd) & (decoder.gpr_rs1 != 0) & x.endpoint_a.gpr_we), fwd_m_rs1.eq((decoder.gpr_rs1 == m.endpoint_a.gpr_rd) & (decoder.gpr_rs1 != 0) & m.endpoint_a.gpr_we), fwd_w_rs1.eq((decoder.gpr_rs1 == w.endpoint_a.gpr_rd) & (decoder.gpr_rs1 != 0) & w.endpoint_a.gpr_we), fwd_x_rs2.eq((decoder.gpr_rs2 == x.endpoint_a.gpr_rd) & (decoder.gpr_rs2 != 0) & x.endpoint_a.gpr_we), fwd_m_rs2.eq((decoder.gpr_rs2 == m.endpoint_a.gpr_rd) & (decoder.gpr_rs2 != 0) & m.endpoint_a.gpr_we), fwd_w_rs2.eq((decoder.gpr_rs2 == w.endpoint_a.gpr_rd) & (decoder.gpr_rs2 != 0) & w.endpoint_a.gpr_we), ] d.add_stall_source(((fwd_x_rs1 & decoder.gpr_rs1_use) | (fwd_x_rs2 & decoder.gpr_rs2_use)) & ~x.endpoint_a.needed_in_x & x.valid) d.add_stall_source(((fwd_m_rs1 & decoder.gpr_rs1_use) | (fwd_m_rs2 & decoder.gpr_rs2_use)) & ~m.endpoint_a.needed_in_m & m.valid) d.add_kill_source(exception.m_exception & m.valid) d.add_kill_source(m.endpoint_a.mret & m.valid) d.add_kill_source(m_kill_bj) d.add_kill_source(x.endpoint_a.fence_i & x.valid & ~x.stall) # ---------------------------------------------------------------------- # Execute Stage x_branch_target = Signal(32) x_take_jmp_branch = Signal() cpu.d.comb += [ x_branch_target.eq(x.endpoint_a.pc + x.endpoint_a.immediate), x_take_jmp_branch.eq(x.endpoint_a.jump | (x.endpoint_a.branch & compare.cmp_ok)) ] cpu.d.comb += [ adder.dat1.eq(x.endpoint_a.src_data1), adder.dat2.eq( Mux(x.endpoint_a.store, x.endpoint_a.immediate, x.endpoint_a.src_data2)), adder.sub.eq((x.endpoint_a.arithmetic & x.endpoint_a.add_sub) | x.endpoint_a.compare | x.endpoint_a.branch) ] cpu.d.comb += [ logic.op.eq(x.endpoint_a.funct3), logic.dat1.eq(x.endpoint_a.src_data1), logic.dat2.eq(x.endpoint_a.src_data2) ] cpu.d.comb += [ shifter.direction.eq(x.endpoint_a.shift_dir), shifter.sign_ext.eq(x.endpoint_a.shift_sign), shifter.dat.eq(x.endpoint_a.src_data1), shifter.shamt.eq(x.endpoint_a.src_data2), shifter.stall.eq(x.stall) ] cpu.d.comb += [ compare.op.eq(x.endpoint_a.funct3), compare.zero.eq(adder.result == 0), compare.negative.eq(adder.result[-1]), compare.overflow.eq(adder.overflow), compare.carry.eq(adder.carry) ] # select result with cpu.If(x.endpoint_a.logic): cpu.d.comb += x_result.eq(logic.result) with cpu.Elif(x.endpoint_a.jump): cpu.d.comb += x_result.eq(x.endpoint_a.pc + 4) if (self.configuration.getOption('isa', 'enable_rv32m')): with cpu.Elif(x.endpoint_a.multiplier): cpu.d.comb += x_result.eq(multiplier.result) with cpu.Else(): cpu.d.comb += x_result.eq(adder.result) # load/store unit cpu.d.comb += [ data_sel.x_funct3.eq(x.endpoint_a.funct3), data_sel.x_offset.eq(adder.result[:2]), data_sel.x_store_data.eq(x.endpoint_a.src_data2), ] cpu.d.comb += [ lsu.x_addr.eq(adder.result), lsu.x_data_w.eq(data_sel.x_data_w), lsu.x_store.eq(x.endpoint_a.store), lsu.x_load.eq(x.endpoint_a.load), lsu.x_byte_sel.eq(data_sel.x_byte_sel), lsu.x_valid.eq(x.valid & ~data_sel.x_misaligned), lsu.x_stall.eq(x.stall) ] if (self.configuration.getOption('dcache', 'enable')): cpu.d.comb += lsu.x_fence_i.eq(x.valid & x.endpoint_a.fence_i) x.add_stall_source(x.valid & x.endpoint_a.fence_i & m.valid & m.endpoint_a.store) if (self.configuration.getOption('isa', 'enable_rv32m')): x.add_stall_source(x.valid & x.endpoint_a.multiplier & ~multiplier.ready) if (self.configuration.getOption('dcache', 'enable')): x.add_stall_source(x.valid & lsu.x_busy) x.add_kill_source(exception.m_exception & m.valid) x.add_kill_source(m.endpoint_a.mret & m.valid) x.add_kill_source(m_kill_bj) # ---------------------------------------------------------------------- # Memory (and CSR) Stage csr_wdata = Signal(32) # jump/branch if (self.configuration.getOption('predictor', 'enable_predictor')): cpu.d.comb += m_kill_bj.eq(( (m.endpoint_a.prediction & m.endpoint_a.branch) ^ m.endpoint_a.take_jmp_branch) & m.valid) else: cpu.d.comb += m_kill_bj.eq(m.endpoint_a.take_jmp_branch & m.valid) cpu.d.comb += lsu.dport.connect( self.dport) # connect the wishbone port # select result with cpu.If(m.endpoint_a.shifter): cpu.d.comb += m_result.eq(shifter.result) with cpu.Elif(m.endpoint_a.compare): cpu.d.comb += m_result.eq(m.endpoint_a.compare_result) if (self.configuration.getOption('isa', 'enable_rv32m')): with cpu.Elif(m.endpoint_a.divider): cpu.d.comb += m_result.eq(divider.result) with cpu.Else(): cpu.d.comb += m_result.eq(m.endpoint_a.result) cpu.d.comb += [ data_sel.m_data_r.eq(lsu.m_load_data), data_sel.m_funct3.eq(m.endpoint_a.funct3), data_sel.m_offset.eq(m.endpoint_a.result) ] cpu.d.comb += [lsu.m_valid.eq(m.valid), lsu.m_stall.eq(m.stall)] if (self.configuration.getOption('dcache', 'enable')): cpu.d.comb += [ lsu.m_addr.eq(m.endpoint_a.result), lsu.m_load.eq(m.endpoint_a.load), lsu.m_store.eq(m.endpoint_a.store) ] csr_src0 = Signal(32) csr_src = Signal(32) cpu.d.comb += [ csr_src0.eq( Mux(m.endpoint_a.funct3[2], m.endpoint_a.instruction[15:20], m.endpoint_a.result)), csr_src.eq( Mux(m.endpoint_a.funct3[:2] == 0b11, ~csr_src0, csr_src0)) ] with cpu.If(m.endpoint_a.funct3[:2] == 0b01): # write cpu.d.comb += csr_wdata.eq(csr_src) with cpu.Elif(m.endpoint_a.funct3[:2] == 0b10): # set cpu.d.comb += csr_wdata.eq(csr_rp.data | csr_src) with cpu.Else(): # clear cpu.d.comb += csr_wdata.eq(csr_rp.data & csr_src) # csr cpu.d.comb += [ csr_rp.addr.eq(m.endpoint_a.csr_addr), csr_wp.addr.eq(m.endpoint_a.csr_addr), csr_wp.en.eq(m.endpoint_a.csr_we & m.valid), csr_wp.data.eq(csr_wdata) ] # exception unit cpu.d.comb += [ exception.external_interrupt.eq(self.external_interrupt), exception.software_interrupt.eq(self.software_interrupt), exception.timer_interrupt.eq(self.timer_interrupt), exception.m_fetch_misalign.eq(m.endpoint_a.take_jmp_branch & ( m.endpoint_a.jmp_branch_target[:2] != 0)), exception.m_fetch_error.eq(m.endpoint_a.fetch_error), exception.m_illegal.eq(m.endpoint_a.illegal | (m.endpoint_a.csr & csr.invalid)), exception.m_load_misalign.eq(m.endpoint_a.ls_misalign & m.endpoint_a.load), exception.m_load_error.eq(lsu.m_load_error), exception.m_store_misalign.eq(m.endpoint_a.ls_misalign & m.endpoint_a.store), exception.m_store_error.eq(lsu.m_store_error), exception.m_ecall.eq(m.endpoint_a.ecall), exception.m_ebreak.eq(m.endpoint_a.ebreak), exception.m_mret.eq(m.endpoint_a.mret), exception.m_pc.eq(m.endpoint_a.pc), exception.m_instruction.eq(m.endpoint_a.instruction), exception.m_fetch_badaddr.eq(m.endpoint_a.fetch_badaddr), exception.m_pc_misalign.eq(m.endpoint_a.jmp_branch_target), exception.m_ls_misalign.eq(m.endpoint_a.result), exception.m_load_store_badaddr.eq(lsu.m_badaddr), exception.m_store.eq(m.endpoint_a.store), exception.m_valid.eq(m.valid), exception.m_stall.eq(m.stall) ] m.add_stall_source(m.valid & lsu.m_busy) if (self.configuration.getOption('isa', 'enable_rv32m')): m.add_stall_source(divider.busy) m.add_kill_source(exception.m_exception & m.valid) # ---------------------------------------------------------------------- # Write-back stage if self.configuration.getOption('isa', 'enable_extra_csr'): cpu.d.comb += exception.w_retire.eq(w.endpoint_a.is_instruction) with cpu.If(w.endpoint_a.load): cpu.d.comb += w_result.eq(w.endpoint_a.ld_result) with cpu.Elif(w.endpoint_a.csr): cpu.d.comb += w_result.eq(w.endpoint_a.csr_result) with cpu.Else(): cpu.d.comb += w_result.eq(w.endpoint_a.result) # ---------------------------------------------------------------------- # Optional units: Multiplier/Divider if (self.configuration.getOption('isa', 'enable_rv32m')): cpu.d.comb += [ multiplier.op.eq(x.endpoint_a.funct3), multiplier.dat1.eq(x.endpoint_a.src_data1), multiplier.dat2.eq(x.endpoint_a.src_data2), multiplier.valid.eq(x.endpoint_a.multiplier & x.valid) ] cpu.d.comb += [ divider.op.eq(x.endpoint_a.funct3), divider.dat1.eq(x.endpoint_a.src_data1), divider.dat2.eq(x.endpoint_a.src_data2), divider.stall.eq(x.stall), divider.start.eq(x.endpoint_a.divider) ] # ---------------------------------------------------------------------- # Optional units: branch predictor if (self.configuration.getOption('predictor', 'enable_predictor')): cpu.d.comb += [ predictor.a_pc.eq(a_next_pc_fu), predictor.a_stall.eq(a.stall), predictor.f_pc.eq(f.endpoint_a.pc), predictor.m_prediction_state.eq(m.endpoint_a.prediction_state), predictor.m_take_jmp_branch.eq(m.endpoint_a.take_jmp_branch & m.valid), predictor.m_pc.eq(m.endpoint_a.pc), predictor.m_target_pc.eq(m.endpoint_a.jmp_branch_target), predictor.m_update.eq(m.endpoint_a.branch & m.valid) ] # ---------------------------------------------------------------------- # Pipeline registers # A -> F with cpu.If(~a.stall): cpu.d.sync += a.endpoint_b.pc.eq(a_next_pc_fu) # F -> D with cpu.If(~f.stall): cpu.d.sync += [ f.endpoint_b.pc.eq(f.endpoint_a.pc), f.endpoint_b.instruction.eq(fetch.f_instruction), f.endpoint_b.fetch_error.eq(fetch.f_bus_error), f.endpoint_b.fetch_badaddr.eq(fetch.f_badaddr) ] if (self.configuration.getOption('predictor', 'enable_predictor')): cpu.d.sync += [ f.endpoint_b.prediction.eq(predictor.f_prediction), f.endpoint_b.prediction_state.eq( predictor.f_prediction_state) ] # D -> X with cpu.If(~d.stall): cpu.d.sync += [ d.endpoint_b.pc.eq(d.endpoint_a.pc), d.endpoint_b.instruction.eq(d.endpoint_a.instruction), d.endpoint_b.gpr_rd.eq(decoder.gpr_rd), d.endpoint_b.gpr_we.eq(decoder.gpr_we), d.endpoint_b.src_data1.eq(rs1_data), d.endpoint_b.src_data2.eq(rs2_data), d.endpoint_b.immediate.eq(decoder.immediate), d.endpoint_b.funct3.eq(decoder.funct3), d.endpoint_b.gpr_rs1_use.eq(decoder.gpr_rs1_use), d.endpoint_b.needed_in_x.eq(decoder.needed_in_x), d.endpoint_b.needed_in_m.eq(decoder.needed_in_m), d.endpoint_b.arithmetic.eq(decoder.aritmetic), d.endpoint_b.logic.eq(decoder.logic), d.endpoint_b.shifter.eq(decoder.shift), d.endpoint_b.jump.eq(decoder.jump), d.endpoint_b.branch.eq(decoder.branch), d.endpoint_b.compare.eq(decoder.compare), d.endpoint_b.load.eq(decoder.load), d.endpoint_b.store.eq(decoder.store), d.endpoint_b.csr.eq(decoder.csr), d.endpoint_b.add_sub.eq(decoder.substract), d.endpoint_b.shift_dir.eq(decoder.shift_direction), d.endpoint_b.shift_sign.eq(decoder.shit_signed), d.endpoint_b.csr_addr.eq(decoder.immediate), d.endpoint_b.csr_we.eq(decoder.csr_we), d.endpoint_b.fetch_error.eq(d.endpoint_a.fetch_error), d.endpoint_b.fetch_badaddr.eq(d.endpoint_a.fetch_badaddr), d.endpoint_b.ecall.eq(decoder.ecall), d.endpoint_b.ebreak.eq(decoder.ebreak), d.endpoint_b.mret.eq(decoder.mret), d.endpoint_b.illegal.eq(decoder.illegal), d.endpoint_b.fence_i.eq(decoder.fence_i), d.endpoint_b.multiplier.eq(decoder.multiply), d.endpoint_b.divider.eq(decoder.divide), d.endpoint_b.prediction.eq(d.endpoint_a.prediction), d.endpoint_b.prediction_state.eq(d.endpoint_a.prediction_state) ] # X -> M with cpu.If(~x.stall): cpu.d.sync += [ x.endpoint_b.pc.eq(x.endpoint_a.pc), x.endpoint_b.instruction.eq(x.endpoint_a.instruction), x.endpoint_b.gpr_rd.eq(x.endpoint_a.gpr_rd), x.endpoint_b.gpr_we.eq(x.endpoint_a.gpr_we), x.endpoint_b.needed_in_m.eq(x.endpoint_a.needed_in_m | x.endpoint_a.needed_in_x), x.endpoint_b.funct3.eq(x.endpoint_a.funct3), x.endpoint_b.shifter.eq(x.endpoint_a.shifter), x.endpoint_b.compare.eq(x.endpoint_a.compare), x.endpoint_b.branch.eq(x.endpoint_a.branch), x.endpoint_b.load.eq(x.endpoint_a.load), x.endpoint_b.store.eq(x.endpoint_a.store), x.endpoint_b.csr.eq(x.endpoint_a.csr), x.endpoint_b.csr_addr.eq(x.endpoint_a.csr_addr), x.endpoint_b.csr_we.eq(x.endpoint_a.csr_we), x.endpoint_b.result.eq(x_result), x.endpoint_b.compare_result.eq(compare.cmp_ok), x.endpoint_b.compare_result.eq(compare.cmp_ok), x.endpoint_b.jmp_branch_target.eq( Mux(x.endpoint_a.jump & x.endpoint_a.gpr_rs1_use, adder.result[1:] << 1, x_branch_target)), x.endpoint_b.take_jmp_branch.eq(x_take_jmp_branch), x.endpoint_b.fetch_error.eq(x.endpoint_a.fetch_error), x.endpoint_b.fetch_badaddr.eq(x.endpoint_a.fetch_badaddr), x.endpoint_b.ecall.eq(x.endpoint_a.ecall), x.endpoint_b.ebreak.eq(x.endpoint_a.ebreak), x.endpoint_b.mret.eq(x.endpoint_a.mret), x.endpoint_b.illegal.eq(x.endpoint_a.illegal), x.endpoint_b.ls_misalign.eq(data_sel.x_misaligned), x.endpoint_b.divider.eq(x.endpoint_a.divider), x.endpoint_b.prediction.eq(x.endpoint_a.prediction), x.endpoint_b.prediction_state.eq( x.endpoint_a.prediction_state), ] # M -> W with cpu.If(~m.stall): cpu.d.sync += [ m.endpoint_b.pc.eq(m.endpoint_a.pc), m.endpoint_b.gpr_rd.eq(m.endpoint_a.gpr_rd), m.endpoint_b.gpr_we.eq(m.endpoint_a.gpr_we), m.endpoint_b.result.eq(m_result), m.endpoint_b.ld_result.eq(data_sel.m_load_data), m.endpoint_b.csr_result.eq(csr_rp.data), m.endpoint_b.load.eq(m.endpoint_a.load), m.endpoint_b.csr.eq(m.endpoint_a.csr) ] return cpu
def elaborate(self, platform): m = Module() m.submodules.lfsr = self.lfsr m.submodules.crc = self.crc if self.printer: payload_data = self.printer.mem else: payload_data = Memory(width=8, depth=64) m.submodules.payload_wport = wport = payload_data.write_port() header = Signal(16) header_idx = Signal(range(16)) pdu = Signal(8) m.d.comb += pdu.eq(header[0:8]) size = Signal(8) m.d.comb += size.eq(header[8:16]) payload_read = Signal(12) # How many bits of the payload we've read payload_addr = Signal(48) payload_addr_idx = Signal(8) payload_sec_header_idx = Signal(4) payload_sec_header = Signal(16) payload_sec_size = Signal(8) payload_sec_type = Signal(8) m.d.comb += [ payload_sec_size.eq(payload_sec_header[0:8]), payload_sec_type.eq(payload_sec_header[8:16]) ] payload_sec_read = Signal(12) payload_byte = Signal(8) dewhitened = Signal() m.d.comb += dewhitened.eq(self.bitstream ^ self.lfsr.output) crc = Signal(24) crc_idx = Signal(8) crc_matches = self.crc_matches m.d.comb += crc_matches.eq( Cat([self.crc.crc[i] == crc[24 - i - 1] for i in range(24)]).all()) should_print = Signal() with m.FSM() as fsm: # Exposed for debugging purposes m.d.comb += self.state.eq(fsm.state) with m.State("IDLE"): with m.If(self.sample): m.next = "READ_HEADER" m.d.sync += [ header_idx.eq(1), header.eq(Cat(dewhitened, [0] * 7)), self.currentbit.eq(dewhitened), self.lfsr.run_strobe.eq(1), self.crc.input.eq(dewhitened), self.crc.en.eq(1), should_print.eq(0), ] with m.Else(): # Reset goes high at the end m.d.sync += [self.lfsr.reset.eq(0), self.crc.reset.eq(0)] with m.State("READ_HEADER"): with m.If(self.sample): m.d.sync += [ header_idx.eq(header_idx + 1), header.eq(header | (dewhitened << header_idx)), self.currentbit.eq(dewhitened), self.lfsr.run_strobe.eq(1), self.crc.input.eq(dewhitened), self.crc.en.eq(1), ] with m.If(header_idx == 15): m.d.sync += [ payload_read.eq(0), payload_addr.eq(0), payload_addr_idx.eq(0) ] m.next = "READ_PAYLOAD_ADDR" with m.Else(): m.d.sync += [self.lfsr.run_strobe.eq(0), self.crc.en.eq(0)] with m.State("READ_PAYLOAD_ADDR"): with m.If(self.sample): m.d.sync += [ payload_read.eq(payload_read + 1), payload_addr_idx.eq(payload_addr_idx + 1), payload_addr.eq(payload_addr | (dewhitened << payload_addr_idx)), self.currentbit.eq(dewhitened), self.lfsr.run_strobe.eq(1), self.crc.input.eq(dewhitened), self.crc.en.eq(1), ] with m.If(payload_addr_idx == (48 - 1)): m.d.sync += [ payload_sec_header.eq(0), payload_sec_header_idx.eq(0) ] m.next = "READ_PAYLOAD_SECTION_HEADER" with m.Else(): m.d.sync += [self.lfsr.run_strobe.eq(0), self.crc.en.eq(0)] with m.State("READ_PAYLOAD_SECTION_HEADER"): with m.If(self.sample): m.d.sync += [ payload_read.eq(payload_read + 1), payload_sec_header_idx.eq(payload_sec_header_idx + 1), payload_sec_header.eq(payload_sec_header | ( dewhitened << payload_sec_header_idx)), self.currentbit.eq(dewhitened), self.lfsr.run_strobe.eq(1), self.crc.input.eq(dewhitened), self.crc.en.eq(1), ] with m.If(payload_sec_header_idx == 15): m.d.sync += payload_sec_read.eq(0) m.next = "READ_PAYLOAD_SECTION_CONTENT" with m.Else(): m.d.sync += [self.lfsr.run_strobe.eq(0), self.crc.en.eq(0)] # If we previously were in READ_PAYLOAD_SECTION_CONTENT, null terminate # what we read m.d.comb += [ wport.addr.eq((payload_sec_read >> 3) + 1), wport.en.eq(1), wport.data.eq(0) ] with m.State("READ_PAYLOAD_SECTION_CONTENT"): with m.If(self.sample): m.d.sync += [ payload_read.eq(payload_read + 1), payload_sec_read.eq(payload_sec_read + 1), self.currentbit.eq(dewhitened), self.lfsr.run_strobe.eq(1), self.crc.input.eq(dewhitened), self.crc.en.eq(1), ] # m.d.comb += self.debug.eq(payload_sec_type == 0x9) with m.If((payload_sec_type == 0x9) | (payload_sec_type == 0x8) ): # If this section is a complete local name idx = payload_read & 0x7 with m.If(idx == 0): m.d.sync += payload_byte.eq(dewhitened) with m.Else(): m.d.sync += payload_byte.eq(payload_byte | (dewhitened << idx)) with m.If(idx == 0b111): m.d.comb += [ wport.addr.eq(payload_sec_read >> 3), wport.en.eq(1), wport.data.eq(payload_byte | (dewhitened << idx)) ] m.d.sync += should_print.eq(should_print | 1) with m.If((payload_read + 1) >= size << 3): m.next = "READ_CRC" m.d.sync += [crc_idx.eq(0), crc.eq(0)] with m.Else(): with m.If((payload_sec_read + 1) == (payload_sec_size - 1) << 3): m.d.sync += [ payload_sec_header.eq(0), payload_sec_header_idx.eq(0) ] m.next = "READ_PAYLOAD_SECTION_HEADER" with m.Else(): m.d.sync += [self.lfsr.run_strobe.eq(0), self.crc.en.eq(0)] with m.State("READ_CRC"): with m.If(self.sample): m.d.sync += [ crc_idx.eq(crc_idx + 1), crc.eq(crc | (dewhitened << crc_idx)), self.currentbit.eq(dewhitened), self.lfsr.run_strobe.eq(1), self.crc.en.eq(0), ] with m.If(crc_idx == 23): m.next = "CHECK_CRC" with m.Else(): m.d.sync += [self.lfsr.run_strobe.eq(0), self.crc.en.eq(0)] with m.State("CHECK_CRC"): with m.If(crc_matches & should_print): m.next = "START_READOUT" with m.Else(): m.next = "IDLE" m.d.comb += self.done.eq(1) m.d.sync += [ self.lfsr.run_strobe.eq(0), self.lfsr.reset.eq(1), self.crc.reset.eq(1) ] with m.State("START_READOUT"): # If we previously were in READ_PAYLOAD_SECTION_CONTENT, null terminate # what we read m.d.comb += [ wport.addr.eq((payload_sec_read >> 3) + 1), wport.en.eq(1), wport.data.eq(0) ] m.d.comb += self.debug.eq(1) if self.printer: m.d.comb += self.printer.start.eq(1) m.next = "WAIT_READOUT" else: m.d.comb += self.done.eq(1) m.next = "IDLE" with m.State("WAIT_READOUT"): if self.printer: with m.If(self.printer.done): m.next = "IDLE" m.d.comb += self.done.eq(1) m.d.sync += [ self.lfsr.run_strobe.eq(0), self.lfsr.reset.eq(1), self.crc.reset.eq(1) ] else: # Invalid pass return m
def elaborate(self, platform): m = Module() # Range shortcuts for internal signals. address_range = range(0, self.depth + 1) # # Core internal "backing store". # memory = Memory(width=self.width, depth=self.depth + 1, name=self.name) m.submodules.read_port = read_port = memory.read_port() m.submodules.write_port = write_port = memory.write_port() # Always connect up our memory's data/en ports to ours. m.d.comb += [ self.read_data .eq(read_port.data), write_port.data .eq(self.write_data), write_port.en .eq(self.write_en & ~self.full) ] # # Write port. # # We'll track two pieces of data: our _committed_ write position, and our current un-committed write one. # This will allow us to rapidly backtrack to our pre-commit position. committed_write_pointer = Signal(address_range) current_write_pointer = Signal(address_range) m.d.comb += write_port.addr.eq(current_write_pointer) # Compute the location for the next write, accounting for wraparound. We'll not assume a binary-sized # buffer; so we'll compute the wraparound manually. next_write_pointer = Signal.like(current_write_pointer) with m.If(current_write_pointer == self.depth): m.d.comb += next_write_pointer.eq(0) with m.Else(): m.d.comb += next_write_pointer.eq(current_write_pointer + 1) # If we're writing to the fifo, update our current write position. with m.If(self.write_en & ~self.full): m.d.sync += current_write_pointer.eq(next_write_pointer) # If we're committing a FIFO write, update our committed position. with m.If(self.write_commit): m.d.sync += committed_write_pointer.eq(current_write_pointer) # If we're discarding our current write, reset our current position, with m.If(self.write_discard): m.d.sync += current_write_pointer.eq(committed_write_pointer) # # Read port. # # We'll track two pieces of data: our _committed_ read position, and our current un-committed read one. # This will allow us to rapidly backtrack to our pre-commit position. committed_read_pointer = Signal(address_range) current_read_pointer = Signal(address_range) # Compute the location for the next read, accounting for wraparound. We'll not assume a binary-sized # buffer; so we'll compute the wraparound manually. next_read_pointer = Signal.like(current_read_pointer) with m.If(current_read_pointer == self.depth): m.d.comb += next_read_pointer.eq(0) with m.Else(): m.d.comb += next_read_pointer.eq(current_read_pointer + 1) # Our memory always takes a single cycle to provide its read output; so we'll update its address # "one cycle in advance". Accordingly, if we're about to advance the FIFO, we'll use the next read # address as our input. If we're not, we'll use the current one. with m.If(self.read_en & ~self.empty): m.d.comb += read_port.addr.eq(next_read_pointer) with m.Else(): m.d.comb += read_port.addr.eq(current_read_pointer) # If we're reading from our the fifo, update our current read position. with m.If(self.read_en & ~self.empty): m.d.sync += current_read_pointer.eq(next_read_pointer) # If we're committing a FIFO write, update our committed position. with m.If(self.read_commit): m.d.sync += committed_read_pointer.eq(current_read_pointer) # If we're discarding our current write, reset our current position, with m.If(self.read_discard): m.d.sync += current_read_pointer.eq(committed_read_pointer) # # FIFO status. # # Our FIFO is empty if our read and write pointers are in the same. We'll use the current # read position (which leads ahead) and the committed write position (which lags behind). m.d.comb += self.empty.eq(current_read_pointer == committed_write_pointer) # For our space available, we'll use the current write position (which leads ahead) and our committed # read position (which lags behind). This yields two cases: one where the buffer isn't wrapped around, # and one where it is. with m.If(self.full): m.d.comb += self.space_available.eq(0) with m.Elif(committed_read_pointer <= current_write_pointer): m.d.comb += self.space_available.eq(self.depth - (current_write_pointer - committed_read_pointer)) with m.Else(): m.d.comb += self.space_available.eq(committed_read_pointer - current_write_pointer - 1) # Our FIFO is full if we don't have any space available. m.d.comb += self.full.eq(next_write_pointer == committed_read_pointer) # If we're not supposed to be in the sync domain, rename our sync domain to the target. if self.domain != "sync": m = DomainRenamer({"sync": self.domain})(m) return m
class IntegratedLogicAnalyzer(Elaboratable): """ Super-simple integrated-logic-analyzer generator class for LUNA. Attributes ---------- trigger: Signal(), input A strobe that determines when we should start sampling. sampling: Signal(), output Indicates when sampling is in progress. complete: Signal(), output Indicates when sampling is complete and ready to be read. captured_sample_number: Signal(), input Selects which sample the ILA will output. Effectively the address for the ILA's sample buffer. captured_sample: Signal(), output The sample corresponding to the relevant sample number. Can be broken apart by using Cat(*signals). Parameters ---------- signals: iterable of Signals An iterable of signals that should be captured by the ILA. sample_depth: int The depth of the desired buffer, in samples. domain: string The clock domain in which the ILA should operate. sample_rate: float Cosmetic indication of the sample rate. Used to format output. samples_pretrigger: int The number of our samples which should be captured _before_ the trigger. This also can act like an implicit synchronizer; so asynchronous inputs are allowed if this number is >= 2. Note that the trigger strobe is read on the rising edge of the clock. """ def __init__(self, *, signals, sample_depth, domain="sync", sample_rate=60e6, samples_pretrigger=1): self.domain = domain self.signals = signals self.inputs = Cat(*signals) self.sample_width = len(self.inputs) self.sample_depth = sample_depth self.samples_pretrigger = samples_pretrigger self.sample_rate = sample_rate self.sample_period = 1 / sample_rate # # Create a backing store for our samples. # self.mem = Memory(width=self.sample_width, depth=sample_depth, name="ila_buffer") # # I/O port # self.trigger = Signal() self.sampling = Signal() self.complete = Signal() self.captured_sample_number = Signal(range(0, self.sample_depth)) self.captured_sample = Signal(self.sample_width) def elaborate(self, platform): m = Module() # TODO: switch this to a single-port RAM # Memory ports. write_port = self.mem.write_port() read_port = self.mem.read_port(domain='comb') m.submodules += [write_port, read_port] # If necessary, create synchronized versions of the relevant signals. if self.samples_pretrigger >= 2: delayed_inputs = Signal.like(self.inputs) m.submodules += FFSynchronizer(self.inputs, delayed_inputs, stages=self.samples_pretrigger) elif self.samples_pretrigger == 1: delayed_inputs = Signal.like(self.inputs) m.d.sync += delayed_inputs.eq(self.inputs) else: delayed_inputs = self.inputs # Counter that keeps track of our write position. write_position = Signal(range(0, self.sample_depth)) # Set up our write port to capture the input signals, # and our read port to provide the output. m.d.comb += [ write_port.data.eq(delayed_inputs), write_port.addr.eq(write_position), self.captured_sample.eq(read_port.data), read_port.addr.eq(self.captured_sample_number) ] self.test = Signal() m.d.comb += self.test.eq(read_port.addr) # Don't sample unless our FSM asserts our sample signal explicitly. m.d.sync += write_port.en.eq(0) with m.FSM() as fsm: m.d.comb += self.sampling.eq(~fsm.ongoing("IDLE")) # IDLE: wait for the trigger strobe with m.State('IDLE'): with m.If(self.trigger): m.next = 'SAMPLE' # Grab a sample as our trigger is asserted. m.d.sync += [ write_port.en.eq(1), write_position.eq(0), self.complete.eq(0), ] # SAMPLE: do our sampling with m.State('SAMPLE'): # Sample until we run out of samples. m.d.sync += [ write_port.en.eq(1), write_position.eq(write_position + 1), ] # If this is the last sample, we're done. Finish up. with m.If(write_position + 1 == self.sample_depth): m.next = "IDLE" m.d.sync += [self.complete.eq(1), write_port.en.eq(0)] # Convert our sync domain to the domain requested by the user, if necessary. if self.domain != "sync": m = DomainRenamer({"sync": self.domain})(m) return m
def elaborate(self, platform): m = Module() size = self.configuration.getOption('predictor', 'size') if size == 0 or (size & (size - 1)): raise ValueError(f'size must be a power of 2: {size}') _bits_index = log2_int(size) _bits_tag = 32 - _bits_index _btb_width = 1 + 32 + _bits_tag # valid + data + tag _btb_depth = 1 << _bits_index _btb_layout = [('target', 32), ('tag', _bits_tag), ('valid', 1)] _pc_layout = [('index', _bits_index), ('tag', _bits_tag)] btb = Memory(width=_btb_width, depth=_btb_depth) btb_rp = btb.read_port() btb_wp = btb.write_port() bht = Memory(width=2, depth=_btb_depth) bht_rp = bht.read_port() bht_wp = bht.write_port() m.submodules += btb_rp, btb_wp m.submodules += bht_rp, bht_wp btb_r = Record(_btb_layout) a_pc = Record(_pc_layout) f_pc = Record(_pc_layout) m_pc = Record(_pc_layout) hit = Signal() pstate_next = Signal(2) m.d.comb += [ btb_rp.addr.eq(Mux(self.a_stall, f_pc.index, a_pc.index)), bht_rp.addr.eq(Mux(self.a_stall, f_pc.index, a_pc.index)), btb_r.eq(btb_rp.data), # a_pc.eq(self.a_pc), f_pc.eq(self.f_pc), hit.eq(btb_r.valid & (btb_r.tag == f_pc.tag)), # self.f_prediction.eq(hit & bht_rp.data[1]), self.f_prediction_state.eq(bht_rp.data), self.f_prediction_pc.eq(btb_r.target) ] # update m.d.comb += [ btb_wp.addr.eq(m_pc.index), btb_wp.data.eq(Cat(self.m_target_pc, m_pc.tag, 1)), btb_wp.en.eq(self.m_update), bht_wp.addr.eq(m_pc.index), bht_wp.data.eq(pstate_next), bht_wp.en.eq(self.m_update), m_pc.eq(self.m_pc), pstate_next.eq(0) ] with m.Switch(Cat(self.m_prediction_state, self.m_take_jmp_branch)): with m.Case(0b000, 0b001): m.d.comb += pstate_next.eq(0b00) with m.Case(0b010, 0b100): m.d.comb += pstate_next.eq(0b01) with m.Case(0b011, 0b101): m.d.comb += pstate_next.eq(0b10) with m.Case(0b110, 0b111): m.d.comb += pstate_next.eq(0b11) return m
from nmigen import Memory, Signal, Module from nmigen import Elaboratable mem = Memory(width=32, depth=16) rp = mem.read_port() wp = mem.write_port() m = Module() m.submodules.rp = rp m.submodules.wp = wp class Check(Elaboratable): def __init__(self): self.check_in = Signal(range(4)) self.check_out = Signal(range(4)) def elaborate(self, platform): m = Module() m.d.sync += self.check_out.eq(self.check_in) return m check = Check() m.submodules.check = check from nmigen.back.pysim import Simulator, Delay, Settle sim = Simulator(m) sim.add_clock(1e-6)
def elaborate(self, platform): if platform is not None: platform.add_file("picorv32.v", open("picorv32.v", "r")) if not os.path.exists("build"): os.makedirs("build") subprocess.run( [ "cargo", "objcopy", "--release", "--", "-O", "binary", "../build/app.bin" ], cwd="app", ).check_returncode() with open("build/app.bin", "rb") as f: b = bytearray(f.read()) b.extend([0] * (4 - (len(b) % 4))) app = np.frombuffer(b, dtype='<u4').tolist() # MEM_SIZE = 256 # words RAM_SIZE = 256 # words init = ([0] * RAM_SIZE) + app MEM_SIZE = len(init) mem = Memory( width=32, depth=MEM_SIZE, init=init, ) resetn = Signal() mem_valid = Signal() mem_ready = Signal() mem_addr = Signal(32) mem_wdata = Signal(32) mem_wstrb = Signal(4) mem_rdata = Signal(32) m = Module() m.d.comb += resetn.eq(~ResetSignal()) m.submodules.picorv32 = Instance( "picorv32", p_ENABLE_COUNTERS=0, p_LATCHED_MEM_RDATA=1, p_TWO_STAGE_SHIFT=0, p_TWO_CYCLE_ALU=1, p_CATCH_MISALIGN=0, p_CATCH_ILLINSN=0, p_COMPRESSED_ISA=1, p_ENABLE_MUL=1, p_PROGADDR_RESET=1024, p_PROGADDR_IRQ=1024 + 0x10, i_clk=ClockSignal(), i_resetn=resetn, o_mem_valid=mem_valid, i_mem_ready=mem_ready, o_mem_addr=mem_addr, o_mem_wdata=mem_wdata, o_mem_wstrb=mem_wstrb, i_mem_rdata=mem_rdata, ) m.submodules.read_port = read_port = mem.read_port(transparent=False) m.submodules.write_port = write_port = mem.write_port(granularity=8) m.d.sync += mem_ready.eq(0) m.d.comb += [ read_port.addr.eq(mem_addr >> 2), mem_rdata.eq(read_port.data), read_port.en.eq((~mem_wstrb).bool()), write_port.addr.eq(mem_addr >> 2), write_port.data.eq(mem_wdata), write_port.en.eq(mem_wstrb), ] with m.If(resetn & mem_valid & ~mem_ready): with m.If((mem_addr >> 2) < MEM_SIZE): m.d.sync += mem_ready.eq(1) for mapping in self.memory_mappings: if mapping.writing_enabled: with m.If(mem_wstrb.bool() & (mem_addr == mapping.addr)): if mapping.write is not None: mapping.write(m, mem_wdata) else: m.d.sync += [ mapping.signal.eq(mem_wdata), mem_ready.eq(1), ] if mapping.read: with m.If((~mem_wstrb).bool() & (mem_addr == mapping.addr)): m.d.comb += mem_rdata.eq(mapping.signal) m.d.sync += mem_ready.eq(1) if not mapping.read and not (mapping.write or mapping.writing_enabled): print(mapping.addr) print("mapping doesn't specify read or write", file=sys.stderr) return m
def elaborate(self, platform): m = Module() way_layout = [ ('data', 32 * self.nwords), ('tag', self.s1_address.tag.shape()), ('valid', 1), ('sel_lru', 1) ] if self.enable_write: way_layout.append(('sel_we', 1)) ways = Array(Record(way_layout) for _way in range(self.nways)) fill_cnt = Signal.like(self.s1_address.offset) # set the LRU if self.nways == 1: lru = Const(0) # self.nlines else: lru = Signal(self.nlines) with m.If(self.bus_valid & self.bus_ack & self.bus_last): # err ^ ack == 1 _lru = lru.bit_select(self.s2_address.line, 1) m.d.sync += lru.bit_select(self.s2_address.line, 1).eq(~_lru) # hit/miss way_hit = m.submodules.way_hit = Encoder(self.nways) for idx, way in enumerate(ways): m.d.comb += way_hit.i[idx].eq((way.tag == self.s2_address.tag) & way.valid) m.d.comb += self.s2_miss.eq(way_hit.n) if self.enable_write: m.d.comb += ways[way_hit.o].sel_we.eq(self.s2_we & self.s2_valid) # read data m.d.comb += self.s2_rdata.eq(ways[way_hit.o].data.word_select(self.s2_address.offset, 32)) with m.FSM(): with m.State('READ'): with m.If(self.s2_re & self.s2_miss & self.s2_valid): m.d.sync += [ self.bus_addr.eq(self.s2_address), # WARNING extra_bits self.bus_valid.eq(1), fill_cnt.eq(self.s2_address.offset - 1) ] m.next = 'REFILL' with m.State('REFILL'): m.d.comb += self.bus_last.eq(fill_cnt == self.bus_addr.offset) with m.If(self.bus_ack): m.d.sync += self.bus_addr.offset.eq(self.bus_addr.offset + 1) with m.If(self.bus_ack & self.bus_last | self.bus_err): m.d.sync += self.bus_valid.eq(0) with m.If(~self.bus_valid | self.s1_flush): # in case of flush, abort ongoing refill. m.next = 'READ' m.d.sync += self.bus_valid.eq(0) # mark the way to use (replace) m.d.comb += ways[lru.bit_select(self.s2_address.line, 1)].sel_lru.eq(self.bus_valid) # generate for N ways for way in ways: # create the memory structures for valid, tag and data. valid = Signal(self.nlines) tag_m = Memory(width=len(way.tag), depth=self.nlines) tag_rp = tag_m.read_port() tag_wp = tag_m.write_port() m.submodules += tag_rp, tag_wp data_m = Memory(width=len(way.data), depth=self.nlines) data_rp = data_m.read_port() data_wp = data_m.write_port(granularity=32) m.submodules += data_rp, data_wp # handle valid with m.If(self.s1_flush & self.s1_valid): # flush m.d.sync += valid.eq(0) with m.Elif(way.sel_lru & self.bus_last & self.bus_ack): # refill ok m.d.sync += valid.bit_select(self.bus_addr.line, 1).eq(1) with m.Elif(way.sel_lru & self.bus_err): # refill error m.d.sync += valid.bit_select(self.bus_addr.line, 1).eq(0) with m.Elif(self.s2_evict & self.s2_valid & (way.tag == self.s2_address.tag)): # evict m.d.sync += valid.bit_select(self.s2_address.line, 1).eq(0) # assignments m.d.comb += [ tag_rp.addr.eq(Mux(self.s1_stall, self.s2_address.line, self.s1_address.line)), tag_wp.addr.eq(self.bus_addr.line), tag_wp.data.eq(self.bus_addr.tag), tag_wp.en.eq(way.sel_lru & self.bus_ack & self.bus_last), data_rp.addr.eq(Mux(self.s1_stall, self.s2_address.line, self.s1_address.line)), way.data.eq(data_rp.data), way.tag.eq(tag_rp.data), way.valid.eq(valid.bit_select(self.s2_address.line, 1)) ] # update cache: CPU or Refill if self.enable_write: update_addr = Signal(len(data_wp.addr)) update_data = Signal(len(data_wp.data)) update_we = Signal(len(data_wp.en)) aux_wdata = Signal(32) with m.If(self.bus_valid): m.d.comb += [ update_addr.eq(self.bus_addr.line), update_data.eq(Repl(self.bus_data, self.nwords)), update_we.bit_select(self.bus_addr.offset, 1).eq(way.sel_lru & self.bus_ack), ] with m.Else(): m.d.comb += [ update_addr.eq(self.s2_address.line), update_data.eq(Repl(aux_wdata, self.nwords)), update_we.bit_select(self.s2_address.offset, 1).eq(way.sel_we & ~self.s2_miss) ] m.d.comb += [ aux_wdata.eq(Cat( Mux(self.s2_sel[0], self.s2_wdata.word_select(0, 8), self.s2_rdata.word_select(0, 8)), Mux(self.s2_sel[1], self.s2_wdata.word_select(1, 8), self.s2_rdata.word_select(1, 8)), Mux(self.s2_sel[2], self.s2_wdata.word_select(2, 8), self.s2_rdata.word_select(2, 8)), Mux(self.s2_sel[3], self.s2_wdata.word_select(3, 8), self.s2_rdata.word_select(3, 8)) )), # data_wp.addr.eq(update_addr), data_wp.data.eq(update_data), data_wp.en.eq(update_we), ] else: m.d.comb += [ data_wp.addr.eq(self.bus_addr.line), data_wp.data.eq(Repl(self.bus_data, self.nwords)), data_wp.en.bit_select(self.bus_addr.offset, 1).eq(way.sel_lru & self.bus_ack), ] return m