def __init__(self, pads, *, ser_latency, des_latency, serdes_reset_cnt=0, **kwargs): super().__init__( pads, ser_latency=ser_latency + Latency(sys=Serializer.LATENCY), des_latency=des_latency + Latency(sys=Deserializer.LATENCY), **kwargs) self._out = self.out self.out = LPDDR4Output(nphases=self.nphases // 2, databits=self.databits) def ser(i, o): assert len(o) == len(i) // 2 self.submodules += Serializer( clkdiv="sys", clk="sys2x", i_dw=len(i), o_dw=len(o), i=i, o=o, reset_cnt=serdes_reset_cnt, ) def des(i, o): assert len(i) == len(o) // 2 self.submodules += Deserializer( clkdiv="sys", clk="sys2x", i_dw=len(i), o_dw=len(o), i=i, o=o, reset_cnt=serdes_reset_cnt, ) # handle ser/des for both the lists (like dq) and just Signal (like cs) def apply(fn, i, o): if not isinstance(i, list): i, o = [i], [o] for i_n, o_n in zip(i, o): fn(i=i_n, o=o_n) for name in vars(self.out): old = getattr(self._out, name) new = getattr(self.out, name) if name.endswith("_oe"): # OE signals need to be delayed self.comb += new.eq( delayed(self, old, cycles=Serializer.LATENCY)) elif name.endswith("_i"): # Deserialize inputs apply(des, o=old, i=new) else: # All other signals are outputs apply(ser, i=old, o=new)
def __init__(self, pads, data_cdc, *, clk_freq, log_level, init_delays=False): self.submodules.log = log = SimLogger(log_level=log_level, clk_freq=clk_freq) self.log.add_csrs() # Mode Registers storage self.mode_regs = Array([Signal(8) for _ in range(64)]) # Active banks self.active_banks = Array([Signal() for _ in range(8)]) self.active_rows = Array([Signal(17) for _ in range(8)]) # Connection to DataSim self.data_en = TappedDelayLine(ntaps=20) self.data = data_cdc self.submodules += self.data, self.data_en # CS/CA shift registers cs = TappedDelayLine(pads.cs, ntaps=2) ca = TappedDelayLine(pads.ca, ntaps=2) self.submodules += cs, ca self.cs_low = Signal(6) self.cs_high = Signal(6) self.handle_cmd = Signal() self.mpc_op = Signal(7) cmds_enabled = Signal() cmd_handlers = OrderedDict( MRW=self.mrw_handler(), REF=self.refresh_handler(), ACT=self.activate_handler(), PRE=self.precharge_handler(), CAS=self.cas_handler(), MPC=self.mpc_handler(), ) self.comb += [ If( cmds_enabled, If( Cat(cs.taps) == 0b10, self.handle_cmd.eq(1), self.cs_high.eq(ca.taps[1]), self.cs_low.eq(ca.taps[0]), )), If( self.handle_cmd & ~reduce(or_, cmd_handlers.values()), self.log.error( "Unexpected command: cs_high=0b%06b cs_low=0b%06b", self.cs_high, self.cs_low)), ] def ck(t): return math.ceil(t * clk_freq) self.submodules.tinit0 = PulseTiming( ck(20e-3)) # makes no sense in simulation self.submodules.tinit1 = PulseTiming(ck(200e-6)) self.submodules.tinit2 = PulseTiming(ck(10e-9)) self.submodules.tinit3 = PulseTiming(ck(2e-3)) self.submodules.tinit4 = PulseTiming( 5) # TODO: would require counting pads.clk_p ticks self.submodules.tinit5 = PulseTiming(ck(2e-6)) self.submodules.tzqcal = PulseTiming(ck(1e-6)) self.submodules.tzqlat = PulseTiming(max(8, ck(30e-9))) self.submodules.tpw_reset = PulseTiming(ck(100e-9)) self.comb += [ self.tinit1.trigger.eq(1), self.tinit2.trigger.eq(~pads.cke), self.tinit3.trigger.eq(pads.reset_n), self.tpw_reset.trigger.eq(~pads.reset_n), If( ~delayed(self, pads.reset_n) & pads.reset_n, self.log.info("RESET released"), If(~self.tinit1.ready, self.log.warn( "tINIT1 violated: RESET deasserted too fast")), If( ~self.tinit2.ready, self.log.warn( "tINIT2 violated: CKE LOW too short before RESET being released" )), ), If( delayed(self, pads.reset_n) & ~pads.reset_n, self.log.info("RESET asserted"), ), If( delayed(self, pads.cke) & ~pads.cke, self.log.info("CKE falling edge"), ), If( ~delayed(self, pads.cke) & pads.cke, self.log.info("CKE rising edge"), If( ~self.tinit3.ready, self.log.warn( "tINIT3 violated: CKE set HIGH too fast after RESET being released" )), ), ] self.submodules.fsm = fsm = ResetInserter()(FSM()) self.comb += [ If(self.tpw_reset.ready_p, fsm.reset.eq(1), self.log.info("FSM reset")) ] fsm.act( "RESET", If( self.tinit3.ready_p | (not init_delays), NextState("EXIT-PD") # Td )) fsm.act( "EXIT-PD", self.tinit5.trigger.eq(1), If( self.tinit5.ready_p | (not init_delays), NextState("MRW") # Te )) fsm.act( "MRW", cmds_enabled.eq(1), If( self.handle_cmd & ~cmd_handlers["MRW"] & ~cmd_handlers["MPC"], self.log.warn( "Only MRW/MRR commands expected before ZQ calibration"), self.log.warn( " ".join("{}=%d".format(cmd) for cmd in cmd_handlers.keys()), *cmd_handlers.values()), ), If( cmd_handlers["MPC"], If( self.mpc_op != MPC.ZQC_START, self.log.error("ZQC-START expected, got op=0b%07b", self.mpc_op)).Else(NextState("ZQC") # Tf )), ) fsm.act( "ZQC", self.tzqcal.trigger.eq(1), cmds_enabled.eq(1), If( self.handle_cmd, If(~(cmd_handlers["MPC"] & (self.mpc_op == MPC.ZQC_LATCH)), self.log.error("Expected ZQC-LATCH")).Else( If(init_delays & ~self.tzqcal.ready, self.log.warn("tZQCAL violated")), NextState("NORMAL") # Tg )), ) fsm.act( "NORMAL", cmds_enabled.eq(1), self.tzqlat.trigger.eq(1), If(init_delays & self.handle_cmd & ~self.tzqlat.ready, self.log.warn("tZQLAT violated")), ) # Log state transitions fsm.finalize() prev_state = delayed(self, fsm.state) self.comb += If( prev_state != fsm.state, Case( prev_state, { state: Case( fsm.state, { next_state: self.log.info( f"FSM: {state_name} -> {next_state_name}") for next_state, next_state_name in fsm.decoding.items() }) for state, state_name in fsm.decoding.items() }))
def __init__(self, aligned_reset_zero=False, **kwargs): pads = LPDDR4SimulationPads() self.submodules += pads super().__init__(pads, ser_latency=Latency(sys=Serializer.LATENCY), des_latency=Latency(sys=Deserializer.LATENCY), phytype="LPDDR4SimPHY", **kwargs) # fake delays (make no nsense in simulation, but sdram.c expects them) self.settings.read_leveling = True self.settings.delays = 1 self._rdly_dq_rst = CSR() self._rdly_dq_inc = CSR() delay = lambda sig, cycles: delayed(self, sig, cycles=cycles) sdr = dict(clkdiv="sys", clk="sys8x") sdr_90 = dict(clkdiv="sys", clk="sys8x_90") ddr = dict(clkdiv="sys", clk="sys8x_ddr") ddr_90 = dict(clkdiv="sys", clk="sys8x_90_ddr") if aligned_reset_zero: sdr["reset_cnt"] = 0 ddr["reset_cnt"] = 0 # Clock is shifted 180 degrees to get rising edge in the middle of SDR signals. # To achieve that we send negated clock on clk (clk_p). self.ser(i=~self.out.clk, o=self.pads.clk, name='clk', **ddr) self.ser(i=self.out.cke, o=self.pads.cke, name='cke', **sdr) self.ser(i=self.out.odt, o=self.pads.odt, name='odt', **sdr) self.ser(i=self.out.reset_n, o=self.pads.reset_n, name='reset_n', **sdr) # Command/address self.ser(i=self.out.cs, o=self.pads.cs, name='cs', **sdr) for i in range(6): self.ser(i=self.out.ca[i], o=self.pads.ca[i], name=f'ca{i}', **sdr) # Tristate I/O (separate for simulation) for i in range(self.databits // 8): self.ser(i=self.out.dmi_o[i], o=self.pads.dmi_o[i], name=f'dmi_o{i}', **ddr) self.des(o=self.out.dmi_i[i], i=self.pads.dmi[i], name=f'dmi_i{i}', **ddr) self.ser(i=self.out.dqs_o[i], o=self.pads.dqs_o[i], name=f'dqs_o{i}', **ddr_90) self.des(o=self.out.dqs_i[i], i=self.pads.dqs[i], name=f'dqs_i{i}', **ddr_90) for i in range(self.databits): self.ser(i=self.out.dq_o[i], o=self.pads.dq_o[i], name=f'dq_o{i}', **ddr) self.des(o=self.out.dq_i[i], i=self.pads.dq[i], name=f'dq_i{i}', **ddr) # Output enable signals self.comb += [ self.pads.dmi_oe.eq( delay(self.out.dmi_oe, cycles=Serializer.LATENCY)), self.pads.dqs_oe.eq( delay(self.out.dqs_oe, cycles=Serializer.LATENCY)), self.pads.dq_oe.eq(delay(self.out.dq_oe, cycles=Serializer.LATENCY)), ]
def __init__(self, pads, *, sys_clk_freq, ser_latency, des_latency, phytype, cmd_delay=None, masked_write=True, extended_overlaps_check=False): self.pads = pads self.memtype = memtype = "LPDDR4" self.nranks = nranks = 1 if not hasattr(pads, "cs_n") else len( pads.cs_n) self.databits = databits = len(pads.dq) self.addressbits = addressbits = 17 # for activate row address self.bankbits = bankbits = 6 # 3 bankbits, but we use 6 for Mode Register address in MRS self.nphases = nphases = 8 self.tck = tck = 1 / (nphases * sys_clk_freq) assert databits % 8 == 0 # Parameters ------------------------------------------------------------------------------- def get_cl_cw(memtype, tck): # MT53E256M16D1, No DBI, Set A f_to_cl_cwl = OrderedDict() f_to_cl_cwl[532e6] = (6, 4) f_to_cl_cwl[1066e6] = (10, 6) f_to_cl_cwl[1600e6] = (14, 8) f_to_cl_cwl[2132e6] = (20, 10) f_to_cl_cwl[2666e6] = (24, 12) f_to_cl_cwl[3200e6] = (28, 14) f_to_cl_cwl[3732e6] = (32, 16) f_to_cl_cwl[4266e6] = (36, 18) for f, (cl, cwl) in f_to_cl_cwl.items(): if tck >= 2 / f: return cl, cwl raise ValueError # Bitslip introduces latency from 1 up to `cycles + 1` # FIXME: (check if True) from tests on hardware it seems we need 1 more cycle # of read_latency, probably to have space for manipulating bitslip values bitslip_cycles = 1 bitslip_range = 1 # Commands are sent over 4 DRAM clocks (sys8x) and we count cl/cwl from last bit cmd_latency = 4 # Commands read from adapters are delayed on ConstBitSlips ca_latency = 1 cl, cwl = get_cl_cw(memtype, tck) cl_sys_latency = get_sys_latency(nphases, cl) cwl_sys_latency = get_sys_latency(nphases, cwl) # For reads we need to account for ser+des latency to make sure we get the data in-phase with sys clock rdphase = get_sys_phase( nphases, cl_sys_latency, cl + cmd_latency + ser_latency.sys8x % 8 + des_latency.sys8x % 8) # No need to modify wrphase, because ser_latency applies the same to both CA and DQ wrphase = get_sys_phase(nphases, cwl_sys_latency, cwl + cmd_latency) # When the calculated phase is negative, it means that we need to increase sys latency def updated_latency(phase, sys_latency): while phase < 0: phase += nphases sys_latency += 1 return phase, sys_latency wrphase, cwl_sys_latency = updated_latency(wrphase, cwl_sys_latency) rdphase, cl_sys_latency = updated_latency(rdphase, cl_sys_latency) # Read latency read_data_delay = ca_latency + ser_latency.sys8x // 8 + cl_sys_latency # DFI cmd -> read data on DQ read_des_delay = des_latency.sys8x // 8 + bitslip_cycles + bitslip_range # data on DQ -> data on DFI rddata read_latency = read_data_delay + read_des_delay # Write latency write_latency = cwl_sys_latency # Registers -------------------------------------------------------------------------------- self._rst = CSRStorage() self._wlevel_en = CSRStorage() self._wlevel_strobe = CSR() self._dly_sel = CSRStorage(databits // 8) self._rdly_dq_bitslip_rst = CSR() self._rdly_dq_bitslip = CSR() self._wdly_dq_bitslip_rst = CSR() self._wdly_dq_bitslip = CSR() self._rdphase = CSRStorage(log2_int(nphases), reset=rdphase) self._wrphase = CSRStorage(log2_int(nphases), reset=wrphase) # PHY settings ----------------------------------------------------------------------------- self.settings = PhySettings( phytype=phytype, memtype=memtype, databits=databits, dfi_databits=2 * databits, nranks=nranks, nphases=nphases, rdphase=self._rdphase.storage, wrphase=self._wrphase.storage, cl=cl, cwl=cwl, read_latency=read_latency, write_latency=write_latency, cmd_latency=cmd_latency, cmd_delay=cmd_delay, bitslips=16, ) # DFI Interface ---------------------------------------------------------------------------- # Due to the fact that LPDDR4 has 16n prefetch we use 8 phases to be able to read/write a # whole burst during a single controller clock cycle. PHY should use sys8x clock. self.dfi = dfi = Interface(addressbits, bankbits, nranks, 2 * databits, nphases=8) # # # adapters = [ DFIPhaseAdapter(phase, masked_write=masked_write) for phase in self.dfi.phases ] self.submodules += adapters # Now prepare the data by converting the sequences on adapters into sequences on the pads. # We have to ignore overlapping commands, and module timings have to ensure that there are # no overlapping commands anyway. self.out = LPDDR4Output(nphases, databits) # Clocks ----------------------------------------------------------------------------------- self.comb += self.out.clk.eq(bitpattern("-_-_-_-_" * 2)) # Simple commands -------------------------------------------------------------------------- self.comb += [ self.out.cke.eq( Cat(delayed(self, phase.cke) for phase in self.dfi.phases)), self.out.odt.eq( Cat(delayed(self, phase.odt) for phase in self.dfi.phases)), self.out.reset_n.eq( Cat(delayed(self, phase.reset_n) for phase in self.dfi.phases)), ] # LPDDR4 Commands -------------------------------------------------------------------------- # Each LPDDR4 command can span several phases (2 or 4), so in theory the commands could # overlap. No overlap should be guaranteed by the controller based on module timings, but # we also include an overlaps check in PHY logic. self.submodules.commands = CommandsPipeline( adapters, cs_ser_width=len(self.out.cs), ca_ser_width=len(self.out.ca[0]), ca_nbits=len(self.out.ca), cmd_nphases_span=4, extended_overlaps_check=extended_overlaps_check) self.comb += self.out.cs.eq(self.commands.cs) for bit in range(6): self.comb += self.out.ca[bit].eq(self.commands.ca[bit]) # DQ --------------------------------------------------------------------------------------- dq_oe = Signal() self.comb += self.out.dq_oe.eq(delayed(self, dq_oe, cycles=1)) for bit in range(self.databits): # output wrdata = [ self.dfi.phases[i // 2].wrdata[i % 2 * self.databits + bit] for i in range(2 * nphases) ] self.submodules += BitSlip( dw=2 * nphases, cycles=bitslip_cycles, rst=self.get_rst(bit // 8, self._wdly_dq_bitslip_rst.re), slp=self.get_inc(bit // 8, self._wdly_dq_bitslip.re), i=Cat(*wrdata), o=self.out.dq_o[bit], ) # input dq_i_bs = Signal(2 * nphases) self.submodules += BitSlip( dw=2 * nphases, cycles=bitslip_cycles, rst=self.get_rst(bit // 8, self._rdly_dq_bitslip_rst.re), slp=self.get_inc(bit // 8, self._rdly_dq_bitslip.re), i=self.out.dq_i[bit], o=dq_i_bs, ) for i in range(2 * nphases): self.comb += self.dfi.phases[i // 2].rddata[i % 2 * self.databits + bit].eq(dq_i_bs[i]) # DQS -------------------------------------------------------------------------------------- dqs_oe = Signal() dqs_preamble = Signal() dqs_postamble = Signal() dqs_pattern = DQSPattern(preamble=dqs_preamble, postamble=dqs_postamble, wlevel_en=self._wlevel_en.storage, wlevel_strobe=self._wlevel_strobe.re) self.submodules += dqs_pattern self.comb += [ self.out.dqs_oe.eq(delayed(self, dqs_oe, cycles=1)), ] for byte in range(self.databits // 8): # output self.submodules += BitSlip( dw=2 * nphases, cycles=bitslip_cycles, rst=self.get_rst(byte, self._wdly_dq_bitslip_rst.re), slp=self.get_inc(byte, self._wdly_dq_bitslip.re), i=dqs_pattern.o, o=self.out.dqs_o[byte], ) # DMI -------------------------------------------------------------------------------------- # DMI signal is used for Data Mask or Data Bus Invertion depending on Mode Registers values. # With DM and DBI disabled, this signal is a Don't Care. # With DM enabled, masking is performed only when the command used is WRITE-MASKED. # We don't support DBI, DM support is configured statically with `masked_write`. for byte in range(self.databits // 8): if isinstance(masked_write, Signal) or masked_write: self.comb += self.out.dmi_oe.eq(self.out.dq_oe) wrdata_mask = [ self.dfi.phases[i // 2].wrdata_mask[i % 2 * self.databits // 8 + byte] for i in range(2 * nphases) ] self.submodules += BitSlip( dw=2 * nphases, cycles=bitslip_cycles, rst=self.get_rst(byte, self._wdly_dq_bitslip_rst.re), slp=self.get_inc(byte, self._wdly_dq_bitslip.re), i=Cat(*wrdata_mask), o=self.out.dmi_o[byte], ) else: self.comb += self.out.dmi_o[byte].eq(0) self.comb += self.out.dmi_oe.eq(0) # Read Control Path ------------------------------------------------------------------------ # Creates a delay line of read commands coming from the DFI interface. The output is used to # signal a valid read data to the DFI interface. # # The read data valid is asserted for 1 sys_clk cycle when the data is available on the DFI # interface, the latency is the sum of the OSERDESE2, CAS, ISERDESE2 and Bitslip latencies. rddata_en = TappedDelayLine(signal=reduce( or_, [dfi.phases[i].rddata_en for i in range(nphases)]), ntaps=self.settings.read_latency) self.submodules += rddata_en self.comb += [ phase.rddata_valid.eq(rddata_en.output | self._wlevel_en.storage) for phase in dfi.phases ] # Write Control Path ----------------------------------------------------------------------- wrtap = cwl_sys_latency - 1 assert wrtap >= 0 # Create a delay line of write commands coming from the DFI interface. This taps are used to # control DQ/DQS tristates. wrdata_en = TappedDelayLine(signal=reduce( or_, [dfi.phases[i].wrdata_en for i in range(nphases)]), ntaps=wrtap + 2) self.submodules += wrdata_en self.comb += dq_oe.eq(wrdata_en.taps[wrtap]) # Always enabled in write leveling mode, else during transfers self.comb += dqs_oe.eq(self._wlevel_en.storage | (dqs_preamble | dq_oe | dqs_postamble)) # Write DQS Postamble/Preamble Control Path ------------------------------------------------ # Generates DQS Preamble 1 cycle before the first write and Postamble 1 cycle after the last # write. During writes, DQS tristate is configured as output for at least 3 sys_clk cycles: # 1 for Preamble, 1 for the Write and 1 for the Postamble. def wrdata_en_tap(i): # allows to have wrtap == 0 return wrdata_en.input if i == -1 else wrdata_en.taps[i] self.comb += dqs_preamble.eq( wrdata_en_tap(wrtap - 1) & ~wrdata_en_tap(wrtap + 0)) self.comb += dqs_postamble.eq( wrdata_en_tap(wrtap + 1) & ~wrdata_en_tap(wrtap + 0))