def __init__(self, pads, *, ser_latency, des_latency, serdes_reset_cnt=0, **kwargs): super().__init__( pads, ser_latency=ser_latency + Latency(sys=Serializer.LATENCY), des_latency=des_latency + Latency(sys=Deserializer.LATENCY), **kwargs) self._out = self.out self.out = LPDDR4Output(nphases=self.nphases // 2, databits=self.databits) def ser(i, o): assert len(o) == len(i) // 2 self.submodules += Serializer( clkdiv="sys", clk="sys2x", i_dw=len(i), o_dw=len(o), i=i, o=o, reset_cnt=serdes_reset_cnt, ) def des(i, o): assert len(i) == len(o) // 2 self.submodules += Deserializer( clkdiv="sys", clk="sys2x", i_dw=len(i), o_dw=len(o), i=i, o=o, reset_cnt=serdes_reset_cnt, ) # handle ser/des for both the lists (like dq) and just Signal (like cs) def apply(fn, i, o): if not isinstance(i, list): i, o = [i], [o] for i_n, o_n in zip(i, o): fn(i=i_n, o=o_n) for name in vars(self.out): old = getattr(self._out, name) new = getattr(self.out, name) if name.endswith("_oe"): # OE signals need to be delayed self.comb += new.eq( delayed(self, old, cycles=Serializer.LATENCY)) elif name.endswith("_i"): # Deserialize inputs apply(des, o=old, i=new) else: # All other signals are outputs apply(ser, i=old, o=new)
def __init__(self, aligned_reset_zero=False, **kwargs): pads = LPDDR4SimulationPads() self.submodules += pads super().__init__(pads, ser_latency = Latency(Serializer.LATENCY), des_latency = Latency(Deserializer.LATENCY), phytype = "LPDDR4SimPHY", **kwargs) self.do_serialization( clkdiv = "sys", delay = lambda sig, cycles: delayed(self, sig, cycles=cycles), aligned_reset_zero = aligned_reset_zero, )
def __init__(self, aligned_reset_zero=False, **kwargs): pads = LPDDR4SimulationPads() self.submodules += pads super().__init__(pads, ser_latency = Latency(sys=0, sys8x=4*Serializer.LATENCY), des_latency = Latency(sys=0, sys8x=4*Deserializer.LATENCY), phytype = "LPDDR4SimPHY", **kwargs) self.submodules.half_delay = ClockDomainsRenamer("sys2x")(Module()) self.do_serialization( clkdiv = "sys2x", delay = lambda sig, cycles: delayed(self.half_delay, sig, cycles=cycles), aligned_reset_zero = aligned_reset_zero, )
def __init__(self, pads, *, sys_clk_freq, ser_latency, des_latency, phytype, cmd_delay=None, masked_write=True, extended_overlaps_check=False): self.pads = pads self.memtype = memtype = "LPDDR4" self.nranks = nranks = 1 if not hasattr(pads, "cs_n") else len( pads.cs_n) self.databits = databits = len(pads.dq) self.addressbits = addressbits = 17 # for activate row address self.bankbits = bankbits = 6 # 3 bankbits, but we use 6 for Mode Register address in MRS self.nphases = nphases = 8 self.tck = tck = 1 / (nphases * sys_clk_freq) assert databits % 8 == 0 # Parameters ------------------------------------------------------------------------------- def get_cl_cw(memtype, tck): # MT53E256M16D1, No DBI, Set A f_to_cl_cwl = OrderedDict() f_to_cl_cwl[532e6] = (6, 4) f_to_cl_cwl[1066e6] = (10, 6) f_to_cl_cwl[1600e6] = (14, 8) f_to_cl_cwl[2132e6] = (20, 10) f_to_cl_cwl[2666e6] = (24, 12) f_to_cl_cwl[3200e6] = (28, 14) f_to_cl_cwl[3732e6] = (32, 16) f_to_cl_cwl[4266e6] = (36, 18) for f, (cl, cwl) in f_to_cl_cwl.items(): if tck >= 2 / f: return cl, cwl raise ValueError # Bitslip introduces latency from 1 up to `cycles + 1` # FIXME: (check if True) from tests on hardware it seems we need 1 more cycle # of read_latency, probably to have space for manipulating bitslip values bitslip_cycles = 1 bitslip_range = 1 # Commands are sent over 4 DRAM clocks (sys8x) and we count cl/cwl from last bit cmd_latency = 4 # Commands read from adapters are delayed on ConstBitSlips ca_latency = 1 cl, cwl = get_cl_cw(memtype, tck) cl_sys_latency = get_sys_latency(nphases, cl) cwl_sys_latency = get_sys_latency(nphases, cwl) # For reads we need to account for ser+des latency to make sure we get the data in-phase with sys clock rdphase = get_sys_phase( nphases, cl_sys_latency, cl + cmd_latency + ser_latency.sys8x + des_latency.sys8x) # No need to modify wrphase, because ser_latency applies the same to both CA and DQ wrphase = get_sys_phase(nphases, cwl_sys_latency, cwl + cmd_latency) # When the calculated phase is negative, it means that we need to increase sys latency def updated_latency(phase, sys_latency): while phase < 0: phase += nphases sys_latency += 1 return phase, sys_latency wrphase, cwl_sys_latency = updated_latency(wrphase, cwl_sys_latency) rdphase, cl_sys_latency = updated_latency(rdphase, cl_sys_latency) # Read latency read_data_delay = ca_latency + ser_latency.sys + cl_sys_latency # DFI cmd -> read data on DQ read_des_delay = des_latency.sys + bitslip_cycles + bitslip_range # data on DQ -> data on DFI rddata read_latency = read_data_delay + read_des_delay # Write latency write_latency = cwl_sys_latency # Registers -------------------------------------------------------------------------------- self._rst = CSRStorage() self._wlevel_en = CSRStorage() self._wlevel_strobe = CSR() self._dly_sel = CSRStorage(databits // 8) self._rdly_dq_bitslip_rst = CSR() self._rdly_dq_bitslip = CSR() self._wdly_dq_bitslip_rst = CSR() self._wdly_dq_bitslip = CSR() self._rdphase = CSRStorage(log2_int(nphases), reset=rdphase) self._wrphase = CSRStorage(log2_int(nphases), reset=wrphase) # PHY settings ----------------------------------------------------------------------------- self.settings = PhySettings( phytype=phytype, memtype=memtype, databits=databits, dfi_databits=2 * databits, nranks=nranks, nphases=nphases, rdphase=self._rdphase.storage, wrphase=self._wrphase.storage, cl=cl, cwl=cwl, read_latency=read_latency, write_latency=write_latency, cmd_latency=cmd_latency, cmd_delay=cmd_delay, ) # DFI Interface ---------------------------------------------------------------------------- # Due to the fact that LPDDR4 has 16n prefetch we use 8 phases to be able to read/write a # whole burst during a single controller clock cycle. PHY should use sys8x clock. self.dfi = dfi = Interface(addressbits, bankbits, nranks, 2 * databits, nphases=8) # # # adapters = [ DFIPhaseAdapter(phase, masked_write=masked_write) for phase in self.dfi.phases ] self.submodules += adapters # Now prepare the data by converting the sequences on adapters into sequences on the pads. # We have to ignore overlapping commands, and module timings have to ensure that there are # no overlapping commands anyway. self.out = LPDDR4Output(nphases, databits) # Clocks ----------------------------------------------------------------------------------- self.comb += self.out.clk.eq(bitpattern("-_-_-_-_" * 2)) # Simple commands -------------------------------------------------------------------------- self.comb += [ self.out.cke.eq( Cat(delayed(self, phase.cke) for phase in self.dfi.phases)), self.out.odt.eq( Cat(delayed(self, phase.odt) for phase in self.dfi.phases)), self.out.reset_n.eq( Cat(delayed(self, phase.reset_n) for phase in self.dfi.phases)), ] # LPDDR4 Commands -------------------------------------------------------------------------- # Each LPDDR4 command can span several phases (2 or 4), so the commands cannot overlap. # This should be guaranteed by the controller based on module timings, but we also include # an overlaps check in PHY logic. # Basic check will make sure that no command will be sent to DRAM if there was any command # sent by the controller on DFI during 3 previous cycles. The extended version will instead # make sure no command is sent to DRAM if there was any command _actually sent to DRAM_ # during 3 previous cycles. This is more expensive in terms of resources and generally not # needed. # Create a history of valid adapters used for masking overlapping ones valids = ConstBitSlip(dw=nphases, cycles=1, slp=0) self.submodules += valids self.comb += valids.i.eq(Cat(a.valid for a in adapters)) valids_hist = valids.r if extended_overlaps_check: valids_hist = Signal.like(valids.r) for i in range(len(valids_hist)): was_valid_before = reduce(or_, valids_hist[max(0, i - 3):i], 0) self.comb += valids_hist[i].eq(valids.r[i] & ~was_valid_before) cs_per_adapter = [] ca_per_adapter = defaultdict(list) for phase, adapter in enumerate(adapters): # The signals from an adapter can be used if there were no commands on 3 previous cycles allowed = ~reduce(or_, valids_hist[nphases + phase - 3:nphases + phase]) # Use CS and CA of given adapter slipped by `phase` bits cs_bs = ConstBitSlip(dw=nphases, cycles=1, slp=phase) self.submodules += cs_bs self.comb += cs_bs.i.eq(Cat(adapter.cs)), cs_mask = Replicate(allowed, len(cs_bs.o)) cs = cs_bs.o & cs_mask cs_per_adapter.append(cs) # For CA we need to do the same for each bit ca_bits = [] for bit in range(6): ca_bs = ConstBitSlip(dw=nphases, cycles=1, slp=phase) self.submodules += ca_bs ca_bit_hist = [adapter.ca[i][bit] for i in range(4)] self.comb += ca_bs.i.eq(Cat(*ca_bit_hist)), ca_mask = Replicate(allowed, len(ca_bs.o)) ca = ca_bs.o & ca_mask ca_per_adapter[bit].append(ca) # OR all the masked signals self.comb += self.out.cs.eq(reduce(or_, cs_per_adapter)) for bit in range(6): self.comb += self.out.ca[bit].eq(reduce(or_, ca_per_adapter[bit])) # DQ --------------------------------------------------------------------------------------- dq_oe = Signal() self.comb += self.out.dq_oe.eq(delayed(self, dq_oe, cycles=1)) for bit in range(self.databits): # output wrdata = [ self.dfi.phases[i // 2].wrdata[i % 2 * self.databits + bit] for i in range(2 * nphases) ] self.submodules += BitSlip( dw=2 * nphases, cycles=bitslip_cycles, rst=self.get_rst(bit // 8, self._wdly_dq_bitslip_rst.re), slp=self.get_inc(bit // 8, self._wdly_dq_bitslip.re), i=Cat(*wrdata), o=self.out.dq_o[bit], ) # input dq_i_bs = Signal(2 * nphases) self.submodules += BitSlip( dw=2 * nphases, cycles=bitslip_cycles, rst=self.get_rst(bit // 8, self._rdly_dq_bitslip_rst.re), slp=self.get_inc(bit // 8, self._rdly_dq_bitslip.re), i=self.out.dq_i[bit], o=dq_i_bs, ) for i in range(2 * nphases): self.comb += self.dfi.phases[i // 2].rddata[i % 2 * self.databits + bit].eq(dq_i_bs[i]) # DQS -------------------------------------------------------------------------------------- dqs_oe = Signal() dqs_preamble = Signal() dqs_postamble = Signal() dqs_pattern = DQSPattern(preamble=dqs_preamble, postamble=dqs_postamble, wlevel_en=self._wlevel_en.storage, wlevel_strobe=self._wlevel_strobe.re) self.submodules += dqs_pattern self.comb += [ self.out.dqs_oe.eq(delayed(self, dqs_oe, cycles=1)), ] for byte in range(self.databits // 8): # output self.submodules += BitSlip( dw=2 * nphases, cycles=bitslip_cycles, rst=self.get_rst(byte, self._wdly_dq_bitslip_rst.re), slp=self.get_inc(byte, self._wdly_dq_bitslip.re), i=dqs_pattern.o, o=self.out.dqs_o[byte], ) # DMI -------------------------------------------------------------------------------------- # DMI signal is used for Data Mask or Data Bus Invertion depending on Mode Registers values. # With DM and DBI disabled, this signal is a Don't Care. # With DM enabled, masking is performed only when the command used is WRITE-MASKED. # We don't support DBI, DM support is configured statically with `masked_write`. for byte in range(self.databits // 8): if isinstance(masked_write, Signal) or masked_write: self.comb += self.out.dmi_oe.eq(self.out.dq_oe) wrdata_mask = [ self.dfi.phases[i // 2].wrdata_mask[i % 2 * self.databits // 8 + byte] for i in range(2 * nphases) ] self.submodules += BitSlip( dw=2 * nphases, cycles=bitslip_cycles, rst=self.get_rst(byte, self._wdly_dq_bitslip_rst.re), slp=self.get_inc(byte, self._wdly_dq_bitslip.re), i=Cat(*wrdata_mask), o=self.out.dmi_o[byte], ) else: self.comb += self.out.dmi_o[byte].eq(0) self.comb += self.out.dmi_oe.eq(0) # Read Control Path ------------------------------------------------------------------------ # Creates a delay line of read commands coming from the DFI interface. The output is used to # signal a valid read data to the DFI interface. # # The read data valid is asserted for 1 sys_clk cycle when the data is available on the DFI # interface, the latency is the sum of the OSERDESE2, CAS, ISERDESE2 and Bitslip latencies. rddata_en = TappedDelayLine(signal=reduce( or_, [dfi.phases[i].rddata_en for i in range(nphases)]), ntaps=self.settings.read_latency) self.submodules += rddata_en self.comb += [ phase.rddata_valid.eq(rddata_en.output | self._wlevel_en.storage) for phase in dfi.phases ] # Write Control Path ----------------------------------------------------------------------- wrtap = cwl_sys_latency - 1 assert wrtap >= 0 # Create a delay line of write commands coming from the DFI interface. This taps are used to # control DQ/DQS tristates. wrdata_en = TappedDelayLine(signal=reduce( or_, [dfi.phases[i].wrdata_en for i in range(nphases)]), ntaps=wrtap + 2) self.submodules += wrdata_en self.comb += dq_oe.eq(wrdata_en.taps[wrtap]) # Always enabled in write leveling mode, else during transfers self.comb += dqs_oe.eq(self._wlevel_en.storage | (dqs_preamble | dq_oe | dqs_postamble)) # Write DQS Postamble/Preamble Control Path ------------------------------------------------ # Generates DQS Preamble 1 cycle before the first write and Postamble 1 cycle after the last # write. During writes, DQS tristate is configured as output for at least 3 sys_clk cycles: # 1 for Preamble, 1 for the Write and 1 for the Postamble. def wrdata_en_tap(i): # allows to have wrtap == 0 return wrdata_en.input if i == -1 else wrdata_en.taps[i] self.comb += dqs_preamble.eq( wrdata_en_tap(wrtap - 1) & ~wrdata_en_tap(wrtap + 0)) self.comb += dqs_postamble.eq( wrdata_en_tap(wrtap + 1) & ~wrdata_en_tap(wrtap + 0))
def __init__(self, pads, data_cdc, *, clk_freq, log_level, init_delays=False): self.submodules.log = log = SimLogger(log_level=log_level, clk_freq=clk_freq) self.log.add_csrs() # Mode Registers storage self.mode_regs = Array([Signal(8) for _ in range(64)]) # Active banks self.active_banks = Array([Signal() for _ in range(8)]) self.active_rows = Array([Signal(17) for _ in range(8)]) # Connection to DataSim self.data_en = TappedDelayLine(ntaps=20) self.data = data_cdc self.submodules += self.data, self.data_en # CS/CA shift registers cs = TappedDelayLine(pads.cs, ntaps=2) ca = TappedDelayLine(pads.ca, ntaps=2) self.submodules += cs, ca self.cs_low = Signal(6) self.cs_high = Signal(6) self.handle_cmd = Signal() self.mpc_op = Signal(7) cmds_enabled = Signal() cmd_handlers = OrderedDict( MRW = self.mrw_handler(), REF = self.refresh_handler(), ACT = self.activate_handler(), PRE = self.precharge_handler(), CAS = self.cas_handler(), MPC = self.mpc_handler(), ) self.comb += [ If(cmds_enabled, If(Cat(cs.taps) == 0b10, self.handle_cmd.eq(1), self.cs_high.eq(ca.taps[1]), self.cs_low.eq(ca.taps[0]), ) ), If(self.handle_cmd & ~reduce(or_, cmd_handlers.values()), self.log.error("Unexpected command: cs_high=0b%06b cs_low=0b%06b", self.cs_high, self.cs_low) ), ] def ck(t): return math.ceil(t * clk_freq) self.submodules.tinit0 = PulseTiming(ck(20e-3)) # makes no sense in simulation self.submodules.tinit1 = PulseTiming(ck(200e-6)) self.submodules.tinit2 = PulseTiming(ck(10e-9)) self.submodules.tinit3 = PulseTiming(ck(2e-3)) self.submodules.tinit4 = PulseTiming(5) # TODO: would require counting pads.clk_p ticks self.submodules.tinit5 = PulseTiming(ck(2e-6)) self.submodules.tzqcal = PulseTiming(ck(1e-6)) self.submodules.tzqlat = PulseTiming(max(8, ck(30e-9))) self.submodules.tpw_reset = PulseTiming(ck(100e-9)) self.comb += [ self.tinit1.trigger.eq(1), self.tinit2.trigger.eq(~pads.cke), self.tinit3.trigger.eq(pads.reset_n), self.tpw_reset.trigger.eq(~pads.reset_n), If(~delayed(self, pads.reset_n) & pads.reset_n, self.log.info("RESET released"), If(~self.tinit1.ready, self.log.warn("tINIT1 violated: RESET deasserted too fast") ), If(~self.tinit2.ready, self.log.warn("tINIT2 violated: CKE LOW too short before RESET being released") ), ), If(delayed(self, pads.reset_n) & ~pads.reset_n, self.log.info("RESET asserted"), ), If(delayed(self, pads.cke) & ~pads.cke, self.log.info("CKE falling edge"), ), If(~delayed(self, pads.cke) & pads.cke, self.log.info("CKE rising edge"), If(~self.tinit3.ready, self.log.warn("tINIT3 violated: CKE set HIGH too fast after RESET being released") ), ), ] self.submodules.fsm = fsm = ResetInserter()(FSM()) self.comb += [ If(self.tpw_reset.ready_p, fsm.reset.eq(1), self.log.info("FSM reset") ) ] fsm.act("RESET", If(self.tinit3.ready_p | (not init_delays), NextState("EXIT-PD") # Td ) ) fsm.act("EXIT-PD", self.tinit5.trigger.eq(1), If(self.tinit5.ready_p | (not init_delays), NextState("MRW") # Te ) ) fsm.act("MRW", cmds_enabled.eq(1), If(self.handle_cmd & ~cmd_handlers["MRW"] & ~cmd_handlers["MPC"], self.log.warn("Only MRW/MRR commands expected before ZQ calibration"), self.log.warn(" ".join("{}=%d".format(cmd) for cmd in cmd_handlers.keys()), *cmd_handlers.values()), ), If(cmd_handlers["MPC"], If(self.mpc_op != MPC.ZQC_START, self.log.error("ZQC-START expected, got op=0b%07b", self.mpc_op) ).Else( NextState("ZQC") # Tf ) ), ) fsm.act("ZQC", self.tzqcal.trigger.eq(1), cmds_enabled.eq(1), If(self.handle_cmd, If(~(cmd_handlers["MPC"] & (self.mpc_op == MPC.ZQC_LATCH)), self.log.error("Expected ZQC-LATCH") ).Else( If(init_delays & ~self.tzqcal.ready, self.log.warn("tZQCAL violated") ), NextState("NORMAL") # Tg ) ), ) fsm.act("NORMAL", cmds_enabled.eq(1), self.tzqlat.trigger.eq(1), If(init_delays & self.handle_cmd & ~self.tzqlat.ready, self.log.warn("tZQLAT violated") ), ) # Log state transitions fsm.finalize() prev_state = delayed(self, fsm.state) self.comb += If(prev_state != fsm.state, Case(prev_state, { state: Case(fsm.state, { next_state: self.log.info(f"FSM: {state_name} -> {next_state_name}") for next_state, next_state_name in fsm.decoding.items() }) for state, state_name in fsm.decoding.items() }) )