Exemple #1
0
    def __init__(self,
                 pads,
                 *,
                 ser_latency,
                 des_latency,
                 serdes_reset_cnt=0,
                 **kwargs):
        super().__init__(
            pads,
            ser_latency=ser_latency + Latency(sys=Serializer.LATENCY),
            des_latency=des_latency + Latency(sys=Deserializer.LATENCY),
            **kwargs)

        self._out = self.out
        self.out = LPDDR4Output(nphases=self.nphases // 2,
                                databits=self.databits)

        def ser(i, o):
            assert len(o) == len(i) // 2
            self.submodules += Serializer(
                clkdiv="sys",
                clk="sys2x",
                i_dw=len(i),
                o_dw=len(o),
                i=i,
                o=o,
                reset_cnt=serdes_reset_cnt,
            )

        def des(i, o):
            assert len(i) == len(o) // 2
            self.submodules += Deserializer(
                clkdiv="sys",
                clk="sys2x",
                i_dw=len(i),
                o_dw=len(o),
                i=i,
                o=o,
                reset_cnt=serdes_reset_cnt,
            )

        # handle ser/des for both the lists (like dq) and just Signal (like cs)
        def apply(fn, i, o):
            if not isinstance(i, list):
                i, o = [i], [o]
            for i_n, o_n in zip(i, o):
                fn(i=i_n, o=o_n)

        for name in vars(self.out):
            old = getattr(self._out, name)
            new = getattr(self.out, name)
            if name.endswith("_oe"):  # OE signals need to be delayed
                self.comb += new.eq(
                    delayed(self, old, cycles=Serializer.LATENCY))
            elif name.endswith("_i"):  # Deserialize inputs
                apply(des, o=old, i=new)
            else:  # All other signals are outputs
                apply(ser, i=old, o=new)
Exemple #2
0
    def __init__(self, aligned_reset_zero=False, **kwargs):
        pads = LPDDR4SimulationPads()
        self.submodules += pads
        super().__init__(pads,
            ser_latency  = Latency(Serializer.LATENCY),
            des_latency  = Latency(Deserializer.LATENCY),
            phytype      = "LPDDR4SimPHY",
            **kwargs)

        self.do_serialization(
            clkdiv             = "sys",
            delay              = lambda sig, cycles: delayed(self, sig, cycles=cycles),
            aligned_reset_zero = aligned_reset_zero,
        )
Exemple #3
0
    def __init__(self, aligned_reset_zero=False, **kwargs):
        pads = LPDDR4SimulationPads()
        self.submodules += pads
        super().__init__(pads,
            ser_latency  = Latency(sys=0, sys8x=4*Serializer.LATENCY),
            des_latency  = Latency(sys=0, sys8x=4*Deserializer.LATENCY),
            phytype      = "LPDDR4SimPHY",
            **kwargs)

        self.submodules.half_delay = ClockDomainsRenamer("sys2x")(Module())

        self.do_serialization(
            clkdiv             = "sys2x",
            delay              = lambda sig, cycles: delayed(self.half_delay, sig, cycles=cycles),
            aligned_reset_zero = aligned_reset_zero,
        )
Exemple #4
0
    def __init__(self,
                 pads,
                 *,
                 sys_clk_freq,
                 ser_latency,
                 des_latency,
                 phytype,
                 cmd_delay=None,
                 masked_write=True,
                 extended_overlaps_check=False):
        self.pads = pads
        self.memtype = memtype = "LPDDR4"
        self.nranks = nranks = 1 if not hasattr(pads, "cs_n") else len(
            pads.cs_n)
        self.databits = databits = len(pads.dq)
        self.addressbits = addressbits = 17  # for activate row address
        self.bankbits = bankbits = 6  # 3 bankbits, but we use 6 for Mode Register address in MRS
        self.nphases = nphases = 8
        self.tck = tck = 1 / (nphases * sys_clk_freq)
        assert databits % 8 == 0

        # Parameters -------------------------------------------------------------------------------
        def get_cl_cw(memtype, tck):
            # MT53E256M16D1, No DBI, Set A
            f_to_cl_cwl = OrderedDict()
            f_to_cl_cwl[532e6] = (6, 4)
            f_to_cl_cwl[1066e6] = (10, 6)
            f_to_cl_cwl[1600e6] = (14, 8)
            f_to_cl_cwl[2132e6] = (20, 10)
            f_to_cl_cwl[2666e6] = (24, 12)
            f_to_cl_cwl[3200e6] = (28, 14)
            f_to_cl_cwl[3732e6] = (32, 16)
            f_to_cl_cwl[4266e6] = (36, 18)
            for f, (cl, cwl) in f_to_cl_cwl.items():
                if tck >= 2 / f:
                    return cl, cwl
            raise ValueError

        # Bitslip introduces latency from 1 up to `cycles + 1`
        # FIXME: (check if True) from tests on hardware it seems we need 1 more cycle
        #   of read_latency, probably to have space for manipulating bitslip values
        bitslip_cycles = 1
        bitslip_range = 1
        # Commands are sent over 4 DRAM clocks (sys8x) and we count cl/cwl from last bit
        cmd_latency = 4
        # Commands read from adapters are delayed on ConstBitSlips
        ca_latency = 1

        cl, cwl = get_cl_cw(memtype, tck)
        cl_sys_latency = get_sys_latency(nphases, cl)
        cwl_sys_latency = get_sys_latency(nphases, cwl)
        # For reads we need to account for ser+des latency to make sure we get the data in-phase with sys clock
        rdphase = get_sys_phase(
            nphases, cl_sys_latency,
            cl + cmd_latency + ser_latency.sys8x + des_latency.sys8x)
        # No need to modify wrphase, because ser_latency applies the same to both CA and DQ
        wrphase = get_sys_phase(nphases, cwl_sys_latency, cwl + cmd_latency)

        # When the calculated phase is negative, it means that we need to increase sys latency
        def updated_latency(phase, sys_latency):
            while phase < 0:
                phase += nphases
                sys_latency += 1
            return phase, sys_latency

        wrphase, cwl_sys_latency = updated_latency(wrphase, cwl_sys_latency)
        rdphase, cl_sys_latency = updated_latency(rdphase, cl_sys_latency)

        # Read latency
        read_data_delay = ca_latency + ser_latency.sys + cl_sys_latency  # DFI cmd -> read data on DQ
        read_des_delay = des_latency.sys + bitslip_cycles + bitslip_range  # data on DQ -> data on DFI rddata
        read_latency = read_data_delay + read_des_delay

        # Write latency
        write_latency = cwl_sys_latency

        # Registers --------------------------------------------------------------------------------
        self._rst = CSRStorage()

        self._wlevel_en = CSRStorage()
        self._wlevel_strobe = CSR()

        self._dly_sel = CSRStorage(databits // 8)

        self._rdly_dq_bitslip_rst = CSR()
        self._rdly_dq_bitslip = CSR()

        self._wdly_dq_bitslip_rst = CSR()
        self._wdly_dq_bitslip = CSR()

        self._rdphase = CSRStorage(log2_int(nphases), reset=rdphase)
        self._wrphase = CSRStorage(log2_int(nphases), reset=wrphase)

        # PHY settings -----------------------------------------------------------------------------
        self.settings = PhySettings(
            phytype=phytype,
            memtype=memtype,
            databits=databits,
            dfi_databits=2 * databits,
            nranks=nranks,
            nphases=nphases,
            rdphase=self._rdphase.storage,
            wrphase=self._wrphase.storage,
            cl=cl,
            cwl=cwl,
            read_latency=read_latency,
            write_latency=write_latency,
            cmd_latency=cmd_latency,
            cmd_delay=cmd_delay,
        )

        # DFI Interface ----------------------------------------------------------------------------
        # Due to the fact that LPDDR4 has 16n prefetch we use 8 phases to be able to read/write a
        # whole burst during a single controller clock cycle. PHY should use sys8x clock.
        self.dfi = dfi = Interface(addressbits,
                                   bankbits,
                                   nranks,
                                   2 * databits,
                                   nphases=8)

        # # #

        adapters = [
            DFIPhaseAdapter(phase, masked_write=masked_write)
            for phase in self.dfi.phases
        ]
        self.submodules += adapters

        # Now prepare the data by converting the sequences on adapters into sequences on the pads.
        # We have to ignore overlapping commands, and module timings have to ensure that there are
        # no overlapping commands anyway.
        self.out = LPDDR4Output(nphases, databits)

        # Clocks -----------------------------------------------------------------------------------
        self.comb += self.out.clk.eq(bitpattern("-_-_-_-_" * 2))

        # Simple commands --------------------------------------------------------------------------
        self.comb += [
            self.out.cke.eq(
                Cat(delayed(self, phase.cke) for phase in self.dfi.phases)),
            self.out.odt.eq(
                Cat(delayed(self, phase.odt) for phase in self.dfi.phases)),
            self.out.reset_n.eq(
                Cat(delayed(self, phase.reset_n)
                    for phase in self.dfi.phases)),
        ]

        # LPDDR4 Commands --------------------------------------------------------------------------
        # Each LPDDR4 command can span several phases (2 or 4), so the commands cannot overlap.
        # This should be guaranteed by the controller based on module timings, but we also include
        # an overlaps check in PHY logic.
        # Basic check will make sure that no command will be sent to DRAM if there was any command
        # sent by the controller on DFI during 3 previous cycles. The extended version will instead
        # make sure no command is sent to DRAM if there was any command _actually sent to DRAM_
        # during 3 previous cycles. This is more expensive in terms of resources and generally not
        # needed.

        # Create a history of valid adapters used for masking overlapping ones
        valids = ConstBitSlip(dw=nphases, cycles=1, slp=0)
        self.submodules += valids
        self.comb += valids.i.eq(Cat(a.valid for a in adapters))
        valids_hist = valids.r
        if extended_overlaps_check:
            valids_hist = Signal.like(valids.r)
            for i in range(len(valids_hist)):
                was_valid_before = reduce(or_, valids_hist[max(0, i - 3):i], 0)
                self.comb += valids_hist[i].eq(valids.r[i] & ~was_valid_before)

        cs_per_adapter = []
        ca_per_adapter = defaultdict(list)
        for phase, adapter in enumerate(adapters):
            # The signals from an adapter can be used if there were no commands on 3 previous cycles
            allowed = ~reduce(or_,
                              valids_hist[nphases + phase - 3:nphases + phase])

            # Use CS and CA of given adapter slipped by `phase` bits
            cs_bs = ConstBitSlip(dw=nphases, cycles=1, slp=phase)
            self.submodules += cs_bs
            self.comb += cs_bs.i.eq(Cat(adapter.cs)),
            cs_mask = Replicate(allowed, len(cs_bs.o))
            cs = cs_bs.o & cs_mask
            cs_per_adapter.append(cs)

            # For CA we need to do the same for each bit
            ca_bits = []
            for bit in range(6):
                ca_bs = ConstBitSlip(dw=nphases, cycles=1, slp=phase)
                self.submodules += ca_bs
                ca_bit_hist = [adapter.ca[i][bit] for i in range(4)]
                self.comb += ca_bs.i.eq(Cat(*ca_bit_hist)),
                ca_mask = Replicate(allowed, len(ca_bs.o))
                ca = ca_bs.o & ca_mask
                ca_per_adapter[bit].append(ca)

        # OR all the masked signals
        self.comb += self.out.cs.eq(reduce(or_, cs_per_adapter))
        for bit in range(6):
            self.comb += self.out.ca[bit].eq(reduce(or_, ca_per_adapter[bit]))

        # DQ ---------------------------------------------------------------------------------------
        dq_oe = Signal()
        self.comb += self.out.dq_oe.eq(delayed(self, dq_oe, cycles=1))

        for bit in range(self.databits):
            # output
            wrdata = [
                self.dfi.phases[i // 2].wrdata[i % 2 * self.databits + bit]
                for i in range(2 * nphases)
            ]
            self.submodules += BitSlip(
                dw=2 * nphases,
                cycles=bitslip_cycles,
                rst=self.get_rst(bit // 8, self._wdly_dq_bitslip_rst.re),
                slp=self.get_inc(bit // 8, self._wdly_dq_bitslip.re),
                i=Cat(*wrdata),
                o=self.out.dq_o[bit],
            )

            # input
            dq_i_bs = Signal(2 * nphases)
            self.submodules += BitSlip(
                dw=2 * nphases,
                cycles=bitslip_cycles,
                rst=self.get_rst(bit // 8, self._rdly_dq_bitslip_rst.re),
                slp=self.get_inc(bit // 8, self._rdly_dq_bitslip.re),
                i=self.out.dq_i[bit],
                o=dq_i_bs,
            )
            for i in range(2 * nphases):
                self.comb += self.dfi.phases[i //
                                             2].rddata[i % 2 * self.databits +
                                                       bit].eq(dq_i_bs[i])

        # DQS --------------------------------------------------------------------------------------
        dqs_oe = Signal()
        dqs_preamble = Signal()
        dqs_postamble = Signal()
        dqs_pattern = DQSPattern(preamble=dqs_preamble,
                                 postamble=dqs_postamble,
                                 wlevel_en=self._wlevel_en.storage,
                                 wlevel_strobe=self._wlevel_strobe.re)
        self.submodules += dqs_pattern
        self.comb += [
            self.out.dqs_oe.eq(delayed(self, dqs_oe, cycles=1)),
        ]

        for byte in range(self.databits // 8):
            # output
            self.submodules += BitSlip(
                dw=2 * nphases,
                cycles=bitslip_cycles,
                rst=self.get_rst(byte, self._wdly_dq_bitslip_rst.re),
                slp=self.get_inc(byte, self._wdly_dq_bitslip.re),
                i=dqs_pattern.o,
                o=self.out.dqs_o[byte],
            )

        # DMI --------------------------------------------------------------------------------------
        # DMI signal is used for Data Mask or Data Bus Invertion depending on Mode Registers values.
        # With DM and DBI disabled, this signal is a Don't Care.
        # With DM enabled, masking is performed only when the command used is WRITE-MASKED.
        # We don't support DBI, DM support is configured statically with `masked_write`.
        for byte in range(self.databits // 8):
            if isinstance(masked_write, Signal) or masked_write:
                self.comb += self.out.dmi_oe.eq(self.out.dq_oe)
                wrdata_mask = [
                    self.dfi.phases[i //
                                    2].wrdata_mask[i % 2 * self.databits // 8 +
                                                   byte]
                    for i in range(2 * nphases)
                ]
                self.submodules += BitSlip(
                    dw=2 * nphases,
                    cycles=bitslip_cycles,
                    rst=self.get_rst(byte, self._wdly_dq_bitslip_rst.re),
                    slp=self.get_inc(byte, self._wdly_dq_bitslip.re),
                    i=Cat(*wrdata_mask),
                    o=self.out.dmi_o[byte],
                )
            else:
                self.comb += self.out.dmi_o[byte].eq(0)
                self.comb += self.out.dmi_oe.eq(0)

        # Read Control Path ------------------------------------------------------------------------
        # Creates a delay line of read commands coming from the DFI interface. The output is used to
        # signal a valid read data to the DFI interface.
        #
        # The read data valid is asserted for 1 sys_clk cycle when the data is available on the DFI
        # interface, the latency is the sum of the OSERDESE2, CAS, ISERDESE2 and Bitslip latencies.
        rddata_en = TappedDelayLine(signal=reduce(
            or_, [dfi.phases[i].rddata_en for i in range(nphases)]),
                                    ntaps=self.settings.read_latency)
        self.submodules += rddata_en

        self.comb += [
            phase.rddata_valid.eq(rddata_en.output | self._wlevel_en.storage)
            for phase in dfi.phases
        ]

        # Write Control Path -----------------------------------------------------------------------
        wrtap = cwl_sys_latency - 1
        assert wrtap >= 0

        # Create a delay line of write commands coming from the DFI interface. This taps are used to
        # control DQ/DQS tristates.
        wrdata_en = TappedDelayLine(signal=reduce(
            or_, [dfi.phases[i].wrdata_en for i in range(nphases)]),
                                    ntaps=wrtap + 2)
        self.submodules += wrdata_en

        self.comb += dq_oe.eq(wrdata_en.taps[wrtap])
        # Always enabled in write leveling mode, else during transfers
        self.comb += dqs_oe.eq(self._wlevel_en.storage
                               | (dqs_preamble | dq_oe | dqs_postamble))

        # Write DQS Postamble/Preamble Control Path ------------------------------------------------
        # Generates DQS Preamble 1 cycle before the first write and Postamble 1 cycle after the last
        # write. During writes, DQS tristate is configured as output for at least 3 sys_clk cycles:
        # 1 for Preamble, 1 for the Write and 1 for the Postamble.
        def wrdata_en_tap(i):  # allows to have wrtap == 0
            return wrdata_en.input if i == -1 else wrdata_en.taps[i]

        self.comb += dqs_preamble.eq(
            wrdata_en_tap(wrtap - 1) & ~wrdata_en_tap(wrtap + 0))
        self.comb += dqs_postamble.eq(
            wrdata_en_tap(wrtap + 1) & ~wrdata_en_tap(wrtap + 0))
Exemple #5
0
    def __init__(self, pads, data_cdc, *, clk_freq, log_level, init_delays=False):
        self.submodules.log = log = SimLogger(log_level=log_level, clk_freq=clk_freq)
        self.log.add_csrs()

        # Mode Registers storage
        self.mode_regs = Array([Signal(8) for _ in range(64)])
        # Active banks
        self.active_banks = Array([Signal() for _ in range(8)])
        self.active_rows = Array([Signal(17) for _ in range(8)])
        # Connection to DataSim
        self.data_en = TappedDelayLine(ntaps=20)
        self.data = data_cdc
        self.submodules += self.data, self.data_en

        # CS/CA shift registers
        cs = TappedDelayLine(pads.cs, ntaps=2)
        ca = TappedDelayLine(pads.ca, ntaps=2)
        self.submodules += cs, ca

        self.cs_low     = Signal(6)
        self.cs_high    = Signal(6)
        self.handle_cmd = Signal()
        self.mpc_op     = Signal(7)

        cmds_enabled = Signal()
        cmd_handlers = OrderedDict(
            MRW = self.mrw_handler(),
            REF = self.refresh_handler(),
            ACT = self.activate_handler(),
            PRE = self.precharge_handler(),
            CAS = self.cas_handler(),
            MPC = self.mpc_handler(),
        )
        self.comb += [
            If(cmds_enabled,
                If(Cat(cs.taps) == 0b10,
                    self.handle_cmd.eq(1),
                    self.cs_high.eq(ca.taps[1]),
                    self.cs_low.eq(ca.taps[0]),
                )
            ),
            If(self.handle_cmd & ~reduce(or_, cmd_handlers.values()),
                self.log.error("Unexpected command: cs_high=0b%06b cs_low=0b%06b", self.cs_high, self.cs_low)
            ),
        ]

        def ck(t):
            return math.ceil(t * clk_freq)

        self.submodules.tinit0 = PulseTiming(ck(20e-3))  # makes no sense in simulation
        self.submodules.tinit1 = PulseTiming(ck(200e-6))
        self.submodules.tinit2 = PulseTiming(ck(10e-9))
        self.submodules.tinit3 = PulseTiming(ck(2e-3))
        self.submodules.tinit4 = PulseTiming(5)  # TODO: would require counting pads.clk_p ticks
        self.submodules.tinit5 = PulseTiming(ck(2e-6))
        self.submodules.tzqcal = PulseTiming(ck(1e-6))
        self.submodules.tzqlat = PulseTiming(max(8, ck(30e-9)))
        self.submodules.tpw_reset = PulseTiming(ck(100e-9))

        self.comb += [
            self.tinit1.trigger.eq(1),
            self.tinit2.trigger.eq(~pads.cke),
            self.tinit3.trigger.eq(pads.reset_n),
            self.tpw_reset.trigger.eq(~pads.reset_n),
            If(~delayed(self, pads.reset_n) & pads.reset_n,
                self.log.info("RESET released"),
                If(~self.tinit1.ready,
                    self.log.warn("tINIT1 violated: RESET deasserted too fast")
                ),
                If(~self.tinit2.ready,
                    self.log.warn("tINIT2 violated: CKE LOW too short before RESET being released")
                ),
            ),
            If(delayed(self, pads.reset_n) & ~pads.reset_n,
                self.log.info("RESET asserted"),
            ),
            If(delayed(self, pads.cke) & ~pads.cke,
                self.log.info("CKE falling edge"),
            ),
            If(~delayed(self, pads.cke) & pads.cke,
                self.log.info("CKE rising edge"),
                If(~self.tinit3.ready,
                    self.log.warn("tINIT3 violated: CKE set HIGH too fast after RESET being released")
                ),
            ),
        ]

        self.submodules.fsm = fsm = ResetInserter()(FSM())
        self.comb += [
            If(self.tpw_reset.ready_p,
                fsm.reset.eq(1),
                self.log.info("FSM reset")
            )
        ]
        fsm.act("RESET",
            If(self.tinit3.ready_p | (not init_delays),
                NextState("EXIT-PD")  # Td
            )
        )
        fsm.act("EXIT-PD",
            self.tinit5.trigger.eq(1),
            If(self.tinit5.ready_p | (not init_delays),
                NextState("MRW")  # Te
            )
        )
        fsm.act("MRW",
            cmds_enabled.eq(1),
            If(self.handle_cmd & ~cmd_handlers["MRW"] & ~cmd_handlers["MPC"],
                self.log.warn("Only MRW/MRR commands expected before ZQ calibration"),
                self.log.warn(" ".join("{}=%d".format(cmd) for cmd in cmd_handlers.keys()), *cmd_handlers.values()),
            ),
            If(cmd_handlers["MPC"],
                If(self.mpc_op != MPC.ZQC_START,
                    self.log.error("ZQC-START expected, got op=0b%07b", self.mpc_op)
                ).Else(
                    NextState("ZQC")  # Tf
                )
            ),
        )
        fsm.act("ZQC",
            self.tzqcal.trigger.eq(1),
            cmds_enabled.eq(1),
            If(self.handle_cmd,
                If(~(cmd_handlers["MPC"] & (self.mpc_op == MPC.ZQC_LATCH)),
                    self.log.error("Expected ZQC-LATCH")
                ).Else(
                    If(init_delays & ~self.tzqcal.ready,
                        self.log.warn("tZQCAL violated")
                    ),
                    NextState("NORMAL")  # Tg
                )
            ),
        )
        fsm.act("NORMAL",
            cmds_enabled.eq(1),
            self.tzqlat.trigger.eq(1),
            If(init_delays & self.handle_cmd & ~self.tzqlat.ready,
                self.log.warn("tZQLAT violated")
            ),
        )

        # Log state transitions
        fsm.finalize()
        prev_state = delayed(self, fsm.state)
        self.comb += If(prev_state != fsm.state,
            Case(prev_state, {
                state: Case(fsm.state, {
                    next_state: self.log.info(f"FSM: {state_name} -> {next_state_name}")
                    for next_state, next_state_name in fsm.decoding.items()
                })
                for state, state_name in fsm.decoding.items()
            })
        )