async def dma_mem_read(self, addr, length, timeout=0, timeout_unit='ns'):
        data = b''
        n = 0

        while True:
            tlp = Tlp()
            if addr > 0xffffffff:
                tlp.fmt_type = TlpType.MEM_READ_64
            else:
                tlp.fmt_type = TlpType.MEM_READ
            tlp.requester_id = PcieId(self.dev_bus_num, self.dev_device_num, 0)

            first_pad = addr % 4
            byte_length = length - n
            # max read request size
            byte_length = min(byte_length,
                              (128 << self.dev_max_read_req) - first_pad)
            # 4k address align
            byte_length = min(byte_length, 0x1000 - (addr & 0xfff))
            tlp.set_addr_be(addr, byte_length)

            tlp.tag = await self.alloc_tag()

            await self.tx_source.send(S10PcieFrame.from_tlp(tlp))

            m = 0

            while True:
                cpl = await self.recv_cpl(tlp.tag, timeout, timeout_unit)

                if not cpl:
                    raise Exception("Timeout")

                if cpl.status != CplStatus.SC:
                    raise Exception("Unsuccessful completion")
                else:
                    assert cpl.byte_count + 3 + (cpl.lower_address
                                                 & 3) >= cpl.length * 4
                    assert cpl.byte_count == max(byte_length - m, 1)

                    d = cpl.get_data()

                    offset = cpl.lower_address & 3
                    data += d[offset:offset + cpl.byte_count]

                m += len(d) - offset

                if m >= byte_length:
                    break

            self.release_tag(tlp.tag)

            n += byte_length
            addr += byte_length

            if n >= length:
                break

        return data[:length]
    async def dma_io_read(self, addr, length, timeout=0, timeout_unit='ns'):
        data = b''
        n = 0

        while True:
            tlp = Tlp()
            tlp.fmt_type = TlpType.IO_READ
            tlp.requester_id = PcieId(self.dev_bus_num, self.dev_device_num, 0)

            first_pad = addr % 4
            byte_length = min(length - n, 4 - first_pad)
            tlp.set_addr_be(addr, byte_length)

            tlp.tag = await self.alloc_tag()

            await self.tx_source.send(S10PcieFrame.from_tlp(tlp))
            cpl = await self.recv_cpl(tlp.tag, timeout, timeout_unit)

            self.release_tag(tlp.tag)

            if not cpl:
                raise Exception("Timeout")

            if cpl.status != CplStatus.SC:
                raise Exception("Unsuccessful completion")
            else:
                d = cpl.get_data()

            data += d[first_pad:]

            n += byte_length
            addr += byte_length

            if n >= length:
                break

        return data[:length]