Example #1
0
 def __init__(self, width_i, shape):
     self.input_a = MatrixStream(width=width_i, shape=shape, direction='sink', name='input_a')
     self.input_b = MatrixStream(width=width_i, shape=shape, direction='sink', name='input_b')
     self.input_w = self.input_a.dataport.width
     self.n_inputs = self.input_a.dataport.n_elements
     self.output_w = calculate_output_width(self.input_w, self.n_inputs)
     self.output = DataStream(self.output_w, direction='source', name='output')
     self.shape = self.input_a.dataport.shape
Example #2
0
def test_main_wrapper(latency):
    core = StreamWrapper(wrapped_core=ExampleCore(16, latency),
                         input_stream=DataStream(16, direction='sink', name='input'),
                         output_stream=DataStream(16, direction='source', name='output'),
                         input_map={'data': 'data_i'},
                         output_map={'data': 'data_o'},
                         latency=latency)
    ports = core.get_ports()
    run(core, 'cnn.tests.test_stream_wrapper', ports=ports, vcd_file=f'./test_stream_wrapper.vcd')
Example #3
0
 def __init__(self, input_w, row_length, N, invert=False):
     self.row_length = row_length
     self.invert = invert
     self.input = DataStream(width=input_w, direction='sink', name='input')
     self.output = MatrixStream(width=input_w,
                                shape=(N, ),
                                direction='source',
                                name='output')
     self.input_w = len(self.input.data)
     self.output_w = self.output.dataport.width
     self.shape = self.output.dataport.shape
     self.N = self.output.dataport.shape[0]
Example #4
0
 def __init__(self, width_i, width_c, width_acc=None, shift=None):
     if width_acc is None:
         width_acc = 48
     if shift is None:
         shift = 0
     output_w = width_acc - shift
     self.shift = shift
     self.accumulator = Signal(signed(width_acc))
     self.input = DataStream(width=width_i, direction='sink', name='input')
     self.output = DataStream(width=output_w,
                              direction='source',
                              name='output')
     self.r_data = Signal(signed(width_c))
     self.r_en = Signal()
     self.r_rdy = Signal()
     self.latency = 5
Example #5
0
 def __init__(self, width, input_shape, N, n_cores):
     self.input_shape = input_shape
     self.n_cores = n_cores
     self.matrix_feeder = MatrixFeeder(data_w=width,
                                       input_shape=input_shape,
                                       N=N,
                                       invert=False)
     self.farm = Farm(width=width,
                      shape=(N, N),
                      n_cores=n_cores)
     self.coeff = MatrixStream(width=width, shape=(N, N), direction='sink', name='coeff')
     self.input = DataStream(width=width, direction='sink', name='input')
     self.output = DataStream(width=len(self.farm.output.data), direction='source', name='output')
     self.input_w = len(self.input.data)
     self.output_w = len(self.output.data)
     self.shape = self.coeff.dataport.shape
     self.N = self.coeff.dataport.shape[0]
Example #6
0
 def __init__(self, width, shape, n_cores):
     self.cores = [DotProduct(width, shape) for _ in range(n_cores)]
     self.input_a = MatrixStream(width=width,
                                 shape=shape,
                                 direction='sink',
                                 name='input_a')
     self.input_b = MatrixStream(width=width,
                                 shape=shape,
                                 direction='sink',
                                 name='input_b')
     self.output_w = self.cores[0].output_w
     self.output = DataStream(self.output_w,
                              direction='source',
                              name='output')
     self.input_w = self.input_a.dataport.width
     self.n_inputs = self.input_a.dataport.n_elements
     self.shape = self.input_a.dataport.shape
     self.n_cores = len(self.cores)
Example #7
0
 def __init__(self, data_w, input_shape, N, invert=False):
     assert input_shape[0] % N == 0, (
         f'image height must be a multiple of N. Psss, you can use Padder() to append zeros!'
     )
     assert input_shape[1] % N == 0, (
         f'image width must be a multiple of N. Psss, you can use Padder() to append zeros!'
     )
     self.input = DataStream(width=data_w, direction='sink', name='input')
     self.output = MatrixStream(width=data_w,
                                shape=(N, N),
                                direction='source',
                                name='output')
     self.matrix_feeder = MatrixFeeder(data_w,
                                       input_shape,
                                       N,
                                       invert=invert)
     self.output_shape = (int(input_shape[0] / N), int(input_shape[1] / N))
     self.N = N
Example #8
0
 def __init__(self, data_w, input_shape, N, mode):
     assert input_shape[0] % N == 0, (
         f'image height must be a multiple of N. Psss, you can use Padder() to append zeros!'
     )
     assert input_shape[1] % N == 0, (
         f'image width must be a multiple of N. Psss, you can use Padder() to append zeros!'
     )
     assert mode in self._modes, 'Unsupported mode'
     self.mode = mode
     self.matrix_feeder = MatrixFeederSkip(data_w=data_w,
                                           input_shape=input_shape,
                                           N=N,
                                           invert=False)
     self.input = DataStream(width=data_w, direction='sink', name='input')
     self.output = DataStream(width=data_w,
                              direction='source',
                              name='output')
     self.N = N
     self.output_shape = [int(x / N) for x in input_shape]
Example #9
0
 def __init__(self, data_w, input_shape, N, invert=False):
     self.input_shape = input_shape
     self.output_shape = (input_shape[0] + 1 - N, input_shape[1] + 1 - N)
     self.invert = invert
     self.input = DataStream(width=data_w, direction='sink', name='input')
     self.output = MatrixStream(width=data_w,
                                shape=(N, N),
                                direction='source',
                                name='output')
     self.data_w = len(self.input.data)
     self.shape = self.output.dataport.shape
     self.N = self.output.dataport.shape[0]
Example #10
0
    def __init__(self, width_i, width_w, n_inputs, rom_init):
        assert len(rom_init) % (n_inputs + 1) == 0
        accum_w = accum_req_bits(width_i, width_w, n_inputs + 1)  # +1 bias
        shift = width_w - 1  # compensate weights gain

        self.n_inputs = n_inputs

        self.rom = CircularROM(width=width_w, init=rom_init)

        self.macc = StreamMacc(width_i=width_i,
                               width_c=width_w,
                               width_acc=accum_w,
                               shift=shift)

        output_w = len(self.macc.output.data)
        assert output_w == accum_w - shift, (
            f'{output_w} == {accum_w} - {shift}')
        self.input = DataStream(width=width_i, direction='sink', name='input')
        self.output = DataStream(width=output_w,
                                 direction='source',
                                 name='output')
Example #11
0
 def __init__(self, data_w, input_shape, output_shape, fill_value=0):
     if input_shape[0] == output_shape[0] and input_shape[
             1] == output_shape[1]:
         # no operation, just last generation
         setattr(self, 'elaborate', self.elaborate_nop)
     elif input_shape[0] <= output_shape[0] and input_shape[
             1] <= output_shape[1]:
         setattr(self, 'elaborate', self.elaborate_padder)
     elif input_shape[0] >= output_shape[0] and input_shape[
             1] >= output_shape[1]:
         setattr(self, 'elaborate', self.elaborate_cropper)
     else:
         raise RuntimeError(
             'Output image must be cant be bigger in one dimension and smaller in the other one'
         )
     self.input_shape = input_shape
     self.output_shape = output_shape
     self.fill_value = fill_value
     self.input = DataStream(width=data_w, direction='sink', name='input')
     self.output = DataStream(width=data_w,
                              direction='source',
                              name='output')
Example #12
0
def TreeHighestUnsignedWrapped(width_i, n_stages, reg_in, reg_out):
    core = TreeHighestUnsigned(width_i=width_i,
                               n_stages=n_stages,
                               reg_in=reg_in,
                               reg_out=reg_out)
    latency = core.latency
    n_inputs = len(core.inputs)
    input_stream = MatrixStream(width_i,
                                shape=(n_inputs, ),
                                direction='sink',
                                name='input')
    output_stream = DataStream(core.output.width,
                               direction='source',
                               name='output')
    input_map = {}
    for i in range(n_inputs):
        input_map['data_' + str(i)] = core.inputs[i].name
    return StreamWrapper(wrapped_core=core,
                         input_stream=input_stream,
                         output_stream=output_stream,
                         input_map=input_map,
                         output_map={'data': 'output'},
                         latency=latency)
Example #13
0
class MatrixFeederSkip(MatrixFeeder):
    def __init__(self, data_w, input_shape, N, invert=False):
        assert input_shape[0] % N == 0, (
            f'image height must be a multiple of N. Psss, you can use Padder() to append zeros!'
        )
        assert input_shape[1] % N == 0, (
            f'image width must be a multiple of N. Psss, you can use Padder() to append zeros!'
        )
        self.input = DataStream(width=data_w, direction='sink', name='input')
        self.output = MatrixStream(width=data_w,
                                   shape=(N, N),
                                   direction='source',
                                   name='output')
        self.matrix_feeder = MatrixFeeder(data_w,
                                          input_shape,
                                          N,
                                          invert=invert)
        self.output_shape = (int(input_shape[0] / N), int(input_shape[1] / N))
        self.N = N

    def get_ports(self):
        ports = [self.input[f] for f in self.input.fields]
        ports += [self.output[f] for f in self.output.fields]
        return ports

    def elaborate(self, platform):
        m = Module()
        sync = m.d.sync
        comb = m.d.comb

        pooling_counter_row = Signal(range(self.N))
        pooling_counter_col = Signal(range(self.N))

        m.submodules.matrix_feeder = matrix_feeder = self.matrix_feeder

        row, col = img_position_counter(m, sync, self.output,
                                        self.output_shape)
        feeder_row, feeder_col = img_position_counter(
            m, sync, matrix_feeder.output, matrix_feeder.output_shape)

        # input --> matrix_feeder
        comb += [
            matrix_feeder.input.valid.eq(self.input.valid),
            matrix_feeder.input.last.eq(self.input.last),
            matrix_feeder.input.data.eq(self.input.data),
            self.input.ready.eq(matrix_feeder.input.ready),
        ]

        comb += self.output.dataport.eq(matrix_feeder.output.dataport)
        comb += self.output.last.eq(is_last(row, col, self.output_shape))

        with m.If(matrix_feeder.output.accepted()):
            sync += pooling_counter_row.eq(_incr(pooling_counter_row, self.N))
            with m.If(feeder_row == matrix_feeder.output_shape[1] - 1):
                sync += pooling_counter_row.eq(0)
                sync += pooling_counter_col.eq(
                    _incr(pooling_counter_col, self.N))
            with m.If(matrix_feeder.output.last):
                sync += [
                    pooling_counter_row.eq(0),
                    pooling_counter_col.eq(0),
                ]

        with m.FSM() as fsm:
            with m.State("normal"):
                with m.If((pooling_counter_row == 0)
                          & (pooling_counter_col == 0)):
                    comb += [
                        self.output.valid.eq(matrix_feeder.output.valid),
                        matrix_feeder.output.ready.eq(self.output.ready),
                    ]
                with m.Else():
                    comb += [
                        self.output.valid.eq(0),
                        matrix_feeder.output.ready.eq(1),
                    ]
                with m.If(self.output.accepted() & self.output.last):
                    m.next = "last"

            with m.State("last"):
                comb += [
                    self.output.valid.eq(0),
                    matrix_feeder.output.ready.eq(1),
                ]
                with m.If(self.input.accepted() & self.input.last):
                    m.next = "normal"

        return m
Example #14
0
 def __init__(self, width, leak=0):
     self.leak = leak
     self.input = DataStream(width=width, direction='sink', name='input')
     self.output = DataStream(width=width,
                              direction='source',
                              name='output')
Example #15
0
class mlpNode(Elaboratable):
    _doc_ = """
    MLP Node instantiates a Stream Macc and a Circular ROM
    to store the corresponding weights. Both input and output
    are Stream interfaces.
    This MLP Node can actually do the job of N neurons serially
    where each neuron will require (n_inputs + 1) weights stored.

    Parameters
    ----------
    width_i : int
        Bit width of data in stream interface.

    width_w : int
        Bit width of data in the ROM.

    n_inputs : int
        Number of inputs for each neuron.

    rom_init : list
        List with weights to initialize the ROM. It should
        have the form
        [N0_W0, N0_W1, ..., N0_Wn-1, N0_Wbias,
         N1_W0, N1_W1, ..., N1_Wn-1, N1_Wbias,
         ...
        ]
        where Nx_Wy refers to the weight of the sample 'y'
        of neuron 'x'.
    """

    def __init__(self, width_i, width_w, n_inputs, rom_init):
        assert len(rom_init) % (n_inputs + 1) == 0
        accum_w = accum_req_bits(width_i, width_w, n_inputs + 1)  # +1 bias
        shift = width_w - 1  # compensate weights gain

        self.n_inputs = n_inputs

        self.rom = CircularROM(width=width_w, init=rom_init)

        self.macc = StreamMacc(width_i=width_i,
                               width_c=width_w,
                               width_acc=accum_w,
                               shift=shift)

        output_w = len(self.macc.output.data)
        assert output_w == accum_w - shift, (
            f'{output_w} == {accum_w} - {shift}')
        self.input = DataStream(width=width_i, direction='sink', name='input')
        self.output = DataStream(width=output_w,
                                 direction='source',
                                 name='output')

    def get_ports(self):
        ports = []
        ports += [self.input[f] for f in self.input.fields]
        ports += [self.output[f] for f in self.output.fields]
        return ports

    def elaborate(self, platform):
        m = Module()
        sync = m.d.sync
        comb = m.d.comb

        m.submodules.rom = rom = self.rom
        m.submodules.macc = macc = self.macc

        cnt = Signal(range(self.n_inputs))
        output_data = Signal(signed(len(self.output.data)))

        comb += macc.r_data.eq(rom.r_data)
        comb += macc.r_rdy.eq(rom.r_rdy)
        comb += rom.r_en.eq(macc.r_en)
        comb += rom.restart.eq(0)  # should be unnecessary if the
        # inputs are correct.

        nxt_cnt = Signal.like(cnt)
        comb += nxt_cnt.eq(_incr(cnt, self.n_inputs))
        with m.If(self.input.accepted()):
            sync += cnt.eq(nxt_cnt)

        with m.FSM() as fsm:

            with m.State("INPUT"):
                comb += self.input.ready.eq(macc.input.ready)
                comb += macc.input.valid.eq(self.input.valid)
                comb += macc.input.data.eq(self.input.data)
                comb += macc.input.last.eq(0)
                with m.If(macc.input.accepted() & (nxt_cnt == 0)):
                    m.next = "BIAS"

            with m.State("BIAS"):
                comb += self.input.ready.eq(0)
                comb += macc.input.valid.eq(1)
                comb += macc.input.data.eq(
                    1
                )  # should it be bigger? what granularity should the bias have?
                comb += macc.input.last.eq(1)
                with m.If(macc.input.accepted()):
                    m.next = "INPUT"

        comb += output_data.eq(macc.output.data)
        comb += self.output.valid.eq(macc.output.valid)
        comb += self.output.data.eq(output_data)
        comb += self.output.last.eq(0)  # self.input.last + delay?
        comb += macc.output.ready.eq(self.output.ready)

        return m
Example #16
0
class Farm(Elaboratable):
    _doc_ = """
    "Farm" of DotProduct cores, for parallel computation.
    The performed operation is the dot product of two NxM
    matrixes.

    Keep in mind that since throughput will never be higher
    than one output per clock, it doesn't make sense to use
    a higher number of DotProduct cores than the latency of
    each one of them.

    The dataflow is controlled ONLY by the input_a Stream interface.
    The input_b stream interface is DUMMY, and should always
    have valid values in the input. The ready of the input_b
    interface will be attached to input_a.accepted(), and a valid=1
    will be assumed. Why?
    I want to avoid a combinational path between the valid of input_b
    and the ready of input_a.

    Interfaces
    ----------
    input_a : Matrix Stream, input
        Input a matrix data.

    input_b : Matrix Stream, input
        Input b matrix data.
        TO DO: should not be a stream, but plain "matrix shaped" values.

    output : Data Stream, output
        Dot product computated value.

    Parameters
    ----------
    width : int
        Bit width of both inputs.

    shape : tuple
        Input shape (N, M).

    n_cores : int
        Number of paralell computations of dot product.
    """

    def __init__(self, width, shape, n_cores):
        self.cores = [DotProduct(width, shape) for _ in range(n_cores)]
        self.input_a = MatrixStream(width=width,
                                    shape=shape,
                                    direction='sink',
                                    name='input_a')
        self.input_b = MatrixStream(width=width,
                                    shape=shape,
                                    direction='sink',
                                    name='input_b')
        self.output_w = self.cores[0].output_w
        self.output = DataStream(self.output_w,
                                 direction='source',
                                 name='output')
        self.input_w = self.input_a.dataport.width
        self.n_inputs = self.input_a.dataport.n_elements
        self.shape = self.input_a.dataport.shape
        self.n_cores = len(self.cores)

    def get_ports(self):
        ports = []
        ports += [self.input_a[f] for f in self.input_a.fields]
        ports += [self.input_b[f] for f in self.input_b.fields]
        ports += [self.output[f] for f in self.output.fields]
        return ports

    def elaborate(self, platform):
        m = Module()
        sync = m.d.sync
        comb = m.d.comb

        current_core_sink = Signal(range(self.n_cores))
        current_core_source = Signal(range(self.n_cores))

        # DUMMY input_b interface
        # comb += [self.input_b.ready.eq(self.input_a.accepted())]

        for i, core in enumerate(self.cores):
            m.submodules['core_' + str(i)] = core
            comb += core.input_b.dataport.eq(
                self.input_b.dataport)  # same coefficients for everybody
            with m.If(current_core_sink == i):
                comb += [
                    self.input_a.ready.eq(core.input_a.ready),
                    self.input_b.ready.eq(core.input_b.ready),
                ]
                comb += [
                    core.input_a.valid.eq(self.input_a.valid),
                    core.input_b.valid.eq(self.input_b.valid),
                    core.input_a.dataport.eq(self.input_a.dataport),
                ]
            with m.Else():
                comb += [
                    core.input_a.valid.eq(0),
                    core.input_b.valid.eq(0),
                    core.input_a.dataport.eq_const(0),
                ]
            with m.If(current_core_source == i):
                comb += [
                    self.output.valid.eq(core.output.valid),
                    self.output.data.eq(core.output.data),
                ]
                comb += [
                    core.output.ready.eq(self.output.ready),
                ]
            with m.Else():
                comb += [
                    core.output.ready.eq(0),
                ]

        with m.If(self.input_a.accepted()):
            sync += current_core_sink.eq(_incr(current_core_sink,
                                               self.n_cores))

        with m.If(self.output.accepted()):
            sync += current_core_source.eq(
                _incr(current_core_source, self.n_cores))

        return m
Example #17
0
class RowFifos(Elaboratable):
    """ N fifos that work synchronized to provide Nx1 (N=row)
    vector of data.
    """
    def __init__(self, input_w, row_length, N, invert=False):
        self.row_length = row_length
        self.invert = invert
        self.input = DataStream(width=input_w, direction='sink', name='input')
        self.output = MatrixStream(width=input_w,
                                   shape=(N, ),
                                   direction='source',
                                   name='output')
        self.input_w = len(self.input.data)
        self.output_w = self.output.dataport.width
        self.shape = self.output.dataport.shape
        self.N = self.output.dataport.shape[0]

    def get_ports(self):
        ports = [self.input[f] for f in self.input.fields]
        ports += [self.output[f] for f in self.output.fields]
        return ports

    def elaborate(self, platform):
        m = Module()
        sync = m.d.sync
        comb = m.d.comb

        fifo = [
            SyncFIFOBuffered(width=self.input_w, depth=self.row_length + 4)
            for _ in range(self.N)
        ]

        fifo_r_rdy = [Signal() for _ in range(self.N)]
        fifo_r_valid = [Signal() for _ in range(self.N)]

        w_en = [Signal() for _ in range(self.N - 1)]

        for n in range(self.N):
            m.submodules['fifo_' + str(n)] = fifo[n]
            comb += [
                fifo_r_rdy[n].eq((fifo[n].level < self.row_length)
                                 | self.output.accepted()),
            ]

        # first fifo
        comb += [
            self.input.ready.eq(fifo[0].w_rdy),
            fifo[0].w_en.eq(self.input.accepted()),
            fifo[0].w_data.eq(self.input.data),
        ]

        for n in range(self.N - 1):
            comb += [
                fifo_r_valid[n].eq((fifo[n + 1].level == self.row_length)
                                   & (fifo[n].r_rdy)),
                fifo[n].r_en.eq((self.output.accepted() | ~fifo_r_valid[n])),
                fifo[n + 1].w_en.eq(fifo[n].r_rdy & fifo[n].r_en),
                fifo[n + 1].w_data.eq(fifo[n].r_data),
            ]

        # last fifo
        n = self.N - 1
        comb += [
            fifo_r_valid[n].eq(fifo[n].r_rdy),
            fifo[n].r_en.eq(self.output.accepted()),
        ]

        # output
        comb += [
            self.output.valid.eq(_and(fifo_r_valid)),
        ]

        for n in range(self.N):
            if self.invert:
                comb += self.output.dataport.matrix[n].eq(fifo[n].r_data)
            else:
                comb += self.output.dataport.matrix[n].eq(fifo[self.N - 1 -
                                                               n].r_data)

        return m
Example #18
0
class DotProduct(Elaboratable):
    #
    # WARNING:
    # The dataflow is controlled ONLY by the input_a AXIS interface.
    # The input_b AXIS interface is DUMMY, and should always have valid values in the input.
    # The ready of the input_b interface will be attached to input_a.accepted(), and a valid=1
    # will be assumed.
    #
    # Why?
    # I want to avoid a combinational path between the valid of input_b and the ready of input_a.
    #
    def __init__(self, width_i, shape):
        self.input_a = MatrixStream(width=width_i, shape=shape, direction='sink', name='input_a')
        self.input_b = MatrixStream(width=width_i, shape=shape, direction='sink', name='input_b')
        self.input_w = self.input_a.dataport.width
        self.n_inputs = self.input_a.dataport.n_elements
        self.output_w = calculate_output_width(self.input_w, self.n_inputs)
        self.output = DataStream(self.output_w, direction='source', name='output')
        self.shape = self.input_a.dataport.shape

    def get_ports(self):
        ports = []
        ports += [self.input_a[f] for f in self.input_a.fields]
        ports += [self.input_b[f] for f in self.input_b.fields]
        ports += [self.output[f] for f in self.output.fields]
        return ports

    def elaborate(self, platform):
        m = Module()
        sync = m.d.sync
        comb = m.d.comb

        tmp_input_a = Signal(self.input_w * self.n_inputs)
        tmp_input_b = Signal(self.input_w * self.n_inputs)
        counter = Signal(range(self.n_inputs))
        
        m.submodules['mac'] = mac = MAC(input_w=self.input_w, output_w=self.output_w)
        comb += [mac.input_a.eq(tmp_input_a[0:self.input_w]),
                 mac.input_b.eq(tmp_input_b[0:self.input_w]),]
        
        # DUMMY input_b interface
        comb += [self.input_b.ready.eq(self.input_a.accepted())]
    
        with m.FSM() as fsm:
            
            with m.State("IDLE"):
            
                comb += [self.input_a.ready.eq(self.output.accepted() | ~self.output.valid),
                         mac.clr.eq(1),
                         mac.clken.eq(0),]
            
                with m.If(self.input_a.accepted()):
                    m.next = "BUSY"
                    sync += [tmp_input_a.eq(Cat(*self.input_a.flat)), #self.input_a.data),
                             tmp_input_b.eq(Cat(*self.input_b.flat)), #Cat(*self.input_b)),
                             counter.eq(0),]
            
                with m.If(self.output.accepted()):
                    sync += self.output.valid.eq(0)
            
            with m.State("BUSY"):
            
                comb += [self.input_a.ready.eq(0),
                         mac.clr.eq(0),
                         mac.clken.eq(1),]
            
                sync += [tmp_input_b.eq(tmp_input_b >> self.input_w),
                         tmp_input_a.eq(tmp_input_a >> self.input_w),]
            
                with m.If(mac.valid_o):
                    sync += counter.eq(counter + 1)
                    with m.If(counter == self.n_inputs - 1):
                        m.next = "IDLE"
                        sync += [self.output.data.eq(mac.output),
                                 self.output.valid.eq(1),]

        return m
Example #19
0
class StreamMacc(Elaboratable):
    _doc_ = """
    Multiplier and accumulator with selectable output division to scale
    down result and a Stream interface.
    The main input data interface is a Stream interface, while
    for the coefficients a memory read port interface is used.
    While the memory has r_rdy==1, the dataflow control will be
    done in the input data stream interface. If the coefficients
    come from the same interface as the input data, just assign
    the ports of the Stream Macc you should just:
    * ignore r_en
    * tell the core that the memory is ready to be read when
      the input data is valid by assigning:
        core.r_rdy <-- core.input.valid

    Interfaces
    ----------
    input : Data Stream, input
        Input data.
        Each product between the input data and the coeff data
        will be accumulated, until a last is asserted. A last
        will flush the pipeline and output the valid result,
        clearing the accumulator afterwards.

    coeff : {r_data, r_en, r_rdy}, input
        Input coefficients.
        TO DO: Implement ReadportInterface

    output : Data Stream, output
        Output data. Will output valid data after a last is
        asserted in the input interface. Otherwise, it will
        keep accumulating.

    Parameters
    ----------
    width_i : int
        Bit width of data in stream interface.

    width_c : int
        Bit width of the coefficients.

    width_acc : int
        Bit width of the accumulator.

    shift : int
        The accumulator result will be shifted to the right by
        this number, so the output will be (accumulator / 2**shift).
    """

    def __init__(self, width_i, width_c, width_acc=None, shift=None):
        if width_acc is None:
            width_acc = 48
        if shift is None:
            shift = 0
        output_w = width_acc - shift
        self.shift = shift
        self.accumulator = Signal(signed(width_acc))
        self.input = DataStream(width=width_i, direction='sink', name='input')
        self.output = DataStream(width=output_w,
                                 direction='source',
                                 name='output')
        self.r_data = Signal(signed(width_c))
        self.r_en = Signal()
        self.r_rdy = Signal()
        self.latency = 5

    def get_ports(self):
        ports = [self.r_data, self.r_en, self.r_rdy]
        ports += [self.input[f] for f in self.input.fields]
        ports += [self.output[f] for f in self.output.fields]
        return ports

    def elaborate(self, platform):
        m = Module()
        comb = m.d.comb
        sync = m.d.sync

        clken = Signal()
        accepted_last = self.input.accepted() & self.input.last
        last_delayed = signal_delay(m, accepted_last, self.latency, ce=clken)

        accum_shifted = Signal(signed(self.output.dataport.width))

        with m.FSM() as fsm:

            with m.State("ACCUM"):
                comb += self.input.ready.eq(self.r_rdy)
                comb += self.output.valid.eq(0)
                comb += self.r_en.eq(self.input.accepted())
                comb += clken.eq(self.input.accepted())
                with m.If(self.input.accepted() & self.input.last):
                    m.next = "LAST"
            with m.State("LAST"):
                comb += self.input.ready.eq(0)
                comb += self.output.valid.eq(last_delayed)
                comb += self.r_en.eq(0)
                comb += clken.eq(~self.output.valid | self.output.accepted())
                with m.If(self.output.accepted()):
                    m.next = "ACCUM"

        _get_input = lambda x: Mux(self.input.accepted(), x, 0)
        _get_accum = lambda x: Mux(self.output.accepted(), 0, x)

        pipeline = Pipeline()
        a0, b0 = pipeline.add_stage([
            _get_input(self.input.data.as_signed()),
            _get_input(self.r_data.as_signed())
        ])
        a1, b1 = pipeline.add_stage([a0, b0])
        m2, = pipeline.add_stage([a1 * b1])
        m3, = pipeline.add_stage([m2])
        out, = pipeline.add_stage([_get_accum(self.accumulator) + m3])
        pipeline.generate(m=m, ce=clken, domain='sync')

        comb += self.accumulator.eq(out)

        comb += accum_shifted.eq(out[self.shift:].as_signed())

        comb += self.output.data.eq(accum_shifted)
        comb += self.output.last.eq(self.output.valid)

        return m