class MatrixFeederSkip(MatrixFeeder): def __init__(self, data_w, input_shape, N, invert=False): assert input_shape[0] % N == 0, ( f'image height must be a multiple of N. Psss, you can use Padder() to append zeros!' ) assert input_shape[1] % N == 0, ( f'image width must be a multiple of N. Psss, you can use Padder() to append zeros!' ) self.input = DataStream(width=data_w, direction='sink', name='input') self.output = MatrixStream(width=data_w, shape=(N, N), direction='source', name='output') self.matrix_feeder = MatrixFeeder(data_w, input_shape, N, invert=invert) self.output_shape = (int(input_shape[0] / N), int(input_shape[1] / N)) self.N = N def get_ports(self): ports = [self.input[f] for f in self.input.fields] ports += [self.output[f] for f in self.output.fields] return ports def elaborate(self, platform): m = Module() sync = m.d.sync comb = m.d.comb pooling_counter_row = Signal(range(self.N)) pooling_counter_col = Signal(range(self.N)) m.submodules.matrix_feeder = matrix_feeder = self.matrix_feeder row, col = img_position_counter(m, sync, self.output, self.output_shape) feeder_row, feeder_col = img_position_counter( m, sync, matrix_feeder.output, matrix_feeder.output_shape) # input --> matrix_feeder comb += [ matrix_feeder.input.valid.eq(self.input.valid), matrix_feeder.input.last.eq(self.input.last), matrix_feeder.input.data.eq(self.input.data), self.input.ready.eq(matrix_feeder.input.ready), ] comb += self.output.dataport.eq(matrix_feeder.output.dataport) comb += self.output.last.eq(is_last(row, col, self.output_shape)) with m.If(matrix_feeder.output.accepted()): sync += pooling_counter_row.eq(_incr(pooling_counter_row, self.N)) with m.If(feeder_row == matrix_feeder.output_shape[1] - 1): sync += pooling_counter_row.eq(0) sync += pooling_counter_col.eq( _incr(pooling_counter_col, self.N)) with m.If(matrix_feeder.output.last): sync += [ pooling_counter_row.eq(0), pooling_counter_col.eq(0), ] with m.FSM() as fsm: with m.State("normal"): with m.If((pooling_counter_row == 0) & (pooling_counter_col == 0)): comb += [ self.output.valid.eq(matrix_feeder.output.valid), matrix_feeder.output.ready.eq(self.output.ready), ] with m.Else(): comb += [ self.output.valid.eq(0), matrix_feeder.output.ready.eq(1), ] with m.If(self.output.accepted() & self.output.last): m.next = "last" with m.State("last"): comb += [ self.output.valid.eq(0), matrix_feeder.output.ready.eq(1), ] with m.If(self.input.accepted() & self.input.last): m.next = "normal" return m
class mlpNode(Elaboratable): _doc_ = """ MLP Node instantiates a Stream Macc and a Circular ROM to store the corresponding weights. Both input and output are Stream interfaces. This MLP Node can actually do the job of N neurons serially where each neuron will require (n_inputs + 1) weights stored. Parameters ---------- width_i : int Bit width of data in stream interface. width_w : int Bit width of data in the ROM. n_inputs : int Number of inputs for each neuron. rom_init : list List with weights to initialize the ROM. It should have the form [N0_W0, N0_W1, ..., N0_Wn-1, N0_Wbias, N1_W0, N1_W1, ..., N1_Wn-1, N1_Wbias, ... ] where Nx_Wy refers to the weight of the sample 'y' of neuron 'x'. """ def __init__(self, width_i, width_w, n_inputs, rom_init): assert len(rom_init) % (n_inputs + 1) == 0 accum_w = accum_req_bits(width_i, width_w, n_inputs + 1) # +1 bias shift = width_w - 1 # compensate weights gain self.n_inputs = n_inputs self.rom = CircularROM(width=width_w, init=rom_init) self.macc = StreamMacc(width_i=width_i, width_c=width_w, width_acc=accum_w, shift=shift) output_w = len(self.macc.output.data) assert output_w == accum_w - shift, ( f'{output_w} == {accum_w} - {shift}') self.input = DataStream(width=width_i, direction='sink', name='input') self.output = DataStream(width=output_w, direction='source', name='output') def get_ports(self): ports = [] ports += [self.input[f] for f in self.input.fields] ports += [self.output[f] for f in self.output.fields] return ports def elaborate(self, platform): m = Module() sync = m.d.sync comb = m.d.comb m.submodules.rom = rom = self.rom m.submodules.macc = macc = self.macc cnt = Signal(range(self.n_inputs)) output_data = Signal(signed(len(self.output.data))) comb += macc.r_data.eq(rom.r_data) comb += macc.r_rdy.eq(rom.r_rdy) comb += rom.r_en.eq(macc.r_en) comb += rom.restart.eq(0) # should be unnecessary if the # inputs are correct. nxt_cnt = Signal.like(cnt) comb += nxt_cnt.eq(_incr(cnt, self.n_inputs)) with m.If(self.input.accepted()): sync += cnt.eq(nxt_cnt) with m.FSM() as fsm: with m.State("INPUT"): comb += self.input.ready.eq(macc.input.ready) comb += macc.input.valid.eq(self.input.valid) comb += macc.input.data.eq(self.input.data) comb += macc.input.last.eq(0) with m.If(macc.input.accepted() & (nxt_cnt == 0)): m.next = "BIAS" with m.State("BIAS"): comb += self.input.ready.eq(0) comb += macc.input.valid.eq(1) comb += macc.input.data.eq( 1 ) # should it be bigger? what granularity should the bias have? comb += macc.input.last.eq(1) with m.If(macc.input.accepted()): m.next = "INPUT" comb += output_data.eq(macc.output.data) comb += self.output.valid.eq(macc.output.valid) comb += self.output.data.eq(output_data) comb += self.output.last.eq(0) # self.input.last + delay? comb += macc.output.ready.eq(self.output.ready) return m
class Farm(Elaboratable): _doc_ = """ "Farm" of DotProduct cores, for parallel computation. The performed operation is the dot product of two NxM matrixes. Keep in mind that since throughput will never be higher than one output per clock, it doesn't make sense to use a higher number of DotProduct cores than the latency of each one of them. The dataflow is controlled ONLY by the input_a Stream interface. The input_b stream interface is DUMMY, and should always have valid values in the input. The ready of the input_b interface will be attached to input_a.accepted(), and a valid=1 will be assumed. Why? I want to avoid a combinational path between the valid of input_b and the ready of input_a. Interfaces ---------- input_a : Matrix Stream, input Input a matrix data. input_b : Matrix Stream, input Input b matrix data. TO DO: should not be a stream, but plain "matrix shaped" values. output : Data Stream, output Dot product computated value. Parameters ---------- width : int Bit width of both inputs. shape : tuple Input shape (N, M). n_cores : int Number of paralell computations of dot product. """ def __init__(self, width, shape, n_cores): self.cores = [DotProduct(width, shape) for _ in range(n_cores)] self.input_a = MatrixStream(width=width, shape=shape, direction='sink', name='input_a') self.input_b = MatrixStream(width=width, shape=shape, direction='sink', name='input_b') self.output_w = self.cores[0].output_w self.output = DataStream(self.output_w, direction='source', name='output') self.input_w = self.input_a.dataport.width self.n_inputs = self.input_a.dataport.n_elements self.shape = self.input_a.dataport.shape self.n_cores = len(self.cores) def get_ports(self): ports = [] ports += [self.input_a[f] for f in self.input_a.fields] ports += [self.input_b[f] for f in self.input_b.fields] ports += [self.output[f] for f in self.output.fields] return ports def elaborate(self, platform): m = Module() sync = m.d.sync comb = m.d.comb current_core_sink = Signal(range(self.n_cores)) current_core_source = Signal(range(self.n_cores)) # DUMMY input_b interface # comb += [self.input_b.ready.eq(self.input_a.accepted())] for i, core in enumerate(self.cores): m.submodules['core_' + str(i)] = core comb += core.input_b.dataport.eq( self.input_b.dataport) # same coefficients for everybody with m.If(current_core_sink == i): comb += [ self.input_a.ready.eq(core.input_a.ready), self.input_b.ready.eq(core.input_b.ready), ] comb += [ core.input_a.valid.eq(self.input_a.valid), core.input_b.valid.eq(self.input_b.valid), core.input_a.dataport.eq(self.input_a.dataport), ] with m.Else(): comb += [ core.input_a.valid.eq(0), core.input_b.valid.eq(0), core.input_a.dataport.eq_const(0), ] with m.If(current_core_source == i): comb += [ self.output.valid.eq(core.output.valid), self.output.data.eq(core.output.data), ] comb += [ core.output.ready.eq(self.output.ready), ] with m.Else(): comb += [ core.output.ready.eq(0), ] with m.If(self.input_a.accepted()): sync += current_core_sink.eq(_incr(current_core_sink, self.n_cores)) with m.If(self.output.accepted()): sync += current_core_source.eq( _incr(current_core_source, self.n_cores)) return m
class StreamMacc(Elaboratable): _doc_ = """ Multiplier and accumulator with selectable output division to scale down result and a Stream interface. The main input data interface is a Stream interface, while for the coefficients a memory read port interface is used. While the memory has r_rdy==1, the dataflow control will be done in the input data stream interface. If the coefficients come from the same interface as the input data, just assign the ports of the Stream Macc you should just: * ignore r_en * tell the core that the memory is ready to be read when the input data is valid by assigning: core.r_rdy <-- core.input.valid Interfaces ---------- input : Data Stream, input Input data. Each product between the input data and the coeff data will be accumulated, until a last is asserted. A last will flush the pipeline and output the valid result, clearing the accumulator afterwards. coeff : {r_data, r_en, r_rdy}, input Input coefficients. TO DO: Implement ReadportInterface output : Data Stream, output Output data. Will output valid data after a last is asserted in the input interface. Otherwise, it will keep accumulating. Parameters ---------- width_i : int Bit width of data in stream interface. width_c : int Bit width of the coefficients. width_acc : int Bit width of the accumulator. shift : int The accumulator result will be shifted to the right by this number, so the output will be (accumulator / 2**shift). """ def __init__(self, width_i, width_c, width_acc=None, shift=None): if width_acc is None: width_acc = 48 if shift is None: shift = 0 output_w = width_acc - shift self.shift = shift self.accumulator = Signal(signed(width_acc)) self.input = DataStream(width=width_i, direction='sink', name='input') self.output = DataStream(width=output_w, direction='source', name='output') self.r_data = Signal(signed(width_c)) self.r_en = Signal() self.r_rdy = Signal() self.latency = 5 def get_ports(self): ports = [self.r_data, self.r_en, self.r_rdy] ports += [self.input[f] for f in self.input.fields] ports += [self.output[f] for f in self.output.fields] return ports def elaborate(self, platform): m = Module() comb = m.d.comb sync = m.d.sync clken = Signal() accepted_last = self.input.accepted() & self.input.last last_delayed = signal_delay(m, accepted_last, self.latency, ce=clken) accum_shifted = Signal(signed(self.output.dataport.width)) with m.FSM() as fsm: with m.State("ACCUM"): comb += self.input.ready.eq(self.r_rdy) comb += self.output.valid.eq(0) comb += self.r_en.eq(self.input.accepted()) comb += clken.eq(self.input.accepted()) with m.If(self.input.accepted() & self.input.last): m.next = "LAST" with m.State("LAST"): comb += self.input.ready.eq(0) comb += self.output.valid.eq(last_delayed) comb += self.r_en.eq(0) comb += clken.eq(~self.output.valid | self.output.accepted()) with m.If(self.output.accepted()): m.next = "ACCUM" _get_input = lambda x: Mux(self.input.accepted(), x, 0) _get_accum = lambda x: Mux(self.output.accepted(), 0, x) pipeline = Pipeline() a0, b0 = pipeline.add_stage([ _get_input(self.input.data.as_signed()), _get_input(self.r_data.as_signed()) ]) a1, b1 = pipeline.add_stage([a0, b0]) m2, = pipeline.add_stage([a1 * b1]) m3, = pipeline.add_stage([m2]) out, = pipeline.add_stage([_get_accum(self.accumulator) + m3]) pipeline.generate(m=m, ce=clken, domain='sync') comb += self.accumulator.eq(out) comb += accum_shifted.eq(out[self.shift:].as_signed()) comb += self.output.data.eq(accum_shifted) comb += self.output.last.eq(self.output.valid) return m
class DotProduct(Elaboratable): # # WARNING: # The dataflow is controlled ONLY by the input_a AXIS interface. # The input_b AXIS interface is DUMMY, and should always have valid values in the input. # The ready of the input_b interface will be attached to input_a.accepted(), and a valid=1 # will be assumed. # # Why? # I want to avoid a combinational path between the valid of input_b and the ready of input_a. # def __init__(self, width_i, shape): self.input_a = MatrixStream(width=width_i, shape=shape, direction='sink', name='input_a') self.input_b = MatrixStream(width=width_i, shape=shape, direction='sink', name='input_b') self.input_w = self.input_a.dataport.width self.n_inputs = self.input_a.dataport.n_elements self.output_w = calculate_output_width(self.input_w, self.n_inputs) self.output = DataStream(self.output_w, direction='source', name='output') self.shape = self.input_a.dataport.shape def get_ports(self): ports = [] ports += [self.input_a[f] for f in self.input_a.fields] ports += [self.input_b[f] for f in self.input_b.fields] ports += [self.output[f] for f in self.output.fields] return ports def elaborate(self, platform): m = Module() sync = m.d.sync comb = m.d.comb tmp_input_a = Signal(self.input_w * self.n_inputs) tmp_input_b = Signal(self.input_w * self.n_inputs) counter = Signal(range(self.n_inputs)) m.submodules['mac'] = mac = MAC(input_w=self.input_w, output_w=self.output_w) comb += [mac.input_a.eq(tmp_input_a[0:self.input_w]), mac.input_b.eq(tmp_input_b[0:self.input_w]),] # DUMMY input_b interface comb += [self.input_b.ready.eq(self.input_a.accepted())] with m.FSM() as fsm: with m.State("IDLE"): comb += [self.input_a.ready.eq(self.output.accepted() | ~self.output.valid), mac.clr.eq(1), mac.clken.eq(0),] with m.If(self.input_a.accepted()): m.next = "BUSY" sync += [tmp_input_a.eq(Cat(*self.input_a.flat)), #self.input_a.data), tmp_input_b.eq(Cat(*self.input_b.flat)), #Cat(*self.input_b)), counter.eq(0),] with m.If(self.output.accepted()): sync += self.output.valid.eq(0) with m.State("BUSY"): comb += [self.input_a.ready.eq(0), mac.clr.eq(0), mac.clken.eq(1),] sync += [tmp_input_b.eq(tmp_input_b >> self.input_w), tmp_input_a.eq(tmp_input_a >> self.input_w),] with m.If(mac.valid_o): sync += counter.eq(counter + 1) with m.If(counter == self.n_inputs - 1): m.next = "IDLE" sync += [self.output.data.eq(mac.output), self.output.valid.eq(1),] return m
class RowFifos(Elaboratable): """ N fifos that work synchronized to provide Nx1 (N=row) vector of data. """ def __init__(self, input_w, row_length, N, invert=False): self.row_length = row_length self.invert = invert self.input = DataStream(width=input_w, direction='sink', name='input') self.output = MatrixStream(width=input_w, shape=(N, ), direction='source', name='output') self.input_w = len(self.input.data) self.output_w = self.output.dataport.width self.shape = self.output.dataport.shape self.N = self.output.dataport.shape[0] def get_ports(self): ports = [self.input[f] for f in self.input.fields] ports += [self.output[f] for f in self.output.fields] return ports def elaborate(self, platform): m = Module() sync = m.d.sync comb = m.d.comb fifo = [ SyncFIFOBuffered(width=self.input_w, depth=self.row_length + 4) for _ in range(self.N) ] fifo_r_rdy = [Signal() for _ in range(self.N)] fifo_r_valid = [Signal() for _ in range(self.N)] w_en = [Signal() for _ in range(self.N - 1)] for n in range(self.N): m.submodules['fifo_' + str(n)] = fifo[n] comb += [ fifo_r_rdy[n].eq((fifo[n].level < self.row_length) | self.output.accepted()), ] # first fifo comb += [ self.input.ready.eq(fifo[0].w_rdy), fifo[0].w_en.eq(self.input.accepted()), fifo[0].w_data.eq(self.input.data), ] for n in range(self.N - 1): comb += [ fifo_r_valid[n].eq((fifo[n + 1].level == self.row_length) & (fifo[n].r_rdy)), fifo[n].r_en.eq((self.output.accepted() | ~fifo_r_valid[n])), fifo[n + 1].w_en.eq(fifo[n].r_rdy & fifo[n].r_en), fifo[n + 1].w_data.eq(fifo[n].r_data), ] # last fifo n = self.N - 1 comb += [ fifo_r_valid[n].eq(fifo[n].r_rdy), fifo[n].r_en.eq(self.output.accepted()), ] # output comb += [ self.output.valid.eq(_and(fifo_r_valid)), ] for n in range(self.N): if self.invert: comb += self.output.dataport.matrix[n].eq(fifo[n].r_data) else: comb += self.output.dataport.matrix[n].eq(fifo[self.N - 1 - n].r_data) return m