class SubmatrixRegisters(Elaboratable): def __init__(self, data_w, N, invert=False): self.invert = invert self.input = MatrixStream(width=data_w, shape=(N, ), direction='sink', name='input') self.output = MatrixStream(width=data_w, shape=(N, N), direction='source', name='output') self.data_w = self.input.dataport.width self.shape_i = self.input.dataport.shape self.shape_o = self.output.dataport.shape self.N = self.output.dataport.shape[0] def get_ports(self): ports = [self.input[f] for f in self.input.fields] ports += [self.output[f] for f in self.output.fields] return ports def elaborate(self, platform): m = Module() sync = m.d.sync comb = m.d.comb if self.invert: _col = lambda col: col else: _col = lambda col: self.N - 1 - col with m.If(self.input.accepted()): for row in range(self.N): # row iteration sync += self.output.dataport.matrix[row, _col(0)].eq( self.input.dataport.matrix[row] ) # append column from input for col in range( 1, self.N): # shift to the right the other columns sync += self.output.dataport.matrix[row, _col(col)].eq( self.output.dataport.matrix[row, _col(col - 1)]) with m.If(self.input.accepted()): sync += self.output.valid.eq(1) with m.Elif(self.output.accepted()): sync += self.output.valid.eq(0) comb += self.input.ready.eq(self.output.accepted() | ~self.output.valid) return m
class MatrixFeederSkip(MatrixFeeder): def __init__(self, data_w, input_shape, N, invert=False): assert input_shape[0] % N == 0, ( f'image height must be a multiple of N. Psss, you can use Padder() to append zeros!' ) assert input_shape[1] % N == 0, ( f'image width must be a multiple of N. Psss, you can use Padder() to append zeros!' ) self.input = DataStream(width=data_w, direction='sink', name='input') self.output = MatrixStream(width=data_w, shape=(N, N), direction='source', name='output') self.matrix_feeder = MatrixFeeder(data_w, input_shape, N, invert=invert) self.output_shape = (int(input_shape[0] / N), int(input_shape[1] / N)) self.N = N def get_ports(self): ports = [self.input[f] for f in self.input.fields] ports += [self.output[f] for f in self.output.fields] return ports def elaborate(self, platform): m = Module() sync = m.d.sync comb = m.d.comb pooling_counter_row = Signal(range(self.N)) pooling_counter_col = Signal(range(self.N)) m.submodules.matrix_feeder = matrix_feeder = self.matrix_feeder row, col = img_position_counter(m, sync, self.output, self.output_shape) feeder_row, feeder_col = img_position_counter( m, sync, matrix_feeder.output, matrix_feeder.output_shape) # input --> matrix_feeder comb += [ matrix_feeder.input.valid.eq(self.input.valid), matrix_feeder.input.last.eq(self.input.last), matrix_feeder.input.data.eq(self.input.data), self.input.ready.eq(matrix_feeder.input.ready), ] comb += self.output.dataport.eq(matrix_feeder.output.dataport) comb += self.output.last.eq(is_last(row, col, self.output_shape)) with m.If(matrix_feeder.output.accepted()): sync += pooling_counter_row.eq(_incr(pooling_counter_row, self.N)) with m.If(feeder_row == matrix_feeder.output_shape[1] - 1): sync += pooling_counter_row.eq(0) sync += pooling_counter_col.eq( _incr(pooling_counter_col, self.N)) with m.If(matrix_feeder.output.last): sync += [ pooling_counter_row.eq(0), pooling_counter_col.eq(0), ] with m.FSM() as fsm: with m.State("normal"): with m.If((pooling_counter_row == 0) & (pooling_counter_col == 0)): comb += [ self.output.valid.eq(matrix_feeder.output.valid), matrix_feeder.output.ready.eq(self.output.ready), ] with m.Else(): comb += [ self.output.valid.eq(0), matrix_feeder.output.ready.eq(1), ] with m.If(self.output.accepted() & self.output.last): m.next = "last" with m.State("last"): comb += [ self.output.valid.eq(0), matrix_feeder.output.ready.eq(1), ] with m.If(self.input.accepted() & self.input.last): m.next = "normal" return m
class Farm(Elaboratable): _doc_ = """ "Farm" of DotProduct cores, for parallel computation. The performed operation is the dot product of two NxM matrixes. Keep in mind that since throughput will never be higher than one output per clock, it doesn't make sense to use a higher number of DotProduct cores than the latency of each one of them. The dataflow is controlled ONLY by the input_a Stream interface. The input_b stream interface is DUMMY, and should always have valid values in the input. The ready of the input_b interface will be attached to input_a.accepted(), and a valid=1 will be assumed. Why? I want to avoid a combinational path between the valid of input_b and the ready of input_a. Interfaces ---------- input_a : Matrix Stream, input Input a matrix data. input_b : Matrix Stream, input Input b matrix data. TO DO: should not be a stream, but plain "matrix shaped" values. output : Data Stream, output Dot product computated value. Parameters ---------- width : int Bit width of both inputs. shape : tuple Input shape (N, M). n_cores : int Number of paralell computations of dot product. """ def __init__(self, width, shape, n_cores): self.cores = [DotProduct(width, shape) for _ in range(n_cores)] self.input_a = MatrixStream(width=width, shape=shape, direction='sink', name='input_a') self.input_b = MatrixStream(width=width, shape=shape, direction='sink', name='input_b') self.output_w = self.cores[0].output_w self.output = DataStream(self.output_w, direction='source', name='output') self.input_w = self.input_a.dataport.width self.n_inputs = self.input_a.dataport.n_elements self.shape = self.input_a.dataport.shape self.n_cores = len(self.cores) def get_ports(self): ports = [] ports += [self.input_a[f] for f in self.input_a.fields] ports += [self.input_b[f] for f in self.input_b.fields] ports += [self.output[f] for f in self.output.fields] return ports def elaborate(self, platform): m = Module() sync = m.d.sync comb = m.d.comb current_core_sink = Signal(range(self.n_cores)) current_core_source = Signal(range(self.n_cores)) # DUMMY input_b interface # comb += [self.input_b.ready.eq(self.input_a.accepted())] for i, core in enumerate(self.cores): m.submodules['core_' + str(i)] = core comb += core.input_b.dataport.eq( self.input_b.dataport) # same coefficients for everybody with m.If(current_core_sink == i): comb += [ self.input_a.ready.eq(core.input_a.ready), self.input_b.ready.eq(core.input_b.ready), ] comb += [ core.input_a.valid.eq(self.input_a.valid), core.input_b.valid.eq(self.input_b.valid), core.input_a.dataport.eq(self.input_a.dataport), ] with m.Else(): comb += [ core.input_a.valid.eq(0), core.input_b.valid.eq(0), core.input_a.dataport.eq_const(0), ] with m.If(current_core_source == i): comb += [ self.output.valid.eq(core.output.valid), self.output.data.eq(core.output.data), ] comb += [ core.output.ready.eq(self.output.ready), ] with m.Else(): comb += [ core.output.ready.eq(0), ] with m.If(self.input_a.accepted()): sync += current_core_sink.eq(_incr(current_core_sink, self.n_cores)) with m.If(self.output.accepted()): sync += current_core_source.eq( _incr(current_core_source, self.n_cores)) return m
class DotProduct(Elaboratable): # # WARNING: # The dataflow is controlled ONLY by the input_a AXIS interface. # The input_b AXIS interface is DUMMY, and should always have valid values in the input. # The ready of the input_b interface will be attached to input_a.accepted(), and a valid=1 # will be assumed. # # Why? # I want to avoid a combinational path between the valid of input_b and the ready of input_a. # def __init__(self, width_i, shape): self.input_a = MatrixStream(width=width_i, shape=shape, direction='sink', name='input_a') self.input_b = MatrixStream(width=width_i, shape=shape, direction='sink', name='input_b') self.input_w = self.input_a.dataport.width self.n_inputs = self.input_a.dataport.n_elements self.output_w = calculate_output_width(self.input_w, self.n_inputs) self.output = DataStream(self.output_w, direction='source', name='output') self.shape = self.input_a.dataport.shape def get_ports(self): ports = [] ports += [self.input_a[f] for f in self.input_a.fields] ports += [self.input_b[f] for f in self.input_b.fields] ports += [self.output[f] for f in self.output.fields] return ports def elaborate(self, platform): m = Module() sync = m.d.sync comb = m.d.comb tmp_input_a = Signal(self.input_w * self.n_inputs) tmp_input_b = Signal(self.input_w * self.n_inputs) counter = Signal(range(self.n_inputs)) m.submodules['mac'] = mac = MAC(input_w=self.input_w, output_w=self.output_w) comb += [mac.input_a.eq(tmp_input_a[0:self.input_w]), mac.input_b.eq(tmp_input_b[0:self.input_w]),] # DUMMY input_b interface comb += [self.input_b.ready.eq(self.input_a.accepted())] with m.FSM() as fsm: with m.State("IDLE"): comb += [self.input_a.ready.eq(self.output.accepted() | ~self.output.valid), mac.clr.eq(1), mac.clken.eq(0),] with m.If(self.input_a.accepted()): m.next = "BUSY" sync += [tmp_input_a.eq(Cat(*self.input_a.flat)), #self.input_a.data), tmp_input_b.eq(Cat(*self.input_b.flat)), #Cat(*self.input_b)), counter.eq(0),] with m.If(self.output.accepted()): sync += self.output.valid.eq(0) with m.State("BUSY"): comb += [self.input_a.ready.eq(0), mac.clr.eq(0), mac.clken.eq(1),] sync += [tmp_input_b.eq(tmp_input_b >> self.input_w), tmp_input_a.eq(tmp_input_a >> self.input_w),] with m.If(mac.valid_o): sync += counter.eq(counter + 1) with m.If(counter == self.n_inputs - 1): m.next = "IDLE" sync += [self.output.data.eq(mac.output), self.output.valid.eq(1),] return m
class RowFifos(Elaboratable): """ N fifos that work synchronized to provide Nx1 (N=row) vector of data. """ def __init__(self, input_w, row_length, N, invert=False): self.row_length = row_length self.invert = invert self.input = DataStream(width=input_w, direction='sink', name='input') self.output = MatrixStream(width=input_w, shape=(N, ), direction='source', name='output') self.input_w = len(self.input.data) self.output_w = self.output.dataport.width self.shape = self.output.dataport.shape self.N = self.output.dataport.shape[0] def get_ports(self): ports = [self.input[f] for f in self.input.fields] ports += [self.output[f] for f in self.output.fields] return ports def elaborate(self, platform): m = Module() sync = m.d.sync comb = m.d.comb fifo = [ SyncFIFOBuffered(width=self.input_w, depth=self.row_length + 4) for _ in range(self.N) ] fifo_r_rdy = [Signal() for _ in range(self.N)] fifo_r_valid = [Signal() for _ in range(self.N)] w_en = [Signal() for _ in range(self.N - 1)] for n in range(self.N): m.submodules['fifo_' + str(n)] = fifo[n] comb += [ fifo_r_rdy[n].eq((fifo[n].level < self.row_length) | self.output.accepted()), ] # first fifo comb += [ self.input.ready.eq(fifo[0].w_rdy), fifo[0].w_en.eq(self.input.accepted()), fifo[0].w_data.eq(self.input.data), ] for n in range(self.N - 1): comb += [ fifo_r_valid[n].eq((fifo[n + 1].level == self.row_length) & (fifo[n].r_rdy)), fifo[n].r_en.eq((self.output.accepted() | ~fifo_r_valid[n])), fifo[n + 1].w_en.eq(fifo[n].r_rdy & fifo[n].r_en), fifo[n + 1].w_data.eq(fifo[n].r_data), ] # last fifo n = self.N - 1 comb += [ fifo_r_valid[n].eq(fifo[n].r_rdy), fifo[n].r_en.eq(self.output.accepted()), ] # output comb += [ self.output.valid.eq(_and(fifo_r_valid)), ] for n in range(self.N): if self.invert: comb += self.output.dataport.matrix[n].eq(fifo[n].r_data) else: comb += self.output.dataport.matrix[n].eq(fifo[self.N - 1 - n].r_data) return m