def __init__(self, specialize_nx=False, a_size=Constants.SYS_ARRAY_HEIGHT, b_size=Constants.SYS_ARRAY_WIDTH, n=4, a_shape=signed(9), b_shape=signed(8), accumulator_shape=signed(32)): self._specialize_nx = specialize_nx self._a_size = a_size self._b_size = b_size self._n = n self._a_shape = a_shape self._b_shape = b_shape self._accumulator_shape = accumulator_shape self.input_a = [ Signal(unsigned(n * a_shape.width), name=f"input_a{i}") for i in range(a_size) ] self.input_b = [ Signal(unsigned(n * b_shape.width), name=f"input_b{i}") for i in range(b_size) ] self.first = Signal() self.last = Signal() self.accumulator = [ Signal(accumulator_shape) for _ in range(a_size * b_size) ] self.accumulator_new = [Signal() for _ in range(a_size * b_size)]
def __init__(self, valid_at_reset=True, ready_when_valid=True): super().__init__() self.ready_when_valid = ready_when_valid self.input = Endpoint(unsigned(32)) self.invalidate = Signal() self.valid = Signal(reset=valid_at_reset) self.value = Signal(32)
def __init__(self): super().__init__() self.output_streams = { i: Endpoint( unsigned(32)) for i in self.REGISTER_IDS} self.values = {i: Signal(32) for i in self.REGISTER_IDS} self.write_strobes = {i: Signal(1) for i in self.REGISTER_IDS}
def __init__(self, n, a_shape, b_shape, accumulator_shape): self._n = n self._a_shape = a_shape self._b_shape = b_shape self._accumulator_shape = accumulator_shape self.input_a = Signal(unsigned(n * a_shape.width)) self.output_a = Signal.like(self.input_a) self.input_b = Signal(unsigned(n * b_shape.width)) self.output_b = Signal.like(self.input_b) self.input_first = Signal() self.output_first = Signal() self.input_last = Signal() self.output_last = Signal() self.output_accumulator = Signal(accumulator_shape) self.output_accumulator_new = Signal()
def __init__(self): super().__init__() self.input_streams = {} self.invalidates = {} self.read_strobes = {} for i in self.REGISTER_IDS: self.input_streams[i] = Endpoint(unsigned(32), name=f"sink_{i:02x}") self.invalidates[i] = Signal(name=f"clear_{i:02x}") self.read_strobes[i] = Signal(name=f"read_strobe_{i:02x}")
def __init__(self, specialize_nx=False): self._specialize_nx = specialize_nx self.reset = Signal() self.start = Signal() self.config = Record(ACCELERATOR_CONFIGURATION_LAYOUT) self.write_filter_input = Endpoint(FILTER_WRITE_COMMAND) self.lram_addr = [Signal(14, name=f"lram_addr{i}") for i in range(4)] self.lram_data = [Signal(32, name=f"lram_data{i}") for i in range(4)] self.post_process_params = Endpoint(POST_PROCESS_PARAMS) self.output = Endpoint(unsigned(32))
def connect_macc(self, m, set, input_store, filter_store, macc): m.d.comb += macc.offset.eq(set.values[Constants.REG_INPUT_OFFSET]) # Connect the input store to a FlowRestrictor, which will let through # N values at a time when signalled. input_flow_restrictor = FlowRestrictor(128) m.submodules['input_flow_restrictor'] = input_flow_restrictor m.d.comb += connect(input_store.data_output, input_flow_restrictor.input) next = set.output_streams[Constants.REG_FILTER_INPUT_NEXT] m.d.comb += input_flow_restrictor.release.payload.eq(next.payload) m.d.comb += input_flow_restrictor.release.valid.eq(next.valid) # TODO(dcallagh): can we connect this properly somehow? m.d.comb += next.ready.eq(1) # Same for the filter store. filter_flow_restrictor = FlowRestrictor(128) m.submodules['filter_flow_restrictor'] = filter_flow_restrictor m.d.comb += connect(filter_store.data_output, filter_flow_restrictor.input) next = set.output_streams[Constants.REG_FILTER_INPUT_NEXT] m.d.comb += filter_flow_restrictor.release.payload.eq(next.payload) m.d.comb += filter_flow_restrictor.release.valid.eq(next.valid) # TODO(dcallagh): can we connect this properly somehow? m.d.comb += next.ready.eq(1) # Join both streams into a single stream holding all operands. # Then connect that to the macc. operands_buffer = ConcatenatingBuffer([ ('inputs', unsigned(128)), ('filters', unsigned(128)), ]) m.submodules['operands_buffer'] = operands_buffer m.d.comb += [ connect(input_flow_restrictor.output, operands_buffer.inputs['inputs']), connect(filter_flow_restrictor.output, operands_buffer.inputs['filters']), connect(operands_buffer.output, macc.operands), ]
def elab_instructions(self, m): m.submodules['ping'] = ping = PingInstruction() m.submodules['set'] = set_ = SetInstruction() m.submodules['get'] = get = GetInstruction() m.submodules['pool'] = pool = PoolInstruction() m.submodules['core'] = core = AcceleratorCore(self._specialize_nx) m.submodules['fifo'] = fifo = StreamFifo( depth=Constants.OUTPUT_FIFO_DEPTH, type=unsigned(32)) # Connect set_ and get instructions to accelerator core and FIFO m.d.comb += [ core.start.eq(set_.accelerator_start), core.reset.eq(set_.accelerator_reset), core.config.eq(set_.config), get.reg_fifo_items_value.eq(fifo.r_level), ] m.d.comb += connect(set_.filter_output, core.write_filter_input) m.d.comb += connect(set_.post_process_params, core.post_process_params) m.d.comb += connect(core.output, fifo.input) m.d.comb += connect(fifo.output, get.output_words) # Connect accelerator LRAMs for i in range(4): m.d.comb += [ self.lram_addr[i].eq(core.lram_addr[i]), core.lram_data[i].eq(self.lram_data[i]), ] # Connect verify functionality m.d.comb += get.reg_verify_value.eq(set_.reg_verify_value + 1) return { Constants.INS_GET: get, Constants.INS_SET: set_, Constants.INS_POOL: pool, Constants.INS_PING: ping, }
def create_dut(self): return Buffer(unsigned(32))
def create_dut(self): return ConcatenatingBuffer([('x', unsigned(8)), ('y', unsigned(8))])
# both operands are positive, so result always positive reg_ab = Signal(signed(63)) m.d.sync += reg_ab.eq(reg_a * reg_b) # Cycle 2: nudge, take high bits and sign positive_2 = self.delay(m, 2, a >= 0) # Whether input positive nudged = reg_ab + Mux(positive_2, (1 << 30), (1 << 30) - 1) high_bits = Signal(signed(32)) m.d.comb += high_bits.eq(nudged[31:]) with_sign = Mux(positive_2, high_bits, -high_bits) m.d.sync += out_value.eq(with_sign) RDBPOT_INPUT_LAYOUT = [ ('dividend', signed(32)), # The value to be divided ('shift', unsigned(4)), # The power of two by which to divide by ] class RoundingDivideByPowerOfTwo(BinaryPipelineActor): """Divides its input by a power of two, rounding appropriately. Attributes ---------- input: Endpoint(RDBPOT_INPUT_LAYOUT), in The calculation to perform. output: Endpoint(signed(32)), out The result.
from ..stream import connect, Endpoint from .constants import Constants from .filter import FilterStore, FILTER_WRITE_COMMAND from .mem import SinglePortMemory from .mode0_input import Mode0InputFetcher from .mode1_input import Mode1InputFetcher from .post_process import (AccumulatorReader, OutputWordAssembler, ParamWriter, POST_PROCESS_PARAMS, POST_PROCESS_PARAMS_WIDTH, PostProcessPipeline, ReadingProducer, StreamLimiter) from .ram_mux import RamMux from .sysarray import SystolicArray from .utils import unsigned_upto ACCELERATOR_CONFIGURATION_LAYOUT = [ # The mode of the accelerator - mode 0 for input, mode 1 for full speed ('mode', unsigned(1)), # Offset applied to each input activation value. ('input_offset', signed(9)), # Number of words of filter data, per filter store ('num_filter_words', unsigned_upto(Constants.FILTER_WORDS_PER_STORE)), # Offset applied to each output value. ('output_offset', signed(9)), # The minimum output value ('output_activation_min', signed(8)), # The maximum output value ('output_activation_max', signed(8)), # Address of start of input data, in bytes ('input_base_addr', 18), # How many pixels in output row ('num_pixels_x', 9), # Number of RAM blocks to advance to move to new pixel in X direction
def __init__(self, payload_type): super().__init__(payload_type, payload_type) self.release = Endpoint(unsigned(32))
def __init__(self): super().__init__() self.reg_fifo_items_value = Signal(32) self.reg_verify_value = Signal(32) self.output_words = Endpoint(unsigned(32))
# See the License for the specific language governing permissions and # limitations under the License. """Gateware for filter storage.""" from amaranth import Mux, Signal, unsigned from amaranth_cfu import SequentialMemoryReader, SimpleElaboratable from ..stream import Endpoint from .constants import Constants from .mem import SinglePortMemory from .utils import unsigned_upto FILTER_WRITE_COMMAND = [ ('store', range(Constants.NUM_FILTER_STORES)), ('addr', range(Constants.FILTER_WORDS_PER_STORE)), ('data', unsigned(32)), ] class FilterStore(SimpleElaboratable): """Stores words in a single port memory. The filter store contains multiple SinglePortMemories, each of which is written separately with values in the order required by the SystolicArray. Attributes ---------- write_input: Endpoint(FILTER_WRITE_COMMAND), in Commands to write to the filter store. Always ready.
def create_dut(self): # Can only use standard implementation in Amaranth simulator return StandardMaccBlock(4, unsigned(8), signed(8), signed(24))
def data_shape(self) -> Shape: """Amaranth shape describing a data word.""" return unsigned(self.width)
def __init__(self): self.data_input = Endpoint(unsigned(32)) self.num_words_input = Endpoint(unsigned(32)) self.data_output = Endpoint(unsigned(128))
def __init__(self): super().__init__() self.output = Endpoint(unsigned(32)) self.value = self.output.payload self.new_en = Signal() self.new_value = Signal(32)
def __init__(self, depth=Constants.MAX_FILTER_WORDS): self.depth = depth self.data_input = Endpoint(unsigned(32)) self.num_words_input = Endpoint(unsigned(32)) self.data_output = Endpoint(unsigned(128))
def __init__(self, num_pixels=Constants.SYS_ARRAY_HEIGHT): self._num_pixels = num_pixels self.half_mode = Signal() self.input = Endpoint(signed(8)) self.output = Endpoint(unsigned(32))
def create_dut(self): return StreamLimiter(unsigned(8))
def build_multipliers(self, m, accumulator): a0 = self.input_a.word_select(0, self._a_shape.width) b0 = self.input_b.word_select(0, self._b_shape.width) a1 = self.input_a.word_select(1, self._a_shape.width) b1 = self.input_b.word_select(1, self._b_shape.width) a2 = self.input_a.word_select(2, self._a_shape.width) b2 = self.input_b.word_select(2, self._b_shape.width) a3 = self.input_a.word_select(3, self._a_shape.width) b3 = self.input_b.word_select(3, self._b_shape.width) # Explicitly instantiate the DSP macro m.submodules.dsp = Instance( "MULTADDSUB9X9WIDE", i_CLK=ClockSignal(), i_CEA0A1=Const(1), i_CEA2A3=Const(1), i_CEB0B1=Const(1), i_CEB2B3=Const(1), i_CEC=Const(1), i_CEPIPE=Const(1), i_CEOUT=Const(1), i_CECTRL=Const(1), i_RSTA0A1=ResetSignal(), i_RSTA2A3=ResetSignal(), i_RSTB0B1=ResetSignal(), i_RSTB2B3=ResetSignal(), i_RSTC=ResetSignal(), i_RSTCTRL=ResetSignal(), i_RSTPIPE=ResetSignal(), i_RSTOUT=ResetSignal(), i_SIGNED=Const(1), i_ADDSUB=Const(0, unsigned(4)), i_A0=a0, i_B0=Cat(b0, b0[7]), i_A1=a1, i_B1=Cat(b1, b1[7]), i_A2=a2, i_B2=Cat(b2, b2[7]), i_A3=a3, i_B3=Cat(b3, b3[7]), i_C=Const(0, unsigned(54)), i_LOADC=self.input_first, o_Z=accumulator, p_REGINPUTAB0="BYPASS", p_REGINPUTAB1="BYPASS", p_REGINPUTAB2="BYPASS", p_REGINPUTAB3="BYPASS", p_REGINPUTC="BYPASS", p_REGADDSUB="BYPASS", p_REGLOADC="BYPASS", p_REGLOADC2="REGISTER", p_REGPIPELINE="REGISTER", p_REGOUTPUT="REGISTER", p_RESETMODE="SYNC", p_GSR="ENABLED", )
m.d.sync += self.value.eq(0) with m.Else(): value_p1 = Signal.like(self.count) next_value = Signal.like(self.value) m.d.comb += [ value_p1.eq(self.value + 1), self.last.eq(value_p1 == self.count), next_value.eq(Mux(self.last, 0, value_p1)), ] with m.If(self.next): m.d.sync += self.value.eq(next_value) LDR_PARAMS_LAYOUT = [ ('count', unsigned(16)), ('repeats', unsigned(4)), ] class LoopingAddressGenerator(SimpleElaboratable): """Generates addresses from a memory. Loops from address zero for a given count, with configurable number of repeats. Parameters ---------- depth: int Number of words in the memory being addressed. Maximum
def __init__(self): self.input = Endpoint(signed(8)) self.output = Endpoint(unsigned(32))
def __init__(self): super().__init__(unsigned(32), unsigned(32))
def __init__(self): self.num_results = Endpoint(unsigned(32)) self.results = Endpoint(signed(32)) self.accumulated = Endpoint(signed(32))
def unsigned_upto(maximum_value): """Creates a shape of a size to hold maximum_value""" return unsigned(maximum_value.bit_length())
def addr_shape(self) -> Shape: """Amaranth shape describing the address.""" return unsigned((self.depth - 1).bit_length())