Esempio n. 1
0
    def __init__(self,
                 specialize_nx=False,
                 a_size=Constants.SYS_ARRAY_HEIGHT,
                 b_size=Constants.SYS_ARRAY_WIDTH,
                 n=4,
                 a_shape=signed(9),
                 b_shape=signed(8),
                 accumulator_shape=signed(32)):
        self._specialize_nx = specialize_nx
        self._a_size = a_size
        self._b_size = b_size
        self._n = n
        self._a_shape = a_shape
        self._b_shape = b_shape
        self._accumulator_shape = accumulator_shape

        self.input_a = [
            Signal(unsigned(n * a_shape.width), name=f"input_a{i}")
            for i in range(a_size)
        ]
        self.input_b = [
            Signal(unsigned(n * b_shape.width), name=f"input_b{i}")
            for i in range(b_size)
        ]

        self.first = Signal()
        self.last = Signal()

        self.accumulator = [
            Signal(accumulator_shape) for _ in range(a_size * b_size)
        ]
        self.accumulator_new = [Signal() for _ in range(a_size * b_size)]
Esempio n. 2
0
 def __init__(self, valid_at_reset=True, ready_when_valid=True):
     super().__init__()
     self.ready_when_valid = ready_when_valid
     self.input = Endpoint(unsigned(32))
     self.invalidate = Signal()
     self.valid = Signal(reset=valid_at_reset)
     self.value = Signal(32)
Esempio n. 3
0
 def __init__(self):
     super().__init__()
     self.output_streams = {
         i: Endpoint(
             unsigned(32)) for i in self.REGISTER_IDS}
     self.values = {i: Signal(32) for i in self.REGISTER_IDS}
     self.write_strobes = {i: Signal(1) for i in self.REGISTER_IDS}
Esempio n. 4
0
    def __init__(self, n, a_shape, b_shape, accumulator_shape):
        self._n = n
        self._a_shape = a_shape
        self._b_shape = b_shape
        self._accumulator_shape = accumulator_shape

        self.input_a = Signal(unsigned(n * a_shape.width))
        self.output_a = Signal.like(self.input_a)
        self.input_b = Signal(unsigned(n * b_shape.width))
        self.output_b = Signal.like(self.input_b)

        self.input_first = Signal()
        self.output_first = Signal()
        self.input_last = Signal()
        self.output_last = Signal()

        self.output_accumulator = Signal(accumulator_shape)
        self.output_accumulator_new = Signal()
Esempio n. 5
0
 def __init__(self):
     super().__init__()
     self.input_streams = {}
     self.invalidates = {}
     self.read_strobes = {}
     for i in self.REGISTER_IDS:
         self.input_streams[i] = Endpoint(unsigned(32),
                                          name=f"sink_{i:02x}")
         self.invalidates[i] = Signal(name=f"clear_{i:02x}")
         self.read_strobes[i] = Signal(name=f"read_strobe_{i:02x}")
Esempio n. 6
0
    def __init__(self, specialize_nx=False):
        self._specialize_nx = specialize_nx
        self.reset = Signal()
        self.start = Signal()
        self.config = Record(ACCELERATOR_CONFIGURATION_LAYOUT)

        self.write_filter_input = Endpoint(FILTER_WRITE_COMMAND)
        self.lram_addr = [Signal(14, name=f"lram_addr{i}") for i in range(4)]
        self.lram_data = [Signal(32, name=f"lram_data{i}") for i in range(4)]
        self.post_process_params = Endpoint(POST_PROCESS_PARAMS)
        self.output = Endpoint(unsigned(32))
Esempio n. 7
0
    def connect_macc(self, m, set, input_store, filter_store, macc):
        m.d.comb += macc.offset.eq(set.values[Constants.REG_INPUT_OFFSET])

        # Connect the input store to a FlowRestrictor, which will let through
        # N values at a time when signalled.
        input_flow_restrictor = FlowRestrictor(128)
        m.submodules['input_flow_restrictor'] = input_flow_restrictor
        m.d.comb += connect(input_store.data_output,
                            input_flow_restrictor.input)
        next = set.output_streams[Constants.REG_FILTER_INPUT_NEXT]
        m.d.comb += input_flow_restrictor.release.payload.eq(next.payload)
        m.d.comb += input_flow_restrictor.release.valid.eq(next.valid)
        # TODO(dcallagh): can we connect this properly somehow?
        m.d.comb += next.ready.eq(1)

        # Same for the filter store.
        filter_flow_restrictor = FlowRestrictor(128)
        m.submodules['filter_flow_restrictor'] = filter_flow_restrictor
        m.d.comb += connect(filter_store.data_output,
                            filter_flow_restrictor.input)
        next = set.output_streams[Constants.REG_FILTER_INPUT_NEXT]
        m.d.comb += filter_flow_restrictor.release.payload.eq(next.payload)
        m.d.comb += filter_flow_restrictor.release.valid.eq(next.valid)
        # TODO(dcallagh): can we connect this properly somehow?
        m.d.comb += next.ready.eq(1)

        # Join both streams into a single stream holding all operands.
        # Then connect that to the macc.
        operands_buffer = ConcatenatingBuffer([
            ('inputs', unsigned(128)),
            ('filters', unsigned(128)),
        ])
        m.submodules['operands_buffer'] = operands_buffer
        m.d.comb += [
            connect(input_flow_restrictor.output,
                    operands_buffer.inputs['inputs']),
            connect(filter_flow_restrictor.output,
                    operands_buffer.inputs['filters']),
            connect(operands_buffer.output, macc.operands),
        ]
Esempio n. 8
0
    def elab_instructions(self, m):
        m.submodules['ping'] = ping = PingInstruction()
        m.submodules['set'] = set_ = SetInstruction()
        m.submodules['get'] = get = GetInstruction()
        m.submodules['pool'] = pool = PoolInstruction()
        m.submodules['core'] = core = AcceleratorCore(self._specialize_nx)
        m.submodules['fifo'] = fifo = StreamFifo(
            depth=Constants.OUTPUT_FIFO_DEPTH, type=unsigned(32))

        # Connect set_ and get instructions to accelerator core and FIFO
        m.d.comb += [
            core.start.eq(set_.accelerator_start),
            core.reset.eq(set_.accelerator_reset),
            core.config.eq(set_.config),
            get.reg_fifo_items_value.eq(fifo.r_level),
        ]
        m.d.comb += connect(set_.filter_output, core.write_filter_input)
        m.d.comb += connect(set_.post_process_params, core.post_process_params)
        m.d.comb += connect(core.output, fifo.input)
        m.d.comb += connect(fifo.output, get.output_words)

        # Connect accelerator LRAMs
        for i in range(4):
            m.d.comb += [
                self.lram_addr[i].eq(core.lram_addr[i]),
                core.lram_data[i].eq(self.lram_data[i]),
            ]

        # Connect verify functionality
        m.d.comb += get.reg_verify_value.eq(set_.reg_verify_value + 1)

        return {
            Constants.INS_GET: get,
            Constants.INS_SET: set_,
            Constants.INS_POOL: pool,
            Constants.INS_PING: ping,
        }
Esempio n. 9
0
 def create_dut(self):
     return Buffer(unsigned(32))
Esempio n. 10
0
 def create_dut(self):
     return ConcatenatingBuffer([('x', unsigned(8)), ('y', unsigned(8))])
Esempio n. 11
0
        # both operands are positive, so result always positive
        reg_ab = Signal(signed(63))
        m.d.sync += reg_ab.eq(reg_a * reg_b)

        # Cycle 2: nudge, take high bits and sign
        positive_2 = self.delay(m, 2, a >= 0)  # Whether input positive
        nudged = reg_ab + Mux(positive_2, (1 << 30), (1 << 30) - 1)
        high_bits = Signal(signed(32))
        m.d.comb += high_bits.eq(nudged[31:])
        with_sign = Mux(positive_2, high_bits, -high_bits)
        m.d.sync += out_value.eq(with_sign)


RDBPOT_INPUT_LAYOUT = [
    ('dividend', signed(32)),  # The value to be divided
    ('shift', unsigned(4)),    # The power of two by which to divide by
]


class RoundingDivideByPowerOfTwo(BinaryPipelineActor):
    """Divides its input by a power of two, rounding appropriately.

    Attributes
    ----------

    input: Endpoint(RDBPOT_INPUT_LAYOUT), in
      The calculation to perform.

    output: Endpoint(signed(32)), out
      The result.
Esempio n. 12
0
from ..stream import connect, Endpoint
from .constants import Constants
from .filter import FilterStore, FILTER_WRITE_COMMAND
from .mem import SinglePortMemory
from .mode0_input import Mode0InputFetcher
from .mode1_input import Mode1InputFetcher
from .post_process import (AccumulatorReader, OutputWordAssembler, ParamWriter,
                           POST_PROCESS_PARAMS, POST_PROCESS_PARAMS_WIDTH,
                           PostProcessPipeline, ReadingProducer, StreamLimiter)
from .ram_mux import RamMux
from .sysarray import SystolicArray
from .utils import unsigned_upto

ACCELERATOR_CONFIGURATION_LAYOUT = [
    # The mode of the accelerator - mode 0 for input, mode 1 for full speed
    ('mode', unsigned(1)),
    # Offset applied to each input activation value.
    ('input_offset', signed(9)),
    # Number of words of filter data, per filter store
    ('num_filter_words', unsigned_upto(Constants.FILTER_WORDS_PER_STORE)),
    # Offset applied to each output value.
    ('output_offset', signed(9)),
    #  The minimum output value
    ('output_activation_min', signed(8)),
    #  The maximum output value
    ('output_activation_max', signed(8)),
    # Address of start of input data, in bytes
    ('input_base_addr', 18),
    # How many pixels in output row
    ('num_pixels_x', 9),
    # Number of RAM blocks to advance to move to new pixel in X direction
Esempio n. 13
0
 def __init__(self, payload_type):
     super().__init__(payload_type, payload_type)
     self.release = Endpoint(unsigned(32))
Esempio n. 14
0
 def __init__(self):
     super().__init__()
     self.reg_fifo_items_value = Signal(32)
     self.reg_verify_value = Signal(32)
     self.output_words = Endpoint(unsigned(32))
Esempio n. 15
0
# See the License for the specific language governing permissions and
# limitations under the License.
"""Gateware for filter storage."""

from amaranth import Mux, Signal, unsigned
from amaranth_cfu import SequentialMemoryReader, SimpleElaboratable

from ..stream import Endpoint
from .constants import Constants
from .mem import SinglePortMemory
from .utils import unsigned_upto

FILTER_WRITE_COMMAND = [
    ('store', range(Constants.NUM_FILTER_STORES)),
    ('addr', range(Constants.FILTER_WORDS_PER_STORE)),
    ('data', unsigned(32)),
]


class FilterStore(SimpleElaboratable):
    """Stores words in a single port memory.

    The filter store contains multiple SinglePortMemories, each of which is
    written separately with values in the order required by the SystolicArray.

    Attributes
    ----------

    write_input: Endpoint(FILTER_WRITE_COMMAND), in
        Commands to write to the filter store. Always ready.
Esempio n. 16
0
 def create_dut(self):
     # Can only use standard implementation in Amaranth simulator
     return StandardMaccBlock(4, unsigned(8), signed(8), signed(24))
Esempio n. 17
0
 def data_shape(self) -> Shape:
     """Amaranth shape describing a data word."""
     return unsigned(self.width)
Esempio n. 18
0
 def __init__(self):
     self.data_input = Endpoint(unsigned(32))
     self.num_words_input = Endpoint(unsigned(32))
     self.data_output = Endpoint(unsigned(128))
Esempio n. 19
0
 def __init__(self):
     super().__init__()
     self.output = Endpoint(unsigned(32))
     self.value = self.output.payload
     self.new_en = Signal()
     self.new_value = Signal(32)
Esempio n. 20
0
 def __init__(self, depth=Constants.MAX_FILTER_WORDS):
     self.depth = depth
     self.data_input = Endpoint(unsigned(32))
     self.num_words_input = Endpoint(unsigned(32))
     self.data_output = Endpoint(unsigned(128))
Esempio n. 21
0
 def __init__(self, num_pixels=Constants.SYS_ARRAY_HEIGHT):
     self._num_pixels = num_pixels
     self.half_mode = Signal()
     self.input = Endpoint(signed(8))
     self.output = Endpoint(unsigned(32))
Esempio n. 22
0
 def create_dut(self):
     return StreamLimiter(unsigned(8))
Esempio n. 23
0
    def build_multipliers(self, m, accumulator):
        a0 = self.input_a.word_select(0, self._a_shape.width)
        b0 = self.input_b.word_select(0, self._b_shape.width)
        a1 = self.input_a.word_select(1, self._a_shape.width)
        b1 = self.input_b.word_select(1, self._b_shape.width)
        a2 = self.input_a.word_select(2, self._a_shape.width)
        b2 = self.input_b.word_select(2, self._b_shape.width)
        a3 = self.input_a.word_select(3, self._a_shape.width)
        b3 = self.input_b.word_select(3, self._b_shape.width)

        # Explicitly instantiate the DSP macro
        m.submodules.dsp = Instance(
            "MULTADDSUB9X9WIDE",

            i_CLK=ClockSignal(),
            i_CEA0A1=Const(1),
            i_CEA2A3=Const(1),
            i_CEB0B1=Const(1),
            i_CEB2B3=Const(1),
            i_CEC=Const(1),
            i_CEPIPE=Const(1),
            i_CEOUT=Const(1),
            i_CECTRL=Const(1),

            i_RSTA0A1=ResetSignal(),
            i_RSTA2A3=ResetSignal(),
            i_RSTB0B1=ResetSignal(),
            i_RSTB2B3=ResetSignal(),
            i_RSTC=ResetSignal(),
            i_RSTCTRL=ResetSignal(),
            i_RSTPIPE=ResetSignal(),
            i_RSTOUT=ResetSignal(),

            i_SIGNED=Const(1),
            i_ADDSUB=Const(0, unsigned(4)),

            i_A0=a0,
            i_B0=Cat(b0, b0[7]),
            i_A1=a1,
            i_B1=Cat(b1, b1[7]),
            i_A2=a2,
            i_B2=Cat(b2, b2[7]),
            i_A3=a3,
            i_B3=Cat(b3, b3[7]),

            i_C=Const(0, unsigned(54)),
            i_LOADC=self.input_first,

            o_Z=accumulator,

            p_REGINPUTAB0="BYPASS",
            p_REGINPUTAB1="BYPASS",
            p_REGINPUTAB2="BYPASS",
            p_REGINPUTAB3="BYPASS",
            p_REGINPUTC="BYPASS",
            p_REGADDSUB="BYPASS",
            p_REGLOADC="BYPASS",
            p_REGLOADC2="REGISTER",
            p_REGPIPELINE="REGISTER",
            p_REGOUTPUT="REGISTER",
            p_RESETMODE="SYNC",
            p_GSR="ENABLED",
        )
Esempio n. 24
0
            m.d.sync += self.value.eq(0)

        with m.Else():
            value_p1 = Signal.like(self.count)
            next_value = Signal.like(self.value)
            m.d.comb += [
                value_p1.eq(self.value + 1),
                self.last.eq(value_p1 == self.count),
                next_value.eq(Mux(self.last, 0, value_p1)),
            ]
            with m.If(self.next):
                m.d.sync += self.value.eq(next_value)


LDR_PARAMS_LAYOUT = [
    ('count', unsigned(16)),
    ('repeats', unsigned(4)),
]


class LoopingAddressGenerator(SimpleElaboratable):
    """Generates addresses from a memory.

    Loops from address zero for a given count, with configurable
    number of repeats.

    Parameters
    ----------

    depth: int
        Number of words in the memory being addressed. Maximum
Esempio n. 25
0
 def __init__(self):
     self.input = Endpoint(signed(8))
     self.output = Endpoint(unsigned(32))
Esempio n. 26
0
 def __init__(self):
     super().__init__(unsigned(32), unsigned(32))
Esempio n. 27
0
 def __init__(self):
     self.num_results = Endpoint(unsigned(32))
     self.results = Endpoint(signed(32))
     self.accumulated = Endpoint(signed(32))
Esempio n. 28
0
def unsigned_upto(maximum_value):
    """Creates a shape of a size to hold maximum_value"""
    return unsigned(maximum_value.bit_length())
Esempio n. 29
0
 def addr_shape(self) -> Shape:
     """Amaranth shape describing the address."""
     return unsigned((self.depth - 1).bit_length())