コード例 #1
0
    def __init__(self,
                 specialize_nx=False,
                 a_size=Constants.SYS_ARRAY_HEIGHT,
                 b_size=Constants.SYS_ARRAY_WIDTH,
                 n=4,
                 a_shape=signed(9),
                 b_shape=signed(8),
                 accumulator_shape=signed(32)):
        self._specialize_nx = specialize_nx
        self._a_size = a_size
        self._b_size = b_size
        self._n = n
        self._a_shape = a_shape
        self._b_shape = b_shape
        self._accumulator_shape = accumulator_shape

        self.input_a = [
            Signal(unsigned(n * a_shape.width), name=f"input_a{i}")
            for i in range(a_size)
        ]
        self.input_b = [
            Signal(unsigned(n * b_shape.width), name=f"input_b{i}")
            for i in range(b_size)
        ]

        self.first = Signal()
        self.last = Signal()

        self.accumulator = [
            Signal(accumulator_shape) for _ in range(a_size * b_size)
        ]
        self.accumulator_new = [Signal() for _ in range(a_size * b_size)]
コード例 #2
0
ファイル: post_process.py プロジェクト: tcal-x/CFU-Playground
 def __init__(self):
     self.input = Endpoint(signed(32))
     self.params = Endpoint(POST_PROCESS_PARAMS)
     self.output = Endpoint(signed(8))
     self.offset = Signal(signed(9))
     self.activation_min = Signal(signed(8))
     self.activation_max = Signal(signed(8))
コード例 #3
0
 def __init__(self, n):
     self._n = n
     self.enable = Signal()
     self.offset = Signal(signed(9))
     self.operands = Endpoint(
         Layout([('inputs', Shape(8 * n)), ('filters', Shape(8 * n))]))
     self.result = Endpoint(signed(32))
コード例 #4
0
ファイル: post_process.py プロジェクト: tcal-x/CFU-Playground
    def transform(self, m, in_value, out_value):
        # Cycle 0: register inputs
        dividend = Signal(signed(32))
        shift = Signal(4)
        m.d.sync += dividend.eq(in_value.dividend)
        m.d.sync += shift.eq(in_value.shift)

        # Cycle 1: calculate
        result = Signal(signed(32))
        remainder = Signal(signed(32))
        # Our threshold looks like 010, 0100, 01000 etc for positive values and
        # 011, 0101, 01001 etc for negative values.
        threshold = Signal(signed(32))
        quotient = Signal(signed(32))
        negative = Signal()
        m.d.comb += negative.eq(dividend < 0)
        with m.Switch(shift):
            for n in range(2, 13):
                with m.Case(n):
                    mask = (1 << n) - 1
                    m.d.comb += remainder.eq(dividend & mask)
                    m.d.comb += threshold[1:].eq(1 << (n - 2))
                    m.d.comb += quotient.eq(dividend >> n)
        m.d.comb += threshold[0].eq(negative)
        m.d.sync += result.eq(quotient + Mux(remainder >= threshold, 1, 0))

        # Cycle 2: send output
        m.d.sync += out_value.eq(result)
コード例 #5
0
 def __init__(self, platform=None, top=False):
     '''
         platform  -- pass test platform
         top       -- trigger synthesis of module
     '''
     self.top = top
     self.platform = platform
     self.divider = platform.clks[platform.hfosc_div]
     self.order = platform.poldegree
     self.bit_shift = bit_shift(platform)
     self.motors = platform.motors
     self.max_steps = int(MOVE_TICKS / 2)  # Nyquist
     # inputs
     self.coeff = Array()
     for _ in range(self.motors):
         self.coeff.extend([
             Signal(signed(self.bit_shift + 1)),
             Signal(signed(self.bit_shift + 1)),
             Signal(signed(self.bit_shift + 1))
         ][:self.order])
     self.start = Signal()
     self.ticklimit = Signal(MOVE_TICKS.bit_length())
     # output
     self.busy = Signal()
     self.dir = Array(Signal() for _ in range(self.motors))
     self.step = Array(Signal() for _ in range(self.motors))
コード例 #6
0
 def __init__(self):
     self.in0 = Signal(32)
     self.in1 = Signal(32)
     self.funct7 = Signal(7)
     self.output = Signal(32)
     self.start = Signal()
     self.done = Signal()
     self.in0s = Signal(signed(32))
     self.in1s = Signal(signed(32))
コード例 #7
0
 def max_(word0, word1):
     result = [Signal(8, name=f"result{i}") for i in range(4)]
     bytes0 = [word0[i:i + 8] for i in range(0, 32, 8)]
     bytes1 = [word1[i:i + 8] for i in range(0, 32, 8)]
     for r, b0, b1 in zip(result, bytes0, bytes1):
         sb0 = Signal(signed(8))
         m.d.comb += sb0.eq(b0)
         sb1 = Signal(signed(8))
         m.d.comb += sb1.eq(b1)
         m.d.comb += r.eq(Mux(sb1 > sb0, b1, b0))
     return Cat(*result)
コード例 #8
0
ファイル: macc.py プロジェクト: tcal-x/CFU-Playground
    def elab(self, m):
        # Product is 17 bits: 8 bits * 9 bits = 17 bits
        products = [Signal(signed(17), name=f"product_{n}") for n in range(4)]
        for i_val, f_val, product in zip(all_words(self.i_data, 8),
                                         all_words(self.f_data, 8), products):
            f_tmp = Signal(signed(9))
            m.d.sync += f_tmp.eq(f_val.as_signed())
            i_tmp = Signal(signed(9))
            m.d.sync += i_tmp.eq(i_val.as_signed() + self.offset)
            m.d.comb += product.eq(i_tmp * f_tmp)

        m.d.sync += self.result.eq(tree_sum(products))
コード例 #9
0
    def __init__(self):
        super().__init__()
        self.input_offset = Signal(signed(32))
        self.reset_acc = Signal()

        self.output_offset = Signal(signed(32))

        self.out_depth_set = Signal()
        self.out_depth = Signal(32)
        self.out_mult_set = Signal()
        self.out_mult = Signal(signed(32))
        self.out_bias_shift_set = Signal()
        self.out_bias_shift = Signal(32)
コード例 #10
0
 def __init__(self, order=3, totalbits=16, fractionalbits=5):
     # Fixed point arithmic
     # https://vha3.github.io/FixedPoint/FixedPoint.html
     self.totalbits = totalbits
     self.fractionalbits = fractionalbits
     self.signed = 1
     # Bernstein coefficients
     self.coeff = Array()
     for _ in order:
         self.coeff.extend([Signal(signed(totalbits))])
     # time
     self.t = Signal(signed(totalbits))
     # In / out signals
     self.done = Signal()
     self.beta = Array().like(self.coeff)
コード例 #11
0
ファイル: post_process.py プロジェクト: tcal-x/CFU-Playground
    def elab(self, m):
        with_bias = Signal(signed(32))
        m.d.comb += with_bias.eq(self.accumulator + self.bias)

        # acc = cpp_math_mul_by_quantized_mul_software(
        #       acc, param_store_read(&output_multiplier),
        #       param_store_read(&output_shift));
        left_shift = Signal(5)
        right_sr = [Signal(5, name=f'right_sr_{n}') for n in range(4)]
        with m.If(self.shift > 0):
            m.d.comb += left_shift.eq(self.shift)
        with m.Else():
            m.d.comb += right_sr[0].eq(-self.shift)
        left_shifted = Signal(32)
        m.d.comb += left_shifted.eq(with_bias << left_shift),

        # Pass right shift value down through several cycles to where
        # it is needed
        for a, b in zip(right_sr, right_sr[1:]):
            m.d.sync += b.eq(a)

        # All logic is combinational up to the inputs to the SRDHM
        m.submodules['srdhm'] = srdhm = SRDHM()
        m.d.comb += [
            srdhm.a.eq(left_shifted),
            srdhm.b.eq(self.multiplier),
        ]

        # Output from SRDHM appears several cycles later
        right_shifted = Signal(signed(32))
        m.d.sync += right_shifted.eq(
            rounding_divide_by_pot(srdhm.result, right_sr[-1]))

        # This logic is combinational to output
        # acc += reg_output_offset
        # if (acc < reg_activation_min) {
        #     acc = reg_activation_min
        # } else if (acc > reg_activation_max) {
        #     acc = reg_activation_max
        # }
        # return acc
        with_offset = Signal(signed(32))
        m.d.comb += [
            with_offset.eq(right_shifted + self.offset),
            self.result.eq(
                clamped(with_offset, self.activation_min,
                        self.activation_max)),
        ]
コード例 #12
0
ファイル: post_process.py プロジェクト: tcal-x/CFU-Playground
 def __init__(self, payload_type=signed(32)):
     self.stream_in = Endpoint(payload_type)
     self.stream_out = Endpoint(payload_type)
     self.num_allowed = Signal(18)
     self.start = Signal()
     self.running = Signal()
     self.finished = Signal()
コード例 #13
0
ファイル: shifter.py プロジェクト: bieganski/mtkcpu
 def __init__(self):
     self.src1 = Signal(32, name="shifter_src1")
     self.src1signed = Signal(signed(32))
     self.shift = Signal(5, name="shifter_shift")  # 5 lowest imm bits
     self.res = Signal(32, name="shifter_res")
     self.funct3 = Signal(Funct3)
     self.funct7 = Signal(Funct7)
コード例 #14
0
ファイル: post_process.py プロジェクト: tcal-x/CFU-Playground
 def __init__(self):
     self.accumulator = Signal(signed(32))
     self.bias = Signal(signed(32))
     self.multiplier = Signal(signed(32))
     self.shift = Signal(signed(32))
     self.offset = Signal(signed(32))
     self.activation_min = Signal(signed(32))
     self.activation_max = Signal(signed(32))
     self.result = Signal(signed(32))
コード例 #15
0
    def elab(self, m):
        # Create filter store and input fetcher
        filter_values = self.build_filter_store(m)
        stop_input = Signal()
        first, last, activations = self.build_input_fetcher(m, stop_input)

        # Plumb in sysarray and its inputs
        m.submodules['sysarray'] = sa = SystolicArray(self._specialize_nx)
        for j, (in_a, activation) in enumerate(zip(sa.input_a, activations)):
            # Assign activation values with input offset
            for i in range(4):
                raw_val = Signal(signed(8), name=f"raw_{j}_{i}")
                m.d.comb += raw_val.eq(activation[i * 8:i * 8 + 8])
                with_offset = Signal(signed(9), name=f"val_{j}_{i}")
                m.d.sync += with_offset.eq(raw_val + self.config.input_offset)
                m.d.comb += in_a[i * 9:i * 9 + 9].eq(with_offset)
        for in_b, value in zip(sa.input_b, filter_values):
            m.d.sync += in_b.eq(value)
        m.d.sync += sa.first.eq(first)
        m.d.sync += sa.last.eq(last)

        # Get pipeline inputs from systolic array and parameters
        accumulator_stream, finished = self.build_accumulator_reader(
            m, sa.accumulator, sa.accumulator_new)
        param_stream = self.build_param_store(m)

        # When last accumulator read, stop input
        m.d.comb += stop_input.eq(finished)

        # Plumb in pipeline
        m.submodules['ppp'] = ppp = PostProcessPipeline()

        m.d.comb += connect(accumulator_stream, ppp.input)
        m.d.comb += connect(param_stream, ppp.params)
        m.d.comb += [
            ppp.offset.eq(self.config.output_offset),
            ppp.activation_min.eq(self.config.output_activation_min),
            ppp.activation_max.eq(self.config.output_activation_max),
        ]

        # Handle output
        m.submodules['owa'] = owa = ResetInserter(self.reset)(
            OutputWordAssembler())
        m.d.comb += owa.half_mode.eq(~self.config.mode)
        m.d.comb += connect(ppp.output, owa.input)
        m.d.comb += connect(owa.output, self.output)
コード例 #16
0
ファイル: macc.py プロジェクト: tcal-x/CFU-Playground
 def elab(self, m):
     accumulator = Signal(signed(32))
     m.d.comb += self.result.eq(accumulator)
     with m.If(self.add_en):
         m.d.sync += accumulator.eq(accumulator + self.in_value)
         m.d.comb += self.result.eq(accumulator + self.in_value)
     # clear always resets accumulator next cycle, even if add_en is high
     with m.If(self.clear):
         m.d.sync += accumulator.eq(0)
コード例 #17
0
    def elab(self, m):
        # Pipeline flow control:
        pipe_flowing = Signal()
        # We have a sequence of valid signals for each stage in our pipeline.
        # When the pipe is flowing, the signals tick along through the pipe.
        valid = self.operands.valid
        for _ in range(self.PIPELINE_CYCLES):
            next_valid = Signal()
            with m.If(pipe_flowing):
                m.d.sync += next_valid.eq(valid)
            valid = next_valid
        m.d.comb += self.result.valid.eq(self.enable & valid)
        # The pipe flows as long as we are transferring out the end this cycle,
        # or a valid value hasn't yet made it to the end.
        m.d.comb += pipe_flowing.eq(self.enable
                                    & (self.result.is_transferring() | ~valid))
        # We are ready to receive new values at the start of the pipe
        # as long as it's flowing.
        m.d.comb += self.operands.ready.eq(pipe_flowing)

        # Chop operands payload into 8-bit signed signals
        inputs = [
            self.operands.payload['inputs'][i:i + 8].as_signed()
            for i in range(0, 8 * self._n, 8)
        ]
        filters = [
            self.operands.payload['filters'][i:i + 8].as_signed()
            for i in range(0, 8 * self._n, 8)
        ]

        # Product is 17 bits: 8 bits * 9 bits = 17 bits
        products = [
            Signal(signed(17), name=f"product_{i:02x}") for i in range(self._n)
        ]
        with m.If(pipe_flowing):
            for i_val, f_val, product in zip(inputs, filters, products):
                f_tmp = Signal(signed(9))
                m.d.sync += f_tmp.eq(f_val)
                i_tmp = Signal(signed(9))
                m.d.sync += i_tmp.eq(i_val + self.offset)
                # TODO: consider whether to register output of multiplication
                m.d.comb += product.eq(i_tmp * f_tmp)

            m.d.sync += self.result.payload.eq(tree_sum(products))
コード例 #18
0
ファイル: post_process.py プロジェクト: tcal-x/CFU-Playground
 def transform(self, m, in_value, out_value):
     # Cycle 0: add offset, saturate, register result into out_value
     with_offset = Signal(signed(32))
     m.d.comb += with_offset.eq(in_value + self.offset)
     with m.If(with_offset > self.max):
         m.d.sync += out_value.eq(self.max)
     with m.Elif(with_offset < self.min):
         m.d.sync += out_value.eq(self.min)
     with m.Else():
         m.d.sync += out_value.eq(with_offset)
コード例 #19
0
    def __init__(self, mem_port: LoadStoreInterface):

        self.loadstore = mem_port

        # Input signals.
        self.store = Signal()  # assume 'load' if deasserted.
        self.funct3 = Signal(Funct3)
        self.src1 = Signal(32, name="LD_ST_src1")

        # 'src2' is used only for 'store' instructions.
        self.src2 = Signal(32, name="LD_ST_src2")
        self.offset = Signal(signed(12), name="LD_ST_offset")

        self.res = Signal(signed(32), name="LD_ST_res")
        self.en = Signal(
            name="LD_ST_en")  # TODO implement 'ready/valid' interface

        # Output signals.
        self.ack = Signal(name="LD_ST_ack")
コード例 #20
0
ファイル: post_process.py プロジェクト: tcal-x/CFU-Playground
    def transform(self, m, in_value, out_value):
        # Cycle 0: register inputs
        a = in_value.a
        reg_a = Signal(signed(32))
        reg_b = Signal(signed(32))
        m.d.sync += reg_a.eq(Mux(a >= 0, a, -a))
        m.d.sync += reg_b.eq(in_value.b)

        # Cycle 1: multiply to register
        # both operands are positive, so result always positive
        reg_ab = Signal(signed(63))
        m.d.sync += reg_ab.eq(reg_a * reg_b)

        # Cycle 2: nudge, take high bits and sign
        positive_2 = self.delay(m, 2, a >= 0)  # Whether input positive
        nudged = reg_ab + Mux(positive_2, (1 << 30), (1 << 30) - 1)
        high_bits = Signal(signed(32))
        m.d.comb += high_bits.eq(nudged[31:])
        with_sign = Mux(positive_2, high_bits, -high_bits)
        m.d.sync += out_value.eq(with_sign)
コード例 #21
0
 def elab(self, m):
     in_vals = [Signal(signed(8), name=f"in_val_{i}") for i in range(4)]
     filter_vals = [
         Signal(
             signed(8),
             name=f"filter_val_{i}") for i in range(4)]
     mults = [Signal(signed(19), name=f"mult_{i}") for i in range(4)]
     for i in range(4):
         m.d.comb += [
             in_vals[i].eq(self.in0.word_select(i, 8).as_signed()),
             filter_vals[i].eq(self.in1.word_select(i, 8).as_signed()),
             mults[i].eq(
                 (in_vals[i] + self.input_offset) * filter_vals[i]),
         ]
     m.d.sync += self.done.eq(0)
     with m.If(self.start):
         m.d.sync += self.accumulator.eq(self.accumulator + sum(mults))
         # m.d.sync += self.accumulator.eq(self.accumulator + 72)
         m.d.sync += self.done.eq(1)
     with m.Elif(self.reset_acc):
         m.d.sync += self.accumulator.eq(0)
コード例 #22
0
ファイル: post_process.py プロジェクト: tcal-x/CFU-Playground
 def __init__(self):
     super().__init__()
     self.bias = Signal(signed(32))
     self.bias_next = Signal()
     self.multiplier = Signal(signed(32))
     self.multiplier_next = Signal()
     self.shift = Signal(signed(32))
     self.shift_next = Signal()
     self.offset = Signal(signed(32))
     self.activation_min = Signal(signed(32))
     self.activation_max = Signal(signed(32))
コード例 #23
0
    def __init__(self, platform, top=False):
        """
        platform  -- pass test platform
        top       -- trigger synthesis of module
        """
        self.platform = platform
        self.top = top

        self.spi = SPIBus()
        self.position = Array(
            Signal(signed(64)) for _ in range(platform.motors))
        self.pinstate = Signal(8)
        self.read_commit = Signal()
        self.read_en = Signal()
        self.read_discard = Signal()
        self.dispatcherror = Signal()
        self.parse = Signal()
        self.read_data = Signal(MEMWIDTH)
        self.empty = Signal()
コード例 #24
0
    def elaborate(self, platform):
        m = Module()

        beta = self.beta
        temp = Signal(signed(self.totalbits * 2))

        n = len(self.coeff)
        j = Signal(range(1, n))
        k = Signal.like(j)
        with m.FSM(reset='INIT') as algo:
            with m.State('INIT'):
                m.d.sync += self.done.eq(0)
                for i in range(n):
                    m.d.sync += beta[i].eq(self.coeff[i])
                m.d.sync += [k.eq(0), j.eq(1)]
                m.next = 'UPDATE'
            with m.FSM('UPDATE'):
                m.d.sync += temp.eq(beta[k] * (1 - self.t) +
                                    beta[k + 1] * self.t)
                m.next = 'MULTIPLICATIONFIX'
            # Fixed point arithmetic need fix
            # see multiplication as https://vha3.github.io/FixedPoint/FixedPoint.html
            with m.FSM('MULTIPLICATIONFIX'):
                m.d.sync += beta[k].eq(
                    temp[self.fractionalbits:self.fractionalbits +
                         self.totalbits])
                with m.If(k != n - j):
                    m.d.sync += k.eq(k + 1)
                    m.next = 'UPDATE'
                with m.Else():
                    with m.If(j != n):
                        m.d.sync += j.eq(j + 1)
                        m.d.sync += k.eq(0)
                        m.next = 'UPDATE'
                    with m.Else():
                        m.next = 'FINISH'
            with m.FSM('FINISH'):
                m.d.sync += self.done.eq(1)
                m.next = 'FINISH'
        return m
コード例 #25
0
    def elab(self, m):
        m.d.sync += self.done.eq(0)

        ab = Signal(signed(64))
        nudge = 1 << 30  # for some reason negative nudge is not used
        with m.FSM():
            with m.State("stage0"):
                with m.If(self.start):
                    with m.If((self.a == INT32_MIN) & (self.b == INT32_MIN)):
                        m.d.sync += [
                            self.result.eq(INT32_MAX),
                            self.done.eq(1)
                        ]
                    with m.Else():
                        m.d.sync += ab.eq(self.a * self.b)
                        m.next = "stage1"
            with m.State("stage1"):
                m.d.sync += [
                    self.result.eq((ab + nudge)[31:]),
                    self.done.eq(1)
                ]
                m.next = "stage0"
コード例 #26
0
ファイル: post_process.py プロジェクト: tcal-x/CFU-Playground
    def elab(self, m):
        areg = Signal.like(self.a)
        breg = Signal.like(self.b)
        ab = Signal(signed(64))
        overflow = Signal()

        # for some reason negative nudge is not used
        nudge = 1 << 30

        # cycle 0, register a and b
        m.d.sync += [
            areg.eq(self.a),
            breg.eq(self.b),
        ]
        # cycle 1, decide if this is an overflow and multiply
        m.d.sync += [
            overflow.eq((areg == INT32_MIN) & (breg == INT32_MIN)),
            ab.eq(areg * breg),
        ]
        # cycle 2, apply nudge determine result
        m.d.sync += [
            self.result.eq(Mux(overflow, INT32_MAX, (ab + nudge)[31:])),
        ]
コード例 #27
0
 def __init__(self):
     self.a = Signal(signed(32))
     self.b = Signal(signed(32))
     self.start = Signal()
     self.result = Signal(signed(32))
     self.done = Signal()
コード例 #28
0
 def __init__(self):
     self.x = Signal(signed(32))
     self.exponent = Signal(5)
     self.result = Signal(signed(32))
コード例 #29
0
 def __init__(self):
     super().__init__()
     self.input_offset = Signal(signed(32))
     self.accumulator = Signal(signed(32))
     self.reset_acc = Signal()
コード例 #30
0
from .filter import FilterStore, FILTER_WRITE_COMMAND
from .mem import SinglePortMemory
from .mode0_input import Mode0InputFetcher
from .mode1_input import Mode1InputFetcher
from .post_process import (AccumulatorReader, OutputWordAssembler, ParamWriter,
                           POST_PROCESS_PARAMS, POST_PROCESS_PARAMS_WIDTH,
                           PostProcessPipeline, ReadingProducer, StreamLimiter)
from .ram_mux import RamMux
from .sysarray import SystolicArray
from .utils import unsigned_upto

ACCELERATOR_CONFIGURATION_LAYOUT = [
    # The mode of the accelerator - mode 0 for input, mode 1 for full speed
    ('mode', unsigned(1)),
    # Offset applied to each input activation value.
    ('input_offset', signed(9)),
    # Number of words of filter data, per filter store
    ('num_filter_words', unsigned_upto(Constants.FILTER_WORDS_PER_STORE)),
    # Offset applied to each output value.
    ('output_offset', signed(9)),
    #  The minimum output value
    ('output_activation_min', signed(8)),
    #  The maximum output value
    ('output_activation_max', signed(8)),
    # Address of start of input data, in bytes
    ('input_base_addr', 18),
    # How many pixels in output row
    ('num_pixels_x', 9),
    # Number of RAM blocks to advance to move to new pixel in X direction
    ('pixel_advance_x', 4),
    # Number of RAM blocks  to advance between pixels in Y direction