def __init__(self, specialize_nx=False, a_size=Constants.SYS_ARRAY_HEIGHT, b_size=Constants.SYS_ARRAY_WIDTH, n=4, a_shape=signed(9), b_shape=signed(8), accumulator_shape=signed(32)): self._specialize_nx = specialize_nx self._a_size = a_size self._b_size = b_size self._n = n self._a_shape = a_shape self._b_shape = b_shape self._accumulator_shape = accumulator_shape self.input_a = [ Signal(unsigned(n * a_shape.width), name=f"input_a{i}") for i in range(a_size) ] self.input_b = [ Signal(unsigned(n * b_shape.width), name=f"input_b{i}") for i in range(b_size) ] self.first = Signal() self.last = Signal() self.accumulator = [ Signal(accumulator_shape) for _ in range(a_size * b_size) ] self.accumulator_new = [Signal() for _ in range(a_size * b_size)]
def __init__(self): self.input = Endpoint(signed(32)) self.params = Endpoint(POST_PROCESS_PARAMS) self.output = Endpoint(signed(8)) self.offset = Signal(signed(9)) self.activation_min = Signal(signed(8)) self.activation_max = Signal(signed(8))
def __init__(self, n): self._n = n self.enable = Signal() self.offset = Signal(signed(9)) self.operands = Endpoint( Layout([('inputs', Shape(8 * n)), ('filters', Shape(8 * n))])) self.result = Endpoint(signed(32))
def transform(self, m, in_value, out_value): # Cycle 0: register inputs dividend = Signal(signed(32)) shift = Signal(4) m.d.sync += dividend.eq(in_value.dividend) m.d.sync += shift.eq(in_value.shift) # Cycle 1: calculate result = Signal(signed(32)) remainder = Signal(signed(32)) # Our threshold looks like 010, 0100, 01000 etc for positive values and # 011, 0101, 01001 etc for negative values. threshold = Signal(signed(32)) quotient = Signal(signed(32)) negative = Signal() m.d.comb += negative.eq(dividend < 0) with m.Switch(shift): for n in range(2, 13): with m.Case(n): mask = (1 << n) - 1 m.d.comb += remainder.eq(dividend & mask) m.d.comb += threshold[1:].eq(1 << (n - 2)) m.d.comb += quotient.eq(dividend >> n) m.d.comb += threshold[0].eq(negative) m.d.sync += result.eq(quotient + Mux(remainder >= threshold, 1, 0)) # Cycle 2: send output m.d.sync += out_value.eq(result)
def __init__(self, platform=None, top=False): ''' platform -- pass test platform top -- trigger synthesis of module ''' self.top = top self.platform = platform self.divider = platform.clks[platform.hfosc_div] self.order = platform.poldegree self.bit_shift = bit_shift(platform) self.motors = platform.motors self.max_steps = int(MOVE_TICKS / 2) # Nyquist # inputs self.coeff = Array() for _ in range(self.motors): self.coeff.extend([ Signal(signed(self.bit_shift + 1)), Signal(signed(self.bit_shift + 1)), Signal(signed(self.bit_shift + 1)) ][:self.order]) self.start = Signal() self.ticklimit = Signal(MOVE_TICKS.bit_length()) # output self.busy = Signal() self.dir = Array(Signal() for _ in range(self.motors)) self.step = Array(Signal() for _ in range(self.motors))
def __init__(self): self.in0 = Signal(32) self.in1 = Signal(32) self.funct7 = Signal(7) self.output = Signal(32) self.start = Signal() self.done = Signal() self.in0s = Signal(signed(32)) self.in1s = Signal(signed(32))
def max_(word0, word1): result = [Signal(8, name=f"result{i}") for i in range(4)] bytes0 = [word0[i:i + 8] for i in range(0, 32, 8)] bytes1 = [word1[i:i + 8] for i in range(0, 32, 8)] for r, b0, b1 in zip(result, bytes0, bytes1): sb0 = Signal(signed(8)) m.d.comb += sb0.eq(b0) sb1 = Signal(signed(8)) m.d.comb += sb1.eq(b1) m.d.comb += r.eq(Mux(sb1 > sb0, b1, b0)) return Cat(*result)
def elab(self, m): # Product is 17 bits: 8 bits * 9 bits = 17 bits products = [Signal(signed(17), name=f"product_{n}") for n in range(4)] for i_val, f_val, product in zip(all_words(self.i_data, 8), all_words(self.f_data, 8), products): f_tmp = Signal(signed(9)) m.d.sync += f_tmp.eq(f_val.as_signed()) i_tmp = Signal(signed(9)) m.d.sync += i_tmp.eq(i_val.as_signed() + self.offset) m.d.comb += product.eq(i_tmp * f_tmp) m.d.sync += self.result.eq(tree_sum(products))
def __init__(self): super().__init__() self.input_offset = Signal(signed(32)) self.reset_acc = Signal() self.output_offset = Signal(signed(32)) self.out_depth_set = Signal() self.out_depth = Signal(32) self.out_mult_set = Signal() self.out_mult = Signal(signed(32)) self.out_bias_shift_set = Signal() self.out_bias_shift = Signal(32)
def __init__(self, order=3, totalbits=16, fractionalbits=5): # Fixed point arithmic # https://vha3.github.io/FixedPoint/FixedPoint.html self.totalbits = totalbits self.fractionalbits = fractionalbits self.signed = 1 # Bernstein coefficients self.coeff = Array() for _ in order: self.coeff.extend([Signal(signed(totalbits))]) # time self.t = Signal(signed(totalbits)) # In / out signals self.done = Signal() self.beta = Array().like(self.coeff)
def elab(self, m): with_bias = Signal(signed(32)) m.d.comb += with_bias.eq(self.accumulator + self.bias) # acc = cpp_math_mul_by_quantized_mul_software( # acc, param_store_read(&output_multiplier), # param_store_read(&output_shift)); left_shift = Signal(5) right_sr = [Signal(5, name=f'right_sr_{n}') for n in range(4)] with m.If(self.shift > 0): m.d.comb += left_shift.eq(self.shift) with m.Else(): m.d.comb += right_sr[0].eq(-self.shift) left_shifted = Signal(32) m.d.comb += left_shifted.eq(with_bias << left_shift), # Pass right shift value down through several cycles to where # it is needed for a, b in zip(right_sr, right_sr[1:]): m.d.sync += b.eq(a) # All logic is combinational up to the inputs to the SRDHM m.submodules['srdhm'] = srdhm = SRDHM() m.d.comb += [ srdhm.a.eq(left_shifted), srdhm.b.eq(self.multiplier), ] # Output from SRDHM appears several cycles later right_shifted = Signal(signed(32)) m.d.sync += right_shifted.eq( rounding_divide_by_pot(srdhm.result, right_sr[-1])) # This logic is combinational to output # acc += reg_output_offset # if (acc < reg_activation_min) { # acc = reg_activation_min # } else if (acc > reg_activation_max) { # acc = reg_activation_max # } # return acc with_offset = Signal(signed(32)) m.d.comb += [ with_offset.eq(right_shifted + self.offset), self.result.eq( clamped(with_offset, self.activation_min, self.activation_max)), ]
def __init__(self, payload_type=signed(32)): self.stream_in = Endpoint(payload_type) self.stream_out = Endpoint(payload_type) self.num_allowed = Signal(18) self.start = Signal() self.running = Signal() self.finished = Signal()
def __init__(self): self.src1 = Signal(32, name="shifter_src1") self.src1signed = Signal(signed(32)) self.shift = Signal(5, name="shifter_shift") # 5 lowest imm bits self.res = Signal(32, name="shifter_res") self.funct3 = Signal(Funct3) self.funct7 = Signal(Funct7)
def __init__(self): self.accumulator = Signal(signed(32)) self.bias = Signal(signed(32)) self.multiplier = Signal(signed(32)) self.shift = Signal(signed(32)) self.offset = Signal(signed(32)) self.activation_min = Signal(signed(32)) self.activation_max = Signal(signed(32)) self.result = Signal(signed(32))
def elab(self, m): # Create filter store and input fetcher filter_values = self.build_filter_store(m) stop_input = Signal() first, last, activations = self.build_input_fetcher(m, stop_input) # Plumb in sysarray and its inputs m.submodules['sysarray'] = sa = SystolicArray(self._specialize_nx) for j, (in_a, activation) in enumerate(zip(sa.input_a, activations)): # Assign activation values with input offset for i in range(4): raw_val = Signal(signed(8), name=f"raw_{j}_{i}") m.d.comb += raw_val.eq(activation[i * 8:i * 8 + 8]) with_offset = Signal(signed(9), name=f"val_{j}_{i}") m.d.sync += with_offset.eq(raw_val + self.config.input_offset) m.d.comb += in_a[i * 9:i * 9 + 9].eq(with_offset) for in_b, value in zip(sa.input_b, filter_values): m.d.sync += in_b.eq(value) m.d.sync += sa.first.eq(first) m.d.sync += sa.last.eq(last) # Get pipeline inputs from systolic array and parameters accumulator_stream, finished = self.build_accumulator_reader( m, sa.accumulator, sa.accumulator_new) param_stream = self.build_param_store(m) # When last accumulator read, stop input m.d.comb += stop_input.eq(finished) # Plumb in pipeline m.submodules['ppp'] = ppp = PostProcessPipeline() m.d.comb += connect(accumulator_stream, ppp.input) m.d.comb += connect(param_stream, ppp.params) m.d.comb += [ ppp.offset.eq(self.config.output_offset), ppp.activation_min.eq(self.config.output_activation_min), ppp.activation_max.eq(self.config.output_activation_max), ] # Handle output m.submodules['owa'] = owa = ResetInserter(self.reset)( OutputWordAssembler()) m.d.comb += owa.half_mode.eq(~self.config.mode) m.d.comb += connect(ppp.output, owa.input) m.d.comb += connect(owa.output, self.output)
def elab(self, m): accumulator = Signal(signed(32)) m.d.comb += self.result.eq(accumulator) with m.If(self.add_en): m.d.sync += accumulator.eq(accumulator + self.in_value) m.d.comb += self.result.eq(accumulator + self.in_value) # clear always resets accumulator next cycle, even if add_en is high with m.If(self.clear): m.d.sync += accumulator.eq(0)
def elab(self, m): # Pipeline flow control: pipe_flowing = Signal() # We have a sequence of valid signals for each stage in our pipeline. # When the pipe is flowing, the signals tick along through the pipe. valid = self.operands.valid for _ in range(self.PIPELINE_CYCLES): next_valid = Signal() with m.If(pipe_flowing): m.d.sync += next_valid.eq(valid) valid = next_valid m.d.comb += self.result.valid.eq(self.enable & valid) # The pipe flows as long as we are transferring out the end this cycle, # or a valid value hasn't yet made it to the end. m.d.comb += pipe_flowing.eq(self.enable & (self.result.is_transferring() | ~valid)) # We are ready to receive new values at the start of the pipe # as long as it's flowing. m.d.comb += self.operands.ready.eq(pipe_flowing) # Chop operands payload into 8-bit signed signals inputs = [ self.operands.payload['inputs'][i:i + 8].as_signed() for i in range(0, 8 * self._n, 8) ] filters = [ self.operands.payload['filters'][i:i + 8].as_signed() for i in range(0, 8 * self._n, 8) ] # Product is 17 bits: 8 bits * 9 bits = 17 bits products = [ Signal(signed(17), name=f"product_{i:02x}") for i in range(self._n) ] with m.If(pipe_flowing): for i_val, f_val, product in zip(inputs, filters, products): f_tmp = Signal(signed(9)) m.d.sync += f_tmp.eq(f_val) i_tmp = Signal(signed(9)) m.d.sync += i_tmp.eq(i_val + self.offset) # TODO: consider whether to register output of multiplication m.d.comb += product.eq(i_tmp * f_tmp) m.d.sync += self.result.payload.eq(tree_sum(products))
def transform(self, m, in_value, out_value): # Cycle 0: add offset, saturate, register result into out_value with_offset = Signal(signed(32)) m.d.comb += with_offset.eq(in_value + self.offset) with m.If(with_offset > self.max): m.d.sync += out_value.eq(self.max) with m.Elif(with_offset < self.min): m.d.sync += out_value.eq(self.min) with m.Else(): m.d.sync += out_value.eq(with_offset)
def __init__(self, mem_port: LoadStoreInterface): self.loadstore = mem_port # Input signals. self.store = Signal() # assume 'load' if deasserted. self.funct3 = Signal(Funct3) self.src1 = Signal(32, name="LD_ST_src1") # 'src2' is used only for 'store' instructions. self.src2 = Signal(32, name="LD_ST_src2") self.offset = Signal(signed(12), name="LD_ST_offset") self.res = Signal(signed(32), name="LD_ST_res") self.en = Signal( name="LD_ST_en") # TODO implement 'ready/valid' interface # Output signals. self.ack = Signal(name="LD_ST_ack")
def transform(self, m, in_value, out_value): # Cycle 0: register inputs a = in_value.a reg_a = Signal(signed(32)) reg_b = Signal(signed(32)) m.d.sync += reg_a.eq(Mux(a >= 0, a, -a)) m.d.sync += reg_b.eq(in_value.b) # Cycle 1: multiply to register # both operands are positive, so result always positive reg_ab = Signal(signed(63)) m.d.sync += reg_ab.eq(reg_a * reg_b) # Cycle 2: nudge, take high bits and sign positive_2 = self.delay(m, 2, a >= 0) # Whether input positive nudged = reg_ab + Mux(positive_2, (1 << 30), (1 << 30) - 1) high_bits = Signal(signed(32)) m.d.comb += high_bits.eq(nudged[31:]) with_sign = Mux(positive_2, high_bits, -high_bits) m.d.sync += out_value.eq(with_sign)
def elab(self, m): in_vals = [Signal(signed(8), name=f"in_val_{i}") for i in range(4)] filter_vals = [ Signal( signed(8), name=f"filter_val_{i}") for i in range(4)] mults = [Signal(signed(19), name=f"mult_{i}") for i in range(4)] for i in range(4): m.d.comb += [ in_vals[i].eq(self.in0.word_select(i, 8).as_signed()), filter_vals[i].eq(self.in1.word_select(i, 8).as_signed()), mults[i].eq( (in_vals[i] + self.input_offset) * filter_vals[i]), ] m.d.sync += self.done.eq(0) with m.If(self.start): m.d.sync += self.accumulator.eq(self.accumulator + sum(mults)) # m.d.sync += self.accumulator.eq(self.accumulator + 72) m.d.sync += self.done.eq(1) with m.Elif(self.reset_acc): m.d.sync += self.accumulator.eq(0)
def __init__(self): super().__init__() self.bias = Signal(signed(32)) self.bias_next = Signal() self.multiplier = Signal(signed(32)) self.multiplier_next = Signal() self.shift = Signal(signed(32)) self.shift_next = Signal() self.offset = Signal(signed(32)) self.activation_min = Signal(signed(32)) self.activation_max = Signal(signed(32))
def __init__(self, platform, top=False): """ platform -- pass test platform top -- trigger synthesis of module """ self.platform = platform self.top = top self.spi = SPIBus() self.position = Array( Signal(signed(64)) for _ in range(platform.motors)) self.pinstate = Signal(8) self.read_commit = Signal() self.read_en = Signal() self.read_discard = Signal() self.dispatcherror = Signal() self.parse = Signal() self.read_data = Signal(MEMWIDTH) self.empty = Signal()
def elaborate(self, platform): m = Module() beta = self.beta temp = Signal(signed(self.totalbits * 2)) n = len(self.coeff) j = Signal(range(1, n)) k = Signal.like(j) with m.FSM(reset='INIT') as algo: with m.State('INIT'): m.d.sync += self.done.eq(0) for i in range(n): m.d.sync += beta[i].eq(self.coeff[i]) m.d.sync += [k.eq(0), j.eq(1)] m.next = 'UPDATE' with m.FSM('UPDATE'): m.d.sync += temp.eq(beta[k] * (1 - self.t) + beta[k + 1] * self.t) m.next = 'MULTIPLICATIONFIX' # Fixed point arithmetic need fix # see multiplication as https://vha3.github.io/FixedPoint/FixedPoint.html with m.FSM('MULTIPLICATIONFIX'): m.d.sync += beta[k].eq( temp[self.fractionalbits:self.fractionalbits + self.totalbits]) with m.If(k != n - j): m.d.sync += k.eq(k + 1) m.next = 'UPDATE' with m.Else(): with m.If(j != n): m.d.sync += j.eq(j + 1) m.d.sync += k.eq(0) m.next = 'UPDATE' with m.Else(): m.next = 'FINISH' with m.FSM('FINISH'): m.d.sync += self.done.eq(1) m.next = 'FINISH' return m
def elab(self, m): m.d.sync += self.done.eq(0) ab = Signal(signed(64)) nudge = 1 << 30 # for some reason negative nudge is not used with m.FSM(): with m.State("stage0"): with m.If(self.start): with m.If((self.a == INT32_MIN) & (self.b == INT32_MIN)): m.d.sync += [ self.result.eq(INT32_MAX), self.done.eq(1) ] with m.Else(): m.d.sync += ab.eq(self.a * self.b) m.next = "stage1" with m.State("stage1"): m.d.sync += [ self.result.eq((ab + nudge)[31:]), self.done.eq(1) ] m.next = "stage0"
def elab(self, m): areg = Signal.like(self.a) breg = Signal.like(self.b) ab = Signal(signed(64)) overflow = Signal() # for some reason negative nudge is not used nudge = 1 << 30 # cycle 0, register a and b m.d.sync += [ areg.eq(self.a), breg.eq(self.b), ] # cycle 1, decide if this is an overflow and multiply m.d.sync += [ overflow.eq((areg == INT32_MIN) & (breg == INT32_MIN)), ab.eq(areg * breg), ] # cycle 2, apply nudge determine result m.d.sync += [ self.result.eq(Mux(overflow, INT32_MAX, (ab + nudge)[31:])), ]
def __init__(self): self.a = Signal(signed(32)) self.b = Signal(signed(32)) self.start = Signal() self.result = Signal(signed(32)) self.done = Signal()
def __init__(self): self.x = Signal(signed(32)) self.exponent = Signal(5) self.result = Signal(signed(32))
def __init__(self): super().__init__() self.input_offset = Signal(signed(32)) self.accumulator = Signal(signed(32)) self.reset_acc = Signal()
from .filter import FilterStore, FILTER_WRITE_COMMAND from .mem import SinglePortMemory from .mode0_input import Mode0InputFetcher from .mode1_input import Mode1InputFetcher from .post_process import (AccumulatorReader, OutputWordAssembler, ParamWriter, POST_PROCESS_PARAMS, POST_PROCESS_PARAMS_WIDTH, PostProcessPipeline, ReadingProducer, StreamLimiter) from .ram_mux import RamMux from .sysarray import SystolicArray from .utils import unsigned_upto ACCELERATOR_CONFIGURATION_LAYOUT = [ # The mode of the accelerator - mode 0 for input, mode 1 for full speed ('mode', unsigned(1)), # Offset applied to each input activation value. ('input_offset', signed(9)), # Number of words of filter data, per filter store ('num_filter_words', unsigned_upto(Constants.FILTER_WORDS_PER_STORE)), # Offset applied to each output value. ('output_offset', signed(9)), # The minimum output value ('output_activation_min', signed(8)), # The maximum output value ('output_activation_max', signed(8)), # Address of start of input data, in bytes ('input_base_addr', 18), # How many pixels in output row ('num_pixels_x', 9), # Number of RAM blocks to advance to move to new pixel in X direction ('pixel_advance_x', 4), # Number of RAM blocks to advance between pixels in Y direction