def instantiate(self): self.name = 'tb' self.image_size = (4, 4) self.filter_size = (3, 3) self.in_chn = 4 self.out_chn = 8 self.chn_per_word = 4 self.arr_x = self.out_chn self.arr_y = self.in_chn self.input_chn = Channel() self.output_chn = Channel() ifmap_glb_depth = self.image_size[0]*self.image_size[1]* \ self.in_chn//self.chn_per_word print("ifmap glb depth:", ifmap_glb_depth) psum_glb_depth = self.image_size[0]*self.image_size[1]* \ self.out_chn//self.chn_per_word print("psum glb depth:", psum_glb_depth) self.stimulus = Stimulus(self.arr_x, self.arr_y, self.chn_per_word, self.input_chn, self.output_chn) self.dut = WSArch(self.arr_x, self.arr_y, self.input_chn, self.output_chn, self.chn_per_word, ifmap_glb_depth, psum_glb_depth) self.configuration_done = False
def instantiate(self): self.name = 'tb' self.input_size = 4 self.block_size = 12 self.in_sets = self.block_size // self.input_size self.num_nonzero = 5 self.preserve_order = True self.in_chn = Channel() self.mid_chn = Channel() self.out_chn = Channel() self.converter = Converter(self.in_chn, self.mid_chn, self.input_size, self.block_size) #self.pruner = NaivePruner(self.mid_chn,self.out_chn,self.num_nonzero, self.block_size, self.preserve_order) self.pruner = ClusteredPruner(self.mid_chn,self.out_chn,self.num_nonzero, self.block_size, self.preserve_order) #self.pruner = ThresholdPruner(self.mid_chn,self.out_chn,self.num_nonzero, self.block_size, self.preserve_order) self.iterations = 10 self.iteration = 0 self.curr_set = 0 self.out_counter = 0 self.test_data = [[randint(1,5) if randint(0,3)>1 else 0\ for j in range(self.block_size)]\ for i in range(self.iterations+1)] # send in one extra iteration to flush out last outputs print("Stimulus:") print("[") for i in range(len(self.test_data)-1): print(self.test_data[i]) print("]")
def instantiate(self): self.name = 'tb' self.image_size = (4, 4) self.filter_size = (3, 3) self.in_chn = 8 self.out_chn = 16 self.chn_per_word = 4 self.arr_x = self.out_chn // 2 self.arr_y = self.in_chn // 2 self.input_chn = Channel() self.output_chn = Channel() self.psum_chn = Channel(128) self.curr_pass = 0 self.tick_counter = 0 ifmap_glb_depth = self.image_size[0]*self.image_size[1]* \ (self.in_chn//2)//self.chn_per_word psum_glb_depth = self.image_size[0]*self.image_size[1]* \ (self.out_chn//2)//self.chn_per_word self.stimulus = Stimulus(self.arr_x, self.arr_y, self.chn_per_word, self.input_chn, self.output_chn, self.psum_chn) self.dut = WSArch(self.arr_x, self.arr_y, self.input_chn, self.output_chn, self.chn_per_word, ifmap_glb_depth, psum_glb_depth) self.configuration_done = False
def instantiate(self, dram_wr_chn, noc_wr_chn, rd_chn, glb_depth, chn_per_word): self.dram_wr_chn = dram_wr_chn self.noc_wr_chn = noc_wr_chn self.rd_chn = rd_chn self.chn_per_word = chn_per_word self.name = 'psum_glb' self.stat_type = 'show' self.raw_stats = {'size': (glb_depth, chn_per_word), 'rd': 0, 'wr': 0} self.sram = SRAM(glb_depth, chn_per_word, nports=2, name=self.name) self.last_read = Channel(3, name='last_read') self.filter_size = (0, 0) self.fmap_sets = 0 self.fmap_per_iteration = 0 self.rd_set = 0 self.fmap_rd_idx = 0 self.iteration = 0 self.wr_set = 0 self.fmap_wr_idx = 0 self.wr_done = False
def instantiate(self): self.name = 'tb' self.image_size = (4, 4) self.filter_size = (3, 3) self.in_chn = 4 self.out_chn = 8 self.chn_per_word = 4 self.num_tiles = 4 self.arr_x = self.out_chn self.arr_y = self.in_chn self.input_chn = Channel() self.output_chn = Channel() self.finish_signal_chn = Channel() self.stat_type = 'show' self.raw_stats = {} ifmap_glb_depth = self.image_size[0] * self.image_size[ 1] * self.num_tiles * self.in_chn // self.chn_per_word # psum_glb_depth = self.image_size[0]*self.image_size[1]*self.out_chn//self.chn_per_word print("ifmap glb depth:", ifmap_glb_depth) print("weight glb depth: 0") self.stimulus = Stimulus(self.arr_x, self.arr_y, self.chn_per_word, self.input_chn, self.output_chn, self.finish_signal_chn) self.dut = WSArch(self.arr_x, self.arr_y, self.input_chn, self.output_chn, self.chn_per_word, ifmap_glb_depth) self.configuration_done = False
def instantiate(self, wr_chn, rd_chn, glb_depth, chn_per_word): self.wr_chn = wr_chn self.rd_chn = rd_chn self.chn_per_word = chn_per_word self.glb_depth = glb_depth self.name = 'ifmap_glb' self.stat_type = 'show' self.raw_stats = { 'size': (glb_depth, chn_per_word), 'ifmap_glb_rd': 0, 'ifmap_glb_wr': 0 } self.sram = SRAM(glb_depth, chn_per_word) self.last_read = Channel(3) self.image_size = (0, 0) self.filter_size = (0, 0) self.fmap_sets = 0 self.fmap_per_iteration = 0 self.curr_set = 0 self.fmap_idx = 0 self.iteration = 0 self.wr_done = False
def instantiate(self, dram_wr_chn, noc_wr_chn, rd_chn, glb_depth, block_size, num_nonzero): self.dram_wr_chn = dram_wr_chn self.noc_wr_chn = noc_wr_chn self.rd_chn = rd_chn self.name = 'psum_glb' self.block_size = block_size self.num_nonzero = num_nonzero self.stat_type = 'show' self.raw_stats = {'size': (glb_depth, block_size), 'rd': 0, 'wr': 0} self.sram = SRAM(glb_depth, block_size, nports=2, dtype=np.float16) self.last_read = Channel(3) self.filter_size = (0, 0) self.fmap_sets = 0 self.fmap_per_iteration = 0 self.rd_set = 0 self.fmap_rd_idx = 0 self.iteration = 0 self.wr_set = 0 self.fmap_wr_idx = 0 self.wr_done = False
def instantiate(self, wr_chn, rd_chn, glb_depth, chn_per_word): self.wr_chn = wr_chn self.rd_chn = rd_chn self.chn_per_word = chn_per_word self.name = 'ifmap_glb' self.stat_type = 'show' self.raw_stats = {'size' : (glb_depth, chn_per_word), 'rd': 0, 'wr': 0} self.sram = SRAM(glb_depth, chn_per_word, name=self.name) self.last_read = Channel(3, name='last_read') self.image_size = (0, 0) self.filter_size = (0, 0) self.fmap_sets = 0 self.full_fmap_sets = 0 self.fmap_per_iteration = 0 self.curr_set = 0 self.fmap_idx = 0 self.iteration = 0 self.tile_in = 0 self.tile_out = 0 self.wr_done = False self.task_done = True
def instantiate(self, arr_x, arr_y, chn_per_word, done_chn, ifmap_glb_depth, psum_glb_depth, weight_glb_depth): self.name = 'conv_tb' self.image_size = None self.filter_size = None self.full_in_chn = None self.full_out_chn = None self.ceil_in_chn = None self.ceil_out_chn = None self.in_chn = arr_y self.out_chn = arr_x self.done_chn = done_chn self.chn_per_word = chn_per_word self.arr_x = self.out_chn self.arr_y = self.in_chn self.input_chn = Channel(name='arch_input_chn') self.output_chn = Channel(name='arch_output_chn') self.stimulus = Stimulus(self.arr_x, self.arr_y, self.chn_per_word, self.input_chn, self.output_chn, self.done_chn) self.dut = WSArch(self.arr_x, self.arr_y, self.input_chn, self.output_chn, self.chn_per_word, ifmap_glb_depth, psum_glb_depth, weight_glb_depth)
class ConverterTB(Module): def instantiate(self): self.name = 'tb' self.input_size = 4 self.block_size = 12 self.in_sets = self.block_size // self.input_size self.num_nonzero = 5 self.preserve_order = True self.in_chn = Channel() self.mid_chn = Channel() self.out_chn = Channel() self.converter = Converter(self.in_chn, self.mid_chn, self.input_size, self.block_size) #self.pruner = NaivePruner(self.mid_chn,self.out_chn,self.num_nonzero, self.block_size, self.preserve_order) self.pruner = ClusteredPruner(self.mid_chn,self.out_chn,self.num_nonzero, self.block_size, self.preserve_order) #self.pruner = ThresholdPruner(self.mid_chn,self.out_chn,self.num_nonzero, self.block_size, self.preserve_order) self.iterations = 10 self.iteration = 0 self.curr_set = 0 self.out_counter = 0 self.test_data = [[randint(1,5) if randint(0,3)>1 else 0\ for j in range(self.block_size)]\ for i in range(self.iterations+1)] # send in one extra iteration to flush out last outputs print("Stimulus:") print("[") for i in range(len(self.test_data)-1): print(self.test_data[i]) print("]") def tick(self): if (self.in_chn.vacancy() and not self.iteration == self.iterations+1): imin = self.curr_set*self.input_size imax = imin+self.input_size data = [self.test_data[self.iteration][i] for i in range(imin, imax)] self.in_chn.push(data) self.curr_set += 1 if (self.curr_set == self.in_sets): self.curr_set = 0 self.iteration += 1 if (self.out_chn.valid()): data = self.out_chn.pop() print(data) #print("out_counter: ", self.out_counter) self.out_counter += 1 if (self.out_counter == self.iterations): raise Finish("Check manually")
def instantiate(self, image_size, filter_size, in_chn, out_chn, block_size, ifmap, weights, bias, pruner_name, num_nonzero): self.name = 'tb' # if (debug): # self.image_size = (4, 4) # self.filter_size = (3, 3) # self.in_chn = 2 # self.out_chn = 4 # self.block_size = 2 # self.num_nonzero = 1 #number of non-zero values in each blok, help test the correctness of the arch # else: # self.image_size = (16, 16) # self.filter_size = (3, 3) # self.in_chn = 16 # self.out_chn = 8 # self.block_size = 4 # self.num_nonzero = 4 self.image_size = image_size self.filter_size = filter_size self.in_chn = in_chn self.out_chn = out_chn self.block_size = block_size self.num_nonzero = num_nonzero #number of non-zero values in each blok, help test the correctness of the arch #the inputs to this specific layer self.ifmap = ifmap self.weights = weights self.bias = bias self.pruner_name = pruner_name self.arr_y = self.out_chn self.input_chn = Channel() self.output_chn = Channel() ifmap_glb_depth = (self.filter_size[1] + (self.filter_size[0]-1)*\ self.image_size[1]) * self.in_chn // self.block_size psum_glb_depth = self.out_chn // self.block_size weight_glb_depth = self.filter_size[0]*self.filter_size[1]* \ self.in_chn*self.out_chn//self.block_size self.stimulus = Stimulus(self.arr_y, self.block_size, self.num_nonzero, self.input_chn, self.output_chn, self.pruner_name) self.dut = OSArch(self.arr_y, self.input_chn, self.output_chn, self.block_size, self.num_nonzero, ifmap_glb_depth, psum_glb_depth, weight_glb_depth) self.configuration_done = False
class ChannelTB(Module): def instantiate(self): self.channel = Channel(4) self.push_count = 0 self.free_count = 0 self.test_size = 100 def tick(self): # Print current state of the channel c, n = [], 0 while (self.channel.valid(n)): d = self.channel.peek(n) assert (d == (self.free_count + n)) c.append(d) n += 1 print("channel: %s" % c) # Possibly push a new element if random.random() < 0.5 and self.push_count < self.test_size and \ self.channel.vacancy(): self.channel.push(self.push_count) print("push: %d" % self.push_count) self.push_count += 1 # Possibly free some elements if random.random() < 0.5 and self.free_count < self.test_size and \ n != 0: num_free = random.randint(1, n) self.channel.free(num_free) self.free_count += num_free print("free: %d" % num_free)
def instantiate(self, arch_input_chn, arr_y, block_size, num_nonzero, pruner_name): # PE static configuration (immutable) #self.arr_x = arr_x self.arr_y = arr_y #self.chn_per_word = chn_per_word self.block_size = block_size self.num_nonzero = num_nonzero self.convert_chn = Channel() self.prune_chn = Channel() self.arch_input_chn = arch_input_chn # Although both InputSerializer and pruner will be pushing to arch_input_chn # There is no conflict issue because all weights will be pushed by IS first # then all inputs by pruner self.converter = Converter(self.convert_chn, self.prune_chn, \ self.block_size, self.block_size) # self.pruner = NaivePruner(self.prune_chn,self.arch_input_chn, \ # self.num_nonzero,True) #user defined pruner for this layer, default to naive pruner self.pruner = getattr(pruner, pruner_name)(self.prune_chn,self.arch_input_chn, \ self.num_nonzero, self.block_size, True) self.ifmap = None self.weights = None self.bias = None self.image_size = (0, 0) self.filter_size = (0, 0) self.ifmap_psum_done = True self.pass_done = Reg(False) # State Counters self.curr_set = 0 self.curr_filter = 0 self.iteration = 0 self.fmap_idx = 0 self.curr_chn = 0 self.curr_x = 0 # run through first two dimensions of input self.curr_y = 0 self.bias_set = 0
def instantiate(self, wr_chn, rd_chn, glb_depth, block_size): self.wr_chn = wr_chn self.rd_chn = rd_chn self.name = 'weight_glb' self.filter_size = (0, 0) self.image_size = (0, 0) self.wr_done = False self.iteration = 0 self.addr = 0 self.in_chn = 0 self.out_chn = 0 #self.arr_y = 0 #self.out_sets = 0 self.block_size = block_size self.sram = SRAM(glb_depth, block_size) self.last_read = Channel(3) self.stat_type = 'show' self.raw_stats = {'size': (glb_depth, block_size), 'rd': 0, 'wr': 0}
def instantiate(self, wr_chn, rd_chn, arr_y, glb_depth, block_size, num_nonzero): self.wr_chn = wr_chn self.rd_chn = rd_chn self.arr_y = arr_y self.block_size = block_size self.num_nonzero = num_nonzero self.name = 'ifmap_glb' self.stat_type = 'show' self.raw_stats = {'size': (glb_depth, num_nonzero), 'rd': 0, 'wr': 0} self.sram = SRAM(glb_depth, num_nonzero * 3) self.last_read = Channel(3) self.glb_depth = glb_depth self.image_size = (0, 0) self.filter_size = (0, 0) self.fmap_sets = 0 self.fmap_per_iteration = 0 self.curr_set = 0 self.fmap_idx = 0 self.iteration = 0 self.wr_done = False # For managing convolution self.curr_x = 0 self.curr_y = 0 self.curr_chn = 0 self.request_idx = 0 self.send_idx = 0 #self.curr_filt_x = 0 #self.curr_filt_y = 0 self.ifmap_done = False self.needed_addr = 0 self.ready_to_output = False # ready to output a filter_size block of inputs self.curr_data = [0 for i in range(3 * num_nonzero)] self.data_idx = num_nonzero # block other operations while actively working through data
def instantiate(self, arr_x, arr_y, chn_per_word, done_chn, ifmap_glb_depth, psum_glb_depth, weight_glb_depth): self.name = 'fc_tb' self.arr_x = arr_x self.arr_y = arr_y self.chn_per_word = chn_per_word self.batch_size = None self.input_size = None self.output_size = None self.ceil_batch = None self.ceil_output = None self.input_chn = Channel(name='arch_input_chn') self.output_chn = Channel(name='arch_output_chn') self.done_chn = done_chn self.stimulus = Stimulus(self.arr_x, self.arr_y, self.chn_per_word, self.input_chn, self.output_chn, self.done_chn) self.dut = OSArch(self.arr_x, self.arr_y, self.input_chn, self.output_chn, self.chn_per_word, ifmap_glb_depth, weight_glb_depth)
class MetaArchTB(Module): def instantiate(self, arr_x, arr_y, chn_per_word, layers, batch_size): self.arr_x = arr_x self.arr_y = arr_y self.chn_per_word = chn_per_word self.layers = layers self.batch_size = batch_size self.name = 'meta' self.started = False self.done_chn = Channel() self.ifmap_glb_depth = 0 self.psum_glb_depth = 0 self.weights_glb_depth = 0 use_conv = False use_fc = False self.conv_tb = None self.fc_tb = None cur_image_size = None cur_in_chn = None is_conv = False num_convs = 0 num_fc = 0 for layer in self.layers: if isinstance(layer, Conv): if cur_image_size is None: pass elif cur_image_size != layer.image_size or cur_in_chn != layer.in_chn: raise Exception('Invalid conv image size for %s: %s %s' % (layer.name, (cur_image_size, cur_in_chn), (layer.image_size, layer.in_chn))) ifmap_glb_depth, psum_glb_depth, weights_glb_depth = WSArchTB.required_glb_depth( self.arr_x, self.arr_y, self.chn_per_word, layer.image_size, layer.filter_size, layer.in_chn, layer.out_chn) use_conv = True output_shape = layer.new_shape((self.batch_size, ) + layer.image_size + (layer.out_chn, )) cur_image_size = output_shape[1:3] cur_in_chn = output_shape[3] is_conv = True num_convs += 1 elif isinstance(layer, FC): if cur_image_size is None: pass elif not is_conv and cur_image_size != layer.input_size: raise Exception('Invalid fc dimension transition for ' + layer.name) elif is_conv and cur_image_size[0] * cur_image_size[ 1] * cur_in_chn != layer.input_size: raise Exception( 'Invalid conv to fc dimension transition to ' + layer.name) ifmap_glb_depth, psum_glb_depth, weights_glb_depth = OSArchTB.required_glb_depth( self.arr_x, self.arr_y, self.chn_per_word, self.batch_size, layer.input_size, layer.output_size) use_fc = True _, cur_image_size = layer.new_shape( (self.batch_size, layer.output_size)) is_conv = False num_fc += 1 else: raise Exception('layer not valid') self.ifmap_glb_depth = max(self.ifmap_glb_depth, ifmap_glb_depth) self.psum_glb_depth = max(self.psum_glb_depth, psum_glb_depth) self.weights_glb_depth = max(self.weights_glb_depth, weights_glb_depth) if use_conv: self.conv_tb = WSArchTB(self.arr_x, self.arr_y, self.chn_per_word, self.done_chn, self.ifmap_glb_depth, self.psum_glb_depth, self.weights_glb_depth) if use_fc: self.fc_tb = OSArchTB(self.arr_x, self.arr_y, self.chn_per_word, self.done_chn, self.ifmap_glb_depth, self.psum_glb_depth, self.weights_glb_depth) self.layer_step = 0 self.batch_step = 0 self.conv_inputs = [None] * self.batch_size self.fc_input = None self.conv_weights = [None] * num_convs self.conv_bias = [None] * num_convs self.fc_weights = [None] * num_fc self.fc_bias = [None] * num_fc self.cur_conv = 0 self.cur_fc = 0 def tick(self): if not self.started or self.done_chn.valid(): self.started = True old_layer = self.layers[self.layer_step] if self.done_chn.valid(): valid = self.done_chn.pop() if not valid: raise Finish('Validation Failed') if isinstance(old_layer, Conv): self.conv_inputs[ self.batch_step] = self.conv_tb.get_output() self.batch_step += 1 if self.batch_step == self.batch_size: self.conv_inputs = [ batch for batch in old_layer.activation( np.array(self.conv_inputs)) ] self.batch_step = 0 self.layer_step += 1 self.cur_conv += 1 else: self.fc_input = self.fc_tb.get_output() self.fc_input = old_layer.activation(self.fc_input) self.layer_step += 1 self.cur_fc += 1 if self.layer_step == len(self.layers): raise Finish('Success') layer = self.layers[self.layer_step] # handle conv to fc transition if isinstance( layer, FC ) and self.fc_input is None and self.conv_inputs[0] is not None: if self.name != None: self.output_file.write("FC MODE\n") self.fc_input = np.zeros( (self.batch_size, layer.input_size)).astype(np.int64) for i in range(self.batch_size): self.fc_input[i] = self.conv_inputs[i].reshape( layer.input_size) if isinstance(layer, Conv): if self.name != None: self.output_file.write("CONV MODE\n") if self.conv_inputs[self.batch_step] is None: _, weights, bias = self.conv_tb.configure( layer.image_size, layer.filter_size, layer.in_chn, layer.out_chn) self.conv_weights[self.cur_conv] = weights self.conv_bias[self.cur_conv] = bias elif self.conv_weights[ self.cur_conv] is None or self.conv_bias[ self.cur_conv] is None: weights, bias = self.conv_tb.configure_fixed_image( self.conv_inputs[self.batch_step], layer.filter_size, layer.in_chn, layer.out_chn) self.conv_weights[self.cur_conv] = weights self.conv_bias[self.cur_conv] = bias else: self.conv_tb.configure_fixed( self.conv_inputs[self.batch_step], self.conv_weights[self.cur_conv], self.conv_bias[self.cur_conv]) elif isinstance(layer, FC): if self.fc_input is None: _, weights, bias = self.fc_tb.configure( self.batch_size, layer.input_size, layer.output_size) self.fc_weights[self.cur_fc] = weights self.fc_bias[self.cur_fc] = bias elif self.fc_weights[self.cur_fc] is None or self.fc_bias[ self.cur_fc] is None: weights, bias = self.fc_tb.configure_fixed_image( self.fc_input, layer.output_size) self.fc_weights[self.cur_fc] = weights self.fc_bias[self.cur_fc] = bias else: self.fc_tb.configure_fixed(self.fc_input, self.fc_weights[self.cur_fc], self.fc_bias[self.cur_fc]) else: raise Exception('layer not valid')
class InputSerializer(Module): def instantiate(self, arch_input_chn, arr_y, block_size, num_nonzero, pruner_name): # PE static configuration (immutable) #self.arr_x = arr_x self.arr_y = arr_y #self.chn_per_word = chn_per_word self.block_size = block_size self.num_nonzero = num_nonzero self.convert_chn = Channel() self.prune_chn = Channel() self.arch_input_chn = arch_input_chn # Although both InputSerializer and pruner will be pushing to arch_input_chn # There is no conflict issue because all weights will be pushed by IS first # then all inputs by pruner self.converter = Converter(self.convert_chn, self.prune_chn, \ self.block_size, self.block_size) # self.pruner = NaivePruner(self.prune_chn,self.arch_input_chn, \ # self.num_nonzero,True) #user defined pruner for this layer, default to naive pruner self.pruner = getattr(pruner, pruner_name)(self.prune_chn,self.arch_input_chn, \ self.num_nonzero, self.block_size, True) self.ifmap = None self.weights = None self.bias = None self.image_size = (0, 0) self.filter_size = (0, 0) self.ifmap_psum_done = True self.pass_done = Reg(False) # State Counters self.curr_set = 0 self.curr_filter = 0 self.iteration = 0 self.fmap_idx = 0 self.curr_chn = 0 self.curr_x = 0 # run through first two dimensions of input self.curr_y = 0 self.bias_set = 0 #self.send_bias = False def configure(self, ifmap, weights, bias, in_chn, out_chn, image_size, filter_size): self.ifmap = ifmap self.weights = weights self.bias = bias self.in_chn = in_chn self.out_chn = out_chn self.image_size = image_size self.filter_size = filter_size self.ifmap_psum_done = False self.weights_done = False self.pass_done.wr(False) # State Counters self.curr_set = 0 self.curr_filter = 0 self.iteration = 0 self.curr_chn = 0 self.curr_x = 0 # run through first two dimensions of input self.curr_y = 0 self.bias_set = 0 #self.send_bias = False def tick(self): if self.pass_done.rd(): return if self.ifmap_psum_done: if self.convert_chn.vacancy(): data = np.zeros(self.block_size) self.convert_chn.push(data) return in_sets = self.in_chn // self.block_size out_sets = self.out_chn // self.block_size num_iteration = self.filter_size[0] * self.filter_size[1] # read and hold all weights at the beginning for ease of implementation if not self.weights_done: f_x = self.iteration // self.filter_size[0] f_y = self.iteration % self.filter_size[0] # Push filters to PE columns. (PE is responsible for pop) if self.arch_input_chn.vacancy( ) and self.iteration < num_iteration: cmin = self.curr_filter * self.block_size cmax = cmin + self.block_size data = np.array([self.weights[f_x, f_y, self.curr_chn, c] \ for c in range(cmin, cmax) ]) #print("{},{},{},{}-{}".format(f_x,f_y,self.curr_chn,cmin,cmax)) #print(data) self.arch_input_chn.push( data) # Gives groups of four along num_filters axis self.curr_filter += 1 if (self.curr_filter == out_sets ): # Loop through blocks of filters self.curr_filter = 0 self.curr_chn += 1 if (self.curr_chn == self.in_chn): # Loop through channels self.curr_chn = 0 self.iteration += 1 if (self.iteration == num_iteration ): # Loop through 2D filter support self.iteration = 0 #print("Weights done") self.weights_done = True elif self.arch_input_chn.vacancy() and self.bias_set < out_sets: cmin = self.bias_set * self.block_size cmax = cmin + self.block_size data = np.array([self.bias[c] for c in range(cmin, cmax)]) #print("bias (input serializer):") #print(data) self.arch_input_chn.push(data) self.bias_set += 1 elif not self.ifmap_psum_done: if self.convert_chn.vacancy(): cmin = self.curr_set * self.block_size cmax = cmin + self.block_size #xmin = x #xmax = x+self.arr_x # Write ifmap to glb #data = np.array([ self.ifmap[x, self.curr_y, self.curr_chn] for x in range(xmin, xmax) ]) data = np.array([ self.ifmap[self.curr_x, self.curr_y, c] for c in range(cmin, cmax) ]) #print("{},{},{}-{}".format(self.curr_x, self.curr_y, cmin, cmax)) #print(data) self.curr_set += 1 if (self.curr_set == in_sets): self.curr_set = 0 self.curr_y += 1 if (self.curr_y == self.image_size[1]): self.curr_y = 0 self.curr_x += 1 self.convert_chn.push(data) if (self.curr_x == self.image_size[0]): self.curr_x = 0 self.ifmap_psum_done = True
class IFMapGLB(Module): def instantiate(self, wr_chn, rd_chn, glb_depth, chn_per_word): self.wr_chn = wr_chn self.rd_chn = rd_chn self.chn_per_word = chn_per_word self.name = 'ifmap_glb' self.stat_type = 'show' self.raw_stats = {'size': (glb_depth, chn_per_word), 'rd': 0, 'wr': 0} self.sram = SRAM(glb_depth, chn_per_word, name=self.name) self.last_read = Channel(3, name='last_read') self.image_size = (0, 0) self.filter_size = (0, 0) self.fmap_sets = 0 self.fmap_per_iteration = 0 self.curr_set = 0 self.fmap_idx = 0 self.iteration = 0 self.wr_done = False def configure(self, image_size, filter_size, fmap_sets, fmap_per_iteration): self.wr_done = False self.image_size = image_size self.filter_size = filter_size self.fmap_sets = fmap_sets self.fmap_per_iteration = fmap_per_iteration def tick(self): # self.iteration is which weight we are currently using # It's weight stationary so we fully use a set of filter weights # before continuing on. # (first weight in each filter, second weight in each filter, etc...) num_iteration = self.filter_size[0] * self.filter_size[1] offset_x = (self.filter_size[0] - 1) // 2 offset_y = (self.filter_size[1] - 1) // 2 filter_x = self.iteration % self.filter_size[0] - offset_x filter_y = self.iteration // self.filter_size[0] - offset_y # This is the first tick since initializing # INITIALIZATION CODE # Write all ifmaps and psums? to sram if not self.wr_done: # Write to GLB if self.wr_chn.valid(): data = self.wr_chn.pop() # print "ifmap_glb wr" self.raw_stats['wr'] += 1 # Write ifmap to glb # print "ifmap_to_glb: ", in_sets, self.fmap_idx, self.curr_set addr = self.fmap_sets * self.fmap_idx + self.curr_set self.curr_set += 1 self.sram.request(WR, addr, data) if self.curr_set == self.fmap_sets: self.curr_set = 0 self.fmap_idx += 1 if self.fmap_idx == self.fmap_per_iteration: # Done initializing ifmaps and psums # self.sram.dump() self.fmap_idx = 0 self.wr_done = True else: # Read from GLB and deal with SRAM latency if self.rd_chn.vacancy(1) and self.iteration < num_iteration: fmap_x = self.fmap_idx % self.image_size[0] fmap_y = self.fmap_idx // self.image_size[0] ifmap_x, ifmap_y = (fmap_x + filter_x, fmap_y + filter_y) if (ifmap_x < 0) or (ifmap_x >= self.image_size[0]) or \ (ifmap_y < 0) or (ifmap_y >= self.image_size[1]): # print "ifmap req zero", self.iteration, self.fmap_idx self.last_read.push(True) else: fmap_idx = (ifmap_y * self.image_size[0]) + ifmap_x addr = self.fmap_sets * fmap_idx + self.curr_set # print "ifmap req glb", self.iteration, self.fmap_idx self.sram.request(RD, addr) self.last_read.push(False) self.curr_set += 1 if self.curr_set == self.fmap_sets: self.curr_set = 0 self.fmap_idx += 1 if self.fmap_idx == self.fmap_per_iteration: self.fmap_idx = 0 self.iteration += 1 # Process the last read sent to the GLB SRAM if self.last_read.valid(): is_zero = self.last_read.pop() # self.raw_stats['rd'] += 1 data = [0]*self.chn_per_word if is_zero else \ [e for e in self.sram.response()] # print "ifmap rd glb", data self.rd_chn.push(data) self.raw_stats['rd'] += 1
class WeightsGLB(Module): def instantiate(self, wr_chn, rd_chn, glb_depth, chn_per_word): self.wr_chn = wr_chn self.rd_chn = rd_chn self.chn_per_word = chn_per_word self.name = 'weight_glb' self.stat_type = 'show' self.raw_stats = {'size' : (glb_depth, chn_per_word), 'rd': 0, 'wr': 0} self.sram = SRAM(glb_depth, chn_per_word, name=self.name) self.last_read = Channel(3, name='last_read') self.filter_size = (0, 0) self.in_sets = 0 self.out_sets = 0 self.curr_set = 0 self.fmap_idx = 0 self.iteration = 0 self.tile = 0 self.wr_done = False def configure(self, filter_size, in_sets, out_sets): self.wr_done = False self.filter_size = filter_size self.in_sets = in_sets self.out_sets = out_sets self.tile = 0 self.stuff = [] def tick(self): num_iteration = self.filter_size[0]*self.filter_size[1] if not self.wr_done: # Write to GLB if self.wr_chn.valid(): data = self.wr_chn.pop() # print "ifmap_glb wr" # Write ifmap to glb # print "ifmap_to_glb: ", in_sets, self.fmap_idx, self.curr_set addr = self.in_sets*(self.out_sets*self.iteration+self.fmap_idx) + self.curr_set self.stuff.append(data) self.curr_set += 1 self.sram.request(WR, addr, data) self.raw_stats['wr'] += len(data) if self.curr_set == self.in_sets: self.curr_set = 0 self.fmap_idx += 1 if self.fmap_idx == self.out_sets: # Done initializing ifmaps and psums # self.sram.dump() self.fmap_idx = 0 self.iteration += 1 if self.iteration == num_iteration: self.iteration = 0 self.wr_done = True else: did_read = False # Read from GLB and deal with SRAM latency if self.rd_chn.vacancy(1) and self.iteration < num_iteration: addr = self.in_sets*(self.out_sets*self.iteration+self.fmap_idx) + self.curr_set # print "ifmap req glb", self.iteration, self.fmap_idx self.sram.request(RD, addr) self.raw_stats['rd'] += self.chn_per_word self.last_read.push(False) did_read = True self.curr_set += 1 if self.curr_set == self.in_sets: self.curr_set = 0 self.fmap_idx += 1 if self.fmap_idx == self.out_sets: self.fmap_idx = 0 self.iteration += 1 # Process the last read sent to the GLB SRAM if self.last_read.valid(): is_zero = self.last_read.pop() data = [0]*self.chn_per_word if is_zero else \ [e for e in self.sram.response()] # print "ifmap rd glb", data self.rd_chn.push(data) elif not did_read: if self.iteration == num_iteration: self.iteration = 0 self.wr_done = False
def instantiate(self, arr_x, arr_y, input_chn, output_chn, chn_per_word, ifmap_glb_depth, psum_glb_depth): # PE static configuration (immutable) self.name = 'chip' self.arr_x = arr_x self.arr_y = arr_y self.chn_per_word = chn_per_word self.stat_type = 'show' # Instantiate DRAM IO channels self.input_chn = input_chn self.output_chn = output_chn # Instantiate input deserializer and output serializer self.ifmap_wr_chn = Channel(name="ifmap_wr_chn") self.psum_wr_chn = Channel(name="psum_wr_chn") self.weights_wr_chn = Channel(name="weights_wr_chn") self.deserializer = InputDeserializer(self.input_chn, self.ifmap_wr_chn, self.weights_wr_chn, self.psum_wr_chn, arr_x, arr_y, chn_per_word) self.psum_output_chn = Channel(name="psum_output_chn") self.serializer = OutputSerializer(self.output_chn, self.psum_output_chn) # Instantiate GLB and GLB channels self.ifmap_rd_chn = Channel(3, name='ifmap_rd_chn') self.ifmap_glb = IFMapGLB(self.ifmap_wr_chn, self.ifmap_rd_chn, ifmap_glb_depth, chn_per_word) self.psum_rd_chn = Channel(3, name='psum_rd_chn') self.psum_noc_wr_chn = Channel(name='psum_noc_wr_chn') self.psum_glb = PSumGLB(self.psum_wr_chn, self.psum_noc_wr_chn, self.psum_rd_chn, psum_glb_depth, chn_per_word) self.weights_rd_chn = Channel(name='weights_rd_chn') self.weights_glb = WeightsGLB(self.weights_wr_chn, self.weights_rd_chn) # PE Array and local channel declaration self.pe_array = ModuleList() self.pe_ifmap_chns = ModuleList() self.pe_filter_chns = ModuleList() self.pe_psum_chns = ModuleList() self.pe_psum_chns.append(ModuleList()) for x in range(self.arr_x): self.pe_psum_chns[0].append( Channel(32, name='pe_psum_chns_{}_{}'.format(x, 0))) # Actual array instantiation for y in range(self.arr_y): self.pe_array.append(ModuleList()) self.pe_ifmap_chns.append(ModuleList()) self.pe_filter_chns.append(ModuleList()) self.pe_psum_chns.append(ModuleList()) for x in range(self.arr_x): self.pe_ifmap_chns[y].append( Channel(32, name='pe_ifmap_chns_{}_{}'.format(x, y))) self.pe_filter_chns[y].append( Channel(32, name='pe_filter_chns_{}_{}'.format(x, y))) self.pe_psum_chns[y + 1].append( Channel(32, name='pe_psum_chns_{}_{}'.format(x, y))) self.pe_array[y].append( PE(x, y, self.pe_ifmap_chns[y][x], self.pe_filter_chns[y][x], self.pe_psum_chns[y][x], self.pe_psum_chns[y + 1][x])) # Setup NoC to deliver weights, ifmaps and psums self.filter_noc = WeightsNoC(self.weights_rd_chn, self.pe_filter_chns, self.chn_per_word) self.ifmap_noc = IFMapNoC(self.ifmap_rd_chn, self.pe_ifmap_chns, self.arr_x, self.chn_per_word) self.psum_rd_noc = PSumRdNoC(self.psum_rd_chn, self.pe_psum_chns[0], self.chn_per_word) self.psum_wr_noc = PSumWrNoC(self.pe_psum_chns[-1], self.psum_noc_wr_chn, self.psum_output_chn, self.chn_per_word)
class IFMapGLB(Module): def instantiate(self, wr_chn, rd_chn, glb_depth, chn_per_word): self.wr_chn = wr_chn self.rd_chn = rd_chn self.chn_per_word = chn_per_word self.name = 'ifmap_glb' self.stat_type = 'show' self.raw_stats = {'size' : (glb_depth, chn_per_word), 'ifmap_glb_rd': 0, 'ifmap_glb_wr': 0} self.sram = SRAM(glb_depth, chn_per_word) self.last_read = Channel(3) self.image_size = (0, 0) self.filter_size = (0, 0) self.fmap_sets = 0 self.fmap_per_iteration = 0 self.curr_set = 0 self.fmap_idx = 0 self.iteration = 0 self.wr_done = False def configure(self, image_size, filter_size, fmap_sets, fmap_per_iteration): self.wr_done = False self.image_size = image_size self.filter_size = filter_size self.fmap_sets = fmap_sets self.fmap_per_iteration = fmap_per_iteration self.read_ctr = 0 def tick(self): num_iteration = self.filter_size[0]*self.filter_size[1] offset_x = (self.filter_size[0] - 1)//2 offset_y = (self.filter_size[1] - 1)//2 filter_x = self.iteration % self.filter_size[0] - offset_x filter_y = self.iteration // self.filter_size[0] - offset_y if not self.wr_done: # Write to GLB if self.wr_chn.valid(): data = self.wr_chn.pop() self.raw_stats['ifmap_glb_wr'] += len(data) # print "ifmap_glb wr" # Write ifmap to glb addr = self.fmap_sets*self.fmap_idx + self.curr_set # print("ifmap_to_glb: fmap idx, curr set, addr ", self.fmap_idx, self.curr_set, addr) self.curr_set += 1 self.sram.request(WR, addr, data) if self.curr_set == self.fmap_sets: self.curr_set = 0 self.fmap_idx += 1 if self.fmap_idx == self.fmap_per_iteration: # Done initializing ifmaps and psums # self.sram.dump() self.fmap_idx = 0 self.wr_done = True else: # Read from GLB and deal with SRAM latency if self.rd_chn.vacancy(1) and self.iteration < num_iteration: self.read_ctr += 1 #print("ifmap glb read ctr ", self.read_ctr) fmap_x = self.fmap_idx % self.image_size[0] fmap_y = self.fmap_idx // self.image_size[0] ifmap_x, ifmap_y = (fmap_x + filter_x, fmap_y + filter_y) if (ifmap_x < 0) or (ifmap_x >= self.image_size[0]) or \ (ifmap_y < 0) or (ifmap_y >= self.image_size[1]): # print("ifmap req zero: iter, fmap idx ", self.iteration, self.fmap_idx) self.last_read.push(True) else: fmap_idx = (ifmap_y*self.image_size[0]) + ifmap_x addr = self.fmap_sets*fmap_idx + self.curr_set # print("addr fmap idx, addr: ", fmap_idx, addr) #print("ifmap req glb: iter, fmap idx, addr ", self.iteration, self.fmap_idx, addr) self.sram.request(RD, addr) self.last_read.push(False) self.curr_set += 1 if self.curr_set == self.fmap_sets: self.curr_set = 0 self.fmap_idx += 1 if self.fmap_idx == self.fmap_per_iteration: # print("fmap idx, fmap per iter: ", self.fmap_idx, self.fmap_per_iteration) self.fmap_idx = 0 self.iteration += 1 # Process the last read sent to the GLB SRAM if self.last_read.valid(): is_zero = self.last_read.pop() data = [0]*self.chn_per_word if is_zero else \ [e for e in self.sram.response()] #print("ifmap rd glb", data, self.iteration) self.rd_chn.push(data) self.raw_stats['ifmap_glb_rd'] += len(data)
def instantiate(self): self.channel = Channel(4) self.push_count = 0 self.free_count = 0 self.test_size = 100
def instantiate(self, arr_y, input_chn, output_chn, block_size, num_nonzero, ifmap_glb_depth, psum_glb_depth, weight_glb_depth): # PE static configuration (immutable) self.name = 'chip' #self.arr_x = arr_x self.arr_y = arr_y self.block_size = block_size self.num_nonzero = num_nonzero self.stat_type = 'show' # Instantiate DRAM IO channels self.input_chn = input_chn self.output_chn = output_chn # Instantiate input deserializer and output serializer self.ifmap_wr_chn = Channel() self.psum_wr_chn = Channel() self.weights_wr_chn = Channel() self.deserializer = InputDeserializer(self.input_chn, self.ifmap_wr_chn, self.weights_wr_chn, self.psum_wr_chn, arr_y, block_size, num_nonzero) self.psum_output_chn = Channel() self.serializer = OutputSerializer(self.output_chn, self.psum_output_chn) # Instantiate GLB and GLB channels self.ifmap_rd_chn = Channel(3) #self.ifmap_glb = IFMapGLB(self.ifmap_wr_chn, self.ifmap_rd_chn, arr_y, # ifmap_glb_depth, block_size, num_nonzero) self.psum_rd_chn = Channel(3) self.psum_noc_wr_chn = Channel() self.psum_glb = PSumGLB(self.psum_wr_chn, self.psum_noc_wr_chn, self.psum_rd_chn, psum_glb_depth, block_size, num_nonzero) self.weights_rd_chn = Channel() #self.weights_glb = WeightsGLB(self.weights_wr_chn, self.weights_rd_chn, weight_glb_depth, block_size) self.ifmap_weights_glb = IFMapWeightsGLB(self.ifmap_wr_chn, self.ifmap_rd_chn,\ self.weights_wr_chn, self.weights_rd_chn, arr_y, ifmap_glb_depth,\ weight_glb_depth, block_size, num_nonzero) # PE Array and local channel declaration self.pe_array = ModuleList() self.pe_ifmap_chns = ModuleList() self.pe_filter_chns = ModuleList() self.pe_psum_in_chns = ModuleList() self.pe_psum_out_chns = ModuleList() # Actual array instantiation for y in range(self.arr_y): self.pe_array.append(ModuleList()) self.pe_ifmap_chns.append(ModuleList()) self.pe_filter_chns.append(ModuleList()) self.pe_psum_in_chns.append(ModuleList()) self.pe_psum_out_chns.append(ModuleList()) for x in range(1): self.pe_ifmap_chns[y].append(Channel(32)) self.pe_filter_chns[y].append(Channel(32)) self.pe_psum_in_chns[y].append(Channel(32)) self.pe_psum_out_chns[y].append(Channel(32)) self.pe_array[y].append( PE(x, y, self.pe_ifmap_chns[y][x], self.pe_filter_chns[y][x], self.pe_psum_in_chns[y][x], self.pe_psum_out_chns[y][x] ) ) # Setup NoC to deliver weights, ifmaps and psums self.filter_noc = WeightsNoC(self.weights_rd_chn, self.pe_filter_chns, block_size) self.ifmap_noc = IFMapNoC(self.ifmap_rd_chn, self.pe_ifmap_chns) self.psum_rd_noc = PSumRdNoC(self.psum_rd_chn, self.pe_psum_in_chns, self.arr_y, block_size) self.psum_wr_noc = PSumWrNoC(self.pe_psum_out_chns, self.psum_noc_wr_chn, self.psum_output_chn, self.arr_y, block_size)
def instantiate(self, arr_x, arr_y, chn_per_word, layers, batch_size): self.arr_x = arr_x self.arr_y = arr_y self.chn_per_word = chn_per_word self.layers = layers self.batch_size = batch_size self.name = 'meta' self.started = False self.done_chn = Channel() self.ifmap_glb_depth = 0 self.psum_glb_depth = 0 self.weights_glb_depth = 0 use_conv = False use_fc = False self.conv_tb = None self.fc_tb = None cur_image_size = None cur_in_chn = None is_conv = False num_convs = 0 num_fc = 0 for layer in self.layers: if isinstance(layer, Conv): if cur_image_size is None: pass elif cur_image_size != layer.image_size or cur_in_chn != layer.in_chn: raise Exception('Invalid conv image size for %s: %s %s' % (layer.name, (cur_image_size, cur_in_chn), (layer.image_size, layer.in_chn))) ifmap_glb_depth, psum_glb_depth, weights_glb_depth = WSArchTB.required_glb_depth( self.arr_x, self.arr_y, self.chn_per_word, layer.image_size, layer.filter_size, layer.in_chn, layer.out_chn) use_conv = True output_shape = layer.new_shape((self.batch_size, ) + layer.image_size + (layer.out_chn, )) cur_image_size = output_shape[1:3] cur_in_chn = output_shape[3] is_conv = True num_convs += 1 elif isinstance(layer, FC): if cur_image_size is None: pass elif not is_conv and cur_image_size != layer.input_size: raise Exception('Invalid fc dimension transition for ' + layer.name) elif is_conv and cur_image_size[0] * cur_image_size[ 1] * cur_in_chn != layer.input_size: raise Exception( 'Invalid conv to fc dimension transition to ' + layer.name) ifmap_glb_depth, psum_glb_depth, weights_glb_depth = OSArchTB.required_glb_depth( self.arr_x, self.arr_y, self.chn_per_word, self.batch_size, layer.input_size, layer.output_size) use_fc = True _, cur_image_size = layer.new_shape( (self.batch_size, layer.output_size)) is_conv = False num_fc += 1 else: raise Exception('layer not valid') self.ifmap_glb_depth = max(self.ifmap_glb_depth, ifmap_glb_depth) self.psum_glb_depth = max(self.psum_glb_depth, psum_glb_depth) self.weights_glb_depth = max(self.weights_glb_depth, weights_glb_depth) if use_conv: self.conv_tb = WSArchTB(self.arr_x, self.arr_y, self.chn_per_word, self.done_chn, self.ifmap_glb_depth, self.psum_glb_depth, self.weights_glb_depth) if use_fc: self.fc_tb = OSArchTB(self.arr_x, self.arr_y, self.chn_per_word, self.done_chn, self.ifmap_glb_depth, self.psum_glb_depth, self.weights_glb_depth) self.layer_step = 0 self.batch_step = 0 self.conv_inputs = [None] * self.batch_size self.fc_input = None self.conv_weights = [None] * num_convs self.conv_bias = [None] * num_convs self.fc_weights = [None] * num_fc self.fc_bias = [None] * num_fc self.cur_conv = 0 self.cur_fc = 0
class PSumGLB(Module): def instantiate(self, dram_wr_chn, noc_wr_chn, rd_chn, glb_depth, chn_per_word): self.dram_wr_chn = dram_wr_chn self.noc_wr_chn = noc_wr_chn self.rd_chn = rd_chn self.chn_per_word = chn_per_word self.name = 'psum_glb' self.stat_type = 'show' self.raw_stats = {'size' : (glb_depth, chn_per_word), 'psum_glb_rd': 0, 'psum_glb_wr': 0} self.sram = SRAM(glb_depth, chn_per_word, nports=2) self.last_read = Channel(3) self.filter_size = (0, 0) self.fmap_sets = 0 self.fmap_per_iteration = 0 self.rd_set = 0 self.fmap_rd_idx = 0 self.iteration = 0 self.wr_set = 0 self.fmap_wr_idx = 0 self.wr_done = False def configure(self, filter_size, fmap_sets, fmap_per_iteration): self.wr_done = False self.filter_size = filter_size self.fmap_sets = fmap_sets self.fmap_per_iteration = fmap_per_iteration self.rd_set = 0 self.fmap_rd_idx = 0 self.iteration = 0 self.wr_set = 0 self.fmap_wr_idx = 0 self.wr_done = False def tick(self): num_iteration = self.filter_size[0]*self.filter_size[1] if not self.wr_done: # Write to GLB if self.dram_wr_chn.valid(): data = self.dram_wr_chn.pop() self.raw_stats['psum_glb_wr'] += len(data) # print "psum_glb wr" # Write ifmap to glb addr = self.fmap_sets*self.fmap_wr_idx + self.wr_set self.wr_set += 1 self.sram.request(WR, addr, data, port=1) if self.wr_set == self.fmap_sets: self.wr_set = 0 self.fmap_wr_idx += 1 if self.fmap_wr_idx == self.fmap_per_iteration: # Done initializing ifmaps and psums # self.sram.dump() self.fmap_wr_idx = 0 self.wr_done = True #print ("psum orig write, fmap_sets, fmap_wr_idx, wr_set, addr, data: ",self.fmap_sets, self.fmap_wr_idx, self.wr_set, addr, data) else: # Read from GLB and deal with SRAM latency # print self.rd_chn.vacancy(1), self.rd_chn.rd_ptr.rd(), self.rd_chn.wr_ptr.rd() if self.rd_chn.vacancy(1) and self.iteration < num_iteration: addr = self.fmap_sets*self.fmap_rd_idx + self.rd_set #print("psum req glb", self.iteration, self.fmap_rd_idx, self.rd_set) self.sram.request(RD, addr, port=0) self.last_read.push(False) self.rd_set += 1 if self.rd_set == self.fmap_sets: self.rd_set = 0 self.fmap_rd_idx += 1 if self.fmap_rd_idx == self.fmap_per_iteration: self.fmap_rd_idx = 0 self.iteration += 1 # Process the last read sent to the GLB SRAM if self.last_read.valid(): is_zero = self.last_read.pop() data = [0]*self.chn_per_word if is_zero else \ [e for e in self.sram.response()] self.rd_chn.push(data) self.raw_stats['psum_glb_rd'] += len(data) #print("psum rd glb: data", data) if self.noc_wr_chn.valid(): data = self.noc_wr_chn.pop() #print("psum_to_glb: ", self.fmap_wr_idx, self.wr_set, data) self.raw_stats['psum_glb_wr'] += len(data) addr = self.fmap_sets*self.fmap_wr_idx + self.wr_set #print("noc psum wr glb", self.fmap_wr_idx, self.wr_set, data) self.wr_set += 1 self.sram.request(WR, addr, data, port=1) if self.wr_set == self.fmap_sets: self.wr_set = 0 self.fmap_wr_idx += 1 if self.fmap_wr_idx == self.fmap_per_iteration: # Done initializing ifmaps and psums #self.sram.dump() self.fmap_wr_idx = 0
class IFMapGLB(Module): def instantiate(self, wr_chn, rd_chn, glb_depth, chn_per_word): self.wr_chn = wr_chn self.rd_chn = rd_chn self.chn_per_word = chn_per_word self.glb_depth = glb_depth self.name = 'ifmap_glb' self.stat_type = 'show' self.raw_stats = { 'size': (glb_depth, chn_per_word), 'ifmap_glb_rd': 0, 'ifmap_glb_wr': 0 } self.sram = SRAM(glb_depth, chn_per_word) self.last_read = Channel(3) self.image_size = (0, 0) self.filter_size = (0, 0) self.fmap_sets = 0 self.fmap_per_iteration = 0 self.curr_set = 0 self.fmap_idx = 0 self.iteration = 0 self.wr_done = False def configure(self, image_size, filter_size, fmap_sets, fmap_per_iteration): self.wr_done = False self.image_size = image_size self.filter_size = filter_size self.fmap_sets = fmap_sets self.fmap_per_iteration = fmap_per_iteration self.curr_tile = 0 self.num_tiles = 4 self.addr = 0 print("ifmap glb_size: ", self.glb_depth) def tick(self): num_iteration = self.filter_size[0] * self.filter_size[1] offset_x = (self.filter_size[0] - 1) // 2 offset_y = (self.filter_size[1] - 1) // 2 filter_x = self.iteration % self.filter_size[0] - offset_x filter_y = self.iteration // self.filter_size[0] - offset_y if not self.wr_done: # Write to GLB if self.wr_chn.valid(): data = self.wr_chn.pop() self.raw_stats['ifmap_glb_wr'] += len(data) # print "ifmap_glb wr" # Write ifmap to glb addr = self.fmap_sets * self.curr_tile + self.curr_set + self.fmap_idx * self.num_tiles #print ("ifmap_to_glb: ", self.curr_tile, self.fmap_idx, addr) self.curr_set += 1 self.sram.request(WR, addr, data) if self.curr_set == self.fmap_sets: self.curr_set = 0 self.curr_tile += 1 if self.curr_tile == self.num_tiles: # Done initializing ifmaps and psums # self.sram.dump() self.curr_tile = 0 self.fmap_idx += 1 if self.fmap_idx == self.fmap_per_iteration: self.wr_done = True else: if self.rd_chn.vacancy(1) and self.addr < self.glb_depth: # Read from GLB and deal with SRAM latency self.sram.request(RD, self.addr) #print ("read_ifmap_glb: ", self.addr) self.addr += 1 self.last_read.push(False) # Process the last read sent to the GLB SRAM if self.last_read.valid(): #print ("ifmap_glb_to_noc") is_zero = self.last_read.pop() data = [e for e in self.sram.response()] # print "ifmap rd glb", data self.rd_chn.push(data) self.raw_stats['ifmap_glb_rd'] += len(data)
class PSumGLB(Module): def instantiate(self, dram_wr_chn, noc_wr_chn, rd_chn, glb_depth, chn_per_word): self.dram_wr_chn = dram_wr_chn self.noc_wr_chn = noc_wr_chn self.rd_chn = rd_chn self.chn_per_word = chn_per_word self.name = 'psum_glb' self.stat_type = 'show' self.raw_stats = {'size': (glb_depth, chn_per_word), 'rd': 0, 'wr': 0} self.sram = SRAM(glb_depth, chn_per_word, nports=2, name=self.name) self.last_read = Channel(3, name='last_read') self.filter_size = (0, 0) self.fmap_sets = 0 self.fmap_per_iteration = 0 self.rd_set = 0 self.fmap_rd_idx = 0 self.iteration = 0 self.wr_set = 0 self.fmap_wr_idx = 0 self.wr_done = False def configure(self, filter_size, fmap_sets, fmap_per_iteration): self.wr_done = False self.filter_size = filter_size self.fmap_sets = fmap_sets self.fmap_per_iteration = fmap_per_iteration self.rd_set = 0 self.fmap_rd_idx = 0 self.iteration = 0 self.wr_set = 0 self.fmap_wr_idx = 0 self.wr_done = False def tick(self): num_iteration = self.filter_size[0] * self.filter_size[1] if not self.wr_done: # Write to GLB if self.dram_wr_chn.valid(): data = self.dram_wr_chn.pop() self.raw_stats['wr'] += 1 # print "psum_glb wr" # Write ifmap to glb # print "ifmap_to_glb: ", in_sets, self.fmap_idx, self.curr_set addr = self.fmap_sets * self.fmap_wr_idx + self.wr_set self.wr_set += 1 self.sram.request(WR, addr, data, port=0) if self.wr_set == self.fmap_sets: self.wr_set = 0 self.fmap_wr_idx += 1 if self.fmap_wr_idx == self.fmap_per_iteration: # Done initializing ifmaps and psums # self.sram.dump() self.fmap_wr_idx = 0 self.wr_done = True else: # Read from GLB and deal with SRAM latency # print self.rd_chn.vacancy(1), self.rd_chn.rd_ptr.rd(), self.rd_chn.wr_ptr.rd() if self.rd_chn.vacancy(1) and self.iteration < num_iteration: addr = self.fmap_sets * self.fmap_rd_idx + self.rd_set # print "psum req glb", self.iteration, self.fmap_rd_idx, self.rd_set self.sram.request(RD, addr, port=0) self.last_read.push(False) self.rd_set += 1 if self.rd_set == self.fmap_sets: self.rd_set = 0 self.fmap_rd_idx += 1 if self.fmap_rd_idx == self.fmap_per_iteration: self.fmap_rd_idx = 0 self.iteration += 1 # Process the last read sent to the GLB SRAM if self.last_read.valid(): is_zero = self.last_read.pop() data = [0]*self.chn_per_word if is_zero else \ [e for e in self.sram.response()] self.rd_chn.push(data) self.raw_stats['rd'] += 1 # print "psum rd glb", data # If we can pull an elemnt off of the write channel, do it # and write it into the location specificed by the current # fmap_Sets, fmap_wr_idx, and wr_set! if self.noc_wr_chn.valid(): # print "psum_to_glb: ", self.fmap_wr_idx, self.wr_set data = self.noc_wr_chn.pop() self.raw_stats['wr'] += 1 addr = self.fmap_sets * self.fmap_wr_idx + self.wr_set # print "psum wr glb", self.fmap_wr_idx, self.wr_set, data self.wr_set += 1 self.sram.request(WR, addr, data, port=1) if self.wr_set == self.fmap_sets: self.wr_set = 0 self.fmap_wr_idx += 1 if self.fmap_wr_idx == self.fmap_per_iteration: # Done initializing ifmaps and psums # self.sram.dump() self.fmap_wr_idx = 0
def instantiate(self, arr_x, arr_y, input_chn, output_chn, chn_per_word, ifmap_glb_depth): # PE static configuration (immutable) self.name = 'chip' self.arr_x = arr_x self.arr_y = arr_y self.chn_per_word = chn_per_word self.post_tr_x = arr_x # num output channels = 8 self.post_tr_y = 4 # num tiles = 4 self.pre_tr_ifmap_x = arr_y # num input channels = 4 self.pre_tr_ifmap_y = 4 # num tiles = 4 self.pre_tr_weights_x = arr_y # num input channels = 4 self.pre_tr_weights_y = arr_x # num output channels = 8 self.stat_type = 'show' # Instantiate DRAM IO channels self.input_chn = input_chn self.output_chn = output_chn # Instantiate input deserializer and output serializer self.ifmap_wr_chn = Channel() self.weights_wr_chn = Channel() self.bias_wr_chn = Channel() self.deserializer = InputDeserializer(self.input_chn, self.ifmap_wr_chn, self.weights_wr_chn, self.bias_wr_chn, arr_x, arr_y, chn_per_word) self.psum_output_chn = Channel() self.serializer = OutputSerializer(self.output_chn, self.psum_output_chn) # Instantiate GLB and GLB channels self.ifmap_glb_wr_chn = Channel(3) self.ifmap_rd_chn = Channel(3) self.ifmap_glb = IFMapGLB(self.ifmap_glb_wr_chn, self.ifmap_rd_chn, ifmap_glb_depth, chn_per_word) self.psum_rd_chn = Channel(3) self.psum_noc_wr_chn = Channel() # self.psum_glb = PSumGLB(self.psum_wr_chn, self.psum_noc_wr_chn, self.psum_rd_chn, # psum_glb_depth, chn_per_word) self.weights_glb_wr_chn = Channel(3) self.weights_rd_chn = Channel() self.weights_glb = WeightsGLB(self.weights_glb_wr_chn, self.weights_rd_chn) self.bias_rd_chn = Channel() self.bias_glb = BiasGLB(self.bias_wr_chn, self.bias_rd_chn) # PE Array and local channel declaration self.pe_array = ModuleList() self.pe_ifmap_chns = ModuleList() self.pe_filter_chns = ModuleList() self.pe_psum_chns = ModuleList() self.pe_psum_chns.append(ModuleList()) for x in range(self.arr_x): self.pe_psum_chns[0].append(Channel(32)) # Actual PE array instantiation for y in range(self.arr_y): self.pe_array.append(ModuleList()) self.pe_ifmap_chns.append(ModuleList()) self.pe_filter_chns.append(ModuleList()) self.pe_psum_chns.append(ModuleList()) for x in range(self.arr_x): self.pe_ifmap_chns[y].append(Channel(32)) self.pe_filter_chns[y].append(Channel(32)) self.pe_psum_chns[y+1].append(Channel(32)) self.pe_array[y].append( PE(x, y, self.pe_ifmap_chns[y][x], self.pe_filter_chns[y][x], self.pe_psum_chns[y][x], self.pe_psum_chns[y+1][x] ) ) # Pre Transform IFMap array and local channel declaration self.pre_tr_ifmap_array = ModuleList() self.pre_tr_ifmap_in_chns = ModuleList() self.pre_tr_ifmap_out_chns = ModuleList() # Actual pre transform IFMap array instantiation for y in range(self.pre_tr_ifmap_y): self.pre_tr_ifmap_array.append(ModuleList()) self.pre_tr_ifmap_in_chns.append(ModuleList()) self.pre_tr_ifmap_out_chns.append(ModuleList()) for x in range(self.pre_tr_ifmap_x): self.pre_tr_ifmap_in_chns[y].append(Channel(32)) self.pre_tr_ifmap_out_chns[y].append(Channel(32)) self.pre_tr_ifmap_array[y].append( PreTransformIFMap(x, y, self.pre_tr_ifmap_in_chns[y][x], self.pre_tr_ifmap_out_chns[y][x] ) ) # Pre Transform Weight array and local channel declaration self.pre_tr_weights_array = ModuleList() self.pre_tr_weights_in_chns = ModuleList() self.pre_tr_weights_out_chns = ModuleList() # Actual pre transform Weight array instantiation for y in range(self.pre_tr_weights_y): self.pre_tr_weights_array.append(ModuleList()) self.pre_tr_weights_in_chns.append(ModuleList()) self.pre_tr_weights_out_chns.append(ModuleList()) for x in range(self.pre_tr_weights_x): self.pre_tr_weights_in_chns[y].append(Channel(32)) self.pre_tr_weights_out_chns[y].append(Channel(32)) self.pre_tr_weights_array[y].append( PreTransformWeights(x, y, self.pre_tr_weights_in_chns[y][x], self.pre_tr_weights_out_chns[y][x] ) ) # Post Transform Array and local channel declaration self.post_tr_array = ModuleList() self.post_tr_bias_chns = ModuleList() self.post_tr_ofmap_in_chns = ModuleList() self.post_tr_ofmap_out_chns = ModuleList() # Actual post transform array instantiation for y in range(self.post_tr_y): self.post_tr_array.append(ModuleList()) self.post_tr_bias_chns.append(ModuleList()) self.post_tr_ofmap_in_chns.append(ModuleList()) self.post_tr_ofmap_out_chns.append(ModuleList()) for x in range(self.post_tr_x): self.post_tr_bias_chns[y].append(Channel(32)) self.post_tr_ofmap_in_chns[y].append(Channel(32)) self.post_tr_ofmap_out_chns[y].append(Channel(32)) self.post_tr_array[y].append( PostTransform(x, y, self.post_tr_bias_chns[y][x], self.post_tr_ofmap_in_chns[y][x], self.post_tr_ofmap_out_chns[y][x] ) ) # Setup NoC to deliver weights, ifmaps and psums self.filter_noc = WeightsNoC(self.weights_rd_chn, self.pe_filter_chns, self.chn_per_word) self.ifmap_noc = IFMapNoC(self.ifmap_rd_chn, self.pe_ifmap_chns, self.arr_x, self.chn_per_word) self.psum_rd_noc = PSumRdNoC(self.pe_psum_chns[0], self.chn_per_word) #self.psum_wr_noc = PSumWrNoC(self.pe_psum_chns[-1], self.psum_output_chn, self.chn_per_word) self.bias_noc = BiasNoC(self.bias_rd_chn, self.post_tr_bias_chns, self.chn_per_word) # Setup NoC for post transform blocks self.post_tr_wr_noc = PostTrWrNoC(self.pe_psum_chns[-1], self.post_tr_ofmap_in_chns, self.chn_per_word) self.post_tr_rd_noc = PostTrRdNoC(self.post_tr_ofmap_out_chns, self.psum_output_chn, self.chn_per_word) # Instantiate tiler for ifmaps self.ifmap_tiler = IFMapTiler(self.ifmap_wr_chn, self.pre_tr_ifmap_in_chns, self.chn_per_word) # Setup NoC for pre transform blocks #self.pre_tr_ifmap_wr_noc = PreTrIFMapWrNoC(self.ifmap_wr_chn, self.pre_tr_ifmap_in_chns, self.chn_per_word) self.pre_tr_ifmap_rd_noc = PreTrIFMapRdNoC(self.pre_tr_ifmap_out_chns, self.ifmap_glb_wr_chn, self.chn_per_word) self.pre_tr_weights_wr_noc = PreTrWeightsWrNoC(self.weights_wr_chn, self.pre_tr_weights_in_chns, self.chn_per_word) self.pre_tr_weights_rd_noc = PreTrWeightsRdNoC(self.pre_tr_weights_out_chns, self.weights_glb_wr_chn, self.chn_per_word)
class IFMapGLB(Module): def instantiate(self, wr_chn, rd_chn, glb_depth, chn_per_word): self.wr_chn = wr_chn self.rd_chn = rd_chn self.chn_per_word = chn_per_word self.name = 'ifmap_glb' self.stat_type = 'show' self.raw_stats = {'size' : (glb_depth, chn_per_word), 'rd': 0, 'wr': 0} self.sram = SRAM(glb_depth, chn_per_word, name=self.name) self.last_read = Channel(3, name='last_read') self.image_size = (0, 0) self.filter_size = (0, 0) self.fmap_sets = 0 self.full_fmap_sets = 0 self.fmap_per_iteration = 0 self.curr_set = 0 self.fmap_idx = 0 self.iteration = 0 self.tile_in = 0 self.tile_out = 0 self.wr_done = False self.task_done = True def configure(self, image_size, filter_size, fmap_sets, full_fmap_sets, tiles_out, fmap_per_iteration): self.wr_done = False self.curr_set = 0 self.fmap_idx = 0 self.iteration = 0 self.image_size = image_size self.filter_size = filter_size self.fmap_sets = fmap_sets self.full_fmap_sets = full_fmap_sets self.fmap_per_iteration = fmap_per_iteration self.tiles_out = tiles_out self.tile_in = 0 self.tile_out = 0 self.task_done = False def tick(self): num_iteration = self.filter_size[0]*self.filter_size[1] offset_x = (self.filter_size[0] - 1)//2 offset_y = (self.filter_size[1] - 1)//2 filter_x = self.iteration % self.filter_size[0] - offset_x filter_y = self.iteration // self.filter_size[0] - offset_y tiles_in = self.full_fmap_sets // self.fmap_sets if self.task_done: return if not self.wr_done: # Write to GLB if self.wr_chn.valid(): data = self.wr_chn.pop() # print "ifmap_glb wr" # Write ifmap to glb # print "ifmap_to_glb: ", in_sets, self.fmap_idx, self.curr_set addr = self.full_fmap_sets*self.fmap_idx + self.curr_set self.curr_set += 1 self.sram.request(WR, addr, data) self.raw_stats['wr'] += len(data) if self.curr_set == self.full_fmap_sets: self.curr_set = 0 self.fmap_idx += 1 if self.fmap_idx == self.fmap_per_iteration: # Done initializing ifmaps and psums # self.sram.dump() self.fmap_idx = 0 self.wr_done = True else: did_read = False # Read from GLB and deal with SRAM latency if self.rd_chn.vacancy(1) and self.iteration < num_iteration and self.tile_in < tiles_in: fmap_x = self.fmap_idx % self.image_size[0] fmap_y = self.fmap_idx // self.image_size[0] ifmap_x, ifmap_y = (fmap_x + filter_x, fmap_y + filter_y) if (ifmap_x < 0) or (ifmap_x >= self.image_size[0]) or \ (ifmap_y < 0) or (ifmap_y >= self.image_size[1]): # print "ifmap req zero", self.iteration, self.fmap_idx self.last_read.push(True) else: fmap_idx = (ifmap_y*self.image_size[0]) + ifmap_x addr = self.fmap_sets*(fmap_idx*tiles_in+self.tile_in) + self.curr_set # print "ifmap req glb", self.iteration, self.fmap_idx self.sram.request(RD, addr) self.raw_stats['rd'] += self.chn_per_word self.last_read.push(False) did_read = True self.curr_set += 1 if self.curr_set == self.fmap_sets: self.curr_set = 0 self.fmap_idx += 1 if self.fmap_idx == self.fmap_per_iteration: self.fmap_idx = 0 self.iteration += 1 # Process the last read sent to the GLB SRAM if self.last_read.valid(): is_zero = self.last_read.pop() data = [0]*self.chn_per_word if is_zero else \ [e for e in self.sram.response()] # print "ifmap rd glb", data self.rd_chn.push(data) elif not did_read: if self.iteration == num_iteration: self.iteration = 0 self.tile_in += 1 if self.tile_in == tiles_in: self.tile_in = 0 self.tile_out += 1 if self.tile_out == self.tiles_out: self.tile_out = 0 self.task_done = True