class ChannelTB(Module): def instantiate(self): self.channel = Channel(4) self.push_count = 0 self.free_count = 0 self.test_size = 100 def tick(self): # Print current state of the channel c, n = [], 0 while (self.channel.valid(n)): d = self.channel.peek(n) assert (d == (self.free_count + n)) c.append(d) n += 1 print("channel: %s" % c) # Possibly push a new element if random.random() < 0.5 and self.push_count < self.test_size and \ self.channel.vacancy(): self.channel.push(self.push_count) print("push: %d" % self.push_count) self.push_count += 1 # Possibly free some elements if random.random() < 0.5 and self.free_count < self.test_size and \ n != 0: num_free = random.randint(1, n) self.channel.free(num_free) self.free_count += num_free print("free: %d" % num_free)
class ConverterTB(Module): def instantiate(self): self.name = 'tb' self.input_size = 4 self.block_size = 12 self.in_sets = self.block_size // self.input_size self.num_nonzero = 5 self.preserve_order = True self.in_chn = Channel() self.mid_chn = Channel() self.out_chn = Channel() self.converter = Converter(self.in_chn, self.mid_chn, self.input_size, self.block_size) #self.pruner = NaivePruner(self.mid_chn,self.out_chn,self.num_nonzero, self.block_size, self.preserve_order) self.pruner = ClusteredPruner(self.mid_chn,self.out_chn,self.num_nonzero, self.block_size, self.preserve_order) #self.pruner = ThresholdPruner(self.mid_chn,self.out_chn,self.num_nonzero, self.block_size, self.preserve_order) self.iterations = 10 self.iteration = 0 self.curr_set = 0 self.out_counter = 0 self.test_data = [[randint(1,5) if randint(0,3)>1 else 0\ for j in range(self.block_size)]\ for i in range(self.iterations+1)] # send in one extra iteration to flush out last outputs print("Stimulus:") print("[") for i in range(len(self.test_data)-1): print(self.test_data[i]) print("]") def tick(self): if (self.in_chn.vacancy() and not self.iteration == self.iterations+1): imin = self.curr_set*self.input_size imax = imin+self.input_size data = [self.test_data[self.iteration][i] for i in range(imin, imax)] self.in_chn.push(data) self.curr_set += 1 if (self.curr_set == self.in_sets): self.curr_set = 0 self.iteration += 1 if (self.out_chn.valid()): data = self.out_chn.pop() print(data) #print("out_counter: ", self.out_counter) self.out_counter += 1 if (self.out_counter == self.iterations): raise Finish("Check manually")
class IFMapGLB(Module): def instantiate(self, wr_chn, rd_chn, glb_depth, chn_per_word): self.wr_chn = wr_chn self.rd_chn = rd_chn self.chn_per_word = chn_per_word self.glb_depth = glb_depth self.name = 'ifmap_glb' self.stat_type = 'show' self.raw_stats = { 'size': (glb_depth, chn_per_word), 'ifmap_glb_rd': 0, 'ifmap_glb_wr': 0 } self.sram = SRAM(glb_depth, chn_per_word) self.last_read = Channel(3) self.image_size = (0, 0) self.filter_size = (0, 0) self.fmap_sets = 0 self.fmap_per_iteration = 0 self.curr_set = 0 self.fmap_idx = 0 self.iteration = 0 self.wr_done = False def configure(self, image_size, filter_size, fmap_sets, fmap_per_iteration): self.wr_done = False self.image_size = image_size self.filter_size = filter_size self.fmap_sets = fmap_sets self.fmap_per_iteration = fmap_per_iteration self.curr_tile = 0 self.num_tiles = 4 self.addr = 0 print("ifmap glb_size: ", self.glb_depth) def tick(self): num_iteration = self.filter_size[0] * self.filter_size[1] offset_x = (self.filter_size[0] - 1) // 2 offset_y = (self.filter_size[1] - 1) // 2 filter_x = self.iteration % self.filter_size[0] - offset_x filter_y = self.iteration // self.filter_size[0] - offset_y if not self.wr_done: # Write to GLB if self.wr_chn.valid(): data = self.wr_chn.pop() self.raw_stats['ifmap_glb_wr'] += len(data) # print "ifmap_glb wr" # Write ifmap to glb addr = self.fmap_sets * self.curr_tile + self.curr_set + self.fmap_idx * self.num_tiles #print ("ifmap_to_glb: ", self.curr_tile, self.fmap_idx, addr) self.curr_set += 1 self.sram.request(WR, addr, data) if self.curr_set == self.fmap_sets: self.curr_set = 0 self.curr_tile += 1 if self.curr_tile == self.num_tiles: # Done initializing ifmaps and psums # self.sram.dump() self.curr_tile = 0 self.fmap_idx += 1 if self.fmap_idx == self.fmap_per_iteration: self.wr_done = True else: if self.rd_chn.vacancy(1) and self.addr < self.glb_depth: # Read from GLB and deal with SRAM latency self.sram.request(RD, self.addr) #print ("read_ifmap_glb: ", self.addr) self.addr += 1 self.last_read.push(False) # Process the last read sent to the GLB SRAM if self.last_read.valid(): #print ("ifmap_glb_to_noc") is_zero = self.last_read.pop() data = [e for e in self.sram.response()] # print "ifmap rd glb", data self.rd_chn.push(data) self.raw_stats['ifmap_glb_rd'] += len(data)
class IFMapGLB(Module): def instantiate(self, wr_chn, rd_chn, glb_depth, chn_per_word): self.wr_chn = wr_chn self.rd_chn = rd_chn self.chn_per_word = chn_per_word self.name = 'ifmap_glb' self.stat_type = 'show' self.raw_stats = {'size': (glb_depth, chn_per_word), 'rd': 0, 'wr': 0} self.sram = SRAM(glb_depth, chn_per_word, name=self.name) self.last_read = Channel(3, name='last_read') self.image_size = (0, 0) self.filter_size = (0, 0) self.fmap_sets = 0 self.fmap_per_iteration = 0 self.curr_set = 0 self.fmap_idx = 0 self.iteration = 0 self.wr_done = False def configure(self, image_size, filter_size, fmap_sets, fmap_per_iteration): self.wr_done = False self.image_size = image_size self.filter_size = filter_size self.fmap_sets = fmap_sets self.fmap_per_iteration = fmap_per_iteration def tick(self): # self.iteration is which weight we are currently using # It's weight stationary so we fully use a set of filter weights # before continuing on. # (first weight in each filter, second weight in each filter, etc...) num_iteration = self.filter_size[0] * self.filter_size[1] offset_x = (self.filter_size[0] - 1) // 2 offset_y = (self.filter_size[1] - 1) // 2 filter_x = self.iteration % self.filter_size[0] - offset_x filter_y = self.iteration // self.filter_size[0] - offset_y # This is the first tick since initializing # INITIALIZATION CODE # Write all ifmaps and psums? to sram if not self.wr_done: # Write to GLB if self.wr_chn.valid(): data = self.wr_chn.pop() # print "ifmap_glb wr" self.raw_stats['wr'] += 1 # Write ifmap to glb # print "ifmap_to_glb: ", in_sets, self.fmap_idx, self.curr_set addr = self.fmap_sets * self.fmap_idx + self.curr_set self.curr_set += 1 self.sram.request(WR, addr, data) if self.curr_set == self.fmap_sets: self.curr_set = 0 self.fmap_idx += 1 if self.fmap_idx == self.fmap_per_iteration: # Done initializing ifmaps and psums # self.sram.dump() self.fmap_idx = 0 self.wr_done = True else: # Read from GLB and deal with SRAM latency if self.rd_chn.vacancy(1) and self.iteration < num_iteration: fmap_x = self.fmap_idx % self.image_size[0] fmap_y = self.fmap_idx // self.image_size[0] ifmap_x, ifmap_y = (fmap_x + filter_x, fmap_y + filter_y) if (ifmap_x < 0) or (ifmap_x >= self.image_size[0]) or \ (ifmap_y < 0) or (ifmap_y >= self.image_size[1]): # print "ifmap req zero", self.iteration, self.fmap_idx self.last_read.push(True) else: fmap_idx = (ifmap_y * self.image_size[0]) + ifmap_x addr = self.fmap_sets * fmap_idx + self.curr_set # print "ifmap req glb", self.iteration, self.fmap_idx self.sram.request(RD, addr) self.last_read.push(False) self.curr_set += 1 if self.curr_set == self.fmap_sets: self.curr_set = 0 self.fmap_idx += 1 if self.fmap_idx == self.fmap_per_iteration: self.fmap_idx = 0 self.iteration += 1 # Process the last read sent to the GLB SRAM if self.last_read.valid(): is_zero = self.last_read.pop() # self.raw_stats['rd'] += 1 data = [0]*self.chn_per_word if is_zero else \ [e for e in self.sram.response()] # print "ifmap rd glb", data self.rd_chn.push(data) self.raw_stats['rd'] += 1
class PSumGLB(Module): def instantiate(self, dram_wr_chn, noc_wr_chn, rd_chn, glb_depth, chn_per_word): self.dram_wr_chn = dram_wr_chn self.noc_wr_chn = noc_wr_chn self.rd_chn = rd_chn self.chn_per_word = chn_per_word self.name = 'psum_glb' self.stat_type = 'show' self.raw_stats = {'size': (glb_depth, chn_per_word), 'rd': 0, 'wr': 0} self.sram = SRAM(glb_depth, chn_per_word, nports=2, name=self.name) self.last_read = Channel(3, name='last_read') self.filter_size = (0, 0) self.fmap_sets = 0 self.fmap_per_iteration = 0 self.rd_set = 0 self.fmap_rd_idx = 0 self.iteration = 0 self.wr_set = 0 self.fmap_wr_idx = 0 self.wr_done = False def configure(self, filter_size, fmap_sets, fmap_per_iteration): self.wr_done = False self.filter_size = filter_size self.fmap_sets = fmap_sets self.fmap_per_iteration = fmap_per_iteration self.rd_set = 0 self.fmap_rd_idx = 0 self.iteration = 0 self.wr_set = 0 self.fmap_wr_idx = 0 self.wr_done = False def tick(self): num_iteration = self.filter_size[0] * self.filter_size[1] if not self.wr_done: # Write to GLB if self.dram_wr_chn.valid(): data = self.dram_wr_chn.pop() self.raw_stats['wr'] += 1 # print "psum_glb wr" # Write ifmap to glb # print "ifmap_to_glb: ", in_sets, self.fmap_idx, self.curr_set addr = self.fmap_sets * self.fmap_wr_idx + self.wr_set self.wr_set += 1 self.sram.request(WR, addr, data, port=0) if self.wr_set == self.fmap_sets: self.wr_set = 0 self.fmap_wr_idx += 1 if self.fmap_wr_idx == self.fmap_per_iteration: # Done initializing ifmaps and psums # self.sram.dump() self.fmap_wr_idx = 0 self.wr_done = True else: # Read from GLB and deal with SRAM latency # print self.rd_chn.vacancy(1), self.rd_chn.rd_ptr.rd(), self.rd_chn.wr_ptr.rd() if self.rd_chn.vacancy(1) and self.iteration < num_iteration: addr = self.fmap_sets * self.fmap_rd_idx + self.rd_set # print "psum req glb", self.iteration, self.fmap_rd_idx, self.rd_set self.sram.request(RD, addr, port=0) self.last_read.push(False) self.rd_set += 1 if self.rd_set == self.fmap_sets: self.rd_set = 0 self.fmap_rd_idx += 1 if self.fmap_rd_idx == self.fmap_per_iteration: self.fmap_rd_idx = 0 self.iteration += 1 # Process the last read sent to the GLB SRAM if self.last_read.valid(): is_zero = self.last_read.pop() data = [0]*self.chn_per_word if is_zero else \ [e for e in self.sram.response()] self.rd_chn.push(data) self.raw_stats['rd'] += 1 # print "psum rd glb", data # If we can pull an elemnt off of the write channel, do it # and write it into the location specificed by the current # fmap_Sets, fmap_wr_idx, and wr_set! if self.noc_wr_chn.valid(): # print "psum_to_glb: ", self.fmap_wr_idx, self.wr_set data = self.noc_wr_chn.pop() self.raw_stats['wr'] += 1 addr = self.fmap_sets * self.fmap_wr_idx + self.wr_set # print "psum wr glb", self.fmap_wr_idx, self.wr_set, data self.wr_set += 1 self.sram.request(WR, addr, data, port=1) if self.wr_set == self.fmap_sets: self.wr_set = 0 self.fmap_wr_idx += 1 if self.fmap_wr_idx == self.fmap_per_iteration: # Done initializing ifmaps and psums # self.sram.dump() self.fmap_wr_idx = 0
class IFMapGLB(Module): def instantiate(self, wr_chn, rd_chn, glb_depth, chn_per_word): self.wr_chn = wr_chn self.rd_chn = rd_chn self.chn_per_word = chn_per_word self.name = 'ifmap_glb' self.stat_type = 'show' self.raw_stats = {'size' : (glb_depth, chn_per_word), 'rd': 0, 'wr': 0} self.sram = SRAM(glb_depth, chn_per_word, name=self.name) self.last_read = Channel(3, name='last_read') self.image_size = (0, 0) self.filter_size = (0, 0) self.fmap_sets = 0 self.full_fmap_sets = 0 self.fmap_per_iteration = 0 self.curr_set = 0 self.fmap_idx = 0 self.iteration = 0 self.tile_in = 0 self.tile_out = 0 self.wr_done = False self.task_done = True def configure(self, image_size, filter_size, fmap_sets, full_fmap_sets, tiles_out, fmap_per_iteration): self.wr_done = False self.curr_set = 0 self.fmap_idx = 0 self.iteration = 0 self.image_size = image_size self.filter_size = filter_size self.fmap_sets = fmap_sets self.full_fmap_sets = full_fmap_sets self.fmap_per_iteration = fmap_per_iteration self.tiles_out = tiles_out self.tile_in = 0 self.tile_out = 0 self.task_done = False def tick(self): num_iteration = self.filter_size[0]*self.filter_size[1] offset_x = (self.filter_size[0] - 1)//2 offset_y = (self.filter_size[1] - 1)//2 filter_x = self.iteration % self.filter_size[0] - offset_x filter_y = self.iteration // self.filter_size[0] - offset_y tiles_in = self.full_fmap_sets // self.fmap_sets if self.task_done: return if not self.wr_done: # Write to GLB if self.wr_chn.valid(): data = self.wr_chn.pop() # print "ifmap_glb wr" # Write ifmap to glb # print "ifmap_to_glb: ", in_sets, self.fmap_idx, self.curr_set addr = self.full_fmap_sets*self.fmap_idx + self.curr_set self.curr_set += 1 self.sram.request(WR, addr, data) self.raw_stats['wr'] += len(data) if self.curr_set == self.full_fmap_sets: self.curr_set = 0 self.fmap_idx += 1 if self.fmap_idx == self.fmap_per_iteration: # Done initializing ifmaps and psums # self.sram.dump() self.fmap_idx = 0 self.wr_done = True else: did_read = False # Read from GLB and deal with SRAM latency if self.rd_chn.vacancy(1) and self.iteration < num_iteration and self.tile_in < tiles_in: fmap_x = self.fmap_idx % self.image_size[0] fmap_y = self.fmap_idx // self.image_size[0] ifmap_x, ifmap_y = (fmap_x + filter_x, fmap_y + filter_y) if (ifmap_x < 0) or (ifmap_x >= self.image_size[0]) or \ (ifmap_y < 0) or (ifmap_y >= self.image_size[1]): # print "ifmap req zero", self.iteration, self.fmap_idx self.last_read.push(True) else: fmap_idx = (ifmap_y*self.image_size[0]) + ifmap_x addr = self.fmap_sets*(fmap_idx*tiles_in+self.tile_in) + self.curr_set # print "ifmap req glb", self.iteration, self.fmap_idx self.sram.request(RD, addr) self.raw_stats['rd'] += self.chn_per_word self.last_read.push(False) did_read = True self.curr_set += 1 if self.curr_set == self.fmap_sets: self.curr_set = 0 self.fmap_idx += 1 if self.fmap_idx == self.fmap_per_iteration: self.fmap_idx = 0 self.iteration += 1 # Process the last read sent to the GLB SRAM if self.last_read.valid(): is_zero = self.last_read.pop() data = [0]*self.chn_per_word if is_zero else \ [e for e in self.sram.response()] # print "ifmap rd glb", data self.rd_chn.push(data) elif not did_read: if self.iteration == num_iteration: self.iteration = 0 self.tile_in += 1 if self.tile_in == tiles_in: self.tile_in = 0 self.tile_out += 1 if self.tile_out == self.tiles_out: self.tile_out = 0 self.task_done = True
class IFMapGLB(Module): def instantiate(self, wr_chn, rd_chn, glb_depth, chn_per_word): self.wr_chn = wr_chn self.rd_chn = rd_chn self.chn_per_word = chn_per_word self.name = 'ifmap_glb' self.stat_type = 'show' self.raw_stats = {'size' : (glb_depth, chn_per_word), 'ifmap_glb_rd': 0, 'ifmap_glb_wr': 0} self.sram = SRAM(glb_depth, chn_per_word) self.last_read = Channel(3) self.image_size = (0, 0) self.filter_size = (0, 0) self.fmap_sets = 0 self.fmap_per_iteration = 0 self.curr_set = 0 self.fmap_idx = 0 self.iteration = 0 self.wr_done = False def configure(self, image_size, filter_size, fmap_sets, fmap_per_iteration): self.wr_done = False self.image_size = image_size self.filter_size = filter_size self.fmap_sets = fmap_sets self.fmap_per_iteration = fmap_per_iteration self.read_ctr = 0 def tick(self): num_iteration = self.filter_size[0]*self.filter_size[1] offset_x = (self.filter_size[0] - 1)//2 offset_y = (self.filter_size[1] - 1)//2 filter_x = self.iteration % self.filter_size[0] - offset_x filter_y = self.iteration // self.filter_size[0] - offset_y if not self.wr_done: # Write to GLB if self.wr_chn.valid(): data = self.wr_chn.pop() self.raw_stats['ifmap_glb_wr'] += len(data) # print "ifmap_glb wr" # Write ifmap to glb addr = self.fmap_sets*self.fmap_idx + self.curr_set # print("ifmap_to_glb: fmap idx, curr set, addr ", self.fmap_idx, self.curr_set, addr) self.curr_set += 1 self.sram.request(WR, addr, data) if self.curr_set == self.fmap_sets: self.curr_set = 0 self.fmap_idx += 1 if self.fmap_idx == self.fmap_per_iteration: # Done initializing ifmaps and psums # self.sram.dump() self.fmap_idx = 0 self.wr_done = True else: # Read from GLB and deal with SRAM latency if self.rd_chn.vacancy(1) and self.iteration < num_iteration: self.read_ctr += 1 #print("ifmap glb read ctr ", self.read_ctr) fmap_x = self.fmap_idx % self.image_size[0] fmap_y = self.fmap_idx // self.image_size[0] ifmap_x, ifmap_y = (fmap_x + filter_x, fmap_y + filter_y) if (ifmap_x < 0) or (ifmap_x >= self.image_size[0]) or \ (ifmap_y < 0) or (ifmap_y >= self.image_size[1]): # print("ifmap req zero: iter, fmap idx ", self.iteration, self.fmap_idx) self.last_read.push(True) else: fmap_idx = (ifmap_y*self.image_size[0]) + ifmap_x addr = self.fmap_sets*fmap_idx + self.curr_set # print("addr fmap idx, addr: ", fmap_idx, addr) #print("ifmap req glb: iter, fmap idx, addr ", self.iteration, self.fmap_idx, addr) self.sram.request(RD, addr) self.last_read.push(False) self.curr_set += 1 if self.curr_set == self.fmap_sets: self.curr_set = 0 self.fmap_idx += 1 if self.fmap_idx == self.fmap_per_iteration: # print("fmap idx, fmap per iter: ", self.fmap_idx, self.fmap_per_iteration) self.fmap_idx = 0 self.iteration += 1 # Process the last read sent to the GLB SRAM if self.last_read.valid(): is_zero = self.last_read.pop() data = [0]*self.chn_per_word if is_zero else \ [e for e in self.sram.response()] #print("ifmap rd glb", data, self.iteration) self.rd_chn.push(data) self.raw_stats['ifmap_glb_rd'] += len(data)
class WeightsGLB(Module): def instantiate(self, wr_chn, rd_chn, glb_depth, chn_per_word): self.wr_chn = wr_chn self.rd_chn = rd_chn self.chn_per_word = chn_per_word self.name = 'weight_glb' self.stat_type = 'show' self.raw_stats = {'size' : (glb_depth, chn_per_word), 'rd': 0, 'wr': 0} self.sram = SRAM(glb_depth, chn_per_word, name=self.name) self.last_read = Channel(3, name='last_read') self.filter_size = (0, 0) self.in_sets = 0 self.out_sets = 0 self.curr_set = 0 self.fmap_idx = 0 self.iteration = 0 self.tile = 0 self.wr_done = False def configure(self, filter_size, in_sets, out_sets): self.wr_done = False self.filter_size = filter_size self.in_sets = in_sets self.out_sets = out_sets self.tile = 0 self.stuff = [] def tick(self): num_iteration = self.filter_size[0]*self.filter_size[1] if not self.wr_done: # Write to GLB if self.wr_chn.valid(): data = self.wr_chn.pop() # print "ifmap_glb wr" # Write ifmap to glb # print "ifmap_to_glb: ", in_sets, self.fmap_idx, self.curr_set addr = self.in_sets*(self.out_sets*self.iteration+self.fmap_idx) + self.curr_set self.stuff.append(data) self.curr_set += 1 self.sram.request(WR, addr, data) self.raw_stats['wr'] += len(data) if self.curr_set == self.in_sets: self.curr_set = 0 self.fmap_idx += 1 if self.fmap_idx == self.out_sets: # Done initializing ifmaps and psums # self.sram.dump() self.fmap_idx = 0 self.iteration += 1 if self.iteration == num_iteration: self.iteration = 0 self.wr_done = True else: did_read = False # Read from GLB and deal with SRAM latency if self.rd_chn.vacancy(1) and self.iteration < num_iteration: addr = self.in_sets*(self.out_sets*self.iteration+self.fmap_idx) + self.curr_set # print "ifmap req glb", self.iteration, self.fmap_idx self.sram.request(RD, addr) self.raw_stats['rd'] += self.chn_per_word self.last_read.push(False) did_read = True self.curr_set += 1 if self.curr_set == self.in_sets: self.curr_set = 0 self.fmap_idx += 1 if self.fmap_idx == self.out_sets: self.fmap_idx = 0 self.iteration += 1 # Process the last read sent to the GLB SRAM if self.last_read.valid(): is_zero = self.last_read.pop() data = [0]*self.chn_per_word if is_zero else \ [e for e in self.sram.response()] # print "ifmap rd glb", data self.rd_chn.push(data) elif not did_read: if self.iteration == num_iteration: self.iteration = 0 self.wr_done = False
class InputSerializer(Module): def instantiate(self, arch_input_chn, arr_y, block_size, num_nonzero, pruner_name): # PE static configuration (immutable) #self.arr_x = arr_x self.arr_y = arr_y #self.chn_per_word = chn_per_word self.block_size = block_size self.num_nonzero = num_nonzero self.convert_chn = Channel() self.prune_chn = Channel() self.arch_input_chn = arch_input_chn # Although both InputSerializer and pruner will be pushing to arch_input_chn # There is no conflict issue because all weights will be pushed by IS first # then all inputs by pruner self.converter = Converter(self.convert_chn, self.prune_chn, \ self.block_size, self.block_size) # self.pruner = NaivePruner(self.prune_chn,self.arch_input_chn, \ # self.num_nonzero,True) #user defined pruner for this layer, default to naive pruner self.pruner = getattr(pruner, pruner_name)(self.prune_chn,self.arch_input_chn, \ self.num_nonzero, self.block_size, True) self.ifmap = None self.weights = None self.bias = None self.image_size = (0, 0) self.filter_size = (0, 0) self.ifmap_psum_done = True self.pass_done = Reg(False) # State Counters self.curr_set = 0 self.curr_filter = 0 self.iteration = 0 self.fmap_idx = 0 self.curr_chn = 0 self.curr_x = 0 # run through first two dimensions of input self.curr_y = 0 self.bias_set = 0 #self.send_bias = False def configure(self, ifmap, weights, bias, in_chn, out_chn, image_size, filter_size): self.ifmap = ifmap self.weights = weights self.bias = bias self.in_chn = in_chn self.out_chn = out_chn self.image_size = image_size self.filter_size = filter_size self.ifmap_psum_done = False self.weights_done = False self.pass_done.wr(False) # State Counters self.curr_set = 0 self.curr_filter = 0 self.iteration = 0 self.curr_chn = 0 self.curr_x = 0 # run through first two dimensions of input self.curr_y = 0 self.bias_set = 0 #self.send_bias = False def tick(self): if self.pass_done.rd(): return if self.ifmap_psum_done: if self.convert_chn.vacancy(): data = np.zeros(self.block_size) self.convert_chn.push(data) return in_sets = self.in_chn // self.block_size out_sets = self.out_chn // self.block_size num_iteration = self.filter_size[0] * self.filter_size[1] # read and hold all weights at the beginning for ease of implementation if not self.weights_done: f_x = self.iteration // self.filter_size[0] f_y = self.iteration % self.filter_size[0] # Push filters to PE columns. (PE is responsible for pop) if self.arch_input_chn.vacancy( ) and self.iteration < num_iteration: cmin = self.curr_filter * self.block_size cmax = cmin + self.block_size data = np.array([self.weights[f_x, f_y, self.curr_chn, c] \ for c in range(cmin, cmax) ]) #print("{},{},{},{}-{}".format(f_x,f_y,self.curr_chn,cmin,cmax)) #print(data) self.arch_input_chn.push( data) # Gives groups of four along num_filters axis self.curr_filter += 1 if (self.curr_filter == out_sets ): # Loop through blocks of filters self.curr_filter = 0 self.curr_chn += 1 if (self.curr_chn == self.in_chn): # Loop through channels self.curr_chn = 0 self.iteration += 1 if (self.iteration == num_iteration ): # Loop through 2D filter support self.iteration = 0 #print("Weights done") self.weights_done = True elif self.arch_input_chn.vacancy() and self.bias_set < out_sets: cmin = self.bias_set * self.block_size cmax = cmin + self.block_size data = np.array([self.bias[c] for c in range(cmin, cmax)]) #print("bias (input serializer):") #print(data) self.arch_input_chn.push(data) self.bias_set += 1 elif not self.ifmap_psum_done: if self.convert_chn.vacancy(): cmin = self.curr_set * self.block_size cmax = cmin + self.block_size #xmin = x #xmax = x+self.arr_x # Write ifmap to glb #data = np.array([ self.ifmap[x, self.curr_y, self.curr_chn] for x in range(xmin, xmax) ]) data = np.array([ self.ifmap[self.curr_x, self.curr_y, c] for c in range(cmin, cmax) ]) #print("{},{},{}-{}".format(self.curr_x, self.curr_y, cmin, cmax)) #print(data) self.curr_set += 1 if (self.curr_set == in_sets): self.curr_set = 0 self.curr_y += 1 if (self.curr_y == self.image_size[1]): self.curr_y = 0 self.curr_x += 1 self.convert_chn.push(data) if (self.curr_x == self.image_size[0]): self.curr_x = 0 self.ifmap_psum_done = True
class PSumGLB(Module): def instantiate(self, dram_wr_chn, noc_wr_chn, rd_chn, glb_depth, chn_per_word): self.dram_wr_chn = dram_wr_chn self.noc_wr_chn = noc_wr_chn self.rd_chn = rd_chn self.chn_per_word = chn_per_word self.name = 'psum_glb' self.stat_type = 'show' self.raw_stats = {'size' : (glb_depth, chn_per_word), 'psum_glb_rd': 0, 'psum_glb_wr': 0} self.sram = SRAM(glb_depth, chn_per_word, nports=2) self.last_read = Channel(3) self.filter_size = (0, 0) self.fmap_sets = 0 self.fmap_per_iteration = 0 self.rd_set = 0 self.fmap_rd_idx = 0 self.iteration = 0 self.wr_set = 0 self.fmap_wr_idx = 0 self.wr_done = False def configure(self, filter_size, fmap_sets, fmap_per_iteration): self.wr_done = False self.filter_size = filter_size self.fmap_sets = fmap_sets self.fmap_per_iteration = fmap_per_iteration self.rd_set = 0 self.fmap_rd_idx = 0 self.iteration = 0 self.wr_set = 0 self.fmap_wr_idx = 0 self.wr_done = False def tick(self): num_iteration = self.filter_size[0]*self.filter_size[1] if not self.wr_done: # Write to GLB if self.dram_wr_chn.valid(): data = self.dram_wr_chn.pop() self.raw_stats['psum_glb_wr'] += len(data) # print "psum_glb wr" # Write ifmap to glb addr = self.fmap_sets*self.fmap_wr_idx + self.wr_set self.wr_set += 1 self.sram.request(WR, addr, data, port=1) if self.wr_set == self.fmap_sets: self.wr_set = 0 self.fmap_wr_idx += 1 if self.fmap_wr_idx == self.fmap_per_iteration: # Done initializing ifmaps and psums # self.sram.dump() self.fmap_wr_idx = 0 self.wr_done = True #print ("psum orig write, fmap_sets, fmap_wr_idx, wr_set, addr, data: ",self.fmap_sets, self.fmap_wr_idx, self.wr_set, addr, data) else: # Read from GLB and deal with SRAM latency # print self.rd_chn.vacancy(1), self.rd_chn.rd_ptr.rd(), self.rd_chn.wr_ptr.rd() if self.rd_chn.vacancy(1) and self.iteration < num_iteration: addr = self.fmap_sets*self.fmap_rd_idx + self.rd_set #print("psum req glb", self.iteration, self.fmap_rd_idx, self.rd_set) self.sram.request(RD, addr, port=0) self.last_read.push(False) self.rd_set += 1 if self.rd_set == self.fmap_sets: self.rd_set = 0 self.fmap_rd_idx += 1 if self.fmap_rd_idx == self.fmap_per_iteration: self.fmap_rd_idx = 0 self.iteration += 1 # Process the last read sent to the GLB SRAM if self.last_read.valid(): is_zero = self.last_read.pop() data = [0]*self.chn_per_word if is_zero else \ [e for e in self.sram.response()] self.rd_chn.push(data) self.raw_stats['psum_glb_rd'] += len(data) #print("psum rd glb: data", data) if self.noc_wr_chn.valid(): data = self.noc_wr_chn.pop() #print("psum_to_glb: ", self.fmap_wr_idx, self.wr_set, data) self.raw_stats['psum_glb_wr'] += len(data) addr = self.fmap_sets*self.fmap_wr_idx + self.wr_set #print("noc psum wr glb", self.fmap_wr_idx, self.wr_set, data) self.wr_set += 1 self.sram.request(WR, addr, data, port=1) if self.wr_set == self.fmap_sets: self.wr_set = 0 self.fmap_wr_idx += 1 if self.fmap_wr_idx == self.fmap_per_iteration: # Done initializing ifmaps and psums #self.sram.dump() self.fmap_wr_idx = 0
class IFMapGLB(Module): def instantiate(self, wr_chn, rd_chn, glb_depth, chn_per_word): self.wr_chn = wr_chn self.rd_chn = rd_chn self.chn_per_word = chn_per_word self.name = 'ifmap_glb' self.stat_type = 'show' self.raw_stats = {'size': (glb_depth, chn_per_word), 'rd': 0, 'wr': 0} self.sram = SRAM(glb_depth, chn_per_word) self.last_read = Channel(3) self.image_size = (0, 0) self.filter_size = (0, 0) self.fmap_sets = 0 self.fmap_per_iteration = 0 self.curr_set = 0 self.fmap_idx = 0 self.iteration = 0 self.wr_done = False def configure(self, image_size, filter_size, fmap_sets, fmap_per_iteration): self.wr_done = False self.image_size = image_size self.filter_size = filter_size self.fmap_sets = fmap_sets self.fmap_per_iteration = fmap_per_iteration self.fmap_idx_ctr = 0 def tick(self): num_iteration = self.filter_size[0] * self.filter_size[1] offset_x = (self.filter_size[0] - 1) // 2 offset_y = (self.filter_size[1] - 1) // 2 filter_x = self.iteration % self.filter_size[0] #- offset_x filter_y = self.iteration // self.filter_size[0] #- offset_y if not self.wr_done: # Write to GLB if self.wr_chn.valid(): data = self.wr_chn.pop() self.raw_stats['wr'] += len(data) # print "ifmap_glb wr" # Write ifmap to glb addr = self.fmap_sets * self.fmap_idx + self.curr_set #print("ifmap_to_glb: fmap idx, curr set, addr ", self.fmap_idx, self.curr_set, addr) self.curr_set += 1 self.sram.request(WR, addr, data) if self.curr_set == self.fmap_sets: self.curr_set = 0 self.fmap_idx += 1 if self.fmap_idx == 16: # self.fmap_per_iteration: # Done initializing ifmaps and psums # self.sram.dump() self.fmap_idx = 0 self.wr_done = True else: # Read from GLB and deal with SRAM latency #if self.rd_chn.vacancy(1) and self.iteration < num_iteration: # fmap_x = self.fmap_idx % self.image_size[0] # fmap_y = self.fmap_idx // self.image_size[0] # ifmap_x, ifmap_y = (fmap_x + filter_x, fmap_y + filter_y) # if (ifmap_x < 0) or (ifmap_x >= self.image_size[0]) or \ # (ifmap_y < 0) or (ifmap_y >= self.image_size[1]): # # print("ifmap req zero: iter, fmap idx ", self.iteration, self.fmap_idx) # self.last_read.push(True) # else: # fmap_idx = (ifmap_y*self.image_size[0]) + ifmap_x # # addr = self.fmap_sets*fmap_idx + self.curr_set # #print("addr fmap idx, addr: ", fmap_idx, addr) # print("ifmap req glb: iter, fmap idx, addr ", self.iteration, self.fmap_idx, addr) # self.sram.request(RD, addr) # self.last_read.push(False) # self.curr_set += 1 # if self.curr_set == self.fmap_sets: # self.curr_set = 0 # self.fmap_idx += 1 # if self.fmap_idx == self.fmap_per_iteration: # # print("fmap idx, fmap per iter: ", self.fmap_idx, self.fmap_per_iteration) # self.fmap_idx = 0 # self.iteration += 1 # TODO: fix this fmap_indices = [ 0, 1, 4, 5, 1, 2, 5, 6, 2, 3, 6, 7, 4, 5, 8, 9, 5, 6, 9, 10, 6, 7, 10, 11, 8, 9, 12, 13, 9, 10, 13, 14, 10, 11, 14, 15 ] if self.rd_chn.vacancy( 1) and self.iteration < num_iteration: # 9 iterations fmap_idx = fmap_indices[self.fmap_idx_ctr] addr = fmap_idx #print("addr fmap idx, addr: ", fmap_idx, addr) print("ifmap req glb: fmap_idx_ctr, addr ", self.fmap_idx_ctr, addr) self.sram.request(RD, addr) self.last_read.push(False) self.fmap_idx_ctr += 1 if (self.fmap_idx_ctr % 4) == 0: self.iteration += 1 if self.last_read.valid(): if self.last_read.pop(): pass # do nothing else: # push data to ifmap NOC data = [e for e in self.sram.response()] print("ifmap rd glb", data) self.rd_chn.push(data) self.raw_stats['rd'] += len(data)
class IFMapWeightsGLB(Module): def instantiate(self, ifmap_wr_chn, ifmap_rd_chn, weights_wr_chn, weights_rd_chn,\ arr_y, ifmap_glb_depth, weights_glb_depth, \ block_size, num_nonzero): self.ifmap_wr_chn = ifmap_wr_chn self.ifmap_rd_chn = ifmap_rd_chn self.weights_wr_chn = weights_wr_chn self.weights_rd_chn = weights_rd_chn self.arr_y = arr_y self.block_size = block_size self.num_nonzero = num_nonzero self.name = 'ifmap_weights_glb' self.in_chn = 0 self.out_chn = 0 self.stat_type = 'show' self.raw_stats = { 'size': (ifmap_glb_depth, num_nonzero * 3), 'rd': 0, 'wr': 0 } self.isram = SRAM(ifmap_glb_depth, num_nonzero * 3, dtype=np.float16) self.ilast_read = Channel(3) self.ifmap_glb_depth = ifmap_glb_depth self.wsram = SRAM(weights_glb_depth, block_size, dtype=np.float16) self.wlast_read = Channel(1) # Channel depth of one here prevents SRAM reads from colliding # was having issues with a later read 'replacing' an earlier one # and thus getting the wrong data # having only one extant write on an SRAM at a time prevents this self.weights_glb_depth = weights_glb_depth # Channel to hold indices of weights that need to be sent # to NoC self.weights_to_send = Channel(3) self.image_size = (0, 0) self.filter_size = (0, 0) self.fmap_sets = 0 self.fmap_per_iteration = 0 self.curr_set = 0 self.fmap_idx = 0 self.iteration = 0 self.iwr_done = False self.wwr_done = False # For managing convolution self.curr_x = 0 self.curr_y = 0 self.curr_chn = 0 self.request_idx = 0 self.send_idx = 0 #self.curr_filt_x = 0 #self.curr_filt_y = 0 self.ifmap_done = False # for weights self.addr = 0 self.base_addr = 0 # to store values from self.weights_to_send self.base_addr_wo_chn = -1 # to keep track of current position within 3x3 filter # invalid weights and inputs to use at the end to flush out last outputs self.weights_to_flush = 0 self.inputs_to_flush = 0 self.needed_addr = 0 self.ready_to_output = False # ready to output a filter_size block of inputs self.curr_data = [0 for i in range(3 * num_nonzero)] self.curr_weights = [0 for i in range(block_size)] self.data_idx = num_nonzero # block other operations while actively working through data # send one data point at a time (of num_nonzero) def configure(self, image_size, filter_size, in_chn, out_chn, fmap_per_iteration): self.wr_done = False self.image_size = image_size self.filter_size = filter_size self.in_chn = in_chn self.out_chn = out_chn self.fmap_per_iteration = fmap_per_iteration # For managing convolution self.curr_x = 0 self.curr_y = 0 self.curr_chn = 0 self.request_idx = 0 self.send_idx = 0 self.curr_filt_x = 0 self.curr_filt_y = 0 self.curr_filt_set = 0 self.ifmap_done = False offset_x = (self.filter_size[0] - 1) // 2 offset_y = (self.filter_size[1] - 1) // 2 # The first address needed to be filled in order to start sending #self.needed_addr = (self.image_size[0]*(1+offset_y) + 1+offset_x) *\ # (self.in_chn // self.block_size) - 1 self.needed_addr = (self.image_size[0]*(offset_y) + 1+offset_x) *\ (self.in_chn // self.block_size) - 1 # Goes high to transfer sram control to output # Doing them synchronously would be better, but complicates things self.ready_to_output = False def tick(self): # WEIGHTS------------------------------------------------------------------- num_iterations = self.image_size[0] * self.image_size[ 1] * self.in_chn // self.block_size max_addr = self.filter_size[0] * self.filter_size[ 1] * self.in_chn * self.out_chn // self.block_size verbose = False if not self.wwr_done: if self.weights_wr_chn.valid(): data = self.weights_wr_chn.pop() self.raw_stats['wr'] += len(data) #print(self.addr) #print("weights (iw glb) at addr {}".format(self.addr)) #print(data) #print(self.addr) self.wsram.request(WR, self.addr, np.asarray(data)) self.addr += 1 #print("storing weights (wi glb)") #print(data) if (self.addr == max_addr): self.addr = self.out_chn // self.block_size self.wwr_done = True #print("Done storing weights (wi glb)") #print("--------------------------") #print(self.wsram.data) #print("--------------------------") # within this block of code self.addr is re-used # here it is more analogous to curr_set # and refers to the current block of filters (last index of the four) # that is being read else: # Catch addresses that correspond to nonzero inputs # search "self.weights_to_send.push(waddr)" below if (self.weights_to_send.valid() and self.addr == self.out_chn // self.block_size): self.base_addr = self.weights_to_send.pop() self.addr = 0 # cycle through channels using self.addr # make requests to memory; will pick these up in the next if statement below elif (self.wlast_read.vacancy() and not self.addr == self.out_chn // self.block_size): full_addr = self.base_addr + self.addr self.wsram.request(RD, full_addr) self.wlast_read.push(False) #print("Request weights (wi glb):") #print(full_addr) self.addr += 1 # catch requests from memory; send results to WeightsNoC if self.wlast_read.valid() and self.weights_rd_chn.vacancy(1): is_zero = self.wlast_read.pop() data = [e for e in self.wsram.response()] self.weights_rd_chn.push(data) #print("weights sent (from iw glb)") #print(data) self.raw_stats['rd'] += len(data) # these two if statements take care of an issue that occurs at the end # PEs don't automatically detect the end of the computation without inputs # from another location # So we send in some dummy inputs to flush out the last outputs if self.weights_rd_chn.vacancy(1) and not self.wlast_read.valid() and \ not self.weights_to_send.valid() and self.addr == self.out_chn // self.block_size\ and self.weights_to_flush > 0: self.weights_to_flush -= 1 self.weights_rd_chn.push([0 for i in range(self.block_size)]) if self.ifmap_done and self.inputs_to_flush > 0 and self.ifmap_rd_chn.vacancy( 1): self.inputs_to_flush -= 1 self.ifmap_rd_chn.push([-1, 0, 0]) # IFMAP------------------------------------------------------------------- if not (self.ifmap_done and not self.ilast_read.valid() and not self.ready_to_output): verbose = False # shorthand values that will be useful later num_iteration = self.filter_size[0] * self.filter_size[1] offset_x = (self.filter_size[0] - 1) // 2 offset_y = (self.filter_size[1] - 1) // 2 filter_x = self.iteration % self.filter_size[0] - offset_x filter_y = self.iteration // self.filter_size[0] - offset_y in_sets = self.in_chn // self.block_size out_sets = self.out_chn // self.block_size if not self.iwr_done and not self.ready_to_output: # Write to GLB if self.ifmap_wr_chn.valid(): data = self.ifmap_wr_chn.pop() data = np.reshape(np.asarray(data), (-1)) full_addr = in_sets * self.fmap_idx + self.curr_set #print("ifmap (wi glb) received data:") #print(data) #print("{} >?= {}".format(full_addr, self.needed_addr)) self.curr_set += 1 addr = full_addr % self.ifmap_glb_depth # if we have enough inputs in memory to start sending if (full_addr == self.needed_addr): self.ready_to_output = True self.needed_addr += in_sets self.isram.request(WR, addr, data) self.raw_stats['wr'] += len(data) #print("ifmap, iw glb") #print("{} written to {}".format(data, addr)) if self.curr_set == self.fmap_sets: self.curr_set = 0 self.fmap_idx += 1 if self.fmap_idx == self.fmap_per_iteration: # Done initializing ifmaps and psums print("iw glb: Finished filling ifmap buffer") self.fmap_idx = 0 self.iwr_done = True elif self.ready_to_output: increment_vals = False # send data to NoC if (self.ilast_read.valid() and self.ifmap_rd_chn.vacancy(1)): is_zero = self.ilast_read.pop() #print(is_zero) if (not is_zero): self.curr_data = [e for e in self.isram.response()] self.data_idx = 0 else: increment_vals = True elif (not self.data_idx == self.num_nonzero and self.weights_to_send.vacancy() and self.base_addr_wo_chn >= 0): data = [self.curr_data[i] for i in \ range(self.data_idx*3, self.data_idx*3 + 3)] data_mod = [self.curr_x*self.image_size[1]+self.curr_y,\ data[1], data[2]] self.ifmap_rd_chn.push(data_mod) #print("iw glb inputs sent: {},{},{}".format(self.curr_x, self.curr_y, self.curr_chn)) #print(data_mod) self.raw_stats['rd'] += 1 # Assertion checks that we will not attempt to read data that # has not yet been stored in memory waddr = self.base_addr_wo_chn + int(data[0]) * out_sets assert (self.wwr_done or waddr < self.addr) #self.wsram.request(RD, waddr) #self.wlast_read.push(False) self.weights_to_send.push(waddr) #print("Send request (wi glb):") #print(waddr) self.data_idx += 1 #if (self.data_idx == self.num_nonzero): if (data[2] == 1): self.data_idx = self.num_nonzero increment_vals = True if (self.data_idx == self.num_nonzero): self.base_addr_wo_chn = -1 #print(self.data_idx) if (increment_vals): #print(self.send_idx) self.curr_chn += 1 if (self.curr_chn == in_sets): self.curr_chn = 0 self.send_idx += 1 if (self.send_idx == self.filter_size[0] * self.filter_size[1]): #print("Ready to shift input glb frame ({},{})".format(self.curr_x, self.curr_y)) self.send_idx = 0 self.curr_y += 1 if (self.curr_y == self.image_size[1]): self.curr_y = 0 self.curr_x += 1 if (self.curr_x == self.image_size[0]): self.curr_x = 0 self.ifmap_done = True #print("Done sending inputs from iw glb") self.ready_to_output = False self.inputs_to_flush = 1 self.weights_to_flush = self.arr_y // self.block_size elif (not self.iwr_done): self.ready_to_output = False #print(self.ifmap_wr_chn.valid()) # stage one of these at a time # request data from SRAM if (not self.ifmap_done and self.ilast_read.vacancy(1) and \ self.data_idx == self.num_nonzero and self.weights_to_send.vacancy()\ and self.base_addr_wo_chn == -1): # and not (self.curr_x == self.image_size[0]): x_adj = (self.curr_x + self.curr_filt_x - offset_x) y_adj = self.curr_y + self.curr_filt_y - offset_y idx = x_adj * self.image_size[1] + y_adj #print("{},{},{} input requested".format(x_adj, y_adj, self.curr_filt_set)) #print(idx) if (x_adj < 0 or x_adj >= self.image_size[0] or y_adj < 0 or y_adj >= self.image_size[1]): self.ilast_read.push(True) else: addr = (idx * in_sets + self.curr_filt_set) % self.ifmap_glb_depth self.isram.request(RD, addr) self.ilast_read.push(False) # set up for corresponding weights to be sent later self.base_addr_wo_chn = self.curr_filt_x*self.filter_size[1]\ *self.in_chn*out_sets + \ self.curr_filt_y*self.in_chn*out_sets + \ self.curr_filt_set*self.block_size*out_sets #print("Next base addr = {}".format(self.base_addr_wo_chn)) self.curr_filt_set += 1 if (self.curr_filt_set == in_sets): self.curr_filt_set = 0 self.curr_filt_y += 1 if (self.curr_filt_y == self.filter_size[1]): self.curr_filt_y = 0 self.curr_filt_x += 1 if (self.curr_filt_x == self.filter_size[0]): self.curr_filt_x = 0
class WeightsGLB(Module): def instantiate(self, wr_chn, rd_chn, glb_depth, block_size): self.wr_chn = wr_chn self.rd_chn = rd_chn self.name = 'weight_glb' self.filter_size = (0, 0) self.image_size = (0, 0) self.wr_done = False self.iteration = 0 self.addr = 0 self.in_chn = 0 self.out_chn = 0 #self.arr_y = 0 #self.out_sets = 0 self.block_size = block_size self.sram = SRAM(glb_depth, block_size) self.last_read = Channel(3) self.stat_type = 'show' self.raw_stats = {'size': (glb_depth, block_size), 'rd': 0, 'wr': 0} def configure(self, filter_size, image_size, in_chn, out_chn): self.filter_size = filter_size self.image_size = image_size self.iteration = 0 self.addr = 0 self.in_chn = in_chn self.out_chn = out_chn #self.arr_y = arr_y #self.out_sets = out_sets self.wr_done = False def tick(self): # num_iterations = times to read out all weights # max_addr = number of slots to hold all blocks of weights num_iterations = self.image_size[0] * self.image_size[ 1] * self.in_chn // self.block_size max_addr = self.filter_size[0] * self.filter_size[ 1] * self.in_chn * self.out_chn // self.block_size verbose = False if not self.wr_done: if self.wr_chn.valid(): data = self.wr_chn.pop() self.raw_stats['wr'] += len(data) #print(self.addr) #print(data) #print(type(data)) self.sram.request(WR, self.addr, np.asarray(data)) self.addr += 1 if (self.addr == max_addr): self.addr = 0 self.wr_done = True if (verbose): print("weight_glb") print(data) elif self.rd_chn.vacancy(1): if (self.iteration < num_iterations): self.sram.request(RD, self.addr) self.last_read.push(False) self.addr += 1 if (self.addr == max_addr): self.addr = 0 self.iteration += 1 #self.rd_chn.push(data) #self.raw_stats['rd'] += len(data) if self.last_read.valid(): is_zero = self.last_read.pop() data = [e for e in self.sram.response()] self.rd_chn.push(data) #print(self.iteration) #print(data) self.raw_stats['rd'] += len(data)
class PSumGLB(Module): def instantiate(self, dram_wr_chn, noc_wr_chn, rd_chn, glb_depth, block_size, num_nonzero): self.dram_wr_chn = dram_wr_chn self.noc_wr_chn = noc_wr_chn self.rd_chn = rd_chn self.name = 'psum_glb' self.block_size = block_size self.num_nonzero = num_nonzero self.stat_type = 'show' self.raw_stats = {'size': (glb_depth, block_size), 'rd': 0, 'wr': 0} self.sram = SRAM(glb_depth, block_size, nports=2, dtype=np.float16) self.last_read = Channel(3) self.filter_size = (0, 0) self.fmap_sets = 0 self.fmap_per_iteration = 0 self.rd_set = 0 self.fmap_rd_idx = 0 self.iteration = 0 self.wr_set = 0 self.fmap_wr_idx = 0 self.wr_done = False def configure(self, filter_size, out_chn, fmap_per_iteration): self.wr_done = False self.filter_size = filter_size self.out_chn = out_chn self.fmap_per_iteration = fmap_per_iteration self.rd_set = 0 self.fmap_rd_idx = 0 self.iteration = 0 self.wr_set = 0 self.fmap_wr_idx = 0 self.wr_done = False def tick(self): num_iteration = 1 #self.filter_size[0]*self.filter_size[1] if not self.wr_done: # Write to GLB if self.dram_wr_chn.valid(): data = self.dram_wr_chn.pop() # Write ifmap to glb #print("psum_glb") #print(data) #addr = self.fmap_sets*self.fmap_wr_idx + self.wr_set addr = self.wr_set self.wr_set += 1 self.sram.request(WR, addr, data, port=0) self.raw_stats['wr'] += len(data) if self.wr_set == self.out_chn // self.block_size: self.wr_set = 0 self.wr_done = True #self.fmap_wr_idx += 1 #if self.fmap_wr_idx == self.fmap_per_iteration: # # Done initializing ifmaps and psums # # self.sram.dump() # #print("done!") # self.fmap_wr_idx = 0 # self.wr_done = True else: # Read from GLB and deal with SRAM latency if self.rd_chn.vacancy(1) and self.iteration < num_iteration: #addr = self.fmap_sets*self.fmap_rd_idx + self.rd_set addr = self.rd_set self.sram.request(RD, addr, port=0) self.last_read.push(False) self.rd_set += 1 if self.rd_set == self.out_chn // self.block_size: self.rd_set = 0 self.fmap_rd_idx += 1 if self.fmap_rd_idx == self.fmap_per_iteration: self.fmap_rd_idx = 0 self.iteration += 1 # Process the last read sent to the GLB SRAM if self.last_read.valid(): is_zero = self.last_read.pop() data = [0]*self.block_size if is_zero else \ [e for e in self.sram.response()] self.rd_chn.push(data) self.raw_stats['rd'] += len(data)
class IFMapGLB(Module): def instantiate(self, wr_chn, rd_chn, arr_y, glb_depth, block_size, num_nonzero): self.wr_chn = wr_chn self.rd_chn = rd_chn self.arr_y = arr_y self.block_size = block_size self.num_nonzero = num_nonzero self.name = 'ifmap_glb' self.stat_type = 'show' self.raw_stats = {'size': (glb_depth, num_nonzero), 'rd': 0, 'wr': 0} self.sram = SRAM(glb_depth, num_nonzero * 3) self.last_read = Channel(3) self.glb_depth = glb_depth self.image_size = (0, 0) self.filter_size = (0, 0) self.fmap_sets = 0 self.fmap_per_iteration = 0 self.curr_set = 0 self.fmap_idx = 0 self.iteration = 0 self.wr_done = False # For managing convolution self.curr_x = 0 self.curr_y = 0 self.curr_chn = 0 self.request_idx = 0 self.send_idx = 0 #self.curr_filt_x = 0 #self.curr_filt_y = 0 self.ifmap_done = False self.needed_addr = 0 self.ready_to_output = False # ready to output a filter_size block of inputs self.curr_data = [0 for i in range(3 * num_nonzero)] self.data_idx = num_nonzero # block other operations while actively working through data # send one data point at a time (of num_nonzero) def configure(self, image_size, filter_size, in_chn, fmap_per_iteration): self.wr_done = False self.image_size = image_size self.filter_size = filter_size self.in_chn = in_chn self.fmap_per_iteration = fmap_per_iteration # For managing convolution self.curr_x = 0 self.curr_y = 0 self.curr_chn = 0 self.request_idx = 0 self.send_idx = 0 self.curr_filt_x = 0 self.curr_filt_y = 0 self.curr_filt_set = 0 self.ifmap_done = False offset_x = (self.filter_size[0] - 1) // 2 offset_y = (self.filter_size[1] - 1) // 2 # The first address needed to be filled in order to start sending self.needed_addr = (self.image_size[0]*(1+offset_y) + 1+offset_x) *\ (self.in_chn // self.block_size) - 1 # Goes high to transfer sram control to output # Doing them synchronously would be better, but complicates things self.ready_to_output = False def tick(self): if (self.ifmap_done and not self.last_read.valid() and not self.ready_to_output): return verbose = False num_iteration = self.filter_size[0] * self.filter_size[1] offset_x = (self.filter_size[0] - 1) // 2 offset_y = (self.filter_size[1] - 1) // 2 filter_x = self.iteration % self.filter_size[0] - offset_x filter_y = self.iteration // self.filter_size[0] - offset_y in_sets = self.in_chn // self.block_size #print(filter_x) if not self.wr_done and not self.ready_to_output: # Write to GLB if self.wr_chn.valid(): data = self.wr_chn.pop() data = np.reshape(np.asarray(data), (-1)) # Write ifmap to glb #print("ifmap_glb") #print(data) full_addr = in_sets * self.fmap_idx + self.curr_set self.curr_set += 1 addr = full_addr % self.glb_depth # if we have enough inputs in memory to start sending if (full_addr == self.needed_addr): self.ready_to_output = True self.needed_addr += in_sets self.sram.request(WR, addr, data) self.raw_stats['wr'] += len(data) if self.curr_set == self.fmap_sets: self.curr_set = 0 self.fmap_idx += 1 if self.fmap_idx == self.fmap_per_iteration: # Done initializing ifmaps and psums # self.sram.dump() self.fmap_idx = 0 self.wr_done = True elif self.ready_to_output: # send data to NoC if (self.last_read.valid() and self.rd_chn.vacancy(1) and self.data_idx == 0): xmin = self.curr_filt_x xmax = xmin + self.arr_y #print("{}-{},{},{}".format(xmin, xmax, self.holder_y, self.curr_chn)) #data = [self.holder[x][self.holder_y][self.curr_chn] for x in range(xmin, xmax)] is_zero = self.last_read.pop() if (not is_zero): self.curr_data = [e for e in self.sram.response()] self.data_idx = 0 if (not self.data_idx == self.num_nonzero): data = [self.curr_data[i] for i in \ range(self.data_idx*3, self.data_idx*3 + 3)] self.rd_chn.push(data) self.raw_stats['rd'] += len(data) self.data_idx += 1 if (self.data_idx == num_nonzero): self.data_idx = 0 self.curr_chn += 1 if (self.curr_chn == self.arr_y): self.curr_chn = 0 self.send_idx += 1 if (self.send_idx == self.filter_size[0] * self.filter_size[1]): if (verbose): print("Ready to shift input glb frame") self.send_idx = 0 self.curr_y += 1 if (self.curr_y == self.image_size[1]): self.curr_y = 0 self.curr_x += 1 if (self.curr_x == self.image_size[0]): self.curr_x = 0 self.ifmap_done = True self.ready_to_output = False elif (not self.wr_done): self.ready_to_output = False #print("{},{},{}".format(self.holder_x, self.holder_y, self.curr_chn)) #print(data) # stage one of these at a time # request data from SRAM if (not self.ifmap_done and self.last_read.vacancy(1) and self.data_idx == num_nonzero): # and not (self.curr_x == self.image_size[0]): idx = (self.curr_x + self.curr_filt_x - offset_x) * self.image_size[ 1] + self.curr_y + self.curr_filt_y - offset_y #print(idx) if (idx >= self.image_size[0] * self.image_size[1] or idx < 0): self.last_read.push(True) else: addr = idx * in_sets + self.curr_filt_set self.sram.request(RD, addr) self.last_read.push(False) self.curr_filt_set += 1 if (self.curr_filt_set == in_sets): self.curr_filt_set = 0 self.curr_filt_y += 1 if (self.curr_filt_y == self.filter_size[1]): self.curr_filt_y = 0 self.curr_filt_x += 1 if (self.curr_filt_x == self.filter_size[0]): self.curr_filt_x = 0