def get_mem_write_cycles(self, src, size): """ Write instruction args: src_idx: index of source address src: destination address size: size of data in bits """ return ceil_a_by_b(size, self.mem_if_width)
def get_mem_read_cycles(self, dst, size): """ Read instruction args: src_idx: index of source address dst: destination address size: size of data in bits """ return ceil_a_by_b(size, self.mem_if_width)
def get_compute_cycles(self, ic, oc, ow, oh, b, kw, kh, iprec, wprec, im2col=False): """ Compute instruction args: ic: Input Channels oc: Output Channels ow: Output Width oh: Output Height kw: Output Height kh: Output Height b: Batch Size im2col: boolean. If true, we assume the cpu does im2col. Otherwise, we do convolutions channel-wise """ overhead = 0 if im2col: ni = kw * kh * ic no = oc batch = b * oh * ow compute_cycles = batch * ceil_a_by_b(no, self.M) * \ (ceil_a_by_b(ni, self.N * self.get_perf_factor(iprec, wprec)) + overhead) else: compute_cycles = b * ceil_a_by_b(oc, self.M) * \ ow * oh * kw * kh * \ (ceil_a_by_b(ic, self.N * self.get_perf_factor(iprec, wprec)) + overhead) return compute_cycles
def _optimize_for_order(conv_params, order_type, verbose=False): """ For a given ordering, optimizes tiling Args: conv_params: A tuple with convolution params order_type: ordering loop """ acc_obj, K, O, S, IC, OC, B, iprec, wprec, im2col, energy_cost = conv_params I = (O - 1) * S + K # We do not tile the "K" dimension and compute an entire 2-D conv at a # time num_O_tiles = int(math.ceil(log2(O))) + 1 num_IC_tiles = int(math.ceil(log2(IC))) + 1 # TODO: Fix? if im2col: num_OC_tiles = int(math.ceil(log2(OC))) + 1 else: num_OC_tiles = int(math.ceil(log2(math.ceil( float(OC) / acc_obj.M)))) + 1 num_B_tiles = int(math.ceil(log2(B))) + 1 best_cycles = None best_energy = None best_tiling = None for _b in range(num_B_tiles): b = min(1 << _b, B) num_b = ceil_a_by_b(B, b) for _o in range(num_O_tiles): ow = min(1 << _o, O) oh = ow num_ow = ceil_a_by_b(O, ow) num_oh = ceil_a_by_b(O, oh) for _ic in range(num_IC_tiles): ic = min(1 << _ic, IC) num_ic = ceil_a_by_b(IC, ic) for _oc in range(num_OC_tiles): if im2col: oc = min((1 << _oc), OC) else: oc = min((1 << _oc) * acc_obj.M, OC) num_oc = ceil_a_by_b(OC, oc) iw = K + (ow - 1) * S ih = K + (oh - 1) * S tiling = {} tiling['B/b'] = (num_b, b) tiling['OW/ow'] = (num_ow, ow) tiling['OH/oh'] = (num_oh, oh) tiling['IC/ic'] = (num_ic, ic) tiling['OC/oc'] = (num_oc, oc) stats = get_stats_fast(conv_params, tiling, order_type, verbose=False) if stats is None: continue cycles = stats.total_cycles energy = stats.get_energy(energy_cost) mem_cycles = stats.mem_stall_cycles if best_cycles is None or best_cycles > cycles or ( best_cycles == cycles and best_energy > energy): # if best_energy is None or best_energy > energy or (best_energy == energy and best_cycles > cycles): best_energy = energy best_cycles = cycles best_mem_cycles = mem_cycles best_order = order_type best_tiling = tiling # if best_cycles is None: # print('Not found') # print(conv_params) # stats = get_stats_fast(conv_params, tiling, order_type, verbose=True) return (best_tiling, order_type, best_cycles, best_energy)
def get_stats_fast(conv_params, tiling, order_type, verbose=False): """ Returns cycles and memory accesses to DRAM, IBUF, OBUF, and WBUF TODOs: Without im2col, the calculation of weight and act size is inexact """ acc_obj, K, O, S, IC, OC, B, iprec, wprec, im2col, energy_cost = conv_params num_b, b = tiling['B/b'] num_ow, ow = tiling['OW/ow'] num_oh, oh = tiling['OH/oh'] num_ic, ic = tiling['IC/ic'] num_oc, oc = tiling['OC/oc'] kw = kh = K perf_factor = acc_obj.get_perf_factor(iprec, wprec) writes = {} reads = {} if im2col: writes['wgt'] = \ ceil_a_by_b(K * K * ic, acc_obj.N * perf_factor) * acc_obj.N * perf_factor * \ oc * \ wprec # ceil_a_by_b(oc, acc_obj.M) * acc_obj.M * \ else: #TODO: Figure this out writes['wgt'] = \ ceil_a_by_b(K * K * ic, acc_obj.N * perf_factor) * acc_obj.N * perf_factor * \ oc * \ wprec # ceil_a_by_b(oc, acc_obj.M) * acc_obj.M * \ if im2col: writes['act'] = ow * oh * \ K * K * ic * \ b * iprec # ceil_a_by_b(K * K * ic, acc_obj.N * perf_factor) * acc_obj.N * perf_factor * \ else: #TODO: Figure this out iw = K + (ow - 1) * S ih = K + (oh - 1) * S writes['act'] = iw * ih * ic * b * iprec oprec = 32 writes['out'] = ow * oh * ceil_a_by_b(oc, acc_obj.M) * acc_obj.M * b * oprec reads['out'] = ow * oh * ceil_a_by_b(oc, acc_obj.M) * acc_obj.M * b * oprec # Skip if overutilizing resources # TODO check bytes/bits overflow = False if writes['wgt'] > acc_obj.sram['wgt'] * 8 / 2: if verbose: print('wgt overflow: {}'.format(writes['wgt'])) print(b, ow, oh, ic, oc) overflow = True if writes['act'] > acc_obj.sram['act'] * 8 / 2: if verbose: print('act overflow') print(b, ow, oh, ic, oc) overflow = True if writes['out'] > acc_obj.sram['out'] * 8 / 2: if verbose: print('out overflow') print(b, ow, oh, ic, oc) overflow = True if overflow: if verbose: print('Activation size: {} bytes'.format(writes['act'] / 8.)) print('Weights size: {} bytes'.format(writes['wgt'] / 8.)) print('Output size: {} bytes'.format(writes['out'] / 8.)) return max_write_size = {} max_read_size = {} for namespace in writes: max_write_size[namespace] = writes[namespace] for namespace in reads: max_read_size[namespace] = reads[namespace] # First the loop block optimizations stats = Stats() write_promote = {'wgt': True, 'act': True, 'out': True} read_promote = {'out': True} if verbose: logger.debug('Initialize reads/writes') logger.debug('\tim2col: {}'.format(im2col)) logger.debug('\tTiling: {}'.format(tiling)) logger.debug('\tReads : {}'.format(reads)) logger.debug('\tWrites: {}'.format(writes)) for loop in reversed(order_type): num_tiles, tile_size = tiling[loop] # promote all writes for namespace in writes: # promote is true if write_promote[namespace]: # If tile loop depends on the namespace index, make the read size larger if tile_deps[loop][namespace]: writes[namespace] *= num_tiles # If tile size is larger than the SRAM, set promote to False if writes[namespace] > acc_obj.sram[namespace] * 8. / 2: write_promote[namespace] = False else: max_write_size[namespace] = writes[namespace] else: writes[namespace] *= num_tiles # promote all reads for namespace in reads: # promote is true if read_promote[namespace]: # Tile loop depends on the namespace index if tile_deps[loop][namespace]: reads[namespace] *= num_tiles # Tile size is now larger than the SRAM, set promote to False if reads[namespace] > acc_obj.sram[namespace] * 8. / 2: read_promote[namespace] = False else: max_read_size[namespace] = writes[namespace] else: reads[namespace] *= num_tiles if verbose: logger.debug('Loop: {}'.format(loop)) logger.debug('\tLoop range: {}'.format(tiling[loop])) logger.debug('\tMax write size: {}'.format(max_write_size)) logger.debug('\tMax read size: {}'.format(max_read_size)) logger.debug('\tLoop Dependencies: {}'.format(tile_deps[loop])) logger.debug('\tLoop Promote: {}'.format(write_promote)) logger.debug('\tReads : {}'.format(reads)) logger.debug('\tWrites: {}'.format(writes)) for namespace in writes: stats.writes[namespace] = writes[namespace] stats.reads['dram'] += writes[namespace] for namespace in reads: stats.reads[namespace] = reads[namespace] stats.writes['dram'] += reads[namespace] # Next the inner loop optimizations if im2col: # With im2col, loops are: # (os_loop: ic x kh x kw): Wgt: True, Out: False, Act: True # (ws_loop: b x oh x ow): Wgt: False, Out: True, Act: True # (is_loop: oc): Wgt: True, Out: True, Act: False is_loop = ceil_a_by_b(oc, acc_obj.M) * acc_obj.M os_loop = ceil_a_by_b( ic * kh * kw, acc_obj.N * acc_obj.get_perf_factor(iprec, wprec) ) * acc_obj.N * acc_obj.get_perf_factor(iprec, wprec) ws_loop = b * oh * ow # Input Stationary energy # kw * kh * ic * oh * ow * b -> oc is_energy = (os_loop * ws_loop) * (iprec + is_loop * (wprec + oprec)) # Output Stationary energy # oc * oh * ow * b -> kw * kh * ic os_energy = (is_loop * ws_loop) * (oprec + os_loop * (iprec + wprec)) # Weight Stationary energy # kw * kh * ic * oc -> b * ow * oh ws_energy = (os_loop * is_loop) * (wprec + ws_loop * (iprec + oprec)) else: is_loop = ceil_a_by_b(oc, acc_obj.M) * acc_obj.M os_loop = ceil_a_by_b( ic, acc_obj.N * acc_obj.get_perf_factor(iprec, wprec) ) * acc_obj.N * acc_obj.get_perf_factor(iprec, wprec) * kh * kw ws_loop = b * oh * ow # Input Stationary energy # kw * kh * ic * oh * ow * b -> oc is_energy = (os_loop * ws_loop) * (iprec + is_loop * (wprec + oprec)) # Output Stationary energy # oc * oh * ow * b -> kw * kh * ic os_energy = (is_loop * ws_loop) * (oprec + os_loop * (iprec + wprec)) # Weight Stationary energy # kw * kh * ic * oc -> b * ow * oh ws_energy = (os_loop * is_loop) * (wprec + ws_loop * (iprec + oprec)) min_energy = min(is_energy, ws_energy, os_energy) num_tiles = num_b * num_ow * num_oh * num_ic * num_oc if is_energy == min_energy: if verbose: logger.debug('SRAM access order: Input Stationary') stats.reads['act'] += num_tiles * (kw * kh * ic * oh * ow * b) * iprec stats.reads['out'] += num_tiles * (kw * kh * ic * oh * ow * b) * oc * oprec stats.writes['out'] += num_tiles * (kw * kh * ic * oh * ow * b) * oc * oprec stats.reads['wgt'] += num_tiles * (kw * kh * ic * oh * ow * b) * oc * wprec elif os_energy == min_energy: if verbose: logger.debug('SRAM access order: Output Stationary') stats.reads['act'] += num_tiles * (oc * oh * ow * b) * (kw * kh * ic) * iprec stats.reads['out'] += num_tiles * (oc * oh * ow * b) * oprec stats.writes['out'] += num_tiles * (oc * oh * ow * b) * oprec stats.reads['wgt'] += num_tiles * (oc * oh * ow * b) * (kw * kh * ic) * wprec else: if verbose: logger.debug('SRAM access order: Weight Stationary') stats.reads['act'] += num_tiles * (kw * kh * ic * oc) * (b * ow * oh) * iprec stats.reads['out'] += num_tiles * (kw * kh * ic * oc) * (b * ow * oh) * oprec stats.writes['out'] += num_tiles * (kw * kh * ic * oc) * (b * ow * oh) * oprec stats.reads['wgt'] += num_tiles * (kw * kh * ic * oc) * wprec # TODO: update initial_dram_reads = 0 final_dram_writes = 0 for namespace in max_write_size: initial_dram_reads += max_write_size[namespace] for namespace in max_read_size: final_dram_writes += max_read_size[namespace] latency = acc_obj.get_mem_read_cycles('dram', initial_dram_reads) + \ acc_obj.get_mem_write_cycles('dram', final_dram_writes) total_dram_accesses = stats.reads['dram'] + stats.writes['dram'] middle_dram_accesses = total_dram_accesses - initial_dram_reads - final_dram_writes compute_cycles = num_tiles * acc_obj.get_compute_cycles( ic, oc, ow, oh, b, kw, kh, iprec, wprec, im2col) memory_cycles_required = ceil_a_by_b(middle_dram_accesses, acc_obj.mem_if_width) memory_stalls = max(0, memory_cycles_required - compute_cycles) + latency stats.total_cycles = compute_cycles + memory_stalls stats.mem_stall_cycles = memory_stalls if verbose: logger.debug('Compute cycles : {:>20,}'.format(compute_cycles)) logger.debug('Memory cycles : {:>20,}'.format(memory_cycles_required + latency)) logger.debug('Memory stalls : {:>20,}'.format(memory_stalls)) return stats
def get_loop_instructions(conv_params, tiling, order_type): acc_obj, K, O, S, IC, OC, B, iprec, wprec, im2col, energy_cost = conv_params I = (O - 1) * S + K num_b, b = tiling['B/b'] num_ow, ow = tiling['OW/ow'] num_oh, oh = tiling['OH/oh'] num_ic, ic = tiling['IC/ic'] num_oc, oc = tiling['OC/oc'] instructions = {} instructions['B/b'] = [num_b, I * I * IC * b, 0, O * O * OC * b] instructions['OW/ow'] = [num_ow, ow * S, 0, ow] instructions['OH/oh'] = [num_oh, I * S, 0, O] instructions['IC/ic'] = [num_ic, I * I * ic, K * K * ic, 0] instructions['OC/oc'] = [num_oc, 0, K * K * IC * oc, O * O * oc] instruction_ordered = LoopStack() wgt_stride = [] act_stride = [] out_stride = [] count = 0 for o in order_type: ins = instructions[o] if ins[0] > 1: stride = {'wgt': ins[2], 'act': ins[1], 'out': ins[3]} instruction_ordered.insert_loop(ins[0], stride=stride, level=count, name=o) wgt_stride.append(stride['wgt']) act_stride.append(stride['act']) out_stride.append(stride['out']) count += 1 if count == 0: ins = instructions[o] stride = {'wgt': ins[2], 'act': ins[1], 'out': ins[3]} instruction_ordered.insert_loop(ins[0], stride=stride, level=count, name=o) wgt_stride.append(stride['wgt']) act_stride.append(stride['act']) out_stride.append(stride['out']) count += 1 iw = K + (ow - 1) * S ih = K + (oh - 1) * S I = K + (O - 1) * S if im2col: wgt_read_size = \ ceil_a_by_b(K * K * ic, acc_obj.N) * acc_obj.N * oc * \ wprec max_wgt_size = \ ceil_a_by_b(K * K * IC, acc_obj.N) * acc_obj.N * OC * wprec else: wgt_read_size = \ ceil_a_by_b(K * K * ic, acc_obj.N) * acc_obj.N * \ ceil_a_by_b(oc, acc_obj.M) * acc_obj.M * \ wprec max_wgt_size = \ ceil_a_by_b(K * K * IC, acc_obj.N) * acc_obj.N * \ ceil_a_by_b(OC, acc_obj.M) * acc_obj.M * wprec if im2col: act_read_size = ow * oh * \ ceil_a_by_b(K * K, acc_obj.N) * \ b * iprec * acc_obj.N max_act_size = B * O * O * \ ceil_a_by_b(K * K, acc_obj.N) * acc_obj.N * \ iprec else: act_read_size = iw * ih * ic * b * iprec max_act_size = B * I * I * IC * iprec oprec = 32 out_read_size = ow * oh * oc * b * oprec max_out_size = O * O * OC * B * oprec # Skip if overutilizing resources (consider double buffering) if wgt_read_size > acc_obj.sram['wgt'] * 8 / 2.0: print('error') return if act_read_size > acc_obj.sram['act'] * 8 / 2.0: return if out_read_size > acc_obj.sram['out'] * 8 / 2.0: return # Skip tiling if underutilizing resources # underutilization_count = 0 # if act_read_size < 0.5 * acc_obj.sram['act'] and max_act_size >= 0.5 * acc_obj.sram['act']: # underutilization_count += 1 # if out_read_size < 0.5 * acc_obj.sram['out'] and max_out_size >= 0.5 * acc_obj.sram['out']: # underutilization_count += 1 # if wgt_read_size < 0.5 * acc_obj.sram['wgt'] and max_wgt_size >= 0.5 * acc_obj.sram['wgt']: # underutilization_count += 1 # if underutilization_count > 1: # return # Memory Instructions instruction_ordered.insert_mem_read(name='Wgt RD', namespace='wgt', addr=0, size=wgt_read_size, stride=wgt_stride, level=count - 0) instruction_ordered.insert_mem_read(name='Act RD', namespace='act', addr=0, size=act_read_size, stride=act_stride, level=count - 0) instruction_ordered.insert_mem_read(name='Out RD', namespace='out', addr=0, size=out_read_size, stride=out_stride, level=count - 0) instruction_ordered.insert_mem_write(name='Out WR', namespace='out', addr=0, size=out_read_size, stride=out_stride, level=count - 0) ni = K * K * ic no = oh * ow * oc b = b instruction_ordered.insert_compute(acc_obj.get_compute_stats, ic, oc, ow, oh, b, K, K, iprec, wprec, im2col) # stats = acc_obj.loop_estimate_stats(instruction_ordered) instruction_ordered.promote_mem_ops(acc_obj.sram) return instruction_ordered
def get_conv_cycles(self, K, O, S, IC, OC, iprec, wprec, batch_size=1, im2col=False): """ Get number of cycles required for Convolution layer. description: This functions does an exhaustive search for finding the optimal Tiling and Ordering parameters """ B = batch_size I = (O - 1) * S + K # We do not tile the "K" dimension and compute an entire 2-D conv at a # time num_O_tiles = int(math.ceil(log2(O))) + 1 num_IC_tiles = int(math.ceil(log2(IC))) + 1 num_OC_tiles = int( math.ceil(log2(math.ceil(float(OC) / self.accelerator.M)))) + 1 num_B_tiles = int(math.ceil(log2(B))) + 1 self.logger.debug('Number of O Tiles: {}'.format(num_O_tiles)) self.logger.debug('Number of IC Tiles: {}'.format(num_IC_tiles)) self.logger.debug('Number of OC Tiles: {}'.format(num_OC_tiles)) self.logger.debug('Number of B Tiles: {}'.format(num_B_tiles)) best_instructions_dict = {} conv_params = self.accelerator, K, O, S, IC, OC, B, iprec, wprec, im2col, self.get_energy_cost( ) best_instructions, best_tiling, best_order = optimize_for_order( conv_params) stats = get_stats_fast(conv_params, best_tiling, best_order, verbose=False) act_reads = stats.reads['act'] wgt_reads = stats.reads['wgt'] out_reads = stats.reads['out'] dram_reads = stats.reads['dram'] out_writes = stats.writes['out'] dram_writes = stats.writes['dram'] best_cycles = stats.total_cycles num_ops = O * O * K * K * IC * OC * B # self.logger.debug('Best Operations: {}'.format(best_operations)) self.logger.debug('Conv Layer') self.logger.debug('Num of ops: {}'.format(num_ops)) self.logger.debug('Kernel Size: {}x{}x{}x{}'.format(K, K, IC, OC)) self.logger.debug('Output Size: {}x{}x{}'.format(O, O, OC)) self.logger.debug('Stride Size: {}x{}'.format(S, S)) self.logger.debug('Input Size: {}x{}x{}'.format(I, I, IC)) self.logger.debug('Max Precision: {}'.format(self.accelerator.pmax)) self.logger.debug('Min Precision: {}'.format(self.accelerator.pmin)) self.logger.debug('Activation Precision: {}'.format(iprec)) self.logger.debug('Weight Precision: {}'.format(wprec)) self.logger.debug('Performance Factor: {}'.format( self.get_perf_factor(iprec, wprec))) self.logger.debug('Total Cycles: {:,}'.format(best_cycles)) cycles_per_batch = ceil_a_by_b(best_cycles, B) self.logger.debug( 'Total Cycles per batch: {:,}'.format(cycles_per_batch)) ops_per_cycle = float(num_ops) / best_cycles self.logger.debug('Ops/Cycle: {:,.2f}'.format(ops_per_cycle)) ops_per_cycle_per_pe = float(ops_per_cycle) / (self.accelerator.N * self.accelerator.M) self.logger.debug('Ops/Cycle/PE: {:,.4}'.format(ops_per_cycle_per_pe)) return stats, best_instructions
def get_energy_cost(self): if self.energy_costs is not None: return self.energy_costs frequency = self.accelerator.frequency ################################################## N = self.accelerator.N M = self.accelerator.M pmax = self.accelerator.pmax pmin = self.accelerator.pmin wbuf_size = self.accelerator.sram['wgt'] * 8 ibuf_size = self.accelerator.sram['act'] * 8 obuf_size = self.accelerator.sram['out'] * 8 wbuf_bank = N * M ibuf_bank = N obuf_bank = M wbuf_bits = (pmax * pmax / pmin) ibuf_bits = (pmax * pmax / pmin) obuf_bits = 32 wbuf_word = ceil_a_by_b(wbuf_size, wbuf_bank * wbuf_bits) ibuf_word = ceil_a_by_b(ibuf_size, ibuf_bank * ibuf_bits) obuf_word = ceil_a_by_b(obuf_size, obuf_bank * obuf_bits) wbuf_bank_size = wbuf_word * wbuf_bits ibuf_bank_size = ibuf_word * ibuf_bits obuf_bank_size = obuf_word * obuf_bits assert wbuf_bank_size * wbuf_bank == wbuf_size assert ibuf_bank_size * ibuf_bank == ibuf_size assert obuf_bank_size * obuf_bank == obuf_size ################################################## cfg_dict = { 'size (bytes)': wbuf_bank_size / 8., 'block size (bytes)': wbuf_bits / 8., 'read-write port': 0 } wbuf_data = self.sram_obj.get_data_clean(cfg_dict) wbuf_read_energy = float(wbuf_data['read_energy_nJ']) / wbuf_bits wbuf_write_energy = float(wbuf_data['write_energy_nJ']) / wbuf_bits wbuf_leak_power = float(wbuf_data['leak_power_mW']) * wbuf_bank wbuf_area = float(wbuf_data['area_mm^2']) * wbuf_bank self.logger.debug('WBUF :') self.logger.debug( '\tBanks : {0:>8}'.format(wbuf_bank)) self.logger.debug( '\tBitWidth : {0:>8} bits'.format(wbuf_bits)) self.logger.debug( '\tWords : {0:>8}'.format(wbuf_word)) self.logger.debug( '\tTotal Size : {0:>8} kBytes'.format(wbuf_size / 8. / 1024.)) self.logger.debug( '\tTotal Area : {0:>8.2f} mm^2'.format(wbuf_area)) self.logger.debug( '\tLeak Energy (per clock) : {0:>8.4f} mWatt'.format( wbuf_leak_power)) self.logger.debug( '\tRead Energy : {0:>8.4f} pJ/bit'.format( wbuf_read_energy * 1.e3)) self.logger.debug( '\tWrite Energy : {0:>8.4f} pJ/bit'.format( wbuf_write_energy * 1.e3)) ################################################## cfg_dict = { 'size (bytes)': ibuf_bank_size / 8., 'block size (bytes)': ibuf_bits / 8., 'read-write port': 0 } ibuf_data = self.sram_obj.get_data_clean(cfg_dict) ibuf_read_energy = float(ibuf_data['read_energy_nJ']) / ibuf_bits ibuf_write_energy = float(ibuf_data['write_energy_nJ']) / ibuf_bits ibuf_leak_power = float(ibuf_data['leak_power_mW']) * ibuf_bank ibuf_area = float(ibuf_data['area_mm^2']) * ibuf_bank self.logger.debug('IBUF :') self.logger.debug( '\tBanks : {0:>8}'.format(ibuf_bank)) self.logger.debug( '\tBitWidth : {0:>8} bits'.format(ibuf_bits)) self.logger.debug( '\tWords : {0:>8}'.format(ibuf_word)) self.logger.debug( '\tTotal Size : {0:>8} kBytes'.format(ibuf_size / 8. / 1024.)) self.logger.debug( '\tTotal Area : {0:>8.2f} mm^2'.format(ibuf_area)) self.logger.debug( '\tLeak Energy (per clock) : {0:>8.4f} mWatt'.format( ibuf_leak_power)) self.logger.debug( '\tRead Energy : {0:>8.4f} pJ/bit'.format( ibuf_read_energy * 1.e3)) self.logger.debug( '\tWrite Energy : {0:>8.4f} pJ/bit'.format( ibuf_write_energy * 1.e3)) ################################################## cfg_dict = { 'size (bytes)': obuf_bank_size / 8., 'block size (bytes)': obuf_bits / 8., 'read-write port': 1 } obuf_data = self.sram_obj.get_data_clean(cfg_dict) obuf_read_energy = float(obuf_data['read_energy_nJ']) / obuf_bits obuf_write_energy = float(obuf_data['write_energy_nJ']) / obuf_bits obuf_leak_power = float(obuf_data['leak_power_mW']) * obuf_bank obuf_area = float(obuf_data['area_mm^2']) * obuf_bank self.logger.debug('OBUF :') self.logger.debug( '\tBanks : {0:>8}'.format(obuf_bank)) self.logger.debug( '\tBitWidth : {0:>8} bits'.format(obuf_bits)) self.logger.debug( '\tWords : {0:>8}'.format(obuf_word)) self.logger.debug( '\tTotal Size : {0:>8} kBytes'.format(obuf_size / 8. / 1024.)) self.logger.debug( '\tTotal Area : {0:>8.2f} mm^2'.format(obuf_area)) self.logger.debug( '\tLeak Energy (per clock) : {0:>8.4f} mWatt'.format( obuf_leak_power)) self.logger.debug( '\tRead Energy : {0:>8.4f} pJ/bit'.format( obuf_read_energy * 1.e3)) self.logger.debug( '\tWrite Energy : {0:>8.4f} pJ/bit'.format( obuf_write_energy * 1.e3)) ################################################## # Get stats for systolic array core_csv = os.path.join('./results', 'systolic_array_synth.csv') core_synth_data = pandas.read_csv(core_csv) lookup_dict = {} lookup_dict['Max Precision (bits)'] = pmax lookup_dict['Min Precision (bits)'] = pmin lookup_dict['N'] = N lookup_dict['M'] = M core_data = lookup_pandas_dataframe(core_synth_data, lookup_dict) if len(core_data) == 0: lookup_dict['N'] = 4 lookup_dict['M'] = 4 core_data = lookup_pandas_dataframe(core_synth_data, lookup_dict) assert len(core_data) == 1 core_area = float(core_data['Area (um^2)']) * 1.e-6 * (N * M) / 16. core_dyn_power = float( core_data['Dynamic Power (nW)']) * (N * M) / 16. core_dyn_energy = core_dyn_power / float(core_data['Frequency']) core_leak_power = float( core_data['Leakage Power (nW)']) * (N * M) / 16. core_leak_energy = core_leak_power / float(core_data['Frequency']) else: core_area = float(core_data['Area (um^2)']) * 1.e-6 core_dyn_power = float(core_data['Dynamic Power (nW)']) core_dyn_energy = core_dyn_power / float(core_data['Frequency']) core_leak_power = float(core_data['Leakage Power (nW)']) core_leak_energy = core_leak_power / float(core_data['Frequency']) self.logger.debug('Core :') self.logger.debug( '\tDimensions : {0}x{1}-systolic array'.format(N, M)) self.logger.debug('\tMax-Precision : {}'.format(pmax)) self.logger.debug('\tMin-Precision : {}'.format(pmin)) self.logger.debug( '\tLeak power : {} (nW)'.format(core_leak_energy)) self.logger.debug( '\tDynamic Energy (nJ) : {}'.format(core_dyn_energy)) self.logger.debug('\tArea (mm^2) : {}'.format(core_area)) ################################################## energy_tuple = EnergyTuple(core_dyn_energy, wbuf_read_energy, wbuf_write_energy, ibuf_read_energy, ibuf_write_energy, obuf_read_energy, obuf_write_energy) return energy_tuple