def __init__(self, SimConfig_path, indata=0, rdata=0, outprecision=8, default_inbuf_size=16, default_outbuf_size=4, default_inchannel=64, default_size=9): # indata: volume of input data (for pooling) (Byte) # rdata: volume of data from buffer to iReg (Byte) # default_inbuf_size: the default PE-level input buffer size (unit: KB) # default_outbuf_size: the default Tile-level output buffer size (unit: KB) self.pooling = Pooling(SimConfig_path=SimConfig_path) self.inbuf = buffer(SimConfig_path=SimConfig_path, buf_level=1, default_buf_size=default_inbuf_size) self.inbuf.calculate_buf_write_latency(indata) self.inbuf_wlatency = self.inbuf.buf_wlatency # unit: ns self.inbuf.calculate_buf_read_latency(rdata) self.inbuf_rlatency = self.inbuf.buf_rlatency self.pooling.calculate_Pooling_latency(inchannel=default_inchannel, insize=default_size) self.digital_latency = self.pooling.Pooling_latency self.outbuf = buffer(SimConfig_path=SimConfig_path, buf_level=2, default_buf_size=default_outbuf_size) self.outbuf.calculate_buf_write_latency(wdata=(default_inchannel * outprecision / 8)) self.outbuf_rlatency = 0 self.outbuf_wlatency = self.outbuf.buf_wlatency self.pooling_latency = self.inbuf_wlatency + self.inbuf_rlatency + self.digital_latency + self.outbuf_rlatency + self.outbuf_wlatency
def calculate_PE_area(self, SimConfig_path=None, default_inbuf_size=16): # unit: um^2 self.inbuf = buffer(SimConfig_path=SimConfig_path, buf_level=1, default_buf_size=default_inbuf_size) self.inbuf.calculate_buf_area() self.calculate_xbar_area() self.calculate_demux_area() self.calculate_mux_area() self.calculate_DAC_area() self.calculate_ADC_area() self.PE_adder.calculate_adder_area() self.PE_shiftreg.calculate_shiftreg_area() self.PE_iReg.calculate_reg_area() self.PE_oReg.calculate_reg_area() self.PE_xbar_area = self.PE_xbar_num * self.xbar_area self.PE_ADC_area = self.ADC_area * self.PE_ADC_num self.PE_DAC_area = self.DAC_area * self.PE_DAC_num self.PE_adder_area = self.PE_group_ADC_num * self.PE_adder_num * self.PE_adder.adder_area self.PE_shiftreg_area = self.PE_ADC_num * self.PE_shiftreg.shiftreg_area self.PE_iReg_area = self.PE_DAC_num * self.PE_iReg.reg_area self.PE_oReg_area = self.PE_ADC_num * self.PE_oReg.reg_area self.PE_input_demux_area = self.input_demux_area * self.PE_DAC_num self.PE_output_mux_area = self.output_mux_area * self.PE_ADC_num self.PE_digital_area = self.PE_adder_area + self.PE_shiftreg_area + self.PE_input_demux_area + \ self.PE_output_mux_area + self.PE_iReg_area + self.PE_oReg_area self.PE_inbuf_area = self.inbuf.buf_area self.PE_area = self.PE_xbar_area + self.PE_ADC_area + self.PE_DAC_area + self.PE_digital_area + self.PE_inbuf_area
def __init__(self, SimConfig_path, read_row=0, read_column=0, indata=0, rdata=0, inprecision = 8, PE_num=0, default_inbuf_size = 16, default_outbuf_size =4): # read_row: activated WL number in crossbar # read_column: activated BL number in crossbar # indata: volume of input data (for PE) (Byte) # rdata: volume of data from buffer to iReg (Byte) # outdata: volume of output data (for PE) (Byte) # inprecision: input data precision of each Xbar # PE_num: used PE_number in one tile # default_inbuf_size: the default PE-level input buffer size (unit: KB) # default_outbuf_size: the default Tile-level output buffer size (unit: KB) PE_latency_analysis.__init__(self, SimConfig_path, read_row=read_row, read_column=read_column, indata=indata, rdata=rdata, inprecision=inprecision, default_buf_size = default_inbuf_size) tilel_config = cp.ConfigParser() tilel_config.read(SimConfig_path, encoding='UTF-8') self.intra_tile_bandwidth = float(tilel_config.get('Tile level', 'Intra_Tile_Bandwidth')) merge_time = math.ceil(math.log2(PE_num)) self.tile_PE_num = list(map(int, tilel_config.get('Tile level', 'PE_Num').split(','))) if self.tile_PE_num[0] == 0: self.tile_PE_num[0] = 4 self.tile_PE_num[1] = 4 assert self.tile_PE_num[0] > 0, "PE number in one PE < 0" assert self.tile_PE_num[1] > 0, "PE number in one PE < 0" self.tile_PE_total_num = self.tile_PE_num[0] * self.tile_PE_num[1] assert PE_num <= self.tile_PE_total_num, "PE number exceeds the range" self.outbuf = buffer(SimConfig_path=SimConfig_path, buf_level=2, default_buf_size=default_outbuf_size) total_level = math.ceil(math.log2(self.tile_PE_total_num)) self.jointmodule_latency = merge_time * self.digital_period self.transfer_latency = (total_level*(self.PE.ADC_precision+merge_time)-merge_time*(merge_time+1)/2)\ *read_column/self.intra_tile_bandwidth self.outbuf.calculate_buf_write_latency(wdata=((self.PE.ADC_precision + merge_time)*read_column*PE_num/8)) self.tile_buf_rlatency = 0 self.tile_buf_wlatency = self.outbuf.buf_wlatency # do not consider self.tile_latency = self.PE_latency + self.jointmodule_latency + self.transfer_latency + self.tile_buf_wlatency
def calculate_PE_read_power_fast(self, max_column=0, max_row=0, max_group=0, SimConfig_path=None, default_inbuf_size=16): # unit: W # coarse but fast estimation # max_column: maximum used column in one crossbar in this tile # max_row: maximum used row in one crossbar in this tile # max_group: maximum used groups in one PE self.inbuf = buffer(SimConfig_path=SimConfig_path, buf_level=1, default_buf_size=default_inbuf_size) self.inbuf.calculate_buf_read_power() self.inbuf.calculate_buf_write_power() self.calculate_DAC_power() self.calculate_ADC_power() self.calculate_demux_power() self.calculate_mux_power() self.PE_shiftreg.calculate_shiftreg_power() self.PE_iReg.calculate_reg_power() self.PE_oReg.calculate_reg_power() self.PE_adder.calculate_adder_power() self.PE_read_power = 0 self.PE_xbar_read_power = 0 self.PE_ADC_read_power = 0 self.PE_DAC_read_power = 0 self.PE_adder_read_power = 0 self.PE_shiftreg_read_power = 0 self.PE_iReg_read_power = 0 self.PE_oReg_read_power = 0 self.input_demux_read_power = 0 self.output_mux_read_power = 0 self.PE_digital_read_power = 0 self.xbar_read_config(read_row=max_row, read_column=max_column) self.calculate_xbar_read_power() self.PE_xbar_read_power = self.PE_multiplex_xbar_num[ 1] * max_group * self.xbar_read_power / self.input_demux / self.output_mux self.PE_DAC_read_power = max_group * math.ceil( max_row / self.input_demux) * self.DAC_power self.PE_ADC_read_power = max_group * math.ceil( max_column / self.output_mux) * self.ADC_power self.input_demux_read_power = max_group * math.ceil( max_row / self.input_demux) * self.input_demux_power self.output_mux_read_power = max_group * math.ceil( max_column / self.output_mux) * self.output_mux_power self.PE_adder_read_power = (max_group - 1) * math.ceil( max_column / self.output_mux) * self.PE_adder.adder_power self.PE_shiftreg_read_power = max_group * math.ceil( max_column / self.output_mux) * self.PE_shiftreg.shiftreg_power self.PE_iReg_read_power = max_group * math.ceil( max_row / self.input_demux) * self.PE_iReg.reg_power self.PE_oReg_read_power = max_group * math.ceil( max_column / self.output_mux) * self.PE_oReg.reg_power self.PE_digital_read_power = self.input_demux_read_power + self.output_mux_read_power + self.PE_adder_read_power + self.PE_shiftreg_read_power + self.PE_iReg_read_power + self.PE_oReg_read_power self.PE_inbuf_read_rpower = self.inbuf.buf_rpower * 1e-3 self.PE_inbuf_read_wpower = self.inbuf.buf_wpower * 1e-3 self.PE_inbuf_read_power = self.PE_inbuf_read_rpower + self.PE_inbuf_read_wpower self.PE_read_power = self.PE_xbar_read_power + self.PE_DAC_read_power + self.PE_ADC_read_power + self.PE_digital_read_power + self.PE_inbuf_read_power
def calculate_model_area(self): #Todo: Noc area self.graph.tile.calculate_tile_area( SimConfig_path=self.SimConfig_path, default_inbuf_size=self.graph.max_inbuf_size, default_outbuf_size=self.graph.max_outbuf_size) self.global_buf = buffer(SimConfig_path=self.SimConfig_path, buf_level=1, default_buf_size=self.graph.global_buf_size) self.global_buf.calculate_buf_area() self.global_add = adder(SimConfig_path=self.SimConfig_path, bitwidth=self.graph.global_adder_bitwidth) self.global_add.calculate_adder_area() for i in range(self.total_layer_num): tile_num = self.graph.layer_tileinfo[i]['tilenum'] self.arch_area[i] = self.graph.tile.tile_area * tile_num self.arch_xbar_area[i] = self.graph.tile.tile_xbar_area * tile_num self.arch_ADC_area[i] = self.graph.tile.tile_ADC_area * tile_num self.arch_DAC_area[i] = self.graph.tile.tile_DAC_area * tile_num self.arch_digital_area[ i] = self.graph.tile.tile_digital_area * tile_num self.arch_adder_area[ i] = self.graph.tile.tile_adder_area * tile_num self.arch_shiftreg_area[ i] = self.graph.tile.tile_shiftreg_area * tile_num self.arch_iReg_area[i] = self.graph.tile.tile_iReg_area * tile_num self.arch_oReg_area[i] = self.graph.tile.tile_oReg_area * tile_num self.arch_input_demux_area[ i] = self.graph.tile.tile_input_demux_area * tile_num self.arch_output_mux_area[ i] = self.graph.tile.tile_output_mux_area * tile_num self.arch_jointmodule_area[ i] = self.graph.tile.tile_jointmodule_area * tile_num self.arch_buf_area[i] = self.graph.tile.tile_buffer_area * tile_num self.arch_pooling_area[ i] = self.graph.tile.tile_pooling_area * tile_num self.arch_total_area = sum(self.arch_area) self.arch_total_xbar_area = sum(self.arch_xbar_area) self.arch_total_ADC_area = sum(self.arch_ADC_area) self.arch_total_DAC_area = sum(self.arch_DAC_area) self.arch_total_digital_area = sum( self.arch_digital_area ) + self.global_add.adder_area * self.graph.global_adder_num self.arch_total_adder_area = sum( self.arch_adder_area ) + self.global_add.adder_area * self.graph.global_adder_num self.arch_total_shiftreg_area = sum(self.arch_shiftreg_area) self.arch_total_iReg_area = sum(self.arch_iReg_area) self.arch_total_oReg_area = sum(self.arch_oReg_area) self.arch_total_input_demux_area = sum(self.arch_input_demux_area) self.arch_total_output_mux_area = sum(self.arch_output_mux_area) self.arch_total_jointmodule_area = sum(self.arch_jointmodule_area) self.arch_total_buf_area = sum( self.arch_buf_area) + self.global_buf.buf_area self.arch_total_pooling_area = sum(self.arch_pooling_area)
def calculate_tile_area(self, SimConfig_path=None, default_inbuf_size=16, default_outbuf_size=4): # unit: um^2 self.tile_area = 0 self.tile_xbar_area = 0 self.tile_ADC_area = 0 self.tile_DAC_area = 0 self.tile_input_demux_area = 0 self.tile_output_mux_area = 0 self.tile_shiftreg_area = 0 self.tile_iReg_area = 0 self.tile_oReg_area = 0 self.tile_adder_area = 0 self.tile_buffer_area = 0 self.tile_digital_area = 0 self.tile_adder.calculate_adder_area() self.tile_shiftreg.calculate_shiftreg_area() self.tile_iReg.calculate_reg_area() self.tile_oReg.calculate_reg_area() self.tile_jointmodule.calculate_jointmodule_area() self.tile_buffer = buffer(SimConfig_path=SimConfig_path, buf_level=2, default_buf_size=default_outbuf_size) self.tile_buffer.calculate_buf_area() self.tile_pooling.calculate_Pooling_area() for i in range(self.tile_PE_num[0]): for j in range(self.tile_PE_num[1]): self.tile_PE_list[i][j].calculate_PE_area( SimConfig_path=SimConfig_path, default_inbuf_size=default_inbuf_size) self.tile_xbar_area += self.tile_PE_list[i][j].PE_xbar_area self.tile_ADC_area += self.tile_PE_list[i][j].PE_ADC_area self.tile_DAC_area += self.tile_PE_list[i][j].PE_DAC_area # self.tile_digital_area += self.tile_PE_list[i][j].PE_digital_area self.tile_input_demux_area += self.tile_PE_list[i][ j].PE_input_demux_area self.tile_output_mux_area += self.tile_PE_list[i][ j].PE_output_mux_area self.tile_shiftreg_area += self.tile_PE_list[i][ j].PE_shiftreg_area self.tile_iReg_area += self.tile_PE_list[i][j].PE_iReg_area self.tile_oReg_area += self.tile_PE_list[i][j].PE_oReg_area self.tile_adder_area += self.tile_PE_list[i][j].PE_adder_area self.tile_buffer_area += self.tile_PE_list[i][j].PE_inbuf_area # self.tile_adder_area += self.tile_adder_num * self.tile_adder.adder_area # self.tile_shiftreg_area += self.tile_shiftreg_num * self.tile_shiftreg.shiftreg_area self.tile_jointmodule_area = self.tile_jointmodule_num * self.tile_jointmodule.jointmodule_area self.tile_digital_area = self.tile_input_demux_area + self.tile_output_mux_area + self.tile_adder_area \ + self.tile_shiftreg_area + self.tile_jointmodule_area + self.tile_iReg_area + self.tile_oReg_area self.tile_pooling_area = self.tile_pooling.Pooling_area self.tile_buffer_area += self.tile_buffer.buf_area self.tile_area = self.tile_xbar_area + self.tile_ADC_area + self.tile_DAC_area + self.tile_digital_area + self.tile_buffer_area + self.tile_pooling_area
def calculate_model_energy(self): #print(self.model_latency.total_buffer_r_latency) self.global_buf = buffer(SimConfig_path=self.SimConfig_path, buf_level=1, default_buf_size=self.graph.global_buf_size) self.global_buf.calculate_buf_read_power() self.global_buf.calculate_buf_write_power() self.global_add = adder(SimConfig_path=self.SimConfig_path, bitwidth=self.graph.global_adder_bitwidth) self.global_add.calculate_adder_power() for i in range(self.total_layer_num): tile_num = self.graph.layer_tileinfo[i]['tilenum'] self.arch_xbar_energy[i] = self.model_power.arch_xbar_power[ i] * self.model_latency.total_xbar_latency[i] self.arch_ADC_energy[i] = self.model_power.arch_ADC_power[ i] * self.model_latency.total_ADC_latency[i] self.arch_DAC_energy[i] = self.model_power.arch_DAC_power[ i] * self.model_latency.total_DAC_latency[i] self.arch_adder_energy[i] = self.model_power.arch_adder_power[ i] * self.model_latency.total_adder_latency[i] self.arch_shiftreg_energy[ i] = self.model_power.arch_shiftreg_power[ i] * self.model_latency.total_shiftreg_latency[i] self.arch_iReg_energy[i] = self.model_power.arch_iReg_power[ i] * self.model_latency.total_iReg_latency[i] self.arch_oReg_energy[i] = self.model_power.arch_oReg_power[ i] * self.model_latency.total_oReg_latency[i] self.arch_input_demux_energy[ i] = self.model_power.arch_input_demux_power[ i] * self.model_latency.total_input_demux_latency[i] self.arch_output_mux_energy[ i] = self.model_power.arch_output_mux_power[ i] * self.model_latency.total_output_mux_latency[i] self.arch_jointmodule_energy[ i] = self.model_power.arch_jointmodule_power[ i] * self.model_latency.total_jointmodule_latency[i] self.arch_buf_r_energy[i] = self.model_power.arch_buf_r_power[ i] * self.model_latency.total_buffer_r_latency[i] self.arch_buf_w_energy[i] = self.model_power.arch_buf_w_power[ i] * self.model_latency.total_buffer_w_latency[i] self.arch_buf_energy[ i] = self.arch_buf_r_energy[i] + self.arch_buf_w_energy[i] self.arch_pooling_energy[i] = self.model_power.arch_pooling_power[ i] * self.model_latency.total_pooling_latency[i] self.arch_digital_energy[i] = self.arch_shiftreg_energy[i]+self.arch_iReg_energy[i]+self.arch_oReg_energy[i]+\ self.arch_input_demux_energy[i]+self.arch_output_mux_energy[i]+self.arch_jointmodule_energy[i] self.arch_energy[i] = self.arch_xbar_energy[i]+self.arch_ADC_energy[i]+self.arch_DAC_energy[i]+\ self.arch_digital_energy[i]+self.arch_buf_energy[i]+self.arch_pooling_energy[i] self.arch_total_energy = sum(self.arch_energy) + self.arch_Noc_energy self.arch_total_xbar_energy = sum(self.arch_xbar_energy) self.arch_total_ADC_energy = sum(self.arch_ADC_energy) self.arch_total_DAC_energy = sum(self.arch_DAC_energy) self.arch_total_digital_energy = sum(self.arch_digital_energy)+\ self.global_add.adder_power*self.graph.global_adder_num*self.global_add.adder_latency self.arch_total_adder_energy = sum(self.arch_adder_energy)+\ self.global_add.adder_power*self.graph.global_adder_num*self.global_add.adder_latency self.arch_total_shiftreg_energy = sum(self.arch_shiftreg_energy) self.arch_total_iReg_energy = sum(self.arch_iReg_energy) self.arch_total_input_demux_energy = sum(self.arch_input_demux_energy) self.arch_total_output_mux_energy = sum(self.arch_output_mux_energy) self.arch_total_jointmodule_energy = sum(self.arch_jointmodule_energy) self.arch_total_buf_energy = sum(self.arch_buf_energy) + self.global_buf.buf_rpower*1e-3*self.global_buf.buf_rlatency \ + self.global_buf.buf_wpower*1e-3*self.global_buf.buf_wlatency self.arch_total_buf_r_energy = sum( self.arch_buf_r_energy ) + self.global_buf.buf_rpower * 1e-3 * self.global_buf.buf_rlatency self.arch_total_buf_w_energy = sum( self.arch_buf_w_energy ) + self.global_buf.buf_wpower * 1e-3 * self.global_buf.buf_wlatency self.arch_total_pooling_energy = sum(self.arch_pooling_energy)
def calculate_model_area(self): #Todo: Noc area self.graph.tile.calculate_tile_area( SimConfig_path=self.SimConfig_path, default_inbuf_size=self.graph.max_inbuf_size, default_outbuf_size=self.graph.max_outbuf_size) self.global_buf = buffer(SimConfig_path=self.SimConfig_path, buf_level=1, default_buf_size=self.graph.global_buf_size) self.global_buf.calculate_buf_area() self.global_add = adder(SimConfig_path=self.SimConfig_path, bitwidth=self.graph.global_adder_bitwidth) self.global_add.calculate_adder_area() self.tile = tile(SimConfig_path=self.SimConfig_path) self.tile_xbar_num = self.tile.tile_PE_total_num * self.tile.group_num * self.tile.xbar_column * self.tile.xbar_row self.tile_DAC_num = self.tile.tile_PE_total_num * self.tile.group_num * self.tile.xbar_row self.tile_ADC_num = self.tile.tile_PE_total_num * self.tile.group_num * self.tile.xbar_column total_tile_num = 0 used_total_xbar_num = 0 used_total_DAC_num = 0 used_total_ADC_num = 0 # not the real DAC/ADC num, but it reflects the DAC/ADC num for i in range(self.total_layer_num): layer_dict = self.NetStruct[i][0][0] tile_num = self.graph.layer_tileinfo[i]['tilenum'] self.arch_area[i] = self.graph.tile.tile_area * tile_num self.arch_xbar_area[i] = self.graph.tile.tile_xbar_area * tile_num self.arch_ADC_area[i] = self.graph.tile.tile_ADC_area * tile_num self.arch_DAC_area[i] = self.graph.tile.tile_DAC_area * tile_num self.arch_digital_area[ i] = self.graph.tile.tile_digital_area * tile_num self.arch_adder_area[ i] = self.graph.tile.tile_adder_area * tile_num self.arch_shiftreg_area[ i] = self.graph.tile.tile_shiftreg_area * tile_num self.arch_iReg_area[i] = self.graph.tile.tile_iReg_area * tile_num self.arch_oReg_area[i] = self.graph.tile.tile_oReg_area * tile_num self.arch_input_demux_area[ i] = self.graph.tile.tile_input_demux_area * tile_num self.arch_output_mux_area[ i] = self.graph.tile.tile_output_mux_area * tile_num self.arch_jointmodule_area[ i] = self.graph.tile.tile_jointmodule_area * tile_num self.arch_buf_area[i] = self.graph.tile.tile_buffer_area * tile_num self.arch_pooling_area[ i] = self.graph.tile.tile_pooling_area * tile_num if self.graph.layer_tileinfo[i]['type'] == 'conv': # only consider the utilization rate of conv layer and fc layer total_tile_num += tile_num used_xbar_num = self.graph.layer_tileinfo[i][ 'x_width'] * self.graph.layer_tileinfo[i]['y_height'] used_DAC_num = self.graph.layer_tileinfo[i][ 'y_height'] * self.graph.layer_tileinfo[i][ 'weight_precision'] * math.ceil( int(layer_dict['Outputchannel']) / self.tile.xbar_column) used_ADC_num = self.graph.layer_tileinfo[i][ 'x_width'] * self.graph.layer_tileinfo[i]['my'] self.arch_xbar_utilization[i] = used_xbar_num / ( tile_num * self.tile_xbar_num) self.arch_DAC_utilization[i] = used_DAC_num / ( tile_num * self.tile_DAC_num) self.arch_ADC_utilization[i] = used_ADC_num / ( tile_num * self.tile_ADC_num) used_total_xbar_num += used_xbar_num used_total_DAC_num += used_DAC_num used_total_ADC_num += used_ADC_num if self.graph.layer_tileinfo[i]['type'] == 'fc': # only consider the utilization rate of conv layer and fc layer total_tile_num += tile_num used_xbar_num = self.graph.layer_tileinfo[i][ 'x_width'] * self.graph.layer_tileinfo[i]['y_height'] used_DAC_num = self.graph.layer_tileinfo[i][ 'y_height'] * self.graph.layer_tileinfo[i][ 'weight_precision'] * math.ceil( int(layer_dict['Outfeature']) / self.tile.xbar_column) used_ADC_num = self.graph.layer_tileinfo[i][ 'x_width'] * self.graph.layer_tileinfo[i]['my'] self.arch_xbar_utilization[i] = used_xbar_num / ( tile_num * self.tile_xbar_num) self.arch_DAC_utilization[i] = used_DAC_num / ( tile_num * self.tile_DAC_num) self.arch_ADC_utilization[i] = used_ADC_num / ( tile_num * self.tile_ADC_num) used_total_xbar_num += used_xbar_num used_total_DAC_num += used_DAC_num used_total_ADC_num += used_ADC_num self.arch_total_area = sum(self.arch_area) self.arch_total_xbar_area = sum(self.arch_xbar_area) self.arch_total_ADC_area = sum(self.arch_ADC_area) self.arch_total_DAC_area = sum(self.arch_DAC_area) self.arch_total_digital_area = sum( self.arch_digital_area ) + self.global_add.adder_area * self.graph.global_adder_num self.arch_total_adder_area = sum( self.arch_adder_area ) + self.global_add.adder_area * self.graph.global_adder_num self.arch_total_shiftreg_area = sum(self.arch_shiftreg_area) self.arch_total_iReg_area = sum(self.arch_iReg_area) self.arch_total_oReg_area = sum(self.arch_oReg_area) self.arch_total_input_demux_area = sum(self.arch_input_demux_area) self.arch_total_output_mux_area = sum(self.arch_output_mux_area) self.arch_total_jointmodule_area = sum(self.arch_jointmodule_area) self.arch_total_buf_area = sum( self.arch_buf_area) + self.global_buf.buf_area self.arch_total_pooling_area = sum(self.arch_pooling_area) self.arch_total_xbar_utilization = used_total_xbar_num / ( total_tile_num * self.tile_xbar_num) self.arch_total_DAC_utilization = used_total_DAC_num / ( total_tile_num * self.tile_DAC_num) self.arch_total_ADC_utilization = used_total_ADC_num / ( total_tile_num * self.tile_ADC_num)
def calculate_model_power(self): self.global_buf = buffer(SimConfig_path=self.SimConfig_path, buf_level=1, default_buf_size=self.graph.global_buf_size) self.global_buf.calculate_buf_read_power() self.global_buf.calculate_buf_write_power() self.global_add = adder(SimConfig_path=self.SimConfig_path, bitwidth=self.graph.global_adder_bitwidth) self.global_add.calculate_adder_power() for i in range(self.total_layer_num): tile_num = self.graph.layer_tileinfo[i]['tilenum'] max_column = self.graph.layer_tileinfo[i]['max_column'] max_row = self.graph.layer_tileinfo[i]['max_row'] max_PE = self.graph.layer_tileinfo[i]['max_PE'] max_group = self.graph.layer_tileinfo[i]['max_group'] layer_type = self.graph.net[i][0][0]['type'] self.graph.tile.calculate_tile_read_power_fast( max_column=max_column, max_row=max_row, max_PE=max_PE, max_group=max_group, layer_type=layer_type, SimConfig_path=self.SimConfig_path, default_inbuf_size=self.graph.max_inbuf_size, default_outbuf_size=self.graph.max_outbuf_size) self.arch_power[i] = self.graph.tile.tile_read_power * tile_num self.arch_xbar_power[ i] = self.graph.tile.tile_xbar_read_power * tile_num self.arch_ADC_power[ i] = self.graph.tile.tile_ADC_read_power * tile_num self.arch_DAC_power[ i] = self.graph.tile.tile_DAC_read_power * tile_num self.arch_digital_power[ i] = self.graph.tile.tile_digital_read_power * tile_num self.arch_adder_power[ i] = self.graph.tile.tile_adder_read_power * tile_num self.arch_shiftreg_power[ i] = self.graph.tile.tile_shiftreg_read_power * tile_num self.arch_iReg_power[ i] = self.graph.tile.tile_iReg_read_power * tile_num self.arch_oReg_power[ i] = self.graph.tile.tile_oReg_read_power * tile_num self.arch_input_demux_power[ i] = self.graph.tile.tile_input_demux_read_power * tile_num self.arch_output_mux_power[ i] = self.graph.tile.tile_output_mux_read_power * tile_num self.arch_jointmodule_power[ i] = self.graph.tile.tile_jointmodule_read_power * tile_num self.arch_buf_power[ i] = self.graph.tile.tile_buffer_read_power * tile_num self.arch_buf_r_power[ i] = self.graph.tile.tile_buffer_r_read_power * tile_num self.arch_buf_w_power[ i] = self.graph.tile.tile_buffer_w_read_power * tile_num self.arch_pooling_power[ i] = self.graph.tile.tile_pooling_read_power * tile_num self.arch_total_power = sum(self.arch_power) self.arch_total_xbar_power = sum(self.arch_xbar_power) self.arch_total_ADC_power = sum(self.arch_ADC_power) self.arch_total_DAC_power = sum(self.arch_DAC_power) self.arch_total_digital_power = sum( self.arch_digital_power ) + self.global_add.adder_power * self.graph.global_adder_num self.arch_total_adder_power = sum( self.arch_adder_power ) + self.global_add.adder_power * self.graph.global_adder_num self.arch_total_shiftreg_power = sum(self.arch_shiftreg_power) self.arch_total_iReg_power = sum(self.arch_iReg_power) self.arch_total_oReg_power = sum(self.arch_oReg_power) self.arch_total_input_demux_power = sum(self.arch_input_demux_power) self.arch_total_output_mux_power = sum(self.arch_output_mux_power) self.arch_total_jointmodule_power = sum(self.arch_jointmodule_power) self.arch_total_buf_power = sum(self.arch_buf_power) + ( self.global_buf.buf_wpower + self.global_buf.buf_rpower) * 1e-3 self.arch_total_buf_r_power = sum( self.arch_buf_r_power) + self.global_buf.buf_rpower * 1e-3 self.arch_total_buf_w_power = sum( self.arch_buf_w_power) + self.global_buf.buf_wpower * 1e-3 self.arch_total_pooling_power = sum(self.arch_pooling_power)
def calculate_tile_read_power_fast(self, max_column=0, max_row=0, max_PE=0, max_group=0, layer_type=None, SimConfig_path=None, default_inbuf_size=16, default_outbuf_size=4): # max_column: maximum used column in one crossbar in this tile # max_row: maximum used row in one crossbar in this tile # max_PE: maximum used PE in this tile # max_group: maximum used groups in one PE # unit: W # coarse but fast estimation self.tile_read_power = 0 self.tile_xbar_read_power = 0 self.tile_ADC_read_power = 0 self.tile_DAC_read_power = 0 self.tile_digital_read_power = 0 self.tile_adder_read_power = 0 self.tile_shiftreg_read_power = 0 self.tile_iReg_read_power = 0 self.tile_oReg_read_power = 0 self.tile_input_demux_read_power = 0 self.tile_output_mux_read_power = 0 self.tile_jointmodule_read_power = 0 self.tile_pooling_read_power = 0 self.tile_buffer_read_power = 0 self.tile_buffer_r_read_power = 0 self.tile_buffer_w_read_power = 0 self.tile_buffer = buffer(SimConfig_path=SimConfig_path, buf_level=2, default_buf_size=default_outbuf_size) if layer_type == 'pooling': self.tile_pooling.calculate_Pooling_power() self.tile_pooling_read_power = self.tile_pooling.Pooling_power elif layer_type == 'conv' or layer_type == 'fc': self.calculate_PE_read_power_fast( max_column=max_column, max_row=max_row, max_group=max_group, SimConfig_path=SimConfig_path, default_inbuf_size=default_inbuf_size) self.tile_xbar_read_power = max_PE * self.PE_xbar_read_power self.tile_ADC_read_power = max_PE * self.PE_ADC_read_power self.tile_DAC_read_power = max_PE * self.PE_DAC_read_power self.tile_adder_read_power = max_PE * self.PE_adder_read_power self.tile_shiftreg_read_power = max_PE * self.PE_shiftreg_read_power self.tile_iReg_read_power = max_PE * self.PE_iReg_read_power self.tile_oReg_read_power = max_PE * self.PE_oReg_read_power self.tile_input_demux_read_power = max_PE * self.input_demux_read_power self.tile_output_mux_read_power = max_PE * self.output_mux_read_power self.tile_jointmodule_read_power = (max_PE - 1) * math.ceil( max_column / self.output_mux) * self.tile_jointmodule.jointmodule_power self.tile_digital_read_power = self.tile_adder_read_power+self.tile_shiftreg_read_power+\ self.tile_input_demux_read_power+self.tile_output_mux_read_power+self.tile_jointmodule_read_power self.tile_buffer_r_read_power = max_PE * self.PE_inbuf_read_rpower self.tile_buffer_w_read_power = max_PE * self.PE_inbuf_read_wpower self.tile_buffer.calculate_buf_read_power() self.tile_buffer.calculate_buf_write_power() self.tile_buffer_r_read_power += self.tile_buffer.buf_rpower * 1e-3 self.tile_buffer_w_read_power += self.tile_buffer.buf_wpower * 1e-3 self.tile_buffer_read_power = self.tile_buffer_r_read_power + self.tile_buffer_w_read_power self.tile_digital_read_power = self.tile_adder_read_power+self.tile_shiftreg_read_power+self.tile_iReg_read_power+self.tile_oReg_read_power+\ self.tile_input_demux_read_power+self.tile_output_mux_read_power+self.tile_jointmodule_read_power self.tile_read_power = self.tile_xbar_read_power+self.tile_ADC_read_power+self.tile_DAC_read_power+\ self.tile_digital_read_power+self.tile_pooling_read_power+self.tile_buffer_read_power
def __init__(self, SimConfig_path): # layer_num is a list with the size of 1xPE_num ProcessElement.__init__(self, SimConfig_path) tile_config = cp.ConfigParser() tile_config.read(SimConfig_path, encoding='UTF-8') self.tile_PE_num = list( map(int, tile_config.get('Tile level', 'PE_Num').split(','))) if self.tile_PE_num[0] == 0: self.tile_PE_num[0] = 4 self.tile_PE_num[1] = 4 assert self.tile_PE_num[0] > 0, "PE number in one PE < 0" assert self.tile_PE_num[1] > 0, "PE number in one PE < 0" self.tile_PE_total_num = self.tile_PE_num[0] * self.tile_PE_num[1] self.tile_simulation_level = int( tile_config.get('Algorithm Configuration', 'Simulation_Level')) self.tile_PE_list = [] self.tile_PE_enable = [] for i in range(self.tile_PE_num[0]): self.tile_PE_list.append([]) self.tile_PE_enable.append([]) for j in range(self.tile_PE_num[1]): __PE = ProcessElement(SimConfig_path) self.tile_PE_list[i].append(__PE) self.tile_PE_enable[i].append(0) self.layer_type = 'conv' self.tile_layer_num = 0 self.tile_activation_precision = 0 self.tile_sliding_times = 0 self.tile_adder_num = 0 self.tile_shiftreg_num = 0 self.tile_jointmodule_num = 0 self.tile_adder = adder(SimConfig_path) self.tile_shiftreg = shiftreg(SimConfig_path) self.tile_iReg = reg(SimConfig_path) self.tile_oReg = reg(SimConfig_path) self.tile_jointmodule = JointModule(SimConfig_path) self.tile_buffer = buffer(SimConfig_path) self.tile_pooling = Pooling(SimConfig_path) self.tile_utilization = 0 self.num_occupied_PE = 0 self.tile_area = 0 self.tile_xbar_area = 0 self.tile_ADC_area = 0 self.tile_DAC_area = 0 self.tile_digital_area = 0 self.tile_adder_area = 0 self.tile_shiftreg_area = 0 self.tile_iReg_area = 0 self.tile_oReg_area = 0 self.tile_input_demux_area = 0 self.tile_output_mux_area = 0 self.tile_jointmodule_area = 0 self.tile_pooling_area = 0 self.tile_buffer_area = 0 self.tile_read_power = 0 self.tile_xbar_read_power = 0 self.tile_ADC_read_power = 0 self.tile_DAC_read_power = 0 self.tile_digital_read_power = 0 self.tile_adder_read_power = 0 self.tile_shiftreg_read_power = 0 self.tile_iReg_read_power = 0 self.tile_oReg_read_power = 0 self.tile_input_demux_read_power = 0 self.tile_output_mux_read_power = 0 self.tile_jointmodule_read_power = 0 self.tile_pooling_read_power = 0 self.tile_buffer_read_power = 0 self.tile_buffer_r_read_power = 0 self.tile_buffer_w_read_power = 0 self.tile_write_power = 0 self.tile_xbar_write_power = 0 self.tile_ADC_write_power = 0 self.tile_DAC_write_power = 0 self.tile_digital_write_power = 0 self.tile_adder_write_power = 0 self.tile_shiftreg_write_power = 0 self.tile_iReg_write_power = 0 self.tile_input_demux_write_power = 0 self.tile_output_mux_write_power = 0 self.tile_jointmodule_write_power = 0 self.tile_read_latency = 0 self.tile_xbar_read_latency = 0 self.tile_ADC_read_latency = 0 self.tile_DAC_read_latency = 0 self.tile_digital_read_latency = 0 self.tile_adder_read_latency = 0 self.tile_shiftreg_read_latency = 0 self.tile_iReg_read_latency = 0 self.tile_input_demux_read_latency = 0 self.tile_output_mux_read_latency = 0 self.tile_jointmodule_read_latency = 0 # self.tile_layer_read_latency = {0:0} self.tile_write_latency = 0 self.tile_xbar_write_latency = 0 self.tile_ADC_write_latency = 0 self.tile_DAC_write_latency = 0 self.tile_digital_write_latency = 0 self.tile_adder_write_latency = 0 self.tile_shiftreg_write_latency = 0 self.tile_iReg_write_latency = 0 self.tile_input_demux_write_latency = 0 self.tile_output_mux_write_latency = 0 self.tile_jointmodule_write_latency = 0 # self.tile_layer_write_latency = {0:0} self.tile_read_energy = 0 self.tile_xbar_read_energy = 0 self.tile_ADC_read_energy = 0 self.tile_DAC_read_energy = 0 self.tile_digital_read_energy = 0 self.tile_adder_read_energy = 0 self.tile_shiftreg_read_energy = 0 self.tile_iReg_read_energy = 0 self.tile_input_demux_read_energy = 0 self.tile_output_mux_read_energy = 0 self.tile_jointmodule_read_energy = 0 self.tile_write_energy = 0 self.tile_xbar_write_energy = 0 self.tile_ADC_write_energy = 0 self.tile_DAC_write_energy = 0 self.tile_digital_write_energy = 0 self.tile_adder_write_energy = 0 self.tile_shiftreg_write_energy = 0 self.tile_iReg_write_energy = 0 self.tile_input_demux_write_energy = 0 self.tile_output_mux_write_energy = 0 self.tile_jointmodule_write_energy = 0 # print("tile configuration is loaded") self.calculate_intra_PE_connection()
def update_tile_buf_size(self, SimConfig_path, default_buf_size=16): self.tile_buffer = buffer(SimConfig_path=SimConfig_path, default_buf_size=default_buf_size)
def __init__(self, SimConfig_path, read_row=0, read_column=0, indata=0, rdata=0, inprecision=8, default_buf_size=16): # read_row: activated WL number in crossbar # read_column: activated BL number in crossbar # indata: volume of input data (for PE) (Byte) # rdata: volume of data from buffer to iReg (Byte) # outdata: volume of output data (for PE) (Byte) # inprecision: input data precision of each Xbar # default_buf_size: default input buffer size (KB) PEl_config = cp.ConfigParser() PEl_config.read(SimConfig_path, encoding='UTF-8') self.inbuf = buffer(SimConfig_path=SimConfig_path, buf_level=1, default_buf_size=default_buf_size) self.PE = ProcessElement(SimConfig_path) self.inbuf.calculate_buf_write_latency(indata) self.PE_buf_wlatency = self.inbuf.buf_wlatency # unit: ns self.digital_period = 1 / float( PEl_config.get('Digital module', 'Digital_Frequency')) * 1e3 self.inbuf.calculate_buf_read_latency(rdata) self.PE_buf_rlatency = self.inbuf.buf_rlatency multiple_time = math.ceil(inprecision/self.PE.DAC_precision) * math.ceil(read_row/self.PE.PE_group_DAC_num) *\ math.ceil(read_column/self.PE.PE_group_ADC_num) self.PE.calculate_xbar_read_latency() Transistor_Tech = int( PEl_config.get('Crossbar level', 'Transistor_Tech')) XBar_size = list( map(float, PEl_config.get('Crossbar level', 'Xbar_Size').split(','))) DAC_num = int(PEl_config.get('Process element level', 'DAC_Num')) ADC_num = int(PEl_config.get('Process element level', 'ADC_Num')) Row = XBar_size[0] Column = XBar_size[1] # ns (using NVSim) decoderLatency_dict = { 1: 0.27933 # 1:8, technology 65nm } decoder1_8 = decoderLatency_dict[1] Row_per_DAC = math.ceil(Row / DAC_num) m = 1 while Row_per_DAC > 0: Row_per_DAC = Row_per_DAC // 8 m += 1 self.decoderLatency = m * decoder1_8 # ns muxLatency_dict = {1: 32.744 / 1000} mux8_1 = muxLatency_dict[1] m = 1 Column_per_ADC = math.ceil(Column / ADC_num) while Column_per_ADC > 0: Column_per_ADC = Column_per_ADC // 8 m += 1 self.muxLatency = m * mux8_1 self.xbar_latency = multiple_time * self.PE.xbar_read_latency self.PE.calculate_DAC_latency() self.DAC_latency = multiple_time * self.PE.DAC_latency self.PE.calculate_ADC_latency() self.ADC_latency = multiple_time * self.PE.ADC_latency self.iReg_latency = math.ceil(read_row/self.PE.PE_group_DAC_num)*math.ceil(read_column/self.PE.PE_group_ADC_num)*self.digital_period+\ multiple_time*self.digital_period # write and read self.shiftreg_latency = multiple_time * self.digital_period self.input_demux_latency = multiple_time * self.decoderLatency self.adder_latency = math.ceil( read_column / self.PE.PE_group_ADC_num) * math.ceil( math.log2(self.PE.group_num)) * self.digital_period self.output_mux_latency = multiple_time * self.muxLatency self.computing_latency = self.DAC_latency + self.xbar_latency + self.ADC_latency self.oreg_latency = math.ceil( read_column / self.PE.PE_group_ADC_num) * self.digital_period self.PE_digital_latency = self.iReg_latency + self.shiftreg_latency + self.input_demux_latency + \ self.adder_latency + self.output_mux_latency + self.oreg_latency self.PE_latency = self.PE_buf_wlatency + self.PE_buf_rlatency + self.computing_latency + self.PE_digital_latency