def __init__(self, constants, config, benchmark): self.config = config self.ROB_size = config.get_ROB_size() self.physical_dispatch_width = config.get_dispatch_width() self.output_dir = constants.output_dir self.debug_printer = Debug_Printer(self.output_dir, benchmark, "debug_base") self.build_issue_stage()
def __init__(self, constants, config, benchmark): self.config = config self.ROB_size = self.config.get_ROB_size() self.LLC_hit_delay = self.config.get_LLC_access_cost() self.cache_config = self.config.get_cache_config() self.output_dir = constants.output_dir self.debug_printer = Debug_Printer(self.output_dir, benchmark, "debug_base")
def __init__(self, constants, config, benchmark, debug=False): self.config = config self.alpha = constants.alpha self.mlp_model = constants.mlp_model self.output_dir = constants.output_dir self.prefetcher_enabled = constants.prefetch self.queue_model = constants.queue_model # often used configuration variables self.ROB_size = self.config.get_ROB_size() self.cacheline_size = self.config.get_cacheline_size() self.DRAM_page_size = self.config.get_DRAM_page_size() self.MSHR_entries = self.config.get_MSHR_entries() self.prefetch_in_page = self.config.get_prefetch_in_page() self.prefetcher_flows = self.config.get_prefetcher_flows() self.dispatch_width = self.config.get_dispatch_width() self.DRAM_latency_with_tag = self.config.get_DRAM_latency_with_tag() self.LLC_miss_cost = self.config.get_LLC_miss_cost() self.LLC_size = self.config.get_LLC_size() self.bus_transfer_cycles = self.config.get_bus_transfer_cycles() # stats self.total_strides = 0 self.total_no_strides = 0 self.total_random_strides = 0 self.total_randomly_placed_misses = 0 self.total_ss_misses = 0 self.debug_printer = Debug_Printer(self.output_dir, benchmark, "debug_mlp") self.debug = debug self.run_avg_MLP = 1 self.run_avg_prefetched = 0 self.run_avg_prefetched_extrapolated = 0
def __init__(self, constants, config, benchmark): self.config = config self.dispatch_width = self.config.get_dispatch_width() self.ROB_size = self.config.get_ROB_size() self.front_end_refill_time = self.config.get_frontend_size() self.top_level_dir = constants.top_level_dir self.output_dir = constants.output_dir self.debug_printer = Debug_Printer(self.output_dir, benchmark, "debug_branch") self.branch_file = self.config.get_branch_predictor_name( ) + "_" + self.config.get_IP_bits() + "_" + self.config.get_BHR_size( ) + ".cfg" self.read_branch_model()
def calculate_model(benchmark, constants, config): random.seed(0) cache_sizes = config.get_cache_sizes() cacheline_size = config.get_cacheline_size() ROB_size = config.get_ROB_size() physical_dispatch_width = config.get_dispatch_width() input_root = os.path.join(constants.input_dir, benchmark) data_reader = Data_Reader(input_root, config) base_model = Base_Model(constants, config, benchmark) cache_model = Cache_Model(constants, config, benchmark) branch_model = Branch_Model(constants, config, benchmark) mlp_model = MLP_Model(constants, config, benchmark) profiler_metadata, phase_bounds, window_bounds = data_reader.get_log_contents( ) # setup progress printer to log file progress_printer = Progress_Printer(len(window_bounds), log_file=os.path.join( constants.output_dir, benchmark, "log.out")) # execute statstack per benchmark # def __init__(self, input_root, benchmark, base_name, ss_version, _type, content) progress_printer.print_message("Executing preliminary Statstack work:") ss_data_load = Statstack(constants, benchmark, "data", "new", "sample", "data", profiler_metadata, progress_printer) ss_data_store = Statstack(constants, benchmark, "data", "new", "sample", "data", profiler_metadata, progress_printer) ss_trace = Statstack(constants, benchmark, "trace", "new", "trace", "data", profiler_metadata, progress_printer) ss_instr = Statstack(constants, benchmark, "instr", "new", "sample", "instr", profiler_metadata, progress_printer) ss_data_load_aligned_bursts = ss_data_load.align_bursts_windows( window_bounds) ss_data_store_aligned_bursts = ss_data_store.align_bursts_windows( window_bounds) ss_instr_aligned_bursts = ss_instr.align_bursts_windows(window_bounds) # make data structures global_D_eff, global_stats = [], [] global_MLP, global_queuing_delay, global_prefetched, global_cache_misses, global_LLC_load_misses, global_trace_misses, global_DRAM = [], [], [], [], [], [], [] global_base_component, global_dependence_component, global_port_component, global_unit_component, global_branch_component, global_I_cache_component, global_LLC_chain_component, global_DRAM_component, global_cycles, global_instructions, global_micro_ops = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 window_base_component, window_dependence_component, window_port_component, window_unit_component, window_branch_component, window_I_cache_component, window_LLC_chain_component, window_DRAM_component, window_window_cycles, window_instructions, window_micro_ops = [], [], [], [], [], [], [], [], [], [], [] # loop over windows trace_counter = 0 progress_printer.setup_progressbar(message="\nCalculating model:") for window_instr, data_load_bursts, data_store_bursts, instr_bursts in zip( window_bounds, ss_data_load_aligned_bursts, ss_data_store_aligned_bursts, ss_instr_aligned_bursts): progress_printer.print_progress(trace_counter) # statstack, get stack distance histograms load_sd_hist = ss_data_load.get_sd_hists(_type='r', bursts=data_load_bursts) store_sd_hist = ss_data_store.get_sd_hists(_type='w', bursts=data_store_bursts) trace_sd_hist = ss_trace.get_sd_hists(_type='r', bursts=[trace_counter]) instr_sd_hist = ss_instr.get_sd_hists(_type='r', bursts=instr_bursts) # transform sd hists to miss rates load_miss_ratios = ss_data_load.calculate_sample_miss_ratios( cache_sizes, load_sd_hist, cacheline_size) store_miss_ratios = ss_data_store.calculate_sample_miss_ratios( cache_sizes, store_sd_hist, cacheline_size) # interpolating load and store miss rates interpolated_load_miss_ratios = ss_data_load.interpolate_miss_ratios( window_instr, data_load_bursts, load_miss_ratios) interpolated_store_miss_ratios = ss_data_store.interpolate_miss_ratios( window_instr, data_store_bursts, store_miss_ratios) # trace_load_miss_ratios should already be aligned with the window boundaries of our instruction based samplers trace_load_miss_ratios = ss_trace.calculate_PC_miss_ratios( cache_sizes, trace_sd_hist, cacheline_size) # calculate instruction miss ratios instr_miss_ratios = ss_instr.calculate_sample_miss_ratios( cache_sizes, instr_sd_hist, cacheline_size) interpolated_instr_miss_ratios = ss_instr.interpolate_miss_ratios( window_instr, instr_bursts, instr_miss_ratios) # calculate the number DRAM accesses LLC_load_misses = interpolated_load_miss_ratios[config.get_LLC_size( )][0] * interpolated_load_miss_ratios[config.get_LLC_size()][1] global_LLC_load_misses.append(LLC_load_misses) # calculate the overall number of cache accesses global_cache_misses.append([]) for cache_size, lmr in sorted( interpolated_load_miss_ratios.iteritems(), key=lambda x: x[0]): global_cache_misses[-1].append(lmr[0] * lmr[1]) for cache_size, smr in sorted( interpolated_store_miss_ratios.iteritems(), key=lambda x: x[0]): global_cache_misses[-1].append(smr[0] * smr[1]) for cache_size, imr in sorted( interpolated_instr_miss_ratios.iteritems(), key=lambda x: x[0]): global_cache_misses[-1].append(imr[0] * imr[1]) # calculate misses per level (but exclude misses that also miss in a lower level) exclusive_trace_miss_ratios, sum_loads = ss_trace.calculate_exclusive_miss_ratios( trace_load_miss_ratios) global_trace_misses.append( exclusive_trace_miss_ratios[config.get_LLC_size()] * sum_loads[config.get_LLC_size()]) ############################# # calculate cache component # ############################# # gather necessary variables L1D_load_hits = ss_data_load.interpolate_L1_hits( window_instr, data_load_bursts, load_miss_ratios[cache_sizes[0]]) L1D_store_hits = ss_data_store.interpolate_L1_hits( window_instr, data_store_bursts, store_miss_ratios[cache_sizes[0]]) load_misses, store_misses, instr_misses = {}, {}, {} for cache_size in cache_sizes: load_misses[cache_size] = interpolated_load_miss_ratios[ cache_size][0] * interpolated_load_miss_ratios[cache_size][1] store_misses[cache_size] = interpolated_store_miss_ratios[ cache_size][0] * interpolated_store_miss_ratios[cache_size][1] instr_misses[cache_size] = interpolated_instr_miss_ratios[ cache_size][0] * interpolated_instr_miss_ratios[cache_size][1] cache_model.set_window_stats(L1D_load_hits, L1D_store_hits, load_misses, store_misses, instr_misses) # calculate I-cache component I_cache_component = cache_model.calculate_instruction_miss_penalty() ############################ # calculate base component # ############################ stats, dependences, uop_hist = data_reader.read_next_utrace() window_sample_rate = float( profiler_metadata["trace_window"]) / stats[0] micro_ops = stats[1] * window_sample_rate base_model.set_window_stats(stats, dependences, uop_hist, int(profiler_metadata["trace_window"]), cache_model.get_load_latency()) base_model.calculate_base_performance() # calculate base component base_component, dependence_component, port_component, unit_component = base_model.calculate_base_component( ) ############################## # calculate branch component # ############################## entropy = data_reader.read_next_entropy_window() # calculate branch misses based on entropy branch_model.estimate_branch_misses(entropy) # gather variables needed to calculate the branch resolution time trace_uops = sum(uop_hist.values()) average_instruction_latency = base_model.get_average_instruction_latency( ) path_lengths = base_model.get_path_lengths() independent_instructions = base_model.get_independent_instructions() # instruction trace can vary in length (e.g. not enough loads were see), window cannot branch_model.estimate_branch_resolution_time( trace_uops, average_instruction_latency, path_lengths, independent_instructions, window_sample_rate) # calculate branch component branch_component = branch_model.calculate_branch_component() ###################################### # calculate DRAM component using MLP # ###################################### # get MLP data for next window load_dependences, pcs_rds, pcs_strides = data_reader.read_next_MLP_window( ) cold_miss_distr = data_reader.read_next_cold_window() window_instr = window_instr[1] - window_instr[0] window_loads = interpolated_load_miss_ratios[config.get_LLC_size()][1] LLC_trace_miss_ratio = trace_load_miss_ratios[config.get_LLC_size()] # calculate MLP, queuing delay and prefetchable misses current_MLP, current_queue_delay, current_prefetched = mlp_model.estimate_MLP( window_instr, stats, load_dependences, pcs_rds, pcs_strides, exclusive_trace_miss_ratios, LLC_trace_miss_ratio, window_loads, trace_counter, cold_miss_distr, interpolated_load_miss_ratios) current_prefetched *= window_sample_rate # subtract prefetched misses since they won't cause an extra delay LLC_load_misses -= min(LLC_load_misses, current_prefetched) DRAM_component = LLC_load_misses / current_MLP * ( config.get_DRAM_latency_with_tag() + current_queue_delay) # save a couple of numbers to write to files global_MLP.append(current_MLP) global_queuing_delay.append(current_queue_delay) global_prefetched.append(current_prefetched) global_DRAM.append( LLC_load_misses / current_MLP * (config.get_DRAM_latency_with_tag() + current_queue_delay)) ################################# # calculate LLC chain component # ################################# D_eff = base_model.get_effective_dispatch_rates() # ignore the effective dispatch rate caused by dependencies since this is similar, do take into account lower dispatch rates due to issue stage contention LLC_chain_penalty = cache_model.estimate_LLC_penalty( min(D_eff.values()), path_lengths[ROB_size], load_dependences, trace_uops, window_sample_rate) ##################################### # calculate window execution cycles # ##################################### window_cycles = base_component + dependence_component + port_component + unit_component + branch_component + I_cache_component + LLC_chain_penalty + DRAM_component # add to global counters global_base_component += base_component global_dependence_component += dependence_component global_port_component += port_component global_unit_component += unit_component global_branch_component += branch_component global_I_cache_component += I_cache_component global_LLC_chain_component += LLC_chain_penalty global_DRAM_component += DRAM_component global_cycles += window_cycles global_instructions += window_instr global_micro_ops += micro_ops # save per window counters window_base_component.append(base_component) window_dependence_component.append(dependence_component) window_port_component.append(port_component) window_unit_component.append(unit_component) window_branch_component.append(branch_component) window_I_cache_component.append(I_cache_component) window_LLC_chain_component.append(LLC_chain_penalty) window_DRAM_component.append(DRAM_component) window_window_cycles.append(window_cycles) window_instructions.append(window_instr) window_micro_ops.append(micro_ops) global_D_eff.append(D_eff) trace_counter += 1 global_strides, global_no_strides, global_random_strides, global_randomly_placed_misses, global_ss_misses = mlp_model.get_overall_stats( ) results_printer = Results_Printer(constants.input_dir, constants.output_dir, benchmark) # print cache miss debug stats debug_statstack_printer = Debug_Printer(constants.output_dir, benchmark, "debug_statstack") # create labels cache_labels = [] for cache_size, lmr in sorted(interpolated_load_miss_ratios.iteritems(), key=lambda x: x[0]): cache_labels.append("Load " + str(cache_size / 1024) + "k") for cache_size, smr in sorted(interpolated_store_miss_ratios.iteritems(), key=lambda x: x[0]): cache_labels.append("Store " + str(cache_size / 1024) + "k") for cache_size, imr in sorted(interpolated_instr_miss_ratios.iteritems(), key=lambda x: x[0]): cache_labels.append("Instr " + str(cache_size / 1024) + "k") debug_string = "S" + "-S" * (len(cache_labels) - 1) debug_statstack_printer.save_debug_stats(cache_labels, global_cache_misses, debug_string) # print mlp debug stats debug_mlp_printer = Debug_Printer(constants.output_dir, benchmark, "debug_mlp") debug_string = "S-WA-S-S" debug_mlp_printer.save_debug_stats( ["LLC load misses", "MLP", "prefetched", "queuing delay", "DRAM"], zip(global_LLC_load_misses, global_MLP, global_prefetched, global_queuing_delay, global_DRAM), debug_string) keys = [ "Base", "Dependence", "Port", "Unit", "Branch", "I-cache", "LLC-chain", "DRAM", "Total", "Instructions", "Micro-Ops" ] values = zip(window_base_component, window_dependence_component, window_port_component, window_unit_component, window_branch_component, window_I_cache_component, window_LLC_chain_component, window_DRAM_component, window_window_cycles, window_instructions, window_micro_ops) results_printer.save_window_stats(keys, values) values = [ global_base_component, global_dependence_component, global_port_component, global_unit_component, global_branch_component, global_I_cache_component, global_LLC_chain_component, global_DRAM_component, global_cycles, global_instructions, global_micro_ops ] results_printer.save_results(keys, values) results_printer.plot_cpi_stack( keys[:-3], [v / global_instructions for v in values[:-3]]) progress_printer.close_log_file()
class MLP_Model(): def __init__(self, constants, config, benchmark, debug=False): self.config = config self.alpha = constants.alpha self.mlp_model = constants.mlp_model self.output_dir = constants.output_dir self.prefetcher_enabled = constants.prefetch self.queue_model = constants.queue_model # often used configuration variables self.ROB_size = self.config.get_ROB_size() self.cacheline_size = self.config.get_cacheline_size() self.DRAM_page_size = self.config.get_DRAM_page_size() self.MSHR_entries = self.config.get_MSHR_entries() self.prefetch_in_page = self.config.get_prefetch_in_page() self.prefetcher_flows = self.config.get_prefetcher_flows() self.dispatch_width = self.config.get_dispatch_width() self.DRAM_latency_with_tag = self.config.get_DRAM_latency_with_tag() self.LLC_miss_cost = self.config.get_LLC_miss_cost() self.LLC_size = self.config.get_LLC_size() self.bus_transfer_cycles = self.config.get_bus_transfer_cycles() # stats self.total_strides = 0 self.total_no_strides = 0 self.total_random_strides = 0 self.total_randomly_placed_misses = 0 self.total_ss_misses = 0 self.debug_printer = Debug_Printer(self.output_dir, benchmark, "debug_mlp") self.debug = debug self.run_avg_MLP = 1 self.run_avg_prefetched = 0 self.run_avg_prefetched_extrapolated = 0 def calculate_stride_misses(self, stride, refs): # number of refs == number of reuses, meaning that if we have 2 refs, we actually saw three memory accesses by that PC # calculate no cachelines referenced # stride is 0, always the same address is referenced if stride == 0: return [0] * refs else: if stride < self.cacheline_size: stride_pattern_misses = [] # put start of the stride pattern at half the cacheline size (on average, this should be the most correct) cacheline = stride + self.cacheline_size / 2 next_cacheline = self.cacheline_size for r in range(0, refs): if cacheline >= next_cacheline: stride_pattern_misses.append(1) next_cacheline += self.cacheline_size else: stride_pattern_misses.append(0) cacheline += stride else: stride_pattern_misses = [1] * refs return stride_pattern_misses def calculate_misses_PC_stride(self, stride_distr): stride_distr = sorted(stride_distr) # we have no stride, only one reference if len(stride_distr) == 0: return ("NO_STRIDE", [False], [1]) elif len(stride_distr) == 1: stride_miss_pattern = [1] + self.calculate_stride_misses( stride_distr[0][0], stride_distr[0][1]) if stride_distr[0][0] > self.DRAM_page_size: if self.prefetch_in_page: return ("STRIDE", [False] * (stride_distr[0][1] + 1), stride_miss_pattern) else: prefetchable = [False] + [ bool(p) for p in stride_miss_pattern[1:] ] return ("STRIDE", prefetchable, stride_miss_pattern) else: prefetchable = [False ] + [bool(p) for p in stride_miss_pattern[1:]] return ("STRIDE", prefetchable, stride_miss_pattern) else: # find strides with biggest reference count stride_distr = sorted(stride_distr, key=lambda s: s[1], reverse=True) references = sum(ref for _, ref in stride_distr) # 1 stride + possible outliers # stride is regular, but there outliers (more than 60% of the references should fall under one stride) biggest_1 = stride_distr[:1] if float(biggest_1[0][1]) / references >= 0.60: stride_miss_pattern = [1] + self.calculate_stride_misses( biggest_1[0][0], biggest_1[0][1]) # check if stride within page boundaries if biggest_1[0][0] < self.DRAM_page_size: prefetchable = [False] prefetchable += [bool(p) for p in stride_miss_pattern[1:]] else: if self.prefetch_in_page: prefetchable = [False] * (references + 1) else: # big stride, so certainly not within one cache line prefetchable = [bool(p) for p in stride_miss_pattern] # random strides miss stride_miss_pattern += [1] * (references - biggest_1[0][1]) prefetchable += [False] * (references - biggest_1[0][1]) return ("STRIDE", prefetchable, stride_miss_pattern) # 2 strides + possible outliers # stride is regular, but there are 2 strides # more than 70% of the references should fall under at most 2 different strides biggest_2 = stride_distr[:2] biggest_2_refs = sum(ref for _, ref in biggest_2) if float(biggest_2_refs) / references >= 0.70: stride_miss_pattern = [1] prefetchable = [False] for stride, ref in biggest_2: current_stride_miss_pattern = self.calculate_stride_misses( stride, ref) if stride < self.DRAM_page_size: prefetchable += [ bool(p) for p in current_stride_miss_pattern ] else: if self.prefetch_in_page: prefetchable += [False] * ref else: prefetchable += [ bool(p) for p in current_stride_miss_pattern ] stride_miss_pattern += current_stride_miss_pattern # random strides miss stride_miss_pattern += [1] * (references - biggest_2_refs) prefetchable += [False] * (references - biggest_2_refs) return ("STRIDE", prefetchable, stride_miss_pattern) # stride is regular, but there are 3 strides # more than 80% of the references should fall under at most 3 different strides biggest_3 = stride_distr[:3] biggest_3_refs = sum(ref for _, ref in biggest_3) if float(biggest_3_refs) / references >= 0.80: stride_miss_pattern = [1] prefetchable = [False] for stride, ref in biggest_3: current_stride_miss_pattern = self.calculate_stride_misses( stride, ref) if stride < self.DRAM_page_size: prefetchable += [ bool(p) for p in current_stride_miss_pattern ] else: if self.prefetch_in_page: prefetchable += [False] * ref else: prefetchable += [ bool(p) for p in current_stride_miss_pattern ] stride_miss_pattern += current_stride_miss_pattern # random strides miss stride_miss_pattern += [1] * (references - biggest_3_refs) prefetchable += [False] * (references - biggest_3_refs) return ("STRIDE", prefetchable, stride_miss_pattern) # 4 strides + possible outliers # stride is regular, but there are 4 strides # more than 90% of the references should fall under at most 4 different strides biggest_4 = stride_distr[:4] biggest_4_refs = sum(ref for _, ref in biggest_4) if float(biggest_4_refs) / references >= 0.90: stride_miss_pattern = [1] prefetchable = [False] for stride, ref in biggest_4: current_stride_miss_pattern = self.calculate_stride_misses( stride, ref) if stride < self.DRAM_page_size: prefetchable += [ bool(p) for p in current_stride_miss_pattern ] else: if self.prefetch_in_page: prefetchable += [False] * ref else: prefetchable += [ bool(p) for p in current_stride_miss_pattern ] stride_miss_pattern += current_stride_miss_pattern # random strides miss stride_miss_pattern += [1] * (references - biggest_4_refs) prefetchable += [False] * (references - biggest_4_refs) return ("STRIDE", prefetchable, stride_miss_pattern) # more strides: stride is completely irregular return ("RANDOM_STRIDE", [False] * (references + 1), [1] * (references + 1)) def expand_distr(self, distr): choose_from = [] for reuse, refs in distr[1:]: for r in range(0, refs): choose_from.append(reuse) random.shuffle(choose_from) expanded_distr = [distr[0]] for reuse in choose_from: expanded_distr.append(expanded_distr[-1] + reuse) return expanded_distr def estimate_MLP(self, window_instrs, counter, load_dep_distr, pcs_rd_distr, pcs_stride_distr, exclusive_trace_miss_ratios, LLC_PC_miss_ratios, window_loads, trace_counter, cold_miss_distr, miss_ratios): # default values estimated_MLP, estimated_queuing_delay, succesfully_prefetched = 1, self.bus_transfer_cycles, 0 if self.mlp_model == "cold": # using this method we cannot estimate the efficacy of a stride prefetcher estimated_MLP, estimated_queuing_delay = self.estimate_MLP_cold( cold_miss_distr, load_dep_distr, window_instrs, window_loads, miss_ratios, trace_counter) elif self.mlp_model == "stride": estimated_MLP, estimated_queuing_delay, succesfully_prefetched = self.estimate_MLP_stride( window_instrs, counter, load_dep_distr, pcs_rd_distr, pcs_stride_distr, exclusive_trace_miss_ratios, LLC_PC_miss_ratios, window_loads, trace_counter) else: # here we estimate the prefetcher to work as good for cold misses as for capacity misses estimated_MLP, estimated_queuing_delay, succesfully_prefetched = self.estimate_MLP_cold_stride( window_instrs, counter, load_dep_distr, pcs_rd_distr, pcs_stride_distr, exclusive_trace_miss_ratios, LLC_PC_miss_ratios, window_loads, trace_counter, cold_miss_distr, miss_ratios) return estimated_MLP, estimated_queuing_delay, succesfully_prefetched def estimate_MLP_cold(self, cold_miss_distr, load_dep_distr, window_instrs, window_loads, miss_ratios, trace_counter, stride_MLP=0.0): total_robs = window_instrs / self.ROB_size LLC_miss_chance = miss_ratios[self.config.get_LLC_size()][0] if window_loads == 0 or LLC_miss_chance == 0: if stride_MLP == 0.0: return 1, self.bus_transfer_cycles else: return 1, self.bus_transfer_cycles, 1 total_cold_miss_robs, cold_misses = 0, 0.0 for cm in cold_miss_distr: cold_misses += cm[0] * cm[1] total_cold_miss_robs += cm[1] miss_chance_conflict = (window_loads * LLC_miss_chance - cold_misses) / window_loads if miss_chance_conflict < 0: miss_chance_conflict = 0 average_loads_per_ROB = window_loads / total_robs # conflict miss MLP conflict_miss_MLP = 0 for loads_on_path, freq in enumerate(load_dep_distr[1:]): dependent_MLP = ( 1 - LLC_miss_chance )**loads_on_path * miss_chance_conflict * average_loads_per_ROB conflict_miss_MLP += freq * dependent_MLP conflict_miss_MLP = max(1, conflict_miss_MLP) # cold miss MLP cold_miss_MLP = 1 if total_cold_miss_robs != 0: for loads_on_path, freq in enumerate(load_dep_distr[1:]): dependent_MLP = ( 1 - LLC_miss_chance )**loads_on_path * cold_misses / total_cold_miss_robs cold_miss_MLP += freq * dependent_MLP prev_cs = 0 exclusive_miss_ratios = {} for cs, miss_data in miss_ratios.iteritems(): exclusive_miss_ratios[cs] = miss_data[0] if prev_cs != 0: exclusive_miss_ratios[prev_cs] -= miss_data[0] prev_cs = cs # if no stride_MLP was provided, we are using the pure cold MLP method, use a uniform number for conflict miss MLP if stride_MLP == 0.0: estimated_MLP = cold_misses / max( cold_misses, window_loads * LLC_miss_chance ) * cold_miss_MLP + window_loads * miss_chance_conflict / max( cold_misses, window_loads * LLC_miss_chance) * conflict_miss_MLP scaling_factor_MSHR = self.scale_MLP_MSHR(window_loads, exclusive_miss_ratios, estimated_MLP, trace_counter) estimated_MLP *= scaling_factor_MSHR estimated_queuing_delay = self.estimate_queuing_delay( estimated_MLP) return estimated_MLP, estimated_queuing_delay else: scaling_factor_MSHR = self.scale_MLP_MSHR(window_loads, exclusive_miss_ratios, cold_miss_MLP, trace_counter) cold_miss_MLP *= scaling_factor_MSHR estimated_MLP = cold_misses / max( cold_misses, window_loads * LLC_miss_chance ) * cold_miss_MLP + window_loads * miss_chance_conflict / max( cold_misses, window_loads * LLC_miss_chance) * stride_MLP estimated_queuing_delay = self.estimate_queuing_delay( estimated_MLP) if window_loads * miss_chance_conflict != 0: cold_miss_multiplier = max( cold_misses, window_loads * LLC_miss_chance) / (window_loads * miss_chance_conflict) else: cold_miss_multiplier = 1.0 return estimated_MLP, estimated_queuing_delay, cold_miss_multiplier def estimate_MLP_stride(self, window_instrs, counter, load_dep_distr, pcs_rd_distr, pcs_stride_distr, exclusive_trace_miss_ratios, LLC_PC_miss_ratios, window_loads, trace_counter): if counter[2] == 0 or exclusive_trace_miss_ratios[self.LLC_size] == 0: estimated_MLP = self.run_avg_MLP estimated_queuing_delay = self.estimate_queuing_delay( self.run_avg_MLP) if counter[2] != 0: extrapolate_prefetched = float( self.run_avg_prefetched) / counter[2] * window_loads else: extrapolate_prefetched = self.run_avg_prefetched_extrapolated succesfully_prefetched = extrapolate_prefetched # should I modify the running averages? self.run_avg_MLP = self.alpha * 1 + (1 - self.alpha) * self.run_avg_MLP self.run_avg_prefetched = self.alpha * 0 + ( 1 - self.alpha) * self.run_avg_prefetched self.run_avg_prefetched_extrapolated = self.alpha * 0 + ( 1 - self.alpha) * self.run_avg_prefetched else: # place loads trace_loads, trace_load_addresses = self.place_loads_in_trace( counter[1], pcs_rd_distr, trace_counter) # place misses trace_misses, statstack_misses = self.place_misses_in_trace( counter[1], pcs_stride_distr, LLC_PC_miss_ratios, pcs_rd_distr, trace_loads, trace_counter) sum_trace_misses = sum([tm for tm, p in trace_misses]) if round(sum_trace_misses) != round(statstack_misses): self.debug_printer.save_error_stats( "Trace " + str(trace_counter) + ", we placed a different number of misses:\tTM:" + str(round(sum_trace_misses)) + "\tSS: " + str(round(statstack_misses))) self.total_ss_misses += statstack_misses trace_misses, succesfully_prefetched = self.remove_prefetchable_misses( trace_load_addresses, trace_misses) if sum_trace_misses > 0: estimated_MLP = self.estimate_MLP_window_stride( trace_loads, trace_misses, load_dep_distr, trace_counter) scaling_factor_MSHR = self.scale_MLP_MSHR( sum(trace_loads), exclusive_trace_miss_ratios, estimated_MLP, trace_counter) estimated_MLP *= scaling_factor_MSHR else: estimated_MLP = 1.0 estimated_queuing_delay = self.estimate_queuing_delay( estimated_MLP) extrapolate_prefetched = float( succesfully_prefetched) / counter[2] * window_loads self.run_avg_MLP = self.alpha * estimated_MLP + ( 1 - self.alpha) * self.run_avg_MLP self.run_avg_prefetched = self.alpha * succesfully_prefetched + ( 1 - self.alpha) * self.run_avg_prefetched self.run_avg_prefetched_extrapolated = self.alpha * extrapolate_prefetched + ( 1 - self.alpha) * self.run_avg_prefetched return estimated_MLP, estimated_queuing_delay, succesfully_prefetched def estimate_MLP_cold_stride(self, window_instrs, counter, load_dep_distr, pcs_rd_distr, pcs_stride_distr, exclusive_trace_miss_ratios, LLC_PC_miss_ratios, window_loads, trace_counter, cold_miss_distr, miss_ratios): estimated_MLP_stride, estimated_queuing_delay_stride, succesfully_prefetched_stride = self.estimate_MLP_stride( window_instrs, counter, load_dep_distr, pcs_rd_distr, pcs_stride_distr, exclusive_trace_miss_ratios, LLC_PC_miss_ratios, window_loads, trace_counter) estimated_MLP, estimated_queuing_delay, cold_miss_multiplier = self.estimate_MLP_cold( cold_miss_distr, load_dep_distr, window_instrs, window_loads, miss_ratios, trace_counter, estimated_MLP_stride) return estimated_MLP, estimated_queuing_delay, int( succesfully_prefetched_stride * cold_miss_multiplier) def place_loads_in_trace(self, trace_length, pcs_rd_distr, trace_counter): # place loads trace_loads = [0] * trace_length trace_load_addresses = [0] * trace_length load_conflicts = [] for pc, rd_distr in sorted(pcs_rd_distr.iteritems()): expanded_rd_distr = self.expand_distr(rd_distr) for rd in expanded_rd_distr: if trace_loads[rd] == 0: trace_loads[rd] = 1 trace_load_addresses[rd] = pc else: load_conflicts.append(pc) # Adding load conflicts (due to not knowing the exact position) randomly to the trace_loads array. add_random_loads = len(load_conflicts) if add_random_loads > 0: zero_positions = [x for x, y in enumerate(trace_loads) if y == 0] while add_random_loads > 0: rnd = int(random.random() * len(zero_positions)) trace_loads[zero_positions[rnd]] = 1 trace_load_addresses[zero_positions[rnd]] = load_conflicts[ len(load_conflicts) - add_random_loads] add_random_loads -= 1 del zero_positions[rnd] if len(load_conflicts) > 0: self.debug_printer.save_log_stats("Trace: " + str(trace_counter)) self.debug_printer.save_log_stats( "Placed " + str(len(load_conflicts)) + " loads randomly for a total of " + str(sum(trace_loads)) + " loads! Load PCs were " + str(load_conflicts)) return trace_loads, trace_load_addresses def place_misses_in_trace(self, trace_length, pcs_stride_distr, LLC_PC_miss_ratios, pcs_rd_distr, trace_loads, trace_counter): trace_misses = [[0, False]] * trace_length no_strides, no_no_strides, no_random_strides = 0, 0, 0 statstack_misses, randomly_placed_misses = 0, 0 succesfully_prefetched = 0 for pc, stride_distr in sorted(pcs_stride_distr.iteritems()): # ignore the first entry in the array which is the first address referenced, this might help for prefetching later stride_distr = stride_distr[1:] stride, prefetchable, stride_misses = self.calculate_misses_PC_stride( stride_distr) misses_placed, miss_conflicts, too_few_stride_misses = 0, 0, 0 PC_statstack_misses = int( round(LLC_PC_miss_ratios[pc][0] * LLC_PC_miss_ratios[pc][1])) statstack_misses += PC_statstack_misses if stride == "STRIDE": # if SS sees misses, the stride we see is for non-repeating addresses, hence every xth cacheline references is a new one and misses if PC_statstack_misses > 0: rd_distr = self.expand_distr(pcs_rd_distr[pc]) assert (len(rd_distr) == len(stride_misses)) assert (LLC_PC_miss_ratios[pc][1] == len(rd_distr)) # Add all stride misses to the beginning of the stride pattern, this better reflects reality as it will be the first accesses of a repeating strided access pattern that will miss (at least for most benchmarks). Using the miss ratios like for random strides leads to severe underestimations of the MLP. miss_ratio = 1 for i in range(0, len(stride_misses)): if stride_misses[i]: if misses_placed < PC_statstack_misses: if trace_misses[rd_distr[i]][0] > 0: miss_conflicts += 1 else: trace_misses[rd_distr[i]] = [ miss_ratio, prefetchable[i] ] misses_placed += miss_ratio else: break # Because we append all strides together, it can happen that we're underestimating the actual number of misses (e.g. 2 strides of 8, a random stride, 2 strides of 8, can give more misses than 4 strides of 8 followed by a random: 16-24-32-48-120 results in two misses, while 16-24-120-128-136 results in three misses). This will be noticeable because the number of statstack misses is bigger than the number of misses we get by using the Statstack missrate for the accesses marked as a miss in the stride pattern. Place them randomly afterwards. if misses_placed < PC_statstack_misses: too_few_stride_misses += PC_statstack_misses - misses_placed no_strides += 1 elif stride == "NO_STRIDE": if PC_statstack_misses > 0: location_pc = pcs_rd_distr[pc][0] miss_ratio = LLC_PC_miss_ratios[pc][0] if trace_misses[location_pc][0] > 0: if trace_misses[rd_distr[i]][0] < 1 - miss_ratio: trace_misses[rd_distr[i]][0] += miss_ratio else: miss_conflicts += 1 else: trace_misses[location_pc] = [miss_ratio, False] misses_placed += miss_ratio no_no_strides += 1 else: if PC_statstack_misses > 0: rd_distr = self.expand_distr(pcs_rd_distr[pc]) assert (LLC_PC_miss_ratios[pc][1] == len(rd_distr)) # We have no idea which of these completely random accesses will actually miss, use the ratio provided by statstack for all possible misses miss_ratio = LLC_PC_miss_ratios[pc][0] for i in range(0, len(rd_distr)): if trace_misses[rd_distr[i]][0] > 0: if trace_misses[rd_distr[i]][0] < 1 - miss_ratio: trace_misses[rd_distr[i]][0] += miss_ratio else: miss_conflicts += 1 else: trace_misses[rd_distr[i]] = [miss_ratio, False] misses_placed += miss_ratio no_random_strides += 1 # Adding miss conflicts (due to not knowing the exact position) randomly to the trace_misses array (but accordingly to the load array). Do the same for underestimation of the misses due to the usage of a stride distribution (see above) add_random_misses = miss_conflicts + too_few_stride_misses if add_random_misses > 0: randomly_placed_misses += add_random_misses * miss_ratio # find positions in trace_misses array where we can still place a miss (value == 0), provided there's a load on that position zero_positions = [ w for (w, (x, y)), z in zip(enumerate(trace_misses), trace_loads) if x == 0 and z == 1 ] if len(zero_positions) < add_random_misses: self.debug_printer.save_error_stats( "Trace " + str(trace_counter) + ", we have too few zero positions for adding random misses" ) else: while add_random_misses > 0: rnd = int(random.random() * len(zero_positions)) trace_misses[zero_positions[rnd]] = [miss_ratio, False] add_random_misses -= 1 del zero_positions[rnd] self.debug_printer.save_log_stats("Number of strided instructions: " + str(no_strides)) self.debug_printer.save_log_stats("Number of single instructions: " + str(no_no_strides)) self.debug_printer.save_log_stats( "Number of random strided instructions: " + str(no_random_strides)) self.debug_printer.save_log_stats( "Placed " + str(randomly_placed_misses) + " misses randomly for a total of " + str(sum([miss for miss, pref in trace_misses])) + " misses") self.total_strides += no_strides self.total_no_strides += no_no_strides self.total_random_strides += no_random_strides self.total_randomly_placed_misses += randomly_placed_misses return trace_misses, statstack_misses def remove_prefetchable_misses(self, trace_loads_addresses, trace_misses): trace_length = len(trace_loads_addresses) plain_trace_misses = [0] * trace_length # check number of flows flows = deque(maxlen=self.prefetcher_flows) ind, succesfully_prefetched = 0, 0 prefetchable = [] # load address, (missrate, prefetchable) for tl, (tm_mr, tm_p) in zip(trace_loads_addresses, trace_misses): # can this load be prefetched (depends on prefetch attribute and whether it's still in the observed flows)? if tm_mr > 0: if not tm_p: plain_trace_misses[ind] = tm_mr else: if tl not in flows: plain_trace_misses[ind] = tm_mr elif self.prefetcher_enabled: assert (tm_mr == 1) prefetchable.append(ind) else: plain_trace_misses[ind] = tm_mr # only append flows for loads that at least reach the LLC if tm_mr > 0 and tl != 0 and not tl in flows: flows.append(tl) ind += 1 # check timeliness, prefetch happens at the same moment as the previous load executed # model non-timeliness as lower missrate if len(prefetchable) > 0: # get boundaries where there's a DRAM access, this marks the potential start of a new ROB np_trace_misses = numpy.asarray(trace_misses) ROB_indices = numpy.where(np_trace_misses > 0)[0] ROB_starts = [ROB_indices[0]] for ind in ROB_indices: if ind >= ROB_starts[-1] + self.ROB_size: ROB_starts.append(ind) for ind in prefetchable: # prev_load_usage = trace_length - trace_loads_addresses[::-1].index(trace_loads_addresses[ind], trace_length - ind) - 1 np_trace_loads_addresses = numpy.asarray(trace_loads_addresses) load_indices = list( numpy.where(np_trace_loads_addresses == trace_loads_addresses[ind])[0]) load_indices = load_indices[:load_indices.index(ind)][::-1] prev_load_usage = load_indices[-1] for li in load_indices: if trace_misses[li][1]: prev_load_usage = li break load_ROB = bisect.bisect(ROB_starts, ind) - 1 prev_load_ROB = bisect.bisect(ROB_starts, prev_load_usage) - 1 # load that initiated prefetch happened in a previous ROB, prefetch will be done # either the ROBs are different or the distance is too big # the latter check is needed because the random placement of loads / misses can break the bisect stuff if load_ROB != prev_load_ROB or prev_load_usage + self.ROB_size <= ind: succesfully_prefetched += 1 # load that initiated prefetch happened in the same ROB, prefetch will not be done in time, modify missrate else: # load_to_head = ind - prev_load_usage load_to_head = ind - ROB_starts[load_ROB] reach_cycles = float(load_to_head) / self.dispatch_width fraction_DRAM = reach_cycles / self.DRAM_latency_with_tag assert (1.0 - fraction_DRAM >= 0) plain_trace_misses[ind] = 1.0 - fraction_DRAM succesfully_prefetched += fraction_DRAM return plain_trace_misses, succesfully_prefetched def estimate_MLP_window_stride(self, trace_loads, trace_misses, load_dep_distr, trace_counter): mlp, miss_ROBs, counter = 0, 0, 0 current_ROB_head = 0 while current_ROB_head < len(trace_misses): # search the trace for the next ROB head (has to start with an LLC miss) if sum(trace_misses[current_ROB_head:]) == 0: break while trace_misses[current_ROB_head] == 0: current_ROB_head += 1 # Take into account load misses depend on each other loads = sum(trace_loads[current_ROB_head:current_ROB_head + self.ROB_size]) load_misses = float( sum(trace_misses[current_ROB_head:current_ROB_head + self.ROB_size])) if loads < load_misses: self.debug_printer.save_error_stats("Trace: " + str(trace_counter) + ", " + str(loads) + " < " + str(load_misses)) loads = load_misses miss_ratio = load_misses / loads # calculate MLP taking into account dependences mlp += self.scale_MLP_dependences(load_dep_distr, miss_ratio, load_misses) current_ROB_head += self.ROB_size miss_ROBs += 1 estimated_MLP = max(1, float(mlp) / miss_ROBs) return estimated_MLP def scale_MLP_dependences(self, load_dep_distr, miss_ratio, load_misses): estimated_MLP = 0 for depending_on in range(1, len(load_dep_distr)): dependent_MLP = (1 - miss_ratio)**(depending_on - 1) * load_misses estimated_MLP += load_dep_distr[depending_on] * dependent_MLP return estimated_MLP def scale_MLP_MSHR(self, loads, trace_level_miss_rates, estimated_MLP, trace_counter): if estimated_MLP == 0: return 1 mshr_occupancy_time = 0 cache_level_hits, cache_hit_cost = 0, 0 for cs in sorted(trace_level_miss_rates.keys())[:-1]: cache_level_hits += trace_level_miss_rates[cs] * loads mshr_occupancy_time += trace_level_miss_rates[ cs] * loads * self.config.get_cache_miss_cost(cs) cache_hit_cost += trace_level_miss_rates[ cs] * loads * self.config.get_cache_miss_cost(cs) if cache_level_hits != 0: cache_hit_cost /= cache_level_hits scaling_factor_MSHR = 1 # calculate scaling factor if cache_level_hits + estimated_MLP > self.MSHR_entries: scaling_factor_MSHR = 0 self.debug_printer.save_log_stats( "Trace: " + str(trace_counter) + ", scaled down MLP due to too many misses live in the ROB (" + str(estimated_MLP) + ")") cache_down, cache_up = math.floor(cache_level_hits), math.ceil( cache_level_hits) mlp_down, mlp_up = math.floor(estimated_MLP), math.ceil( estimated_MLP) cache_frac, MLP_frac = math.modf(cache_level_hits)[0], math.modf( estimated_MLP)[0] if cache_down + mlp_down > self.MSHR_entries: scaling_factor_MSHR_down_down = self.calculate_rounded_MSHR_scaling_factor( cache_down, mlp_down, cache_hit_cost) scaling_factor_MSHR += (1 - cache_frac) * ( 1 - MLP_frac) * scaling_factor_MSHR_down_down if cache_down + mlp_up > self.MSHR_entries: scaling_factor_MSHR_down_up = self.calculate_rounded_MSHR_scaling_factor( cache_down, mlp_up, cache_hit_cost) scaling_factor_MSHR += ( 1 - cache_frac) * MLP_frac * scaling_factor_MSHR_down_up if cache_up + mlp_down > self.MSHR_entries: scaling_factor_MSHR_up_down = self.calculate_rounded_MSHR_scaling_factor( cache_up, mlp_down, cache_hit_cost) scaling_factor_MSHR += cache_frac * ( 1 - MLP_frac) * scaling_factor_MSHR_up_down if cache_up + mlp_up > self.MSHR_entries: scaling_factor_MSHR_up_up = self.calculate_rounded_MSHR_scaling_factor( cache_up, mlp_up, cache_hit_cost) scaling_factor_MSHR += cache_frac * MLP_frac * scaling_factor_MSHR_up_up assert scaling_factor_MSHR <= 1.0 return scaling_factor_MSHR def calculate_rounded_MSHR_scaling_factor(self, rounded_cache_hits, rounded_MLP, cache_hit_cost): if rounded_MLP == 0: return 0 events = rounded_cache_hits + rounded_MLP slots_filled_LLC = max(self.MSHR_entries - rounded_MLP, 0) slots_filled_DRAM = max(self.MSHR_entries - rounded_cache_hits, 0) slots_filled = slots_filled_LLC + slots_filled_DRAM slots_to_fill = int(self.MSHR_entries - slots_filled) P_c = (rounded_cache_hits - max(self.MSHR_entries - rounded_MLP, 0)) / (events - slots_filled) P_DRAM = (rounded_MLP - max(self.MSHR_entries - rounded_cache_hits, 0)) / (events - slots_filled) assert P_c + P_DRAM == 1.0 MLP = 0.0 for i in range(0, slots_to_fill + 1): multiplier = math.factorial(slots_to_fill) / ( math.factorial(i) * math.factorial(slots_to_fill - i)) chance = P_c**i * P_DRAM**(slots_to_fill - i) state_chance = multiplier * chance T_MSHR_free = ( (slots_filled_LLC + i) * cache_hit_cost + (slots_filled_DRAM + slots_to_fill - i) * self.LLC_miss_cost ) / self.MSHR_entries - self.MSHR_entries / events * self.ROB_size / self.dispatch_width # if events resolve faster than the time it takes to get to the next event, this time will be negative T_MSHR_free = max(0, T_MSHR_free) DRAM_MSHR = self.MSHR_entries - ( max(self.MSHR_entries - rounded_MLP, 0) + i) MLP += state_chance * ( DRAM_MSHR + (rounded_MLP - DRAM_MSHR) * (self.LLC_miss_cost - T_MSHR_free) / self.LLC_miss_cost) scaling_factor_MSHR = MLP / rounded_MLP return scaling_factor_MSHR def estimate_queuing_delay(self, mlp): if self.queue_model == "MLP": queue_delay_down, queue_delay_up = 0.0, 0.0 mlp_down, mlp_up = int(mlp), int(mlp) + 1 for m in range(0, mlp_down): queue_delay_down += self.bus_transfer_cycles * (m + 1) - min( self.bus_transfer_cycles * m, self.ROB_size / mlp_down / self.dispatch_width * m) for m in range(0, mlp_up): queue_delay_up += self.bus_transfer_cycles * (m + 1) - min( self.bus_transfer_cycles * m, self.ROB_size / mlp_up / self.dispatch_width * m) fract_mlp = math.modf(mlp)[0] queue_delay = ( 1 - fract_mlp) * queue_delay_down + fract_mlp * queue_delay_up return queue_delay / mlp else: return 0.0 def get_overall_stats(self): return self.total_strides, self.total_no_strides, self.total_random_strides, self.total_randomly_placed_misses, self.total_ss_misses
class Base_Model: def __init__(self, constants, config, benchmark): self.config = config self.ROB_size = config.get_ROB_size() self.physical_dispatch_width = config.get_dispatch_width() self.output_dir = constants.output_dir self.debug_printer = Debug_Printer(self.output_dir, benchmark, "debug_base") self.build_issue_stage() def build_issue_stage(self): self.instruction_latencies = {} for k, v in self.config.get_instruction_latencies().iteritems(): self.instruction_latencies[k] = int(v) self.functional_units_per_port = {} self.available_ports_split = [] for k, v in self.config.get_functional_units_ports().iteritems(): if not k in self.functional_units_per_port: self.functional_units_per_port[k] = [] for port in v.replace(" ", "").split("|"): self.functional_units_per_port[k].append( port.replace(" ", "").split("&")) for p in port.replace(" ", "").split("&"): if not p in self.available_ports_split: self.available_ports_split.append(p) self.available_ports_split = sorted(self.available_ports_split) self.functional_units_pipelined = {} for k, v in self.config.get_functional_unit_pipelined().iteritems(): if v == "1": self.functional_units_pipelined[k] = True else: self.functional_units_pipelined[k] = False self.instruction_per_functional_unit = {} for k, v in self.config.get_instruction_functional_unit().iteritems(): self.instruction_per_functional_unit[k] = v.replace(" ", "").split("|") self.instruction_per_port = {} for ins, functional_units in self.instruction_per_functional_unit.iteritems( ): self.instruction_per_port[ins] = [] for fu_outer in functional_units: for fu_inner, ports in self.functional_units_per_port.items(): if fu_inner == fu_outer: for port in ports: self.instruction_per_port[ins].append(port) def set_window_stats(self, stats, dependences, uop_hist, window_instruction_length, load_latency): self.stats = stats self.dependences = dependences self.uop_hist = uop_hist self.micro_op_count = sum(self.uop_hist.itervalues()) self.window_instruction_length = window_instruction_length # set this value calculated by using StatStack miss rate self.instruction_latencies["LOAD"] = load_latency def calculate_base_performance(self): self.interpolate_dependences() self.calculate_average_instruction_latency() self.calculate_independent_instructions() self.calculate_base_execution_rate() def calculate_independent_instructions(self): self.independent_instructions = {} # use critical path for rob, paths in self.interpolated_dependences.iteritems(): CP_latency = paths[2] * self.average_instruction_latency self.independent_instructions[rob] = float(rob) / CP_latency def calculate_average_instruction_latency(self): self.average_instruction_latency = 0.0 for uop, freq in self.uop_hist.iteritems(): self.average_instruction_latency += float( freq) * self.instruction_latencies[uop] self.average_instruction_latency /= self.micro_op_count def interpolate_dependences(self): self.interpolated_dependences = {} # logarithmic fit ( a + b * ln(x) ) yields the best results compared to the real profiled results (but only if you fit point by point, not over all measurements) # initialize for the first profiled ROB (always = 1) self.interpolated_dependences[self.dependences[0][0]] = ( self.dependences[0][1], self.dependences[0][2], self.dependences[0][3]) # interpolate between profiled ROBs, initialize previous on smallest ROB profiled previous = self.dependences[0] for dep in self.dependences[1:]: y1_AP, y2_AP, x1, x2 = previous[1], dep[1], previous[0], dep[0] b_AP = (2 * (y1_AP * math.log(x1) + y2_AP * math.log(x2)) - (y1_AP + y2_AP) * (math.log(x1) + math.log(x2))) / ( 2 * (math.log(x1)**2 + math.log(x2)**2) - (math.log(x1) + math.log(x2))**2) a_AP = (y1_AP + y2_AP - b_AP * (math.log(x1) + math.log(x2))) / 2 y1_ABP, y2_ABP = previous[2], dep[2] b_ABP = (2 * (y1_ABP * math.log(x1) + y2_ABP * math.log(x2)) - (y1_ABP + y2_ABP) * (math.log(x1) + math.log(x2))) / ( 2 * (math.log(x1)**2 + math.log(x2)**2) - (math.log(x1) + math.log(x2))**2) a_ABP = (y1_ABP + y2_ABP - b_ABP * (math.log(x1) + math.log(x2))) / 2 y1_CP, y2_CP = previous[3], dep[3] b_CP = (2 * (y1_CP * math.log(x1) + y2_CP * math.log(x2)) - (y1_CP + y2_CP) * (math.log(x1) + math.log(x2))) / ( 2 * (math.log(x1)**2 + math.log(x2)**2) - (math.log(x1) + math.log(x2))**2) a_CP = (y1_CP + y2_CP - b_CP * (math.log(x1) + math.log(x2))) / 2 for r in range(previous[0] + 1, dep[0]): interp_AP = a_AP + b_AP * math.log(r) interp_ABP = a_ABP + b_ABP * math.log(r) interp_CP = a_CP + b_CP * math.log(r) self.interpolated_dependences[r] = (interp_AP, interp_ABP, interp_CP) self.interpolated_dependences[dep[0]] = (dep[1], dep[2], dep[3]) previous = dep def calculate_base_execution_rate(self): self.functional_port_issue_rate = self.physical_dispatch_width self.functional_unit_issue_rate = self.physical_dispatch_width self.calculate_functional_port_rate() self.calculate_functional_unit_rate() self.effective_dispatch_rates = { "DISPATCH": self.physical_dispatch_width, "DEPENDENCES": self.independent_instructions[self.ROB_size], "FUNCTIONAL_PORT": self.functional_port_issue_rate, "FUNCTIONAL_UNIT": self.functional_unit_issue_rate } self.debug_printer.save_debug_stats_live( (self.physical_dispatch_width, self.independent_instructions[self.ROB_size], self.functional_port_issue_rate, self.functional_unit_issue_rate)) def calculate_functional_port_rate(self): cycles_per_unit = {} for k in self.functional_units_pipelined: cycles_per_unit[k] = 0 for k, v in self.instruction_per_functional_unit.iteritems(): # v is the functional unit # k is the actual category of the uop for fu in v: if self.functional_units_pipelined[fu]: if k in self.uop_hist: cycles_per_unit[fu] += self.uop_hist[k] * 1.0 / len( self.functional_units_per_port[fu]) / len(v) else: if k in self.uop_hist: cycles_per_unit[fu] += self.uop_hist[k] * float( self.instruction_latencies[k]) / len( self.functional_units_per_port[fu]) / len(v) self.functional_port_issue_rate = float(sum( self.uop_hist.itervalues())) / max(cycles_per_unit.itervalues()) def calculate_functional_unit_rate(self): cycles_per_port_new = {} for k in self.available_ports_split: cycles_per_port_new[k] = 0 # we assume optimal scheduling by first taking all the instructions that can be scheduled only on one port # we schedule the instructions with multiple ports on their optimal port # example: schedule 30 adds on port 0,1 and 5 with following 'activity factors' # P0: 5 P1: 10 P5: 15 # 15 adds are scheduled on port 1, 10 adds on port 2 and 5 adds on port 3 # P0: 20 P1: 20 P5: 20 for ins, ports in sorted(self.instruction_per_port.iteritems(), key=lambda s: len(s[1])): if len(ports) == 1: for p_outer in ports: for p_inner in p_outer: if ins in self.uop_hist: cycles_per_port_new[p_inner] += self.uop_hist[ins] else: already_scheduled = [] for p_outer in ports: already_scheduled.append([]) for p_inner in p_outer: already_scheduled[-1].append( [float(cycles_per_port_new[p_inner]), p_inner]) already_scheduled = sorted(already_scheduled) if ins in self.uop_hist: current_port, activity_to_schedule = 0, float( self.uop_hist[ins]) else: current_port, activity_to_schedule = 0, 0 while activity_to_schedule > 0: if len(already_scheduled) > current_port + 1: can_schedule = ( already_scheduled[current_port + 1][0][0] - already_scheduled[current_port][0][0]) * ( current_port + 1) if can_schedule <= activity_to_schedule: for i in range(0, current_port + 1): for j in range(0, len(already_scheduled[i])): already_scheduled[i][j][ 0] += can_schedule / (current_port + 1) activity_to_schedule -= can_schedule else: for i in range(0, current_port + 1): for j in range(0, len(already_scheduled[i])): already_scheduled[i][j][ 0] += activity_to_schedule / ( current_port + 1) activity_to_schedule = 0 else: for i in range(0, current_port + 1): for j in range(0, len(already_scheduled[i])): already_scheduled[i][j][ 0] += activity_to_schedule / ( current_port + 1) activity_to_schedule = 0 current_port += 1 for i in range(0, len(already_scheduled)): for cycles, port in already_scheduled[i]: cycles_per_port_new[port] = 0 for i in range(0, len(already_scheduled)): for cycles, port in already_scheduled[i]: cycles_per_port_new[port] += cycles self.functional_unit_issue_rate = float(sum( self.uop_hist.itervalues())) / max( cycles_per_port_new.itervalues()) def get_average_instruction_latency(self): return self.average_instruction_latency def get_path_lengths(self): return self.interpolated_dependences def get_independent_instructions(self): return self.independent_instructions def get_effective_dispatch_rates(self): return self.effective_dispatch_rates def calculate_base_component(self): total_base_component = 0 base_component = float(self.stats[1]) / self.stats[ 0] * self.window_instruction_length / self.effective_dispatch_rates[ "DISPATCH"] total_base_component += base_component dependence_component = max( 0, float(self.stats[1]) / self.stats[0] * self.window_instruction_length / self.effective_dispatch_rates["DEPENDENCES"] - total_base_component) total_base_component += dependence_component port_component = max( 0, float(self.stats[1]) / self.stats[0] * self.window_instruction_length / self.effective_dispatch_rates["FUNCTIONAL_PORT"] - total_base_component) total_base_component += port_component unit_component = max( 0, float(self.stats[1]) / self.stats[0] * self.window_instruction_length / self.effective_dispatch_rates["FUNCTIONAL_UNIT"] - total_base_component) return base_component, dependence_component, port_component, unit_component