def get_cnum(self, vdd): core = self.core core_power = core.power(vdd) _debug(_bm_('core_power: {0}', core_power)) l2_power = self.l2_traits['power'] l2_area = self.l2_traits['area'] _debug(_bm_('l2_power: {0}', l2_power)) l1_power = self.l1_traits['power'] l1_area = self.l1_traits['area'] _debug(_bm_('l1_power: {0}', l1_power)) cnum = min((self.sys_power - l2_power) / (core_power + l1_power), (self.sys_area - l2_area) / (core.area + l1_area)) return int(cnum)
def perf(self, vdd, app, cnum=None): if app.type != 'synthetic': raise HomogSysError('Requires a synthetic application') if not cnum: cnum = self.get_cnum(vdd) _debug(_bm_('cnum: {0}', cnum)) core = self.core _debug(_bm_('freq: {0}', core.freq(vdd))) cov = 1 perf = 0 # kernels will be accelerated by multi-cores for kid in app.get_all_kernels(): kcov = app.get_cov(kid) kobj = app.get_kernel(kid) miss_l1 = min( 1, kobj.miss_l1 * ((self.cache_sz_l1 / (kobj.cache_sz_l1_nom))**(1 - kobj.alpha_l1))) miss_l2 = min( 1, kobj.miss_l2 * ((self.cache_sz_l2 / (cnum * kobj.cache_sz_l2_nom))**(1 - kobj.alpha_l2))) _debug(_bm_('l1_miss: {0}, l2_miss: {1}', miss_l1, miss_l2)) t0 = ((1 - miss_l1) * self.delay_l1 + miss_l1 * (1 - miss_l2) * self.delay_l2 + miss_l1 * miss_l2 * self.delay_mem) t = t0 * core.freq(vdd) / core.freq(core.vnom) _debug(_bm_('t: {0}', t)) eta = 1 / (1 + t * kobj.rm / kobj.cpi_exe) eta0 = 1 / (1 + t0 * kobj.rm / kobj.cpi_exe) _debug(_bm_('eta: {0}, eta0: {1}', eta, eta0)) _debug(_bm_('freq: {0}, freq0: {1}', core.freq(vdd), core.fnom)) _debug(_bm_('vdd: {0}, v0: {1}', vdd, core.vnom)) p_speedup = (core.freq(vdd) / core.fnom) * cnum * (eta / eta0) _debug(_bm_('p_speedup: {0}', p_speedup)) vdd_max = min(core.vnom * VSF_MAX, core.vmax) s_speedup = 1 _debug(_bm_('s_speedup: {0}', s_speedup)) perf += kcov * ((1 - kobj.pf + kobj.pf / p_speedup)) cov -= kcov # non-kernels will not be speedup perf += cov abs_perf = core.perfnom / perf # speedup = 1 / perf return abs_perf
def __init__(self, budget, tech, serial_core=None, tput_core=None): self.sys_area = budget.area self.sys_power = budget.power self._sys_bw = budget.bw self.tech = tech self.sys_bandwidth = self._sys_bw[tech] # asic_dict[kid][acc_id] = Acc_obj self.asic_dict = dict() # asic_power_dict[kid][acc_id] = (power_max, power_min, power_nom) self.asic_power_dict = dict() if tput_core: self.thru_core = tput_core else: self.thru_core = BaseCore(tech, 'cmos', 'hp', 'io') self.dim_perf = None self.serial_core = serial_core if serial_core: self.thru_core_area = self.sys_area - serial_core.area _debug( _bm_('Serial core: {0}, area: {1}', self.serial_core.ctype, self.serial_core.area)) else: self.thru_core_area = self.sys_area self.use_gpacc = False
def __init__(self, budget, tech, serial_core=None, tput_core=None): self.sys_area = budget.area self.sys_power = budget.power self._sys_bw = budget.bw self.tech = tech self.sys_bandwidth = self._sys_bw[tech] # asic_dict[kid][acc_id] = Acc_obj self.asic_dict = dict() # asic_power_dict[kid][acc_id] = (power_max, power_min, power_nom) self.asic_power_dict = dict() if tput_core: self.thru_core = tput_core else: self.thru_core = BaseCore(tech, 'cmos', 'hp', 'io') self.dim_perf = None self.serial_core = serial_core if serial_core: self.thru_core_area = self.sys_area - serial_core.area _debug(_bm_('Serial core: {0}, area: {1}', self.serial_core.ctype, self.serial_core.area)) else: self.thru_core_area = self.sys_area self.use_gpacc = False
def add_kernel(self, kernel, cov): """Register a kernel to be accelerate. The kernel could be accelerated by certain ASIC, or more generalized GPU/FPGA Parameters ---------- kernel : :class:`~lumos.model.workload.kernel.Kernel` The kernel object cov : float The coerage of the kernel, relative to the serial execution Raises ------ AppError the given coverage (cov) is larger than the overall parallel ratio """ name = kernel.name if name in self.kernels: _debug(_bm_('Kernel {0} already exist', name)) return False if cov > self.f_noacc: raise AppError( '[add_kernel]: cov of {0} is too large to exceed the overall ' 'parallel ratio {1}'.format(cov, self.f_noacc)) self.kernels[name] = kernel self.kernels_coverage[name] = cov self.f_noacc = self.f_noacc - cov self.tag = self.tag_update()
def load_from_xmltree(cls, xmltree, kernels): name = xmltree.get('name') if not name: raise AppError("No name in app config") a = cls(name) ps = xmltree.find('perf_config') if ps is None: raise Exception('No performance configuration in {0}'.format(name)) else: for ele in ps: if ele.tag is etree.Comment: continue setattr(a, ele.tag, float(ele.text)) ks = xmltree.find('kernel_config') if ks is None: _debug(_bm_('No kernel config in {0}', name)) else: for ele in ks: kname = ele.get('name') if not kname: raise Exception('No name for kernel {0} in app {1}'.format( kname, name)) val_ = ele.get('cov') if not val_: raise Exception( 'No covreage for kernel {0} in app {1}'.format( kname, name)) k_cov = float(val_) a_.add_kernel(kernels[kname], k_cov) return a
def _dim_perf_opt(self, power_budget=None, cnum_max=None): """Get the optimal performance of multicore, subjecting to power and area budget. Increasing the number of cores can improve throughput, while a stringent system budget (e.g. power and area) may limit the throughput improvement. This method finds the optimal throughput-based performance. Parameters ---------- power_budget : float power budget, if None, system power budget will be used cnum_max : int maximum core number, if None, system area budget will be used to determine cnum_max Returns ------- perf : float optimal throughput-based performance vdd : int supply voltage when achieving the optimal throughput cnum : int the number of active cores when achieving the optimal throughput power_eff : float the effective power of throughput cores """ core = self.thru_core if not cnum_max: cnum_max = int(self.sys_area / core.area) _debug(_bm_('power budget: {0}', power_budget)) perf_opt, vdd_opt, cnum_opt, power_eff = 0, 0, 0, 0 for cnum in range(1, cnum_max + 1): perf_, vdd_, power_ = self._dim_perf_cnum( cnum, power_budget=power_budget) if perf_ == 0: # power budget is too small to power such many cores break if perf_ > perf_opt: perf_opt = perf_ vdd_opt = vdd_ cnum_opt = cnum power_eff = power_ _debug( _bm_('cnum: {0}, perf_opt: {1}, power_eff: {2}', cnum, perf_opt, power_eff)) return (perf_opt, vdd_opt, cnum_opt, power_eff)
def perf(self, vdd, app, cnum=None): if app.type != 'synthetic': raise HomogSysError('Requires a synthetic application') if not cnum: cnum = self.get_cnum(vdd) _debug(_bm_('cnum: {0}', cnum)) core = self.core _debug(_bm_('freq: {0}', core.freq(vdd))) cov = 1 perf = 0 # kernels will be accelerated by multi-cores for kid in app.get_all_kernels(): kcov = app.get_cov(kid) kobj = app.get_kernel(kid) miss_l1 = min( 1, kobj.miss_l1 * ((self.cache_sz_l1 / (kobj.cache_sz_l1_nom)) ** (1 - kobj.alpha_l1))) miss_l2 = min( 1, kobj.miss_l2 * ((self.cache_sz_l2 / (cnum * kobj.cache_sz_l2_nom)) ** (1 - kobj.alpha_l2))) _debug(_bm_('l1_miss: {0}, l2_miss: {1}', miss_l1, miss_l2)) t0 = ((1 - miss_l1) * self.delay_l1 + miss_l1 * (1 - miss_l2) * self.delay_l2 + miss_l1 * miss_l2 * self.delay_mem) t = t0 * core.freq(vdd) / core.freq(core.vnom) _debug(_bm_('t: {0}', t)) eta = 1 / (1 + t * kobj.rm / kobj.cpi_exe) eta0 = 1 / (1 + t0 * kobj.rm / kobj.cpi_exe) _debug(_bm_('eta: {0}, eta0: {1}', eta, eta0)) _debug(_bm_('freq: {0}, freq0: {1}', core.freq(vdd), core.fnom)) _debug(_bm_('vdd: {0}, v0: {1}', vdd, core.vnom)) p_speedup = (core.freq(vdd) / core.fnom) * cnum * (eta / eta0) _debug(_bm_('p_speedup: {0}', p_speedup)) vdd_max = min(core.vnom * VSF_MAX, core.vmax) s_speedup = 1 _debug(_bm_('s_speedup: {0}', s_speedup)) perf += kcov * ((1 - kobj.pf + kobj.pf / p_speedup)) cov -= kcov # non-kernels will not be speedup perf += cov abs_perf = core.perfnom / perf # speedup = 1 / perf return abs_perf
def _dim_perf_opt(self, power_budget=None, cnum_max=None): """Get the optimal performance of multicore, subjecting to power and area budget. Increasing the number of cores can improve throughput, while a stringent system budget (e.g. power and area) may limit the throughput improvement. This method finds the optimal throughput-based performance. Parameters ---------- power_budget : float power budget, if None, system power budget will be used cnum_max : int maximum core number, if None, system area budget will be used to determine cnum_max Returns ------- perf : float optimal throughput-based performance vdd : int supply voltage when achieving the optimal throughput cnum : int the number of active cores when achieving the optimal throughput power_eff : float the effective power of throughput cores """ core = self.thru_core if not cnum_max: cnum_max = int(self.sys_area / core.area) _debug(_bm_('power budget: {0}', power_budget)) perf_opt, vdd_opt, cnum_opt, power_eff = 0, 0, 0, 0 for cnum in range(1, cnum_max + 1): perf_, vdd_, power_ = self._dim_perf_cnum(cnum, power_budget=power_budget) if perf_ == 0: # power budget is too small to power such many cores break if perf_ > perf_opt: perf_opt = perf_ vdd_opt = vdd_ cnum_opt = cnum power_eff = power_ _debug(_bm_('cnum: {0}, perf_opt: {1}, power_eff: {2}', cnum, perf_opt, power_eff)) return (perf_opt, vdd_opt, cnum_opt, power_eff)
def get_speedup_appdag_serial(self, appdag): """Get the performance of the system, on an :class:`~lumos.model.AppDAG` application, all kernels are processed in serial. Parameters ---------- appdag : :class:`~lumos.model.AppDAG` The target application Returns ------- float speedup relative to running all kernels at the baseline performance. """ dim_perf, opt_vdd, opt_cnum, power_eff = self._dim_perf_opt( power_budget=self.sys_power) thru_core = self.thru_core thrucore_serial_su = thru_core.perf_by_vdd(opt_vdd) / PERF_BASE thrucore_thru_su = dim_perf / PERF_BASE _debug( _bm_('thrucore serial su: {0}, thrucore parallel su: {1}', thrucore_serial_su, thrucore_thru_su)) serial_su = thru_core.perf_by_vdd(thru_core.vnom) / PERF_BASE _debug(_bm_('serial su: {0}', serial_su)) ker_lengths = appdag.get_all_kernel_lengths() ker_objs = appdag.get_all_kernels(mode='object') _debug( _bm_('dim_perf: {0} cnum: {1}, vdd: {2}', dim_perf, opt_cnum, opt_vdd)) baseline = sum(ker_lengths) perf = 0 for kl, ko in zip(ker_lengths, ker_objs): _debug(_bm_('-------{0}---------', ko.name)) if self.has_asacc(ko.name): acc = self.get_asacc(ko.name) rt = kl / acc.perf() _debug(_bm_('Accelerator speedup: {0}', acc.perf())) else: if ko.pf == 0: # this is a serial application rt = kl / serial_su else: # this is a parallel application rt = kl * ( 1 - ko.pf ) / thrucore_serial_su + kl * ko.pf / thrucore_thru_su perf += rt _debug(_bm_('runtime: {0}, accu runtime: {1}', rt, perf)) _debug(_bm_('baseline: {0}, bench: {1}', baseline, perf)) return baseline / perf
def load_from_xmltree(cls, xmltree, kernels): name = xmltree.get('name') if not name: raise AppError("No name in app config") a = cls(name) ks = xmltree.find('kernel_config') if ks is None: _debug(_bm_('No kernel config in {0}', name)) else: for ele in ks: kname = ele.get('name') if not kname: raise Exception('No name for kernel {0} in app {1}'.format( kname, name)) val_ = ele.get('cov') if not val_: raise Exception( 'No covreage for kernel {0} in app {1}'.format( kname, name)) k_cov = float(val_) val_ = ele.get('rc_count') if not val_: k_rc_count = 1 else: k_rc_count = int(val_) val_ = ele.get('rc_time') if not val_: k_rc_time = 0 else: k_rc_time = float(val_) a.add_kernel(kernels[kname], k_cov, k_rc_count, k_rc_time) _debug( _bm_('Add kernel {0}, cov {1}, rc_count {2}, rc_time {3}', kname, k_cov, k_rc_count, k_rc_time)) return a
def get_speedup_appdag_serial(self, appdag): """Get the performance of the system, on an :class:`~lumos.model.AppDAG` application, all kernels are processed in serial. Parameters ---------- appdag : :class:`~lumos.model.AppDAG` The target application Returns ------- float speedup relative to running all kernels at the baseline performance. """ dim_perf, opt_vdd, opt_cnum, power_eff = self._dim_perf_opt(power_budget=self.sys_power) thru_core = self.thru_core thrucore_serial_su = thru_core.perf_by_vdd(opt_vdd) / PERF_BASE thrucore_thru_su = dim_perf / PERF_BASE _debug(_bm_('thrucore serial su: {0}, thrucore parallel su: {1}', thrucore_serial_su, thrucore_thru_su)) serial_su = thru_core.perf_by_vdd(thru_core.vnom) / PERF_BASE _debug(_bm_('serial su: {0}', serial_su)) ker_lengths = appdag.get_all_kernel_lengths() ker_objs = appdag.get_all_kernels(mode='object') _debug(_bm_('dim_perf: {0} cnum: {1}, vdd: {2}', dim_perf, opt_cnum, opt_vdd)) baseline = sum(ker_lengths) perf = 0 for kl, ko in zip(ker_lengths, ker_objs): _debug(_bm_('-------{0}---------', ko.name)) if self.has_asacc(ko.name): acc = self.get_asacc(ko.name) rt = kl / acc.perf() _debug(_bm_('Accelerator speedup: {0}', acc.perf())) else: if ko.pf == 0: # this is a serial application rt = kl / serial_su else: # this is a parallel application rt = kl * (1-ko.pf)/thrucore_serial_su + kl * ko.pf/thrucore_thru_su perf += rt _debug(_bm_('runtime: {0}, accu runtime: {1}', rt, perf)) _debug(_bm_('baseline: {0}, bench: {1}', baseline, perf)) return baseline / perf
def _dim_perf_cnum(self, cnum, vmin=None, power_budget=None): """Get the performance by given the active number of cores. Given number of multi-cores (potentially) running at lower supply voltage. Parameters ---------- cnum : int The number of active cores vmin : float, optional The minimum voltage of cores power_budget : float The power budget, if None, system power budget will be used Returns ------- perf : float The performance score, not relative speedup vdd : int The supply voltage in mV, when the system achieves the optimal throughput power : float The effective power conumption of dim cores. If it is less than the provided power_budget, this usually means the performance is constrained by area_budget instead of power_budget """ core = self.thru_core if not power_budget: power_budget = self.sys_power cpower = power_budget / float(cnum) if not vmin: vmin = core.vmin elif vmin < core.vmin: _debug(_bm_('Provided vmin {0}mV is lower than core.vmin {1}mV. ' 'vmin is set to core.vmin', vmin, core.vmin)) vmin = core.vmin # first check whether vmin can meet the power constraint. If not, # it is probabaly either due to vmin is too high, or power budget # is too constrained, return None in this case. if core.power(vmin) > cpower: _debug(_bm_('Power budget {1:.3g}W is not met even at vmin of {0}mV', vmin, power_budget)) return (0, 0, 0) # use binary search to find the highest per-core vdd that stays within the power_budget vmax = core.vmax while vmax > vmin: vmid = math.ceil((vmin+vmax) / 2) power_vmid = core.power(vmid) if power_vmid > cpower: vmax = vmid - 1 else: vmin = vmid perf_opt = cnum * core.perf_by_vdd(vmin) power_eff = cnum * core.power(vmin) return (perf_opt, vmin, power_eff)
def perf_by_cnum(self, cnum, app, vmin=None): """ Get the relative performance of the system for a given application, with a given constraint on the number of active cores. Parameters ---------- cnum: num The number of core required to be active. app: :class:`~lumos.model.application.Application` The targeted application. vmin: num An optional argument to specify the lowest boundary which supply voltage can be scaled down. Returns ------- dict: results wrapped in a python dict with three keys: perf : num The relatvie performance. vdd : num The supply voltage of the optimal configuration under core number constraint. freq : num The frequency with the optimal configuration under core number constraint. cnum : num The actual number of active cores, if the requried number can not be met. util : num The utilization of the system at the optimal configuraiton. """ core = self.core f = app.f cnum_max = int(self.area / core.area) if cnum > cnum_max or cnum < 0: return None cpower = self.power / float(cnum) _debug(_bm_('Per-core power budget: {0}', cpower)) # Serial performance is achieved by the highest vdd sperf = core.perf_by_vdd(core.vmax) if not vmin: vmin = core.vmin # Check whether vmin can meet the power requirement if vmin >= core.vmin: if core.power(vmin) > cpower: # Either vmin is too high or active_cnum is too large # so that the system could not meet the power budget even # with the minimum vdd. Return the active core number with vmin # Users can capture the exception by comparing the active_cnum # and the returned active_cnum active_cnum = min(int(self.area / core.area), int(self.power / core.power(vmin))) perf = 1 / ((1 - f) / sperf + f / (active_cnum * core.perf_by_vdd(vmin))) util = float(100 * active_cnum) / float(cnum) _debug(_bm_('vmin is too high or active_cnum is too large')) return { 'perf': perf / PERF_BASE, 'vdd': vmin, 'cnum': active_cnum, 'freq': core.freq(vmin), 'util': util } else: vmin = core.vmin vl = vmin vr = core.vmax vm = int((vl + vr) / 2) while (vr - vl) > V_PRECISION: vm = int((vl + vr) / 2) _debug( _bm_( '[Core]\t:vl: {0}mV, vr: {1}mV, vm: {2}mV, ' 'freq: {3}, power: {4}, area: {5}', vl, vr, vm, core.freq(vm), core.power(vm), core.area)) if core.power(vm) > cpower: vl = vl vr = vm else: vl = vm vr = vr _debug(_bm_('End of bin-search, vl: {0}mV, vr: {1}mV', vl, vr)) core.vdd = vl lpower = core.power(vl) lfreq = core.freq(vl) lcnum = min(int(self.area / core.area), int(self.power / lpower)) lperf = 1 / ((1 - f) / sperf + f / (cnum * core.perf_by_vdd(vl))) rpower = core.power(vr) rfreq = core.freq(vr) rcnum = min(int(self.area / core.area), int(self.power / rpower)) rperf = 1 / ((1 - f) / sperf + f / (cnum * core.perf_by_vdd(vr))) if rpower <= cpower: # right bound meets the power constraint return { 'perf': rperf / PERF_BASE, 'vdd': vr, 'cnum': cnum, 'freq': rfreq, 'util': float(100 * cnum) / float(cnum_max) } else: return { 'perf': lperf / PERF_BASE, 'vdd': vl, 'freq': lfreq, 'cnum': cnum, 'util': float(100 * cnum) / float(cnum_max) }
if mo: return int(mo.group(1)) else: raise TechModelError('no technology node from the name of {0}'.format(model_file)) model_name = 'homoTFET30nm' freq_dict = dict() dynamic_power_dict = dict() static_power_dict = dict() model_files = glob.glob(os.path.join( _MODEL_DIR, '{0}_{1}_*.data'.format( settings.TFET_SIM_CIRCUIT, model_name))) for model_file in model_files: _debug(_bm_('found model {0}', model_file)) model_file_mtime = os.path.getmtime(model_file) tech = _get_tech_node(model_file) pickle_file = os.path.join( _MODEL_DIR, '{0}_{1}_{2}.p'.format( settings.TFET_SIM_CIRCUIT, model_name, tech)) try: pickle_file_mtime = os.path.getmtime(pickle_file) except OSError: pickle_file_mtime = 0 if pickle_file_mtime > model_file_mtime: with open(pickle_file, 'rb') as f: freq_dict[tech] = pickle.load(f) dynamic_power_dict[tech] = pickle.load(f)
def instantiate(self): self._thru_core_num = int(self.thru_core_area / self.thru_core.area) _debug( _bm_('Tput core: {0}, area: {1}, cnum: {2}', self.thru_core.ctype, self.thru_core.area, self._thru_core_num))
def _dim_perf_cnum(self, cnum, vmin=None, power_budget=None): """Get the performance by given the active number of cores. Given number of multi-cores (potentially) running at lower supply voltage. Parameters ---------- cnum : int The number of active cores vmin : float, optional The minimum voltage of cores power_budget : float The power budget, if None, system power budget will be used Returns ------- perf : float The performance score, not relative speedup vdd : int The supply voltage in mV, when the system achieves the optimal throughput power : float The effective power conumption of dim cores. If it is less than the provided power_budget, this usually means the performance is constrained by area_budget instead of power_budget """ core = self.thru_core if not power_budget: power_budget = self.sys_power cpower = power_budget / float(cnum) if not vmin: vmin = core.vmin elif vmin < core.vmin: _debug( _bm_( 'Provided vmin {0}mV is lower than core.vmin {1}mV. ' 'vmin is set to core.vmin', vmin, core.vmin)) vmin = core.vmin # first check whether vmin can meet the power constraint. If not, # it is probabaly either due to vmin is too high, or power budget # is too constrained, return None in this case. if core.power(vmin) > cpower: _debug( _bm_('Power budget {1:.3g}W is not met even at vmin of {0}mV', vmin, power_budget)) return (0, 0, 0) # use binary search to find the highest per-core vdd that stays within the power_budget vmax = core.vmax while vmax > vmin: vmid = math.ceil((vmin + vmax) / 2) power_vmid = core.power(vmid) if power_vmid > cpower: vmax = vmid - 1 else: vmin = vmid perf_opt = cnum * core.perf_by_vdd(vmin) power_eff = cnum * core.power(vmin) return (perf_opt, vmin, power_eff)
def get_speedup_appdag_parallel_greedy(self, appdag): """Get the performance of the system, on an :class:`~lumos.model.AppDAG` application, all kernels are processed in parallel, power budget is allocated to accelerators in a greedy way. The system power budget will be allocated to the kernel that has the longest execution time, the power of the accelerator working at nominal supply will be deducted from the system budget. Then the system will try to allocate the remaining power budget to the accelerator targeting the next longest running kernel. Until there is no more accelerators can be activate. This will assume that all parallel kernels are supported by accelerators. Parameters ---------- appdag : :class:`~lumos.model.AppDAG` The target application Returns ------- float speedup relative to running all kernels at the baseline performance. """ ker_lengths = appdag.get_all_kernel_lengths() baseline = sum(ker_lengths) depth_sorted = appdag.kernels_depth_sort() finish = 0 for l, node_list in enumerate(depth_sorted): ker_lengths = [ appdag.get_kernel_length(idx_) for idx_ in node_list ] ker_len_sorted = sorted(zip(ker_lengths, node_list), reverse=True) power_budget = self.sys_power runtime = [] for kl, idx_ in ker_len_sorted: ko = appdag.get_kernel(idx_) if self.has_asacc(ko.name): asacc = self.get_asacc(ko.name) asacc_su = asacc.perf(power=power_budget) / PERF_BASE power_budget -= asacc.power_eff rt = kl / asacc_su runtime.append(rt) else: perf_, vdd_, cnum_, power_eff = self._dim_perf_opt( power_budget=power_budget) if power_eff == 0: # remaining power budget is too small power_budget = 0 else: thru_su = perf_ / PERF_BASE serial_su = self.thru_core.perf_by_vdd( vdd_) / PERF_BASE power_budget -= power_eff rt = kl * (1 - ko.pf) / serial_su + kl * ko.pf / thru_su runtime.append(rt) _debug(_bm_('runtime: {0}', rt)) if power_budget <= 0.1: finish += max(runtime) _debug(_bm_('=====warp finish=====')) _debug(_bm_('run time of this warp: {0}', max(runtime))) runtime = [] power_budget = self.sys_power if runtime: finish += max(runtime) _debug(_bm_('=====warp finish=====')) _debug(_bm_('run time of this warp: {0}', max(runtime))) _debug(_bm_('baseline: {0}, bench: {1}', baseline, finish)) return baseline / finish
def get_speedup_appdag_parallel_greedy(self, appdag): """Get the performance of the system, on an :class:`~lumos.model.AppDAG` application, all kernels are processed in parallel, power budget is allocated to accelerators in a greedy way. The system power budget will be allocated to the kernel that has the longest execution time, the power of the accelerator working at nominal supply will be deducted from the system budget. Then the system will try to allocate the remaining power budget to the accelerator targeting the next longest running kernel. Until there is no more accelerators can be activate. This will assume that all parallel kernels are supported by accelerators. Parameters ---------- appdag : :class:`~lumos.model.AppDAG` The target application Returns ------- float speedup relative to running all kernels at the baseline performance. """ ker_lengths = appdag.get_all_kernel_lengths() baseline = sum(ker_lengths) depth_sorted = appdag.kernels_depth_sort() finish = 0 for l, node_list in enumerate(depth_sorted): ker_lengths = [appdag.get_kernel_length(idx_) for idx_ in node_list] ker_len_sorted = sorted(zip(ker_lengths, node_list), reverse=True) power_budget = self.sys_power runtime = [] for kl, idx_ in ker_len_sorted: ko = appdag.get_kernel(idx_) if self.has_asacc(ko.name): asacc = self.get_asacc(ko.name) asacc_su = asacc.perf(power=power_budget) / PERF_BASE power_budget -= asacc.power_eff rt = kl / asacc_su runtime.append(rt) else: perf_, vdd_, cnum_, power_eff = self._dim_perf_opt(power_budget=power_budget) if power_eff == 0: # remaining power budget is too small power_budget = 0 else: thru_su = perf_ / PERF_BASE serial_su = self.thru_core.perf_by_vdd(vdd_) / PERF_BASE power_budget -= power_eff rt = kl * (1-ko.pf)/serial_su + kl * ko.pf / thru_su runtime.append(rt) _debug(_bm_('runtime: {0}', rt)) if power_budget <= 0.1: finish += max(runtime) _debug(_bm_('=====warp finish=====')) _debug(_bm_('run time of this warp: {0}', max(runtime))) runtime = [] power_budget = self.sys_power if runtime: finish += max(runtime) _debug(_bm_('=====warp finish=====')) _debug(_bm_('run time of this warp: {0}', max(runtime))) _debug(_bm_('baseline: {0}, bench: {1}', baseline, finish)) return baseline / finish
def instantiate(self): self._thru_core_num = int(self.thru_core_area / self.thru_core.area) _debug(_bm_('Tput core: {0}, area: {1}, cnum: {2}', self.thru_core.ctype, self.thru_core.area, self._thru_core_num))
raise TechModelError( 'no technology node from the name of {0}'.format(model_file)) model_name = 'homoTFET30nm' freq_dict = dict() dynamic_power_dict = dict() static_power_dict = dict() model_files = glob.glob( os.path.join( _MODEL_DIR, '{0}_{1}_*.data'.format(settings.TFET_SIM_CIRCUIT, model_name))) for model_file in model_files: _debug(_bm_('found model {0}', model_file)) model_file_mtime = os.path.getmtime(model_file) tech = _get_tech_node(model_file) pickle_file = os.path.join( _MODEL_DIR, '{0}_{1}_{2}.p'.format(settings.TFET_SIM_CIRCUIT, model_name, tech)) try: pickle_file_mtime = os.path.getmtime(pickle_file) except OSError: pickle_file_mtime = 0 if pickle_file_mtime > model_file_mtime: with open(pickle_file, 'rb') as f: freq_dict[tech] = pickle.load(f) dynamic_power_dict[tech] = pickle.load(f)
def perf_by_cnum(self, cnum, app, vmin=None): """ Get the relative performance of the system for a given application, with a given constraint on the number of active cores. Parameters ---------- cnum: num The number of core required to be active. app: :class:`~lumos.model.application.Application` The targeted application. vmin: num An optional argument to specify the lowest boundary which supply voltage can be scaled down. Returns ------- dict: results wrapped in a python dict with three keys: perf : num The relatvie performance. vdd : num The supply voltage of the optimal configuration under core number constraint. freq : num The frequency with the optimal configuration under core number constraint. cnum : num The actual number of active cores, if the requried number can not be met. util : num The utilization of the system at the optimal configuraiton. """ core = self.core f = app.f cnum_max = int(self.area / core.area) if cnum > cnum_max or cnum < 0: return None cpower = self.power / float(cnum) _debug(_bm_('Per-core power budget: {0}', cpower)) # Serial performance is achieved by the highest vdd sperf = core.perf_by_vdd(core.vmax) if not vmin: vmin = core.vmin # Check whether vmin can meet the power requirement if vmin >= core.vmin: if core.power(vmin) > cpower: # Either vmin is too high or active_cnum is too large # so that the system could not meet the power budget even # with the minimum vdd. Return the active core number with vmin # Users can capture the exception by comparing the active_cnum # and the returned active_cnum active_cnum = min(int(self.area / core.area), int(self.power / core.power(vmin))) perf = 1 / ((1 - f) / sperf + f / (active_cnum * core.perf_by_vdd(vmin))) util = float(100 * active_cnum) / float(cnum) _debug(_bm_('vmin is too high or active_cnum is too large')) return { 'perf': perf / PERF_BASE, 'vdd': vmin, 'cnum': active_cnum, 'freq': core.freq(vmin), 'util': util } else: vmin = core.vmin vl = vmin vr = core.vmax vm = int((vl + vr) / 2) while (vr - vl) > V_PRECISION: vm = int((vl + vr) / 2) _debug(_bm_('[Core]\t:vl: {0}mV, vr: {1}mV, vm: {2}mV, ' 'freq: {3}, power: {4}, area: {5}', vl, vr, vm, core.freq(vm), core.power(vm), core.area)) if core.power(vm) > cpower: vl = vl vr = vm else: vl = vm vr = vr _debug(_bm_('End of bin-search, vl: {0}mV, vr: {1}mV', vl, vr)) core.vdd = vl lpower = core.power(vl) lfreq = core.freq(vl) lcnum = min(int(self.area / core.area), int(self.power / lpower)) lperf = 1 / ((1 - f) / sperf + f / (cnum * core.perf_by_vdd(vl))) rpower = core.power(vr) rfreq = core.freq(vr) rcnum = min(int(self.area / core.area), int(self.power / rpower)) rperf = 1 / ((1 - f) / sperf + f / (cnum * core.perf_by_vdd(vr))) if rpower <= cpower: # right bound meets the power constraint return { 'perf': rperf / PERF_BASE, 'vdd': vr, 'cnum': cnum, 'freq': rfreq, 'util': float(100 * cnum) / float(cnum_max) } else: return { 'perf': lperf / PERF_BASE, 'vdd': vl, 'freq': lfreq, 'cnum': cnum, 'util': float(100 * cnum) / float(cnum_max) }