Example #1
0
 def get_cnum(self, vdd):
     core = self.core
     core_power = core.power(vdd)
     _debug(_bm_('core_power: {0}', core_power))
     l2_power = self.l2_traits['power']
     l2_area = self.l2_traits['area']
     _debug(_bm_('l2_power: {0}', l2_power))
     l1_power = self.l1_traits['power']
     l1_area = self.l1_traits['area']
     _debug(_bm_('l1_power: {0}', l1_power))
     cnum = min((self.sys_power - l2_power) / (core_power + l1_power),
                (self.sys_area - l2_area) / (core.area + l1_area))
     return int(cnum)
Example #2
0
 def get_cnum(self, vdd):
     core = self.core
     core_power = core.power(vdd)
     _debug(_bm_('core_power: {0}', core_power))
     l2_power = self.l2_traits['power']
     l2_area = self.l2_traits['area']
     _debug(_bm_('l2_power: {0}', l2_power))
     l1_power = self.l1_traits['power']
     l1_area = self.l1_traits['area']
     _debug(_bm_('l1_power: {0}', l1_power))
     cnum = min((self.sys_power - l2_power) / (core_power + l1_power),
                (self.sys_area - l2_area) / (core.area + l1_area))
     return int(cnum)
Example #3
0
    def perf(self, vdd, app, cnum=None):
        if app.type != 'synthetic':
            raise HomogSysError('Requires a synthetic application')

        if not cnum:
            cnum = self.get_cnum(vdd)

        _debug(_bm_('cnum: {0}', cnum))
        core = self.core
        _debug(_bm_('freq: {0}', core.freq(vdd)))
        cov = 1
        perf = 0
        # kernels will be accelerated by multi-cores
        for kid in app.get_all_kernels():
            kcov = app.get_cov(kid)
            kobj = app.get_kernel(kid)

            miss_l1 = min(
                1,
                kobj.miss_l1 * ((self.cache_sz_l1 /
                                 (kobj.cache_sz_l1_nom))**(1 - kobj.alpha_l1)))
            miss_l2 = min(
                1,
                kobj.miss_l2 *
                ((self.cache_sz_l2 /
                  (cnum * kobj.cache_sz_l2_nom))**(1 - kobj.alpha_l2)))

            _debug(_bm_('l1_miss: {0}, l2_miss: {1}', miss_l1, miss_l2))
            t0 = ((1 - miss_l1) * self.delay_l1 + miss_l1 *
                  (1 - miss_l2) * self.delay_l2 +
                  miss_l1 * miss_l2 * self.delay_mem)
            t = t0 * core.freq(vdd) / core.freq(core.vnom)
            _debug(_bm_('t: {0}', t))
            eta = 1 / (1 + t * kobj.rm / kobj.cpi_exe)
            eta0 = 1 / (1 + t0 * kobj.rm / kobj.cpi_exe)
            _debug(_bm_('eta: {0}, eta0: {1}', eta, eta0))
            _debug(_bm_('freq: {0}, freq0: {1}', core.freq(vdd), core.fnom))
            _debug(_bm_('vdd: {0}, v0: {1}', vdd, core.vnom))
            p_speedup = (core.freq(vdd) / core.fnom) * cnum * (eta / eta0)
            _debug(_bm_('p_speedup: {0}', p_speedup))

            vdd_max = min(core.vnom * VSF_MAX, core.vmax)
            s_speedup = 1
            _debug(_bm_('s_speedup: {0}', s_speedup))

            perf += kcov * ((1 - kobj.pf + kobj.pf / p_speedup))
            cov -= kcov

        # non-kernels will not be speedup
        perf += cov

        abs_perf = core.perfnom / perf  # speedup = 1 / perf
        return abs_perf
Example #4
0
    def __init__(self, budget, tech, serial_core=None, tput_core=None):
        self.sys_area = budget.area
        self.sys_power = budget.power
        self._sys_bw = budget.bw

        self.tech = tech
        self.sys_bandwidth = self._sys_bw[tech]

        # asic_dict[kid][acc_id] = Acc_obj
        self.asic_dict = dict()
        # asic_power_dict[kid][acc_id] = (power_max, power_min, power_nom)
        self.asic_power_dict = dict()

        if tput_core:
            self.thru_core = tput_core
        else:
            self.thru_core = BaseCore(tech, 'cmos', 'hp', 'io')

        self.dim_perf = None

        self.serial_core = serial_core
        if serial_core:
            self.thru_core_area = self.sys_area - serial_core.area
            _debug(
                _bm_('Serial core: {0}, area: {1}', self.serial_core.ctype,
                     self.serial_core.area))
        else:
            self.thru_core_area = self.sys_area

        self.use_gpacc = False
Example #5
0
    def __init__(self, budget, tech, serial_core=None, tput_core=None):
        self.sys_area = budget.area
        self.sys_power = budget.power
        self._sys_bw = budget.bw

        self.tech = tech
        self.sys_bandwidth = self._sys_bw[tech]

        # asic_dict[kid][acc_id] = Acc_obj
        self.asic_dict = dict()
        # asic_power_dict[kid][acc_id] = (power_max, power_min, power_nom)
        self.asic_power_dict = dict()

        if tput_core:
            self.thru_core = tput_core
        else:
            self.thru_core = BaseCore(tech, 'cmos', 'hp', 'io')

        self.dim_perf = None

        self.serial_core = serial_core
        if serial_core:
            self.thru_core_area = self.sys_area - serial_core.area
            _debug(_bm_('Serial core: {0}, area: {1}',
                                     self.serial_core.ctype, self.serial_core.area))
        else:
            self.thru_core_area = self.sys_area

        self.use_gpacc = False
Example #6
0
    def add_kernel(self, kernel, cov):
        """Register a kernel to be accelerate.

        The kernel could be accelerated by certain ASIC, or more
        generalized GPU/FPGA

        Parameters
        ----------
        kernel : :class:`~lumos.model.workload.kernel.Kernel`
          The kernel object
        cov : float
          The coerage of the kernel, relative to the serial execution

        Raises
        ------
        AppError
          the given coverage (cov) is larger than the overall parallel ratio

        """
        name = kernel.name
        if name in self.kernels:
            _debug(_bm_('Kernel {0} already exist', name))
            return False

        if cov > self.f_noacc:
            raise AppError(
                '[add_kernel]: cov of {0} is too large to exceed the overall '
                'parallel ratio {1}'.format(cov, self.f_noacc))

        self.kernels[name] = kernel
        self.kernels_coverage[name] = cov
        self.f_noacc = self.f_noacc - cov

        self.tag = self.tag_update()
Example #7
0
    def load_from_xmltree(cls, xmltree, kernels):
        name = xmltree.get('name')
        if not name:
            raise AppError("No name in app config")

        a = cls(name)

        ps = xmltree.find('perf_config')
        if ps is None:
            raise Exception('No performance configuration in {0}'.format(name))
        else:
            for ele in ps:
                if ele.tag is etree.Comment:
                    continue
                setattr(a, ele.tag, float(ele.text))

        ks = xmltree.find('kernel_config')
        if ks is None:
            _debug(_bm_('No kernel config in {0}', name))
        else:
            for ele in ks:
                kname = ele.get('name')
                if not kname:
                    raise Exception('No name for kernel {0} in app {1}'.format(
                        kname, name))

                val_ = ele.get('cov')
                if not val_:
                    raise Exception(
                        'No covreage for kernel {0} in app {1}'.format(
                            kname, name))
                k_cov = float(val_)
                a_.add_kernel(kernels[kname], k_cov)

        return a
Example #8
0
    def _dim_perf_opt(self, power_budget=None, cnum_max=None):
        """Get the optimal performance of multicore, subjecting to power and area budget.

        Increasing the number of cores can improve throughput, while a stringent
        system budget (e.g. power and area) may limit the throughput improvement.
        This method finds the optimal throughput-based performance.

        Parameters
        ----------
        power_budget : float
          power budget, if None, system power budget will be used
        cnum_max : int
          maximum core number, if None, system area budget will be used to determine cnum_max

        Returns
        -------
        perf : float
          optimal throughput-based performance
        vdd : int
          supply voltage when achieving the optimal throughput
        cnum : int
          the number of active cores when achieving the optimal throughput
        power_eff : float
          the effective power of throughput cores
        """
        core = self.thru_core
        if not cnum_max:
            cnum_max = int(self.sys_area / core.area)

        _debug(_bm_('power budget: {0}', power_budget))
        perf_opt, vdd_opt, cnum_opt, power_eff = 0, 0, 0, 0
        for cnum in range(1, cnum_max + 1):
            perf_, vdd_, power_ = self._dim_perf_cnum(
                cnum, power_budget=power_budget)
            if perf_ == 0:
                # power budget is too small to power such many cores
                break

            if perf_ > perf_opt:
                perf_opt = perf_
                vdd_opt = vdd_
                cnum_opt = cnum
                power_eff = power_
            _debug(
                _bm_('cnum: {0}, perf_opt: {1}, power_eff: {2}', cnum,
                     perf_opt, power_eff))
        return (perf_opt, vdd_opt, cnum_opt, power_eff)
Example #9
0
    def perf(self, vdd, app, cnum=None):
        if app.type != 'synthetic':
            raise HomogSysError('Requires a synthetic application')

        if not cnum:
            cnum = self.get_cnum(vdd)

        _debug(_bm_('cnum: {0}', cnum))
        core = self.core
        _debug(_bm_('freq: {0}', core.freq(vdd)))
        cov = 1
        perf = 0
        # kernels will be accelerated by multi-cores
        for kid in app.get_all_kernels():
            kcov = app.get_cov(kid)
            kobj = app.get_kernel(kid)

            miss_l1 = min(
                1, kobj.miss_l1 * ((self.cache_sz_l1 /
                                    (kobj.cache_sz_l1_nom)) **
                                   (1 - kobj.alpha_l1)))
            miss_l2 = min(
                1, kobj.miss_l2 * ((self.cache_sz_l2 /
                                    (cnum * kobj.cache_sz_l2_nom)) **
                                   (1 - kobj.alpha_l2)))

            _debug(_bm_('l1_miss: {0}, l2_miss: {1}', miss_l1, miss_l2))
            t0 = ((1 - miss_l1) * self.delay_l1 + miss_l1 * (1 - miss_l2) *
                  self.delay_l2 + miss_l1 * miss_l2 * self.delay_mem)
            t = t0 * core.freq(vdd) / core.freq(core.vnom)
            _debug(_bm_('t: {0}', t))
            eta = 1 / (1 + t * kobj.rm / kobj.cpi_exe)
            eta0 = 1 / (1 + t0 * kobj.rm / kobj.cpi_exe)
            _debug(_bm_('eta: {0}, eta0: {1}', eta, eta0))
            _debug(_bm_('freq: {0}, freq0: {1}', core.freq(vdd), core.fnom))
            _debug(_bm_('vdd: {0}, v0: {1}', vdd, core.vnom))
            p_speedup = (core.freq(vdd) / core.fnom) * cnum * (eta / eta0)
            _debug(_bm_('p_speedup: {0}', p_speedup))

            vdd_max = min(core.vnom * VSF_MAX, core.vmax)
            s_speedup = 1
            _debug(_bm_('s_speedup: {0}', s_speedup))

            perf += kcov * ((1 - kobj.pf + kobj.pf / p_speedup))
            cov -= kcov

        # non-kernels will not be speedup
        perf += cov

        abs_perf = core.perfnom / perf  # speedup = 1 / perf
        return abs_perf
Example #10
0
    def _dim_perf_opt(self, power_budget=None, cnum_max=None):
        """Get the optimal performance of multicore, subjecting to power and area budget.

        Increasing the number of cores can improve throughput, while a stringent
        system budget (e.g. power and area) may limit the throughput improvement.
        This method finds the optimal throughput-based performance.

        Parameters
        ----------
        power_budget : float
          power budget, if None, system power budget will be used
        cnum_max : int
          maximum core number, if None, system area budget will be used to determine cnum_max

        Returns
        -------
        perf : float
          optimal throughput-based performance
        vdd : int
          supply voltage when achieving the optimal throughput
        cnum : int
          the number of active cores when achieving the optimal throughput
        power_eff : float
          the effective power of throughput cores
        """
        core = self.thru_core
        if not cnum_max:
            cnum_max = int(self.sys_area / core.area)

        _debug(_bm_('power budget: {0}', power_budget))
        perf_opt, vdd_opt, cnum_opt, power_eff = 0, 0, 0, 0
        for cnum in range(1, cnum_max + 1):
            perf_, vdd_, power_ = self._dim_perf_cnum(cnum, power_budget=power_budget)
            if perf_ == 0:
                # power budget is too small to power such many cores
                break

            if perf_ > perf_opt:
                perf_opt = perf_
                vdd_opt = vdd_
                cnum_opt = cnum
                power_eff = power_
            _debug(_bm_('cnum: {0}, perf_opt: {1}, power_eff: {2}',
                                     cnum, perf_opt, power_eff))
        return (perf_opt, vdd_opt, cnum_opt, power_eff)
Example #11
0
    def get_speedup_appdag_serial(self, appdag):
        """Get the performance of the system, on an :class:`~lumos.model.AppDAG`
        application, all kernels are processed in serial.

        Parameters
        ----------
        appdag : :class:`~lumos.model.AppDAG`
          The target application

        Returns
        -------
        float
          speedup relative to running all kernels at the baseline performance.

        """
        dim_perf, opt_vdd, opt_cnum, power_eff = self._dim_perf_opt(
            power_budget=self.sys_power)

        thru_core = self.thru_core
        thrucore_serial_su = thru_core.perf_by_vdd(opt_vdd) / PERF_BASE

        thrucore_thru_su = dim_perf / PERF_BASE
        _debug(
            _bm_('thrucore serial su: {0}, thrucore parallel su: {1}',
                 thrucore_serial_su, thrucore_thru_su))

        serial_su = thru_core.perf_by_vdd(thru_core.vnom) / PERF_BASE
        _debug(_bm_('serial su: {0}', serial_su))

        ker_lengths = appdag.get_all_kernel_lengths()
        ker_objs = appdag.get_all_kernels(mode='object')

        _debug(
            _bm_('dim_perf: {0} cnum: {1}, vdd: {2}', dim_perf, opt_cnum,
                 opt_vdd))
        baseline = sum(ker_lengths)
        perf = 0
        for kl, ko in zip(ker_lengths, ker_objs):
            _debug(_bm_('-------{0}---------', ko.name))
            if self.has_asacc(ko.name):
                acc = self.get_asacc(ko.name)
                rt = kl / acc.perf()
                _debug(_bm_('Accelerator speedup: {0}', acc.perf()))
            else:
                if ko.pf == 0:
                    # this is a serial application
                    rt = kl / serial_su
                else:
                    # this is a parallel application
                    rt = kl * (
                        1 - ko.pf
                    ) / thrucore_serial_su + kl * ko.pf / thrucore_thru_su

            perf += rt
            _debug(_bm_('runtime: {0}, accu runtime: {1}', rt, perf))

        _debug(_bm_('baseline: {0}, bench: {1}', baseline, perf))

        return baseline / perf
Example #12
0
    def load_from_xmltree(cls, xmltree, kernels):
        name = xmltree.get('name')
        if not name:
            raise AppError("No name in app config")

        a = cls(name)

        ks = xmltree.find('kernel_config')
        if ks is None:
            _debug(_bm_('No kernel config in {0}', name))
        else:
            for ele in ks:
                kname = ele.get('name')
                if not kname:
                    raise Exception('No name for kernel {0} in app {1}'.format(
                        kname, name))

                val_ = ele.get('cov')
                if not val_:
                    raise Exception(
                        'No covreage for kernel {0} in app {1}'.format(
                            kname, name))
                k_cov = float(val_)

                val_ = ele.get('rc_count')
                if not val_:
                    k_rc_count = 1
                else:
                    k_rc_count = int(val_)

                val_ = ele.get('rc_time')
                if not val_:
                    k_rc_time = 0
                else:
                    k_rc_time = float(val_)
                a.add_kernel(kernels[kname], k_cov, k_rc_count, k_rc_time)
                _debug(
                    _bm_('Add kernel {0}, cov {1}, rc_count {2}, rc_time {3}',
                         kname, k_cov, k_rc_count, k_rc_time))

        return a
Example #13
0
    def get_speedup_appdag_serial(self, appdag):
        """Get the performance of the system, on an :class:`~lumos.model.AppDAG`
        application, all kernels are processed in serial.

        Parameters
        ----------
        appdag : :class:`~lumos.model.AppDAG`
          The target application

        Returns
        -------
        float
          speedup relative to running all kernels at the baseline performance.

        """
        dim_perf, opt_vdd, opt_cnum, power_eff = self._dim_perf_opt(power_budget=self.sys_power)

        thru_core = self.thru_core
        thrucore_serial_su = thru_core.perf_by_vdd(opt_vdd) / PERF_BASE

        thrucore_thru_su = dim_perf / PERF_BASE
        _debug(_bm_('thrucore serial su: {0}, thrucore parallel su: {1}',
                                 thrucore_serial_su, thrucore_thru_su))

        serial_su = thru_core.perf_by_vdd(thru_core.vnom) / PERF_BASE
        _debug(_bm_('serial su: {0}', serial_su))

        ker_lengths = appdag.get_all_kernel_lengths()
        ker_objs = appdag.get_all_kernels(mode='object')

        _debug(_bm_('dim_perf: {0} cnum: {1}, vdd: {2}',
                                 dim_perf, opt_cnum, opt_vdd))
        baseline = sum(ker_lengths)
        perf = 0
        for kl, ko in zip(ker_lengths, ker_objs):
            _debug(_bm_('-------{0}---------', ko.name))
            if self.has_asacc(ko.name):
                acc = self.get_asacc(ko.name)
                rt = kl / acc.perf()
                _debug(_bm_('Accelerator speedup: {0}', acc.perf()))
            else:
                if ko.pf == 0:
                    # this is a serial application
                    rt = kl / serial_su
                else:
                    # this is a parallel application
                    rt = kl * (1-ko.pf)/thrucore_serial_su + kl * ko.pf/thrucore_thru_su

            perf += rt
            _debug(_bm_('runtime: {0}, accu runtime: {1}', rt, perf))

        _debug(_bm_('baseline: {0}, bench: {1}', baseline, perf))

        return baseline / perf
Example #14
0
    def _dim_perf_cnum(self, cnum, vmin=None, power_budget=None):
        """Get the performance by given the active number of cores.

        Given number of multi-cores (potentially) running at lower supply voltage.

        Parameters
        ----------
        cnum : int
          The number of active cores
        vmin : float, optional
          The minimum voltage of cores
        power_budget : float
          The power budget, if None, system power budget will be used

        Returns
        -------
        perf : float
          The performance score, not relative speedup
        vdd : int
          The supply voltage in mV, when the system achieves the optimal
          throughput
        power : float
          The effective power conumption of dim cores. If it is less than the
          provided power_budget, this usually means the performance is
          constrained by area_budget instead of power_budget
        """
        core = self.thru_core

        if not power_budget:
            power_budget = self.sys_power

        cpower = power_budget / float(cnum)

        if not vmin:
            vmin = core.vmin
        elif vmin < core.vmin:
            _debug(_bm_('Provided vmin {0}mV is lower than core.vmin {1}mV. '
                        'vmin is set to core.vmin', vmin, core.vmin))
            vmin = core.vmin

        # first check whether vmin can meet the power constraint. If not,
        # it is probabaly either due to vmin is too high, or power budget
        # is too constrained, return None in this case.
        if core.power(vmin) > cpower:
            _debug(_bm_('Power budget {1:.3g}W is not met even at vmin of {0}mV',
                                       vmin, power_budget))
            return (0, 0, 0)

        # use binary search to find the highest per-core vdd that stays within the power_budget
        vmax = core.vmax

        while vmax > vmin:
            vmid = math.ceil((vmin+vmax) / 2)
            power_vmid = core.power(vmid)

            if power_vmid > cpower:
                vmax = vmid - 1
            else:
                vmin = vmid

        perf_opt = cnum * core.perf_by_vdd(vmin)
        power_eff = cnum * core.power(vmin)

        return (perf_opt, vmin, power_eff)
Example #15
0
    def perf_by_cnum(self, cnum, app, vmin=None):
        """
        Get the relative performance of the system for a given application, with a
        given constraint on the number of active cores.

        Parameters
        ----------
        cnum: num
          The number of core required to be active.
        app: :class:`~lumos.model.application.Application`
          The targeted application.
        vmin: num
          An optional argument to specify the lowest boundary which supply
          voltage can be scaled down.

        Returns
        -------
        dict: results wrapped in a python dict with three keys:

        perf : num
          The relatvie performance.
        vdd : num
          The supply voltage of the optimal configuration under core number
          constraint.
        freq : num
          The frequency with the optimal configuration under core number
          constraint.
        cnum : num
          The actual number of active cores, if the requried number can not be met.
        util : num
          The utilization of the system at the optimal configuraiton.

        """

        core = self.core
        f = app.f

        cnum_max = int(self.area / core.area)

        if cnum > cnum_max or cnum < 0:
            return None

        cpower = self.power / float(cnum)
        _debug(_bm_('Per-core power budget: {0}', cpower))

        # Serial performance is achieved by the highest vdd
        sperf = core.perf_by_vdd(core.vmax)

        if not vmin:
            vmin = core.vmin

        # Check whether vmin can meet the power requirement
        if vmin >= core.vmin:
            if core.power(vmin) > cpower:
                # Either vmin is too high or active_cnum is too large
                # so that the system could not meet the power budget even
                # with the minimum vdd. Return the active core number with vmin
                # Users can capture the exception by comparing the active_cnum
                # and the returned active_cnum
                active_cnum = min(int(self.area / core.area),
                                  int(self.power / core.power(vmin)))

                perf = 1 / ((1 - f) / sperf + f /
                            (active_cnum * core.perf_by_vdd(vmin)))
                util = float(100 * active_cnum) / float(cnum)
                _debug(_bm_('vmin is too high or active_cnum is too large'))
                return {
                    'perf': perf / PERF_BASE,
                    'vdd': vmin,
                    'cnum': active_cnum,
                    'freq': core.freq(vmin),
                    'util': util
                }
        else:
            vmin = core.vmin

        vl = vmin
        vr = core.vmax
        vm = int((vl + vr) / 2)

        while (vr - vl) > V_PRECISION:
            vm = int((vl + vr) / 2)

            _debug(
                _bm_(
                    '[Core]\t:vl: {0}mV, vr: {1}mV, vm: {2}mV, '
                    'freq: {3}, power: {4}, area: {5}', vl, vr, vm,
                    core.freq(vm), core.power(vm), core.area))
            if core.power(vm) > cpower:
                vl = vl
                vr = vm
            else:
                vl = vm
                vr = vr

        _debug(_bm_('End of bin-search, vl: {0}mV, vr: {1}mV', vl, vr))
        core.vdd = vl
        lpower = core.power(vl)
        lfreq = core.freq(vl)
        lcnum = min(int(self.area / core.area), int(self.power / lpower))
        lperf = 1 / ((1 - f) / sperf + f / (cnum * core.perf_by_vdd(vl)))

        rpower = core.power(vr)
        rfreq = core.freq(vr)
        rcnum = min(int(self.area / core.area), int(self.power / rpower))
        rperf = 1 / ((1 - f) / sperf + f / (cnum * core.perf_by_vdd(vr)))

        if rpower <= cpower:
            # right bound meets the power constraint
            return {
                'perf': rperf / PERF_BASE,
                'vdd': vr,
                'cnum': cnum,
                'freq': rfreq,
                'util': float(100 * cnum) / float(cnum_max)
            }
        else:
            return {
                'perf': lperf / PERF_BASE,
                'vdd': vl,
                'freq': lfreq,
                'cnum': cnum,
                'util': float(100 * cnum) / float(cnum_max)
            }
Example #16
0
    if mo:
        return int(mo.group(1))
    else:
        raise TechModelError('no technology node from the name of {0}'.format(model_file))

model_name = 'homoTFET30nm'
freq_dict = dict()
dynamic_power_dict = dict()
static_power_dict = dict()

model_files = glob.glob(os.path.join(
    _MODEL_DIR, '{0}_{1}_*.data'.format(
        settings.TFET_SIM_CIRCUIT, model_name)))

for model_file in model_files:
    _debug(_bm_('found model {0}', model_file))
    model_file_mtime = os.path.getmtime(model_file)

    tech = _get_tech_node(model_file)
    pickle_file = os.path.join(
        _MODEL_DIR, '{0}_{1}_{2}.p'.format(
            settings.TFET_SIM_CIRCUIT, model_name, tech))
    try:
        pickle_file_mtime = os.path.getmtime(pickle_file)
    except OSError:
        pickle_file_mtime = 0

    if pickle_file_mtime > model_file_mtime:
        with open(pickle_file, 'rb') as f:
            freq_dict[tech] = pickle.load(f)
            dynamic_power_dict[tech] = pickle.load(f)
Example #17
0
 def instantiate(self):
     self._thru_core_num = int(self.thru_core_area / self.thru_core.area)
     _debug(
         _bm_('Tput core: {0}, area: {1}, cnum: {2}', self.thru_core.ctype,
              self.thru_core.area, self._thru_core_num))
Example #18
0
    def _dim_perf_cnum(self, cnum, vmin=None, power_budget=None):
        """Get the performance by given the active number of cores.

        Given number of multi-cores (potentially) running at lower supply voltage.

        Parameters
        ----------
        cnum : int
          The number of active cores
        vmin : float, optional
          The minimum voltage of cores
        power_budget : float
          The power budget, if None, system power budget will be used

        Returns
        -------
        perf : float
          The performance score, not relative speedup
        vdd : int
          The supply voltage in mV, when the system achieves the optimal
          throughput
        power : float
          The effective power conumption of dim cores. If it is less than the
          provided power_budget, this usually means the performance is
          constrained by area_budget instead of power_budget
        """
        core = self.thru_core

        if not power_budget:
            power_budget = self.sys_power

        cpower = power_budget / float(cnum)

        if not vmin:
            vmin = core.vmin
        elif vmin < core.vmin:
            _debug(
                _bm_(
                    'Provided vmin {0}mV is lower than core.vmin {1}mV. '
                    'vmin is set to core.vmin', vmin, core.vmin))
            vmin = core.vmin

        # first check whether vmin can meet the power constraint. If not,
        # it is probabaly either due to vmin is too high, or power budget
        # is too constrained, return None in this case.
        if core.power(vmin) > cpower:
            _debug(
                _bm_('Power budget {1:.3g}W is not met even at vmin of {0}mV',
                     vmin, power_budget))
            return (0, 0, 0)

        # use binary search to find the highest per-core vdd that stays within the power_budget
        vmax = core.vmax

        while vmax > vmin:
            vmid = math.ceil((vmin + vmax) / 2)
            power_vmid = core.power(vmid)

            if power_vmid > cpower:
                vmax = vmid - 1
            else:
                vmin = vmid

        perf_opt = cnum * core.perf_by_vdd(vmin)
        power_eff = cnum * core.power(vmin)

        return (perf_opt, vmin, power_eff)
Example #19
0
    def get_speedup_appdag_parallel_greedy(self, appdag):
        """Get the performance of the system, on an
        :class:`~lumos.model.AppDAG` application, all kernels are
        processed in parallel, power budget is allocated to accelerators
        in a greedy way.

        The system power budget will be allocated to the kernel that has
        the longest execution time, the power of the accelerator working
        at nominal supply will be deducted from the system budget. Then
        the system will try to allocate the remaining power budget to
        the accelerator targeting the next longest running kernel. Until
        there is no more accelerators can be activate.

        This will assume that all parallel kernels are supported by accelerators.

        Parameters
        ----------
        appdag : :class:`~lumos.model.AppDAG`
          The target application

        Returns
        -------
        float
          speedup relative to running all kernels at the baseline performance.

        """
        ker_lengths = appdag.get_all_kernel_lengths()
        baseline = sum(ker_lengths)

        depth_sorted = appdag.kernels_depth_sort()
        finish = 0
        for l, node_list in enumerate(depth_sorted):
            ker_lengths = [
                appdag.get_kernel_length(idx_) for idx_ in node_list
            ]
            ker_len_sorted = sorted(zip(ker_lengths, node_list), reverse=True)

            power_budget = self.sys_power
            runtime = []
            for kl, idx_ in ker_len_sorted:
                ko = appdag.get_kernel(idx_)
                if self.has_asacc(ko.name):
                    asacc = self.get_asacc(ko.name)
                    asacc_su = asacc.perf(power=power_budget) / PERF_BASE
                    power_budget -= asacc.power_eff
                    rt = kl / asacc_su
                    runtime.append(rt)
                else:
                    perf_, vdd_, cnum_, power_eff = self._dim_perf_opt(
                        power_budget=power_budget)
                    if power_eff == 0:
                        # remaining power budget is too small
                        power_budget = 0
                    else:
                        thru_su = perf_ / PERF_BASE
                        serial_su = self.thru_core.perf_by_vdd(
                            vdd_) / PERF_BASE
                        power_budget -= power_eff
                        rt = kl * (1 -
                                   ko.pf) / serial_su + kl * ko.pf / thru_su
                        runtime.append(rt)
                        _debug(_bm_('runtime: {0}', rt))

                if power_budget <= 0.1:
                    finish += max(runtime)
                    _debug(_bm_('=====warp finish====='))
                    _debug(_bm_('run time of this warp: {0}', max(runtime)))
                    runtime = []
                    power_budget = self.sys_power

            if runtime:
                finish += max(runtime)
                _debug(_bm_('=====warp finish====='))
                _debug(_bm_('run time of this warp: {0}', max(runtime)))

        _debug(_bm_('baseline: {0}, bench: {1}', baseline, finish))
        return baseline / finish
Example #20
0
    def get_speedup_appdag_parallel_greedy(self, appdag):
        """Get the performance of the system, on an
        :class:`~lumos.model.AppDAG` application, all kernels are
        processed in parallel, power budget is allocated to accelerators
        in a greedy way.

        The system power budget will be allocated to the kernel that has
        the longest execution time, the power of the accelerator working
        at nominal supply will be deducted from the system budget. Then
        the system will try to allocate the remaining power budget to
        the accelerator targeting the next longest running kernel. Until
        there is no more accelerators can be activate.

        This will assume that all parallel kernels are supported by accelerators.

        Parameters
        ----------
        appdag : :class:`~lumos.model.AppDAG`
          The target application

        Returns
        -------
        float
          speedup relative to running all kernels at the baseline performance.

        """
        ker_lengths = appdag.get_all_kernel_lengths()
        baseline = sum(ker_lengths)

        depth_sorted = appdag.kernels_depth_sort()
        finish = 0
        for l, node_list in enumerate(depth_sorted):
            ker_lengths = [appdag.get_kernel_length(idx_) for idx_ in node_list]
            ker_len_sorted = sorted(zip(ker_lengths, node_list), reverse=True)

            power_budget = self.sys_power
            runtime = []
            for kl, idx_ in ker_len_sorted:
                ko = appdag.get_kernel(idx_)
                if self.has_asacc(ko.name):
                    asacc = self.get_asacc(ko.name)
                    asacc_su = asacc.perf(power=power_budget) / PERF_BASE
                    power_budget -= asacc.power_eff
                    rt = kl / asacc_su
                    runtime.append(rt)
                else:
                    perf_, vdd_, cnum_, power_eff = self._dim_perf_opt(power_budget=power_budget)
                    if power_eff == 0:
                        # remaining power budget is too small
                        power_budget = 0
                    else:
                        thru_su = perf_ / PERF_BASE
                        serial_su = self.thru_core.perf_by_vdd(vdd_) / PERF_BASE
                        power_budget -= power_eff
                        rt = kl * (1-ko.pf)/serial_su + kl * ko.pf / thru_su
                        runtime.append(rt)
                        _debug(_bm_('runtime: {0}', rt))

                if power_budget <= 0.1:
                    finish += max(runtime)
                    _debug(_bm_('=====warp finish====='))
                    _debug(_bm_('run time of this warp: {0}', max(runtime)))
                    runtime = []
                    power_budget = self.sys_power

            if runtime:
                finish += max(runtime)
                _debug(_bm_('=====warp finish====='))
                _debug(_bm_('run time of this warp: {0}', max(runtime)))

        _debug(_bm_('baseline: {0}, bench: {1}', baseline, finish))
        return baseline / finish
Example #21
0
 def instantiate(self):
     self._thru_core_num = int(self.thru_core_area / self.thru_core.area)
     _debug(_bm_('Tput core: {0}, area: {1}, cnum: {2}',
                              self.thru_core.ctype, self.thru_core.area, self._thru_core_num))
Example #22
0
        raise TechModelError(
            'no technology node from the name of {0}'.format(model_file))


model_name = 'homoTFET30nm'
freq_dict = dict()
dynamic_power_dict = dict()
static_power_dict = dict()

model_files = glob.glob(
    os.path.join(
        _MODEL_DIR, '{0}_{1}_*.data'.format(settings.TFET_SIM_CIRCUIT,
                                            model_name)))

for model_file in model_files:
    _debug(_bm_('found model {0}', model_file))
    model_file_mtime = os.path.getmtime(model_file)

    tech = _get_tech_node(model_file)
    pickle_file = os.path.join(
        _MODEL_DIR, '{0}_{1}_{2}.p'.format(settings.TFET_SIM_CIRCUIT,
                                           model_name, tech))
    try:
        pickle_file_mtime = os.path.getmtime(pickle_file)
    except OSError:
        pickle_file_mtime = 0

    if pickle_file_mtime > model_file_mtime:
        with open(pickle_file, 'rb') as f:
            freq_dict[tech] = pickle.load(f)
            dynamic_power_dict[tech] = pickle.load(f)
Example #23
0
    def perf_by_cnum(self, cnum, app, vmin=None):
        """
        Get the relative performance of the system for a given application, with a
        given constraint on the number of active cores.

        Parameters
        ----------
        cnum: num
          The number of core required to be active.
        app: :class:`~lumos.model.application.Application`
          The targeted application.
        vmin: num
          An optional argument to specify the lowest boundary which supply
          voltage can be scaled down.

        Returns
        -------
        dict: results wrapped in a python dict with three keys:

        perf : num
          The relatvie performance.
        vdd : num
          The supply voltage of the optimal configuration under core number
          constraint.
        freq : num
          The frequency with the optimal configuration under core number
          constraint.
        cnum : num
          The actual number of active cores, if the requried number can not be met.
        util : num
          The utilization of the system at the optimal configuraiton.

        """

        core = self.core
        f = app.f

        cnum_max = int(self.area / core.area)

        if cnum > cnum_max or cnum < 0:
            return None

        cpower = self.power / float(cnum)
        _debug(_bm_('Per-core power budget: {0}', cpower))

        # Serial performance is achieved by the highest vdd
        sperf = core.perf_by_vdd(core.vmax)

        if not vmin:
            vmin = core.vmin

        # Check whether vmin can meet the power requirement
        if vmin >= core.vmin:
            if core.power(vmin) > cpower:
                # Either vmin is too high or active_cnum is too large
                # so that the system could not meet the power budget even
                # with the minimum vdd. Return the active core number with vmin
                # Users can capture the exception by comparing the active_cnum
                # and the returned active_cnum
                active_cnum = min(int(self.area / core.area),
                                  int(self.power / core.power(vmin)))

                perf = 1 / ((1 - f) / sperf + f /
                            (active_cnum * core.perf_by_vdd(vmin)))
                util = float(100 * active_cnum) / float(cnum)
                _debug(_bm_('vmin is too high or active_cnum is too large'))
                return {
                    'perf': perf / PERF_BASE,
                    'vdd': vmin,
                    'cnum': active_cnum,
                    'freq': core.freq(vmin),
                    'util': util
                }
        else:
            vmin = core.vmin

        vl = vmin
        vr = core.vmax
        vm = int((vl + vr) / 2)

        while (vr - vl) > V_PRECISION:
            vm = int((vl + vr) / 2)

            _debug(_bm_('[Core]\t:vl: {0}mV, vr: {1}mV, vm: {2}mV, '
                        'freq: {3}, power: {4}, area: {5}', vl, vr, vm,
                        core.freq(vm), core.power(vm), core.area))
            if core.power(vm) > cpower:
                vl = vl
                vr = vm
            else:
                vl = vm
                vr = vr

        _debug(_bm_('End of bin-search, vl: {0}mV, vr: {1}mV', vl, vr))
        core.vdd = vl
        lpower = core.power(vl)
        lfreq = core.freq(vl)
        lcnum = min(int(self.area / core.area), int(self.power / lpower))
        lperf = 1 / ((1 - f) / sperf + f / (cnum * core.perf_by_vdd(vl)))

        rpower = core.power(vr)
        rfreq = core.freq(vr)
        rcnum = min(int(self.area / core.area), int(self.power / rpower))
        rperf = 1 / ((1 - f) / sperf + f / (cnum * core.perf_by_vdd(vr)))

        if rpower <= cpower:
            # right bound meets the power constraint
            return {
                'perf': rperf / PERF_BASE,
                'vdd': vr,
                'cnum': cnum,
                'freq': rfreq,
                'util': float(100 * cnum) / float(cnum_max)
            }
        else:
            return {
                'perf': lperf / PERF_BASE,
                'vdd': vl,
                'freq': lfreq,
                'cnum': cnum,
                'util': float(100 * cnum) / float(cnum_max)
            }