Ejemplo n.º 1
0
def simulate(hostname, kernel, args):
    filename = f'results/predicted/{hostname}_{kernel}.pkl.gz'
    print(hostname, kernel, end=" ")
    if os.path.exists(filename):
        print(f"exists {filename}")
        return
    data = []
    machine = get_machine_model(hostname)
    kc_kernel = get_kc_kernel(kernel)
    no_lcp = False
    for c, arg in enumerate(args):
        # reset kernel state
        kc_kernel.clear_state()

        for i, a in enumerate(arg[1:]):
            kc_kernel.set_constant(f'D{i}', a)
        csp = CacheSimulationPredictor(kc_kernel, machine)
        csp_infos = csp.get_infos()
        lcp_infos = None
        if not no_lcp:
            try:
                lcp = LayerConditionPredictor(kc_kernel, machine)
                lcp_infos = lcp.get_infos()
            except ValueError:
                no_lcp = True
        row = {'hostname': hostname, 'kernel': kernel, 'dimensions': arg[1:]}

        # Typically ['L1', 'L2', 'L3', 'MEM']
        levels = [mh['level'] for mh in machine['memory hierarchy']]
        stat_types = ['loads', 'misses', 'stores', 'evicts']

        cs_row = {'source': 'pycachesim'}
        cs_row.update(row)

        for st in stat_types:
            for level, stat in zip(levels, getattr(csp, f'get_{st}')()):
                if level == "MEM" and st in ['misses', 'evicts']:
                    continue  # makes no sense
                cs_row[level + '_' + st] = float(stat)

        cs_row['raw'] = csp_infos
        data.append(cs_row)

        if not no_lcp:
            lc_row = {'source': 'layer-conditions'}
            lc_row.update(row)
            for st in stat_types:
                for level, stat in zip(levels, getattr(lcp, f'get_{st}')()):
                    if level == "MEM" and st in ['misses', 'evicts']:
                        continue  # makes no sense
                    lc_row[level + '_' + st] = float(stat)
            lc_row['raw'] = lcp_infos
            data.append(lc_row)
        if c % (len(args) // 10) == 0:
            print(".", end='', flush=True)
    df = pd.DataFrame(data)
    os.makedirs('results/predicted', exist_ok=True)
    df.to_pickle(f'results/predicted/{hostname}_{kernel}.pkl.gz')
    print(f"saved {filename}")
    return data
Ejemplo n.º 2
0
    def __init__(self, kernel, machine, args=None, parser=None, cores=1,
                 cache_predictor=CacheSimulationPredictor, verbose=0):
        """
        Create Execcution-Cache-Memory data model from kernel and machine objects.

        *kernel* is a Kernel object
        *machine* describes the machine (cpu, cache and memory) characteristics
        *args* (optional) are the parsed arguments from the comand line

        If *args* is None, *cores*, *cache_predictor* and *verbose* are taken into account,
        otherwise *args* takes precedence.
        """
        self.kernel = kernel
        self.machine = machine
        self._args = args
        self._parser = parser
        self.results = None

        if args:
            self.verbose = self._args.verbose
            self.cores = self._args.cores
            if self._args.cache_predictor == 'SIM':
                self.predictor = CacheSimulationPredictor(self.kernel, self.machine, self.cores)
            elif self._args.cache_predictor == 'LC':
                self.predictor = LayerConditionPredictor(self.kernel, self.machine, self.cores)
            else:
                raise NotImplementedError("Unknown cache predictor, only LC (layer condition) and "
                                          "SIM (cache simulation with pycachesim) is supported.")
        else:
            self.cores = cores
            self.predictor = cache_predictor(self.kernel, self.machine, self.cores)
            self.verbose = verbose
Ejemplo n.º 3
0
 def calculate_cache_access(self):
     """Dispatch to cache predictor to get cache stats."""
     if self._args.cache_predictor == 'SIM':
         self.predictor = CacheSimulationPredictor(self.kernel, self.machine, self._args.cores)
     elif self._args.cache_predictor == 'LC':
         self.predictor = LayerConditionPredictor(self.kernel, self.machine, self._args.cores)
     else:
         raise NotImplementedError("Unknown cache predictor, only LC (layer condition) and "
                                   "SIM (cache simulation with pycachesim) is supported.")
     self.results = {'cycles': [],  # will be filled by caclculate_cycles()
                     'misses': self.predictor.get_misses(),
                     'hits': self.predictor.get_hits(),
                     'evicts': self.predictor.get_evicts(),
                     'verbose infos': self.predictor.get_infos()}  # only for verbose outputs
Ejemplo n.º 4
0
    def __init__(self,
                 kernel,
                 machine,
                 args=None,
                 parser=None,
                 cores=1,
                 cache_predictor=LayerConditionPredictor,
                 verbose=0):
        """
        Create roofline model from kernel and machine objects.

        *kernel* is a Kernel object
        *machine* describes the machine (cpu, cache and memory) characteristics
        *args* (optional) are the parsed arguments from the comand line


        If *args* is None, *asm_block*, *pointer_increment* and *verbose* will be used, otherwise
        *args* takes precedence.
        """
        self.kernel = kernel
        self.machine = machine
        self._args = args
        self._parser = parser
        self.results = None

        if args:
            self.verbose = self._args.verbose
            self.cores = self._args.cores
            if self._args.cache_predictor == 'SIM':
                self.predictor = CacheSimulationPredictor(
                    self.kernel, self.machine, self.cores)
            elif self._args.cache_predictor == 'LC':
                self.predictor = LayerConditionPredictor(
                    self.kernel, self.machine, self.cores)
            else:
                raise NotImplementedError(
                    "Unknown cache predictor, only LC (layer condition) and "
                    "SIM (cache simulation with pycachesim) is supported.")
        else:
            self.cores = cores
            self.predictor = cache_predictor(self.kernel, self.machine,
                                             self.cores)
            self.verbose = verbose

        if sum(self.kernel._flops.values()) == 0:
            raise ValueError(
                "The Roofline model requires that the sum of FLOPs is non-zero."
            )
Ejemplo n.º 5
0
    def calculate_cache_access(self):
        if self._args.cache_predictor == 'SIM':
            self.predictor = CacheSimulationPredictor(self.kernel,
                                                      self.machine)
        elif self._args.cache_predictor == 'LC':
            self.predictor = LayerConditionPredictor(self.kernel, self.machine)
        else:
            raise NotImplementedError(
                "Unknown cache predictor, only LC (layer condition) and "
                "SIM (cache simulation with pycachesim) is supported.")
        self.results = {
            'misses': self.predictor.get_misses(),
            'hits': self.predictor.get_hits(),
            'evicts': self.predictor.get_evicts(),
            'verbose infos':
            self.predictor.get_infos(),  # only for verbose outputs
            'bottleneck level': 0,
            'mem bottlenecks': []
        }

        element_size = self.kernel.datatypes_size[self.kernel.datatype]
        cacheline_size = float(self.machine['cacheline size'])
        elements_per_cacheline = int(cacheline_size // element_size)

        total_flops = sum(self.kernel._flops.values()) * elements_per_cacheline

        # TODO let user choose threads_per_core:
        threads_per_core = 1

        # Compile relevant information

        # CPU-L1 stats (in bytes!)
        # We compile CPU-L1 stats on our own, because cacheprediction only works on cache lines
        read_offsets, write_offsets = zip(*list(
            self.kernel.compile_global_offsets(
                iteration=range(0, elements_per_cacheline))))
        read_offsets = set(
            [item for sublist in read_offsets for item in sublist])
        write_offsets = set(
            [item for sublist in write_offsets for item in sublist])

        write_streams = len(write_offsets)
        read_streams = len(read_offsets) + write_streams  # write-allocate
        total_loads = read_streams * element_size
        total_evicts = write_streams * element_size
        bw, measurement_kernel = self.machine.get_bandwidth(
            0,
            read_streams,
            write_streams,
            threads_per_core,
            cores=self._args.cores)

        # Calculate performance (arithmetic intensity * bandwidth with
        # arithmetic intensity = flops / bytes loaded )
        if total_loads == 0:
            # This happens in case of full-caching
            arith_intens = None
            performance = None
        else:
            arith_intens = float(total_flops) / total_loads
            performance = arith_intens * float(bw)

        self.results['mem bottlenecks'].append({
            'performance':
            PrefixedUnit(performance, 'FLOP/s'),
            'level':
            self.machine['memory hierarchy'][0]['level'],
            'arithmetic intensity':
            arith_intens,
            'bw kernel':
            measurement_kernel,
            'bandwidth':
            bw,
            'bytes transfered':
            total_loads
        })
        if performance <= self.results.get('min performance', performance):
            self.results['bottleneck level'] = len(
                self.results['mem bottlenecks']) - 1
            self.results['min performance'] = performance

        # for other cache and memory levels:
        for cache_level, cache_info in list(
                enumerate(self.machine['memory hierarchy']))[:-1]:
            # Compiling stats (in bytes!)
            total_misses = self.results['misses'][cache_level] * cacheline_size
            total_evicts = self.results['evicts'][cache_level] * cacheline_size

            # choose bw according to cache level and problem
            # first, compile stream counts at current cache level
            # write-allocate is allready resolved above
            read_streams = self.results['misses'][cache_level]
            write_streams = self.results['evicts'][cache_level]
            # second, try to find best fitting kernel (closest to stream seen stream counts):
            bw, measurement_kernel = self.machine.get_bandwidth(
                cache_level + 1,
                read_streams,
                write_streams,
                threads_per_core,
                cores=self._args.cores)

            # Calculate performance (arithmetic intensity * bandwidth with
            # arithmetic intensity = flops / bytes transfered)
            bytes_transfered = total_misses + total_evicts

            if bytes_transfered == 0:
                # This happens in case of full-caching
                arith_intens = float('inf')
                performance = float('inf')
            else:
                arith_intens = float(total_flops) / bytes_transfered
                performance = arith_intens * float(bw)

            self.results['mem bottlenecks'].append({
                'performance':
                PrefixedUnit(performance, 'FLOP/s'),
                'level':
                (self.machine['memory hierarchy'][cache_level + 1]['level']),
                'arithmetic intensity':
                arith_intens,
                'bw kernel':
                measurement_kernel,
                'bandwidth':
                bw,
                'bytes transfered':
                bytes_transfered
            })
            if performance < self.results.get('min performance', performance):
                self.results['bottleneck level'] = len(
                    self.results['mem bottlenecks']) - 1
                self.results['min performance'] = performance

        return self.results