def test_non_variable_accesse(self):
     kernel = KernelCode('''
     double Y[s][n];
     double F[s][n];
     double A[s][s];
     double y[n];
     double h;
     
     for (int l = 0; l < s; ++l)
       for (int j = 0; j < n; ++j)
         Y[l][j] = A[l][0] * F[0][j] * h + y[j];
     ''', machine=self.machine)
     kernel.set_constant('s', 4)
     kernel.set_constant('n', 1000000)
     lcp = LayerConditionPredictor(kernel, self.machine)
     self.assertEqual(lcp.get_evicts(), [1, 1, 1, 0])
     self.assertEqual(lcp.get_misses(), [3, 3, 3, 0])
     self.assertEqual(lcp.get_hits(), [0, 0, 0, 3])
 def test_non_variable_accesse(self):
     kernel = KernelCode('''
     double Y[s][n];
     double F[s][n];
     double A[s][s];
     double y[n];
     double h;
     
     for (int l = 0; l < s; ++l)
       for (int j = 0; j < n; ++j)
         Y[l][j] = A[l][0] * F[0][j] * h + y[j];
     ''', machine=self.machine)
     kernel.set_constant('s', 4)
     kernel.set_constant('n', 1000000)
     lcp = LayerConditionPredictor(kernel, self.machine)
     self.assertEqual(lcp.get_evicts(), [1, 1, 1, 0])
     self.assertEqual(lcp.get_misses(), [3, 3, 3, 0])
     self.assertEqual(lcp.get_hits(), [0, 0, 0, 3])
Beispiel #3
0
class ECMData(object):
    """Representation of Data portion of the Execution-Cache-Memory Model."""

    name = "Execution-Cache-Memory (data transfers only)"

    @classmethod
    def configure_arggroup(cls, parser):
        """Configure argument group of parser."""
        pass

    def __init__(self,
                 kernel,
                 machine,
                 args=None,
                 parser=None,
                 cores=1,
                 cache_predictor=CacheSimulationPredictor,
                 verbose=0):
        """
        Create Execcution-Cache-Memory data model from kernel and machine objects.

        *kernel* is a Kernel object
        *machine* describes the machine (cpu, cache and memory) characteristics
        *args* (optional) are the parsed arguments from the comand line

        If *args* is None, *cores*, *cache_predictor* and *verbose* are taken into account,
        otherwise *args* takes precedence.
        """
        self.kernel = kernel
        self.machine = machine
        self._args = args
        self._parser = parser
        self.results = None

        if args:
            self.verbose = self._args.verbose
            self.cores = self._args.cores
            if self._args.cache_predictor == 'SIM':
                self.predictor = CacheSimulationPredictor(
                    self.kernel, self.machine, self.cores)
            elif self._args.cache_predictor == 'LC':
                self.predictor = LayerConditionPredictor(
                    self.kernel, self.machine, self.cores)
            else:
                raise NotImplementedError(
                    "Unknown cache predictor, only LC (layer condition) and "
                    "SIM (cache simulation with pycachesim) is supported.")
        else:
            self.cores = cores
            self.predictor = cache_predictor(self.kernel, self.machine,
                                             self.cores)
            self.verbose = verbose

    def calculate_cache_access(self):
        """Dispatch to cache predictor to get cache stats."""
        self.results = {
            'cycles': [],  # will be filled by caclculate_cycles()
            'misses': self.predictor.get_misses(),
            'hits': self.predictor.get_hits(),
            'evicts': self.predictor.get_evicts(),
            'verbose infos': self.predictor.get_infos()
        }  # only for verbose outputs

    def calculate_cycles(self):
        """
        Calculate performance model cycles from cache stats.

        calculate_cache_access() needs to have been execute before.
        """
        element_size = self.kernel.datatypes_size[self.kernel.datatype]
        elements_per_cacheline = float(
            self.machine['cacheline size']) // element_size

        misses, evicts = (self.predictor.get_misses(),
                          self.predictor.get_evicts())

        for cache_level, cache_info in list(
                enumerate(self.machine['memory hierarchy']))[:-1]:
            cache_cycles = cache_info['cycles per cacheline transfer']

            if cache_cycles is not None:
                # only cache cycles count
                cycles = (misses[cache_level] +
                          evicts[cache_level]) * cache_cycles
            else:
                # Memory transfer
                # we use bandwidth to calculate cycles and then add panalty cycles (if given)

                # choose bw according to cache level and problem
                # first, compile stream counts at current cache level
                # write-allocate is allready resolved in cache predictor
                read_streams = misses[cache_level]
                write_streams = evicts[cache_level]
                # second, try to find best fitting kernel (closest to stream seen stream counts):
                threads_per_core = 1
                bw, measurement_kernel = self.machine.get_bandwidth(
                    cache_level + 1, read_streams, write_streams,
                    threads_per_core)

                # calculate cycles
                cycles = float(misses[cache_level] + evicts[cache_level]) * \
                    float(elements_per_cacheline) * float(element_size) * \
                    float(self.machine['clock']) / float(bw)
                # add penalty cycles for each read stream
                if 'penalty cycles per read stream' in cache_info:
                    cycles += misses[cache_level] * \
                              cache_info['penalty cycles per read stream']

                self.results.update({
                    'memory bandwidth kernel': measurement_kernel,
                    'memory bandwidth': bw
                })

            self.results['cycles'].append(('{}-{}'.format(
                cache_info['level'],
                self.machine['memory hierarchy'][cache_level + 1]['level']),
                                           cycles))

            # TODO remove the following by makeing testcases more versatile:
            self.results['{}-{}'.format(
                cache_info['level'],
                self.machine['memory hierarchy'][cache_level +
                                                 1]['level'])] = cycles

        return self.results

    def analyze(self):
        """Run complete anaylysis and return results."""
        self.calculate_cache_access()
        self.calculate_cycles()

        return self.results

    def conv_cy(self, cy_cl, unit, default='cy/CL'):
        """Convert cycles (cy/CL) to other units, such as FLOP/s or It/s."""
        if not isinstance(cy_cl, PrefixedUnit):
            cy_cl = PrefixedUnit(cy_cl, '', 'cy/CL')
        if not unit:
            unit = default

        clock = self.machine['clock']
        element_size = self.kernel.datatypes_size[self.kernel.datatype]
        elements_per_cacheline = int(
            self.machine['cacheline size']) // element_size
        if cy_cl != 0:
            it_s = clock / cy_cl * elements_per_cacheline
            it_s.unit = 'It/s'
        else:
            it_s = PrefixedUnit('inf It/S')
        flops_per_it = sum(self.kernel._flops.values())
        performance = it_s * flops_per_it
        performance.unit = 'FLOP/s'
        cy_it = cy_cl * elements_per_cacheline
        cy_it.unit = 'cy/It'

        return {
            'It/s': it_s,
            'cy/CL': cy_cl,
            'cy/It': cy_it,
            'FLOP/s': performance
        }[unit]

    def report(self, output_file=sys.stdout):
        """Print generated model data in human readable format."""
        if self.verbose > 1:
            print('{}'.format(pformat(self.results['verbose infos'])),
                  file=output_file)

        for level, cycles in self.results['cycles']:
            print('{} = {}'.format(
                level, self.conv_cy(float(cycles), self._args.unit)),
                  file=output_file)

        if self.verbose > 1:
            if 'memory bandwidth kernel' in self.results:
                print('memory cycles based on {} kernel with {}'.format(
                    self.results['memory bandwidth kernel'],
                    self.results['memory bandwidth']),
                      file=output_file)
Beispiel #4
0
class Roofline(object):
    """
    class representation of the Roofline Model

    more info to follow...
    """

    name = "Roofline"

    @classmethod
    def configure_arggroup(cls, parser):
        pass

    def __init__(self, kernel, machine, args=None, parser=None):
        """
        *kernel* is a Kernel object
        *machine* describes the machine (cpu, cache and memory) characteristics
        *args* (optional) are the parsed arguments from the comand line
        """
        self.kernel = kernel
        self.machine = machine
        self._args = args
        self._parser = parser

        if args:
            # handle CLI info
            pass

        if sum(self.kernel._flops.values()) == 0:
            raise ValueError(
                "The Roofline model requires that the sum of FLOPs is non-zero."
            )

    def calculate_cache_access(self):
        if self._args.cache_predictor == 'SIM':
            self.predictor = CacheSimulationPredictor(self.kernel,
                                                      self.machine)
        elif self._args.cache_predictor == 'LC':
            self.predictor = LayerConditionPredictor(self.kernel, self.machine)
        else:
            raise NotImplementedError(
                "Unknown cache predictor, only LC (layer condition) and "
                "SIM (cache simulation with pycachesim) is supported.")
        self.results = {
            'misses': self.predictor.get_misses(),
            'hits': self.predictor.get_hits(),
            'evicts': self.predictor.get_evicts(),
            'verbose infos':
            self.predictor.get_infos(),  # only for verbose outputs
            'bottleneck level': 0,
            'mem bottlenecks': []
        }

        element_size = self.kernel.datatypes_size[self.kernel.datatype]
        cacheline_size = float(self.machine['cacheline size'])
        elements_per_cacheline = int(cacheline_size // element_size)

        total_flops = sum(self.kernel._flops.values()) * elements_per_cacheline

        # TODO let user choose threads_per_core:
        threads_per_core = 1

        # Compile relevant information

        # CPU-L1 stats (in bytes!)
        # We compile CPU-L1 stats on our own, because cacheprediction only works on cache lines
        read_offsets, write_offsets = zip(*list(
            self.kernel.compile_global_offsets(
                iteration=range(0, elements_per_cacheline))))
        read_offsets = set(
            [item for sublist in read_offsets for item in sublist])
        write_offsets = set(
            [item for sublist in write_offsets for item in sublist])

        write_streams = len(write_offsets)
        read_streams = len(read_offsets) + write_streams  # write-allocate
        total_loads = read_streams * element_size
        total_evicts = write_streams * element_size
        bw, measurement_kernel = self.machine.get_bandwidth(
            0,
            read_streams,
            write_streams,
            threads_per_core,
            cores=self._args.cores)

        # Calculate performance (arithmetic intensity * bandwidth with
        # arithmetic intensity = flops / bytes loaded )
        if total_loads == 0:
            # This happens in case of full-caching
            arith_intens = None
            performance = None
        else:
            arith_intens = float(total_flops) / total_loads
            performance = arith_intens * float(bw)

        self.results['mem bottlenecks'].append({
            'performance':
            PrefixedUnit(performance, 'FLOP/s'),
            'level':
            self.machine['memory hierarchy'][0]['level'],
            'arithmetic intensity':
            arith_intens,
            'bw kernel':
            measurement_kernel,
            'bandwidth':
            bw,
            'bytes transfered':
            total_loads
        })
        if performance <= self.results.get('min performance', performance):
            self.results['bottleneck level'] = len(
                self.results['mem bottlenecks']) - 1
            self.results['min performance'] = performance

        # for other cache and memory levels:
        for cache_level, cache_info in list(
                enumerate(self.machine['memory hierarchy']))[:-1]:
            # Compiling stats (in bytes!)
            total_misses = self.results['misses'][cache_level] * cacheline_size
            total_evicts = self.results['evicts'][cache_level] * cacheline_size

            # choose bw according to cache level and problem
            # first, compile stream counts at current cache level
            # write-allocate is allready resolved above
            read_streams = self.results['misses'][cache_level]
            write_streams = self.results['evicts'][cache_level]
            # second, try to find best fitting kernel (closest to stream seen stream counts):
            bw, measurement_kernel = self.machine.get_bandwidth(
                cache_level + 1,
                read_streams,
                write_streams,
                threads_per_core,
                cores=self._args.cores)

            # Calculate performance (arithmetic intensity * bandwidth with
            # arithmetic intensity = flops / bytes transfered)
            bytes_transfered = total_misses + total_evicts

            if bytes_transfered == 0:
                # This happens in case of full-caching
                arith_intens = float('inf')
                performance = float('inf')
            else:
                arith_intens = float(total_flops) / bytes_transfered
                performance = arith_intens * float(bw)

            self.results['mem bottlenecks'].append({
                'performance':
                PrefixedUnit(performance, 'FLOP/s'),
                'level':
                (self.machine['memory hierarchy'][cache_level + 1]['level']),
                'arithmetic intensity':
                arith_intens,
                'bw kernel':
                measurement_kernel,
                'bandwidth':
                bw,
                'bytes transfered':
                bytes_transfered
            })
            if performance < self.results.get('min performance', performance):
                self.results['bottleneck level'] = len(
                    self.results['mem bottlenecks']) - 1
                self.results['min performance'] = performance

        return self.results

    def analyze(self):
        self.calculate_cache_access()

    def conv_perf(self, performance, unit, default='FLOP/s'):
        '''Convert performance (FLOP/s) to other units, such as It/s or cy/CL'''
        if not unit:
            unit = default

        clock = self.machine['clock']
        flops_per_it = sum(self.kernel._flops.values())
        it_s = performance / flops_per_it
        it_s.unit = 'It/s'
        element_size = self.kernel.datatypes_size[self.kernel.datatype]
        elements_per_cacheline = int(float(
            self.machine['cacheline size'])) / element_size
        cy_cl = clock / it_s * elements_per_cacheline
        cy_cl.unit = 'cy/CL'
        cy_it = clock / it_s
        cy_it.unit = 'cy/It'

        return {
            'It/s': it_s,
            'cy/CL': cy_cl,
            'cy/It': cy_it,
            'FLOP/s': performance
        }[unit]

    def report(self, output_file=sys.stdout):
        precision = 'DP' if self.kernel.datatype == 'double' else 'SP'
        max_flops = self.machine['clock']*self._args.cores * \
                    self.machine['FLOPs per cycle'][precision]['total']
        max_flops.unit = "FLOP/s"

        if self._args and self._args.verbose >= 3:
            print('{}'.format(pformat(self.results)), file=output_file)

        if self._args and self._args.verbose >= 1:
            print('{}'.format(pformat(self.results['verbose infos'])),
                  file=output_file)
            print('Bottlenecks:', file=output_file)
            print(
                '  level | a. intensity |   performance   |   bandwidth  | bandwidth kernel',
                file=output_file)
            print(
                '--------+--------------+-----------------+--------------+-----------------',
                file=output_file)
            print('    CPU |              | {!s:>15} |              |'.format(
                self.conv_perf(max_flops, self._args.unit)),
                  file=output_file)
            for b in self.results['mem bottlenecks']:
                print(
                    '{level:>7} | {arithmetic intensity:>5.2} FLOP/B | {!s:>15} |'
                    ' {bandwidth!s:>12} | {bw kernel:<8}'.format(
                        self.conv_perf(b['performance'], self._args.unit),
                        **b),
                    file=output_file)
            print('', file=output_file)

        if self.results['min performance'] > max_flops:
            # CPU bound
            print('CPU bound with {} cores(s)'.format(self._args.cores),
                  file=output_file)
            print('{!s} due to CPU max. FLOP/s'.format(max_flops),
                  file=output_file)
        else:
            # Cache or mem bound
            print('Cache or mem bound with {} core(s)'.format(
                self._args.cores),
                  file=output_file)

            bottleneck = self.results['mem bottlenecks'][
                self.results['bottleneck level']]
            print(
                '{!s} due to {} transfer bottleneck (bw with from {} benchmark)'
                .format(
                    self.conv_perf(bottleneck['performance'], self._args.unit),
                    bottleneck['level'], bottleneck['bw kernel']),
                file=output_file)
            print('Arithmetic Intensity: {:.2f} FLOP/B'.format(
                bottleneck['arithmetic intensity']),
                  file=output_file)
Beispiel #5
0
class ECMData(PerformanceModel):
    """Representation of Data portion of the Execution-Cache-Memory Model."""

    name = "Execution-Cache-Memory (data transfers only)"

    def __init__(self, kernel, machine, args=None, parser=None, cores=1,
                 cache_predictor=CacheSimulationPredictor, verbose=0):
        """
        Create Execcution-Cache-Memory data model from kernel and machine objects.

        *kernel* is a Kernel object
        *machine* describes the machine (cpu, cache and memory) characteristics
        *args* (optional) are the parsed arguments from the comand line

        If *args* is None, *cores*, *cache_predictor* and *verbose* are taken into account,
        otherwise *args* takes precedence.
        """
        self.kernel = kernel
        self.machine = machine
        self._args = args
        self._parser = parser
        self.results = {}

        if args:
            self.verbose = self._args.verbose
            self.cores = self._args.cores
            if self._args.cache_predictor == 'SIM':
                self.predictor = CacheSimulationPredictor(self.kernel, self.machine, self.cores)
            elif self._args.cache_predictor == 'LC':
                self.predictor = LayerConditionPredictor(self.kernel, self.machine, self.cores)
            else:
                raise NotImplementedError("Unknown cache predictor, only LC (layer condition) and "
                                          "SIM (cache simulation with pycachesim) is supported.")
        else:
            self.cores = cores
            self.predictor = cache_predictor(self.kernel, self.machine, self.cores)
            self.verbose = verbose

    def calculate_cache_access(self):
        """Dispatch to cache predictor to get cache stats."""
        self.results.update({
                        'cycles': [],  # will be filled by caclculate_cycles()
                        'misses': self.predictor.get_misses(),
                        'hits': self.predictor.get_hits(),
                        'evicts': self.predictor.get_evicts(),
                        'verbose infos': self.predictor.get_infos()})  # only for verbose outputs

    def calculate_cycles(self):
        """
        Calculate performance model cycles from cache stats.

        calculate_cache_access() needs to have been execute before.
        """
        element_size = self.kernel.datatypes_size[self.kernel.datatype]
        elements_per_cacheline = float(self.machine['cacheline size']) // element_size
        iterations_per_cacheline = (sympy.Integer(self.machine['cacheline size']) /
                                    sympy.Integer(self.kernel.bytes_per_iteration))
        self.results['iterations per cacheline'] = iterations_per_cacheline
        cacheline_size = float(self.machine['cacheline size'])

        loads, stores = (self.predictor.get_loads(), self.predictor.get_stores())

        for cache_level, cache_info in list(enumerate(self.machine['memory hierarchy']))[1:]:
            throughput, duplexness = cache_info['non-overlap upstream throughput']

            if type(throughput) is str and throughput == 'full socket memory bandwidth':
                # Memory transfer
                # we use bandwidth to calculate cycles and then add panalty cycles (if given)

                # choose bw according to cache level and problem
                # first, compile stream counts at current cache level
                # write-allocate is allready resolved in cache predictor
                read_streams = loads[cache_level]
                write_streams = stores[cache_level]
                # second, try to find best fitting kernel (closest to stream seen stream counts):
                threads_per_core = 1
                bw, measurement_kernel = self.machine.get_bandwidth(
                    cache_level, read_streams, write_streams, threads_per_core)

                # calculate cycles
                if duplexness == 'half-duplex':
                    cycles = float(loads[cache_level] + stores[cache_level]) * \
                             float(elements_per_cacheline) * float(element_size) * \
                             float(self.machine['clock']) / float(bw)
                else:  # full-duplex
                    raise NotImplementedError(
                        "full-duplex mode is not (yet) supported for memory transfers.")
                # add penalty cycles for each read stream
                if 'penalty cycles per read stream' in cache_info:
                    cycles += stores[cache_level] * \
                              cache_info['penalty cycles per read stream']

                self.results.update({
                    'memory bandwidth kernel': measurement_kernel,
                    'memory bandwidth': bw})
            else:
                # since throughput is given in B/cy, and we need CL/cy:
                throughput = float(throughput) / cacheline_size
                # only cache cycles count
                if duplexness == 'half-duplex':
                    cycles = (loads[cache_level] + stores[cache_level]) / float(throughput)
                elif duplexness == 'full-duplex':
                    cycles = max(loads[cache_level] / float(throughput),
                                 stores[cache_level] / float(throughput))
                else:
                    raise ValueError("Duplexness of cache throughput may only be 'half-duplex'"
                                     "or 'full-duplex', found {} in {}.".format(
                        duplexness, cache_info['name']))

            self.results['cycles'].append((cache_info['level'], cycles))

            self.results[cache_info['level']] = cycles

        return self.results

    def analyze(self):
        """Run complete anaylysis and return results."""
        self.calculate_cache_access()
        self.calculate_cycles()
        self.results['flops per iteration'] = sum(self.kernel._flops.values())

        return self.results

    def conv_cy(self, cy_cl):
        """Convert cycles (cy/CL) to other units, such as FLOP/s or It/s."""
        if not isinstance(cy_cl, PrefixedUnit):
            cy_cl = PrefixedUnit(cy_cl, '', 'cy/CL')

        clock = self.machine['clock']
        element_size = self.kernel.datatypes_size[self.kernel.datatype]
        elements_per_cacheline = int(self.machine['cacheline size']) // element_size
        if cy_cl != 0:
            it_s = clock/cy_cl*elements_per_cacheline
            it_s.unit = 'It/s'
        else:
            it_s = PrefixedUnit('inf It/S')
        flops_per_it = sum(self.kernel._flops.values())
        performance = it_s*flops_per_it
        performance.unit = 'FLOP/s'
        cy_it = cy_cl*elements_per_cacheline
        cy_it.unit = 'cy/It'

        return {'It/s': it_s,
                'cy/CL': cy_cl,
                'cy/It': cy_it,
                'FLOP/s': performance}

    def report_data_transfers(self):
        cacheline_size = float(self.machine['cacheline size'])
        r = "Data Transfers:\nLevel   | Loads    | Store    |\n"
        loads, stores = (self.predictor.get_loads(), self.predictor.get_stores())
        for cache_level, cache_info in list(enumerate(self.machine['memory hierarchy']))[1:]:
            r += ("{:>7} | {:>3.0f} B/CL | {:>3.0f} B/CL |\n".format(
                self.machine['memory hierarchy'][cache_level-1]['level']+'-'+cache_info['level'],
                loads[cache_level] * cacheline_size,
                stores[cache_level] * cacheline_size))
        return r

    def report(self, output_file=sys.stdout):
        """Print generated model data in human readable format."""
        if self.verbose > 1:
            print('{}'.format(pprint.pformat(self.results['verbose infos'])), file=output_file)

        for level, cycles in self.results['cycles']:
            print('{} = {}'.format(
                level, self.conv_cy(cycles)[self._args.unit]), file=output_file)

        if self.verbose > 1:
            if 'memory bandwidth kernel' in self.results:
                print('memory cycles based on {} kernel with {}'.format(
                          self.results['memory bandwidth kernel'],
                          self.results['memory bandwidth']),
                      file=output_file)

        if self.verbose > 1:
            print(file=output_file)
            print(self.report_data_transfers(), file=output_file)
Beispiel #6
0
class ECMData(PerformanceModel):
    """Representation of Data portion of the Execution-Cache-Memory Model."""

    name = "Execution-Cache-Memory (data transfers only)"

    def __init__(self,
                 kernel,
                 machine,
                 args=None,
                 parser=None,
                 cores=1,
                 cache_predictor=CacheSimulationPredictor,
                 verbose=0):
        """
        Create Execcution-Cache-Memory data model from kernel and machine objects.

        *kernel* is a Kernel object
        *machine* describes the machine (cpu, cache and memory) characteristics
        *args* (optional) are the parsed arguments from the comand line

        If *args* is None, *cores*, *cache_predictor* and *verbose* are taken into account,
        otherwise *args* takes precedence.
        """
        self.kernel = kernel
        self.machine = machine
        self._args = args
        self._parser = parser
        self.results = {}

        if args:
            self.verbose = self._args.verbose
            self.cores = self._args.cores
            if self._args.cache_predictor == 'SIM':
                self.predictor = CacheSimulationPredictor(
                    self.kernel, self.machine, self.cores)
            elif self._args.cache_predictor == 'LC':
                self.predictor = LayerConditionPredictor(
                    self.kernel, self.machine, self.cores)
            else:
                raise NotImplementedError(
                    "Unknown cache predictor, only LC (layer condition) and "
                    "SIM (cache simulation with pycachesim) is supported.")
        else:
            self.cores = cores
            self.predictor = cache_predictor(self.kernel, self.machine,
                                             self.cores)
            self.verbose = verbose

    def calculate_cache_access(self):
        """Dispatch to cache predictor to get cache stats."""
        self.results.update({
            'cycles': [],  # will be filled by caclculate_cycles()
            'misses': self.predictor.get_misses(),
            'hits': self.predictor.get_hits(),
            'evicts': self.predictor.get_evicts(),
            'verbose infos': self.predictor.get_infos()
        })  # only for verbose outputs

    def calculate_cycles(self):
        """
        Calculate performance model cycles from cache stats.

        calculate_cache_access() needs to have been execute before.
        """
        element_size = self.kernel.datatypes_size[self.kernel.datatype]
        elements_per_cacheline = float(
            self.machine['cacheline size']) // element_size
        cacheline_size = float(self.machine['cacheline size'])

        loads, stores = (self.predictor.get_loads(),
                         self.predictor.get_stores())

        for cache_level, cache_info in list(
                enumerate(self.machine['memory hierarchy']))[1:]:
            throughput, duplexness = cache_info[
                'non-overlap upstream throughput']

            if type(throughput
                    ) is str and throughput == 'full socket memory bandwidth':
                # Memory transfer
                # we use bandwidth to calculate cycles and then add panalty cycles (if given)

                # choose bw according to cache level and problem
                # first, compile stream counts at current cache level
                # write-allocate is allready resolved in cache predictor
                read_streams = loads[cache_level]
                write_streams = stores[cache_level]
                # second, try to find best fitting kernel (closest to stream seen stream counts):
                threads_per_core = 1
                bw, measurement_kernel = self.machine.get_bandwidth(
                    cache_level, read_streams, write_streams, threads_per_core)

                # calculate cycles
                if duplexness == 'half-duplex':
                    cycles = float(loads[cache_level] + stores[cache_level]) * \
                             float(elements_per_cacheline) * float(element_size) * \
                             float(self.machine['clock']) / float(bw)
                else:  # full-duplex
                    raise NotImplementedError(
                        "full-duplex mode is not (yet) supported for memory transfers."
                    )
                # add penalty cycles for each read stream
                if 'penalty cycles per read stream' in cache_info:
                    cycles += stores[cache_level] * \
                              cache_info['penalty cycles per read stream']

                self.results.update({
                    'memory bandwidth kernel': measurement_kernel,
                    'memory bandwidth': bw
                })
            else:
                # since throughput is given in B/cy, and we need CL/cy:
                throughput = float(throughput) / cacheline_size
                # only cache cycles count
                if duplexness == 'half-duplex':
                    cycles = (loads[cache_level] +
                              stores[cache_level]) / float(throughput)
                elif duplexness == 'full-duplex':
                    cycles = max(loads[cache_level] / float(throughput),
                                 stores[cache_level] / float(throughput))
                else:
                    raise ValueError(
                        "Duplexness of cache throughput may only be 'half-duplex'"
                        "or 'full-duplex', found {} in {}.".format(
                            duplexness, cache_info['name']))

            self.results['cycles'].append((cache_info['level'], cycles))

            self.results[cache_info['level']] = cycles

        return self.results

    def analyze(self):
        """Run complete anaylysis and return results."""
        self.calculate_cache_access()
        self.calculate_cycles()
        self.results['flops per iteration'] = sum(self.kernel._flops.values())

        return self.results

    def conv_cy(self, cy_cl):
        """Convert cycles (cy/CL) to other units, such as FLOP/s or It/s."""
        if not isinstance(cy_cl, PrefixedUnit):
            cy_cl = PrefixedUnit(cy_cl, '', 'cy/CL')

        clock = self.machine['clock']
        element_size = self.kernel.datatypes_size[self.kernel.datatype]
        elements_per_cacheline = int(
            self.machine['cacheline size']) // element_size
        if cy_cl != 0:
            it_s = clock / cy_cl * elements_per_cacheline
            it_s.unit = 'It/s'
        else:
            it_s = PrefixedUnit('inf It/S')
        flops_per_it = sum(self.kernel._flops.values())
        performance = it_s * flops_per_it
        performance.unit = 'FLOP/s'
        cy_it = cy_cl * elements_per_cacheline
        cy_it.unit = 'cy/It'

        return {
            'It/s': it_s,
            'cy/CL': cy_cl,
            'cy/It': cy_it,
            'FLOP/s': performance
        }

    def report_data_transfers(self):
        cacheline_size = float(self.machine['cacheline size'])
        r = "Data Transfers:\nLevel   | Loads    | Store    |\n"
        loads, stores = (self.predictor.get_loads(),
                         self.predictor.get_stores())
        for cache_level, cache_info in list(
                enumerate(self.machine['memory hierarchy']))[1:]:
            r += ("{:>7} | {:>3.0f} B/CL | {:>3.0f} B/CL |\n".format(
                self.machine['memory hierarchy'][cache_level - 1]['level'] +
                '-' + cache_info['level'], loads[cache_level] * cacheline_size,
                stores[cache_level] * cacheline_size))
        return r

    def report(self, output_file=sys.stdout):
        """Print generated model data in human readable format."""
        if self.verbose > 1:
            print('{}'.format(pprint.pformat(self.results['verbose infos'])),
                  file=output_file)

        for level, cycles in self.results['cycles']:
            print('{} = {}'.format(level,
                                   self.conv_cy(cycles)[self._args.unit]),
                  file=output_file)

        if self.verbose > 1:
            if 'memory bandwidth kernel' in self.results:
                print('memory cycles based on {} kernel with {}'.format(
                    self.results['memory bandwidth kernel'],
                    self.results['memory bandwidth']),
                      file=output_file)

        if self.verbose > 1:
            print(file=output_file)
            print(self.report_data_transfers(), file=output_file)
Beispiel #7
0
class Roofline(PerformanceModel):
    """
    Representation of the Roofline model based on simplistic FLOP analysis.

    more info to follow...
    """

    name = "Roofline"

    @classmethod
    def configure_arggroup(cls, parser):
        """Configure argument parser."""
        pass

    def __init__(self, kernel, machine, args=None, parser=None, cores=1,
                 cache_predictor=LayerConditionPredictor, verbose=0):
        """
        Create roofline model from kernel and machine objects.

        *kernel* is a Kernel object
        *machine* describes the machine (cpu, cache and memory) characteristics
        *args* (optional) are the parsed arguments from the comand line


        If *args* is None, *asm_block*, *pointer_increment* and *verbose* will be used, otherwise
        *args* takes precedence.
        """
        self.kernel = kernel
        self.machine = machine
        self._args = args
        self._parser = parser
        self.results = None

        if args:
            self.verbose = self._args.verbose
            self.cores = self._args.cores
            if self._args.cache_predictor == 'SIM':
                self.predictor = CacheSimulationPredictor(self.kernel, self.machine, self.cores)
            elif self._args.cache_predictor == 'LC':
                self.predictor = LayerConditionPredictor(self.kernel, self.machine, self.cores)
            else:
                raise NotImplementedError("Unknown cache predictor, only LC (layer condition) and "
                                          "SIM (cache simulation with pycachesim) is supported.")
        else:
            self.cores = cores
            self.predictor = cache_predictor(self.kernel, self.machine, self.cores)
            self.verbose = verbose

        if sum(self.kernel._flops.values()) == 0:
            raise ValueError("The Roofline model requires that the sum of FLOPs is non-zero.")

    def calculate_cache_access(self):
        """Apply cache prediction to generate cache access behaviour."""
        self.results = {'misses': self.predictor.get_misses(),
                        'hits': self.predictor.get_hits(),
                        'evicts': self.predictor.get_evicts(),
                        'verbose infos': self.predictor.get_infos(),  # only for verbose outputs
                        'bottleneck level': 0,
                        'mem bottlenecks': []}

        element_size = self.kernel.datatypes_size[self.kernel.datatype]
        cacheline_size = float(self.machine['cacheline size'])
        elements_per_cacheline = int(cacheline_size // element_size)

        total_flops = sum(self.kernel._flops.values())*elements_per_cacheline

        # TODO let user choose threads_per_core:
        threads_per_core = 1

        # Compile relevant information

        # CPU-L1 stats (in bytes!)
        # We compile CPU-L1 stats on our own, because cacheprediction only works on cache lines
        read_offsets, write_offsets = zip(*list(self.kernel.compile_global_offsets(
            iteration=range(0, elements_per_cacheline))))
        read_offsets = set([item for sublist in read_offsets if sublist is not None
                            for item in sublist])
        write_offsets = set([item for sublist in write_offsets if sublist is not None
                             for item in sublist])

        write_streams = len(write_offsets)
        read_streams = len(read_offsets) + write_streams  # write-allocate
        total_loads = read_streams * element_size
        # total_evicts = write_streams * element_size
        bw, measurement_kernel = self.machine.get_bandwidth(
            0,
            read_streams - write_streams,  # no write-allocate in L1
            write_streams,
            threads_per_core,
            cores=self.cores)

        # Calculate performance (arithmetic intensity * bandwidth with
        # arithmetic intensity = flops / bytes loaded )
        if total_loads == 0:
            # This happens in case of full-caching
            arith_intens = None
            performance = None
        else:
            arith_intens = float(total_flops)/total_loads
            performance = PrefixedUnit(arith_intens * float(bw), 'FLOP/s')

        self.results['mem bottlenecks'].append({
            'performance': self.conv_perf(PrefixedUnit(performance, 'FLOP/s')),
            'level': self.machine['memory hierarchy'][0]['level'],
            'arithmetic intensity': arith_intens,
            'bw kernel': measurement_kernel,
            'bandwidth': bw,
            'bytes transfered': total_loads})
        self.results['bottleneck level'] = len(self.results['mem bottlenecks'])-1
        self.results['min performance'] = self.conv_perf(performance)

        # for other cache and memory levels:
        for cache_level, cache_info in list(enumerate(self.machine['memory hierarchy']))[:-1]:
            # Compiling stats (in bytes!)
            total_misses = self.results['misses'][cache_level]*cacheline_size
            total_evicts = self.results['evicts'][cache_level]*cacheline_size

            # choose bw according to cache level and problem
            # first, compile stream counts at current cache level
            # write-allocate is allready resolved above
            read_streams = self.results['misses'][cache_level]
            write_streams = self.results['evicts'][cache_level]
            # second, try to find best fitting kernel (closest to stream seen stream counts):
            bw, measurement_kernel = self.machine.get_bandwidth(
                cache_level+1, read_streams, write_streams, threads_per_core,
                cores=self.cores)

            # Calculate performance (arithmetic intensity * bandwidth with
            # arithmetic intensity = flops / bytes transfered)
            bytes_transfered = total_misses + total_evicts

            if bytes_transfered == 0:
                # This happens in case of full-caching
                arith_intens = float('inf')
                performance = PrefixedUnit(float('inf'), 'FLOP/s')
            else:
                arith_intens = float(total_flops)/bytes_transfered
                performance = PrefixedUnit(arith_intens * float(bw), 'FLOP/s')

            self.results['mem bottlenecks'].append({
                'performance': self.conv_perf(performance),
                'level': (self.machine['memory hierarchy'][cache_level + 1]['level']),
                'arithmetic intensity': arith_intens,
                'bw kernel': measurement_kernel,
                'bandwidth': bw,
                'bytes transfered': bytes_transfered})
            if performance < self.results.get('min performance', {'FLOP/s': performance})['FLOP/s']:
                self.results['bottleneck level'] = len(self.results['mem bottlenecks'])-1
                self.results['min performance'] = self.conv_perf(performance)

        return self.results

    def analyze(self):
        """Run analysis."""
        precision = 'DP' if self.kernel.datatype == 'double' else 'SP'
        self.calculate_cache_access()

        self.results['max_perf'] = self.conv_perf(self.machine['clock'] * self.cores * \
            self.machine['FLOPs per cycle'][precision]['total'])

    def conv_perf(self, performance):
        """Convert performance (FLOP/s) to other units, such as It/s or cy/CL."""
        clock = self.machine['clock']
        flops_per_it = sum(self.kernel._flops.values())
        it_s = performance/flops_per_it
        it_s.unit = 'It/s'
        element_size = self.kernel.datatypes_size[self.kernel.datatype]
        elements_per_cacheline = int(float(self.machine['cacheline size'])) / element_size
        cy_cl = clock/it_s*elements_per_cacheline
        cy_cl.unit = 'cy/CL'
        cy_it = clock/it_s
        cy_it.unit = 'cy/It'

        return {'It/s': it_s,
                'cy/CL': cy_cl,
                'cy/It': cy_it,
                'FLOP/s': performance}

    def report(self, output_file=sys.stdout):
        """Report analysis outcome in human readable form."""
        max_perf = self.results['max_perf']

        if self._args and self._args.verbose >= 3:
            print('{}'.format(pformat(self.results)), file=output_file)

        if self._args and self._args.verbose >= 1:
            print('{}'.format(pformat(self.results['verbose infos'])), file=output_file)
            print('Bottlenecks:', file=output_file)
            print('  level | a. intensity |   performance   |   peak bandwidth  | peak bandwidth kernel',
                  file=output_file)
            print('--------+--------------+-----------------+-------------------+----------------------',
                  file=output_file)
            print('    CPU |              | {!s:>15} |                   |'.format(
                max_perf[self._args.unit]),
                  file=output_file)
            for b in self.results['mem bottlenecks']:
                print('{level:>7} | {arithmetic intensity:>5.2} FLOP/B | {0!s:>15} |'
                      ' {bandwidth!s:>17} | {bw kernel:<8}'.format(
                          b['performance'][self._args.unit], **b),
                      file=output_file)
            print('', file=output_file)

        if self.results['min performance']['FLOP/s'] > max_perf['FLOP/s']:
            # CPU bound
            print('CPU bound. {!s} due to CPU max. FLOP/s'.format(max_perf), file=output_file)
        else:
            # Cache or mem bound
            print('Cache or mem bound.', file=output_file)

            bottleneck = self.results['mem bottlenecks'][self.results['bottleneck level']]
            print('{!s} due to {} transfer bottleneck (with bw from {} benchmark)'.format(
                    bottleneck['performance'][self._args.unit],
                    bottleneck['level'],
                    bottleneck['bw kernel']),
                  file=output_file)
            print('Arithmetic Intensity: {:.2f} FLOP/B'.format(bottleneck['arithmetic intensity']),
                  file=output_file)
Beispiel #8
0
class Roofline(PerformanceModel):
    """
    Representation of the Roofline model based on simplistic FLOP analysis.

    more info to follow...
    """

    name = "Roofline"

    @classmethod
    def configure_arggroup(cls, parser):
        """Configure argument parser."""
        pass

    def __init__(self,
                 kernel,
                 machine,
                 args=None,
                 parser=None,
                 cores=1,
                 cache_predictor=LayerConditionPredictor,
                 verbose=0):
        """
        Create roofline model from kernel and machine objects.

        *kernel* is a Kernel object
        *machine* describes the machine (cpu, cache and memory) characteristics
        *args* (optional) are the parsed arguments from the comand line


        If *args* is None, *asm_block*, *pointer_increment* and *verbose* will be used, otherwise
        *args* takes precedence.
        """
        self.kernel = kernel
        self.machine = machine
        self._args = args
        self._parser = parser
        self.results = None

        if args:
            self.verbose = self._args.verbose
            self.cores = self._args.cores
            if self._args.cache_predictor == 'SIM':
                self.predictor = CacheSimulationPredictor(
                    self.kernel, self.machine, self.cores)
            elif self._args.cache_predictor == 'LC':
                self.predictor = LayerConditionPredictor(
                    self.kernel, self.machine, self.cores)
            else:
                raise NotImplementedError(
                    "Unknown cache predictor, only LC (layer condition) and "
                    "SIM (cache simulation with pycachesim) is supported.")
        else:
            self.cores = cores
            self.predictor = cache_predictor(self.kernel, self.machine,
                                             self.cores)
            self.verbose = verbose

        if sum(self.kernel._flops.values()) == 0:
            raise ValueError(
                "The Roofline model requires that the sum of FLOPs is non-zero."
            )

    def calculate_cache_access(self):
        """Apply cache prediction to generate cache access behaviour."""
        self.results = {
            'misses': self.predictor.get_misses(),
            'hits': self.predictor.get_hits(),
            'evicts': self.predictor.get_evicts(),
            'verbose infos':
            self.predictor.get_infos(),  # only for verbose outputs
            'bottleneck level': 0,
            'mem bottlenecks': []
        }

        element_size = self.kernel.datatypes_size[self.kernel.datatype]
        cacheline_size = float(self.machine['cacheline size'])
        elements_per_cacheline = int(cacheline_size // element_size)

        total_flops = sum(self.kernel._flops.values()) * elements_per_cacheline

        # TODO let user choose threads_per_core:
        threads_per_core = 1

        # Compile relevant information

        # CPU-L1 stats (in bytes!)
        # We compile CPU-L1 stats on our own, because cacheprediction only works on cache lines
        read_offsets, write_offsets = zip(*list(
            self.kernel.compile_global_offsets(
                iteration=range(0, elements_per_cacheline))))
        read_offsets = set([
            item for sublist in read_offsets if sublist is not None
            for item in sublist
        ])
        write_offsets = set([
            item for sublist in write_offsets if sublist is not None
            for item in sublist
        ])

        write_streams = len(write_offsets)
        read_streams = len(read_offsets) + write_streams  # write-allocate
        total_loads = read_streams * element_size
        # total_evicts = write_streams * element_size
        bw, measurement_kernel = self.machine.get_bandwidth(
            0,
            read_streams - write_streams,  # no write-allocate in L1
            write_streams,
            threads_per_core,
            cores=self.cores)

        # Calculate performance (arithmetic intensity * bandwidth with
        # arithmetic intensity = flops / bytes loaded )
        if total_loads == 0:
            # This happens in case of full-caching
            arith_intens = None
            performance = None
        else:
            arith_intens = float(total_flops) / total_loads
            performance = PrefixedUnit(arith_intens * float(bw), 'FLOP/s')

        self.results['mem bottlenecks'].append({
            'performance':
            self.conv_perf(PrefixedUnit(performance, 'FLOP/s')),
            'level':
            self.machine['memory hierarchy'][0]['level'],
            'arithmetic intensity':
            arith_intens,
            'bw kernel':
            measurement_kernel,
            'bandwidth':
            bw,
            'bytes transfered':
            total_loads
        })
        self.results['bottleneck level'] = len(
            self.results['mem bottlenecks']) - 1
        self.results['min performance'] = self.conv_perf(performance)

        # for other cache and memory levels:
        for cache_level, cache_info in list(
                enumerate(self.machine['memory hierarchy']))[:-1]:
            # Compiling stats (in bytes!)
            total_misses = self.results['misses'][cache_level] * cacheline_size
            total_evicts = self.results['evicts'][cache_level] * cacheline_size

            # choose bw according to cache level and problem
            # first, compile stream counts at current cache level
            # write-allocate is allready resolved above
            read_streams = self.results['misses'][cache_level]
            write_streams = self.results['evicts'][cache_level]
            # second, try to find best fitting kernel (closest to stream seen stream counts):
            bw, measurement_kernel = self.machine.get_bandwidth(
                cache_level + 1,
                read_streams,
                write_streams,
                threads_per_core,
                cores=self.cores)

            # Calculate performance (arithmetic intensity * bandwidth with
            # arithmetic intensity = flops / bytes transfered)
            bytes_transfered = total_misses + total_evicts

            if bytes_transfered == 0:
                # This happens in case of full-caching
                arith_intens = float('inf')
                performance = PrefixedUnit(float('inf'), 'FLOP/s')
            else:
                arith_intens = float(total_flops) / bytes_transfered
                performance = PrefixedUnit(arith_intens * float(bw), 'FLOP/s')

            self.results['mem bottlenecks'].append({
                'performance':
                self.conv_perf(performance),
                'level':
                (self.machine['memory hierarchy'][cache_level + 1]['level']),
                'arithmetic intensity':
                arith_intens,
                'bw kernel':
                measurement_kernel,
                'bandwidth':
                bw,
                'bytes transfered':
                bytes_transfered
            })
            if performance < self.results.get(
                    'min performance', {'FLOP/s': performance})['FLOP/s']:
                self.results['bottleneck level'] = len(
                    self.results['mem bottlenecks']) - 1
                self.results['min performance'] = self.conv_perf(performance)

        return self.results

    def analyze(self):
        """Run analysis."""
        precision = 'DP' if self.kernel.datatype == 'double' else 'SP'
        self.calculate_cache_access()

        self.results['max_perf'] = self.conv_perf(self.machine['clock'] * self.cores * \
            self.machine['FLOPs per cycle'][precision]['total'])

    def conv_perf(self, performance):
        """Convert performance (FLOP/s) to other units, such as It/s or cy/CL."""
        clock = self.machine['clock']
        flops_per_it = sum(self.kernel._flops.values())
        it_s = performance / flops_per_it
        it_s.unit = 'It/s'
        element_size = self.kernel.datatypes_size[self.kernel.datatype]
        elements_per_cacheline = int(float(
            self.machine['cacheline size'])) / element_size
        cy_cl = clock / it_s * elements_per_cacheline
        cy_cl.unit = 'cy/CL'
        cy_it = clock / it_s
        cy_it.unit = 'cy/It'

        return {
            'It/s': it_s,
            'cy/CL': cy_cl,
            'cy/It': cy_it,
            'FLOP/s': performance
        }

    def report(self, output_file=sys.stdout):
        """Report analysis outcome in human readable form."""
        max_perf = self.results['max_perf']

        if self._args and self._args.verbose >= 3:
            print('{}'.format(pformat(self.results)), file=output_file)

        if self._args and self._args.verbose >= 1:
            print('{}'.format(pformat(self.results['verbose infos'])),
                  file=output_file)
            print('Bottlenecks:', file=output_file)
            print(
                '  level | a. intensity |   performance   |   peak bandwidth  | peak bandwidth kernel',
                file=output_file)
            print(
                '--------+--------------+-----------------+-------------------+----------------------',
                file=output_file)
            print('    CPU |              | {!s:>15} |                   |'.
                  format(max_perf[self._args.unit]),
                  file=output_file)
            for b in self.results['mem bottlenecks']:
                print(
                    '{level:>7} | {arithmetic intensity:>5.2} FLOP/B | {0!s:>15} |'
                    ' {bandwidth!s:>17} | {bw kernel:<8}'.format(
                        b['performance'][self._args.unit], **b),
                    file=output_file)
            print('', file=output_file)

        if self.results['min performance']['FLOP/s'] > max_perf['FLOP/s']:
            # CPU bound
            print('CPU bound. {!s} due to CPU max. FLOP/s'.format(max_perf),
                  file=output_file)
        else:
            # Cache or mem bound
            print('Cache or mem bound.', file=output_file)

            bottleneck = self.results['mem bottlenecks'][
                self.results['bottleneck level']]
            print(
                '{!s} due to {} transfer bottleneck (with bw from {} benchmark)'
                .format(bottleneck['performance'][self._args.unit],
                        bottleneck['level'], bottleneck['bw kernel']),
                file=output_file)
            print('Arithmetic Intensity: {:.2f} FLOP/B'.format(
                bottleneck['arithmetic intensity']),
                  file=output_file)

        if any([
                '_Complex' in var_info[0]
                for var_info in self.kernel.variables.values()
        ]):
            print(
                "WARNING: FLOP counts are probably wrong, because complex flops are counted\n"
                "         as single flops. All other units should not be affected.\n",
                file=sys.stderr)