Beispiel #1
0
def measure_bw(type_, total_size, threads_per_core, max_threads_per_core,
               cores_per_socket, sockets):
    """*size* is given in kilo bytes"""
    groups = []
    for s in range(sockets):
        groups += [
            '-w', 'S' + str(s) + ':' + str(total_size) + 'kB:' +
            str(threads_per_core * cores_per_socket) + ':1:' +
            str(int(max_threads_per_core / threads_per_core))
        ]
    # for older likwid versions add ['-g', str(sockets), '-i', str(iterations)] to cmd
    cmd = ['likwid-bench', '-t', type_] + groups
    sys.stderr.write(' '.join(cmd))
    output = subprocess.Popen(
        cmd, stdout=subprocess.PIPE).communicate()[0].decode('utf-8')
    if not output:
        print(' '.join(cmd) +
              ' returned no output, possibly wrong version installed '
              '(requires 4.0 or later)',
              file=sys.stderr)
        sys.exit(1)
    bw = float(
        get_match_or_break(r'^MByte/s:\s+([0-9]+(?:\.[0-9]+)?)\s*$',
                           output)[0])
    print(' ', PrefixedUnit(bw, 'MB/s'), file=sys.stderr)
    return PrefixedUnit(bw, 'MB/s')
Beispiel #2
0
    def conv_cy(self, cy_cl, unit, default='cy/CL'):
        """Convert cycles (cy/CL) to other units, such as FLOP/s or It/s."""
        if not isinstance(cy_cl, PrefixedUnit):
            cy_cl = PrefixedUnit(cy_cl, '', 'cy/CL')
        if not unit:
            unit = default

        clock = self.machine['clock']
        element_size = self.kernel.datatypes_size[self.kernel.datatype]
        elements_per_cacheline = int(
            self.machine['cacheline size']) // element_size
        if cy_cl != 0:
            it_s = clock / cy_cl * elements_per_cacheline
            it_s.unit = 'It/s'
        else:
            it_s = PrefixedUnit('inf It/S')
        flops_per_it = sum(self.kernel._flops.values())
        performance = it_s * flops_per_it
        performance.unit = 'FLOP/s'
        cy_it = cy_cl * elements_per_cacheline
        cy_it.unit = 'cy/It'

        return {
            'It/s': it_s,
            'cy/CL': cy_cl,
            'cy/It': cy_it,
            'FLOP/s': performance
        }[unit]
Beispiel #3
0
    def test_2d5pt_Roofline(self):
        store_file = os.path.join(self.temp_dir, 'test_2d5pt_Roofline.pickle')
        output_stream = StringIO()

        parser = kc.create_parser()
        args = parser.parse_args(['-m', self._find_file('phinally_gcc.yaml'),
                                  '-p', 'Roofline',
                                  self._find_file('2d-5pt.c'),
                                  '-D', 'N', '1024-4096:3log2',
                                  '-D', 'M', '50',
                                  '-vvv',
                                  '--store', store_file])
        kc.check_arguments(args, parser)
        kc.run(parser, args, output_file=output_stream)

        results = pickle.load(open(store_file, 'rb'))

        # Check if results contains correct kernel
        self.assertEqual(list(results), ['2d-5pt.c'])

        # Check for correct variations of constants
        six.assertCountEqual(self,
            [sorted(map(str, r)) for r in results['2d-5pt.c']],
            [sorted(map(str, r)) for r in [
                ((sympy.var('M'), 50), (sympy.var('N'), 1024)), ((sympy.var('M'), 50), (sympy.var('N'), 2048)), ((sympy.var('M'), 50), (sympy.var('N'), 4096))]])

        # Output of first result:
        result = results['2d-5pt.c'][[k for k in results['2d-5pt.c'] if (sympy.var('N'), 4096) in k][0]]

        six.assertCountEqual(self, result, ['Roofline'])

        roofline = result['Roofline']
        self.assertAlmostEqual(roofline['min performance'], 5802500000.0, places=0)
        self.assertEqual(roofline['bottleneck level'], 2)
        
        expected_btlncks = [{u'arithmetic intensity': 0.11764705882352941,
                             u'bandwidth': PrefixedUnit(122.97, u'G', u'B/s'),
                             u'bw kernel': 'copy',
                             u'level': u'L1',
                             u'performance': PrefixedUnit(14467058823.529411, u'', u'FLOP/s')},
                            {u'arithmetic intensity': 0.1,
                             u'bandwidth': PrefixedUnit(61.92, u'G', u'B/s'),
                             u'bw kernel': 'copy',
                             u'level': u'L2',
                             u'performance': PrefixedUnit(6192000000.0, u'', u'FLOP/s')},
                            {u'arithmetic intensity': 0.16666666666666666,
                             u'bandwidth': PrefixedUnit(34815.0, u'M', u'B/s'),
                             u'bw kernel': 'copy',
                             u'level': u'L3',
                             u'performance': PrefixedUnit(5802500000.0, u'', u'FLOP/s')},
                            {u'arithmetic intensity': float(0.5),
                             u'bandwidth': PrefixedUnit(12.01, u'G', u'B/s'),
                             u'bw kernel': 'load',
                             u'level': u'MEM',
                             u'performance': PrefixedUnit(6005000000.0, u'', u'FLOP/s')}]
        
        for i, btlnck in enumerate(expected_btlncks):
            for k,v in btlnck.items():
                self.assertEqual(roofline['mem bottlenecks'][i][k], v)
Beispiel #4
0
    def report(self, output_file=sys.stdout):
        cpu_flops = PrefixedUnit(
            self.results['cpu bottleneck']['performance throughput'], "FLOP/s")

        if self._args and self._args.verbose >= 3:
            print('{}'.format(pformat(self.results)), file=output_file)

        if self._args and self._args.verbose >= 1:
            print('Bottlenecks:', file=output_file)
            print(
                '  level | a. intensity |   performance   |   bandwidth  | bandwidth kernel',
                file=output_file)
            print(
                '--------+--------------+-----------------+--------------+-----------------',
                file=output_file)
            print('    CPU |              | {!s:>15} |              |'.format(
                self.conv_perf(cpu_flops, self._args.unit)),
                  file=output_file)
            for b in self.results['mem bottlenecks']:
                if b is None: continue  # Skip CPU-L1 from Roofline model
                print(
                    '{level:>7} | {arithmetic intensity:>5.2} FLOP/B | {!s:>15} |'
                    ' {bandwidth!s:>12} | {bw kernel:<8}'.format(
                        self.conv_perf(b['performance'], self._args.unit),
                        **b),
                    file=output_file)
            print('', file=output_file)
            print('IACA analisys:', file=output_file)
            print('{!s}'.format({
                k: v
                for k, v in list(self.results['cpu bottleneck'].items())
                if k not in ['IACA output']
            }),
                  file=output_file)

        if float(self.results['min performance']) > float(cpu_flops):
            # CPU bound
            print('CPU bound with {} core(s)'.format(self._args.cores),
                  file=output_file)
            print('{!s} due to CPU bottleneck'.format(
                self.conv_perf(cpu_flops, self._args.unit)),
                  file=output_file)
        else:
            # Cache or mem bound
            print('Cache or mem bound with {} core(s)'.format(
                self._args.cores),
                  file=output_file)

            bottleneck = self.results['mem bottlenecks'][
                self.results['bottleneck level']]
            print(
                '{!s} due to {} transfer bottleneck (bw with from {} benchmark)'
                .format(
                    self.conv_perf(bottleneck['performance'], self._args.unit),
                    bottleneck['level'], bottleneck['bw kernel']),
                file=output_file)
            print('Arithmetic Intensity: {:.2f} FLOP/B'.format(
                bottleneck['arithmetic intensity']),
                  file=output_file)
Beispiel #5
0
    def analyze(self):
        """Run complete analysis."""
        self._CPU.analyze()
        self._data.analyze()
        self.results = copy.deepcopy(self._CPU.results)
        self.results.update(copy.deepcopy(self._data.results))

        # Simple scaling prediction:
        # Assumptions are:
        #  - bottleneck is always LLC-MEM
        #  - all caches scale with number of cores (bw AND size(WRONG!))
        if self.results['cycles'][-1][1] == 0.0:
            # Full caching in higher cache level
            self.results['scaling cores'] = float('inf')
        else:
            self.results['scaling cores'] = (max(
                self.results['T_OL'], self.results['T_nOL'] +
                sum([c[1] for c in self.results['cycles']])) /
                                             self.results['cycles'][-1][1])

        # Compile total single-core prediction
        self.results['total cycles'] = self._CPU.conv_cy(
            max(
                self.results['T_OL'],
                sum([self.results['T_nOL']] +
                    [i[1] for i in self.results['cycles']])))

        # Detailed scaling:
        if self._args.cores > 1:
            notes = []
            cores_per_numa_domain = self.machine['cores per NUMA domain']
            innuma_cores = min(self._args.cores, cores_per_numa_domain)
            if innuma_cores <= self.results['scaling cores']:
                innuma_rectp = PrefixedUnit(
                    max(
                        sum([c[1] for c in self.results['cycles']]) +
                        self.results['T_nOL'], self.results['T_OL']) /
                    innuma_cores, "cy/CL")
                notes.append("memory-interface not saturated")
            else:
                innuma_rectp = PrefixedUnit(self.results['cycles'][-1][1],
                                            'cy/CL')
                notes.append("memory-interface saturated on first socket")

            if 0 < self._args.cores <= cores_per_numa_domain:
                # only in-numa scaling to consider
                multi_core_perf = self._CPU.conv_cy(innuma_rectp)
                notes.append("in-NUMA-domain scaling")
            elif self._args.cores <= self.machine[
                    'cores per socket'] * self.machine['sockets']:
                # out-of-numa scaling behavior
                multi_core_perf = self._CPU.conv_cy(
                    innuma_rectp * innuma_cores / self._args.cores)
                notes.append("out-of-NUMA-domain scaling")
            else:
                raise ValueError(
                    "Number of cores must be greater than zero and upto the max. "
                    "number of cores defined by cores per socket and sockets in"
                    "machine file.")

            self.results['multi-core'] = {
                'cores': self._args.cores,
                'performance': multi_core_perf,
                'notes': notes
            }
        else:
            self.results['multi-core'] = None
    def test_get_machine_topology(self):
        # patch environment to include dummy likwid
        environ_orig = os.environ
        os.environ['PATH'] = self._find_file(
            'dummy_likwid') + ':' + os.environ['PATH']

        self.maxDiff = None
        self.assertEqual(
            lba.get_machine_topology(cpuinfo_path=self._find_file('cpuinfo')),
            {
                'kerncraft version':
                kerncraft_version,
                'FLOPs per cycle': {
                    'DP': {
                        'ADD': 'INFORMATION_REQUIRED',
                        'FMA': 'INFORMATION_REQUIRED',
                        'MUL': 'INFORMATION_REQUIRED',
                        'total': 'INFORMATION_REQUIRED'
                    },
                    'SP': {
                        'ADD': 'INFORMATION_REQUIRED',
                        'FMA': 'INFORMATION_REQUIRED',
                        'MUL': 'INFORMATION_REQUIRED',
                        'total': 'INFORMATION_REQUIRED'
                    }
                },
                'NUMA domains per socket':
                1,
                'cacheline size':
                'INFORMATION_REQUIRED (in bytes, e.g. 64 B)',
                'clock':
                'INFORMATION_REQUIRED (e.g., 2.7 GHz)',
                'compiler':
                collections.OrderedDict([
                    ('icc',
                     'INFORMATION_REQUIRED (e.g., -O3 -fno-alias -xAVX)'),
                    ('clang',
                     'INFORMATION_REQUIRED (e.g., -O3 -mavx, -D_POSIX_C_SOURCE=200112L'
                     ),
                    ('gcc',
                     'INFORMATION_REQUIRED (e.g., -O3 -march=ivybridge)')
                ]),
                'cores per NUMA domain':
                10,
                'cores per socket':
                10,
                'memory hierarchy': [{
                    'cache per group': {
                        'cl_size': 'INFORMATION_REQUIRED '
                        '(sets*ways*cl_size=32.00 kB)',
                        'load_from': 'L2',
                        'replacement_policy': 'INFORMATION_REQUIRED (options: '
                        'LRU, FIFO, MRU, RR)',
                        'sets':
                        'INFORMATION_REQUIRED (sets*ways*cl_size=32.00 '
                        'kB)',
                        'store_to': 'L2',
                        'ways':
                        'INFORMATION_REQUIRED (sets*ways*cl_size=32.00 '
                        'kB)',
                        'write_allocate': 'INFORMATION_REQUIRED (True/False)',
                        'write_back': 'INFORMATION_REQUIRED (True/False)'
                    },
                    'cores per group':
                    1,
                    'groups':
                    20,
                    'level':
                    'L1',
                    'performance counter metrics': {
                        'accesses':
                        'INFORMATION_REQUIRED (e.g., '
                        'L1D_REPLACEMENT__PMC0)',
                        'evicts':
                        'INFORMATION_REQUIRED (e.g., '
                        'L2_LINES_OUT_DIRTY_ALL__PMC2)',
                        'misses':
                        'INFORMATION_REQUIRED (e.g., '
                        'L2_LINES_IN_ALL__PMC1)'
                    },
                    'size per group':
                    PrefixedUnit(32.0, 'k', 'B'),
                    'threads per group':
                    2
                }, {
                    'cache per group': {
                        'cl_size': 'INFORMATION_REQUIRED '
                        '(sets*ways*cl_size=256.00 kB)',
                        'load_from': 'L3',
                        'replacement_policy': 'INFORMATION_REQUIRED (options: '
                        'LRU, FIFO, MRU, RR)',
                        'sets':
                        'INFORMATION_REQUIRED (sets*ways*cl_size=256.00 '
                        'kB)',
                        'store_to': 'L3',
                        'ways':
                        'INFORMATION_REQUIRED (sets*ways*cl_size=256.00 '
                        'kB)',
                        'write_allocate': 'INFORMATION_REQUIRED (True/False)',
                        'write_back': 'INFORMATION_REQUIRED (True/False)'
                    },
                    'cores per group':
                    1,
                    'non-overlap upstream throughput': [
                        'INFORMATION_REQUIRED (e.g. 24 B/cy)',
                        'INFORMATION_REQUIRED (e.g. "half-duplex" or "full-duplex")'
                    ],
                    'groups':
                    20,
                    'level':
                    'L2',
                    'performance counter metrics': {
                        'accesses':
                        'INFORMATION_REQUIRED (e.g., '
                        'L1D_REPLACEMENT__PMC0)',
                        'evicts':
                        'INFORMATION_REQUIRED (e.g., '
                        'L2_LINES_OUT_DIRTY_ALL__PMC2)',
                        'misses':
                        'INFORMATION_REQUIRED (e.g., '
                        'L2_LINES_IN_ALL__PMC1)'
                    },
                    'size per group':
                    PrefixedUnit(256.0, 'k', 'B'),
                    'threads per group':
                    2
                }, {
                    'cache per group': {
                        'cl_size': 'INFORMATION_REQUIRED '
                        '(sets*ways*cl_size=25.00 MB)',
                        'replacement_policy': 'INFORMATION_REQUIRED (options: '
                        'LRU, FIFO, MRU, RR)',
                        'sets':
                        'INFORMATION_REQUIRED (sets*ways*cl_size=25.00 '
                        'MB)',
                        'ways':
                        'INFORMATION_REQUIRED (sets*ways*cl_size=25.00 '
                        'MB)',
                        'write_allocate': 'INFORMATION_REQUIRED (True/False)',
                        'write_back': 'INFORMATION_REQUIRED (True/False)'
                    },
                    'cores per group':
                    10,
                    'non-overlap upstream throughput': [
                        'INFORMATION_REQUIRED (e.g. 24 B/cy)',
                        'INFORMATION_REQUIRED (e.g. "half-duplex" or "full-duplex")'
                    ],
                    'groups':
                    2,
                    'level':
                    'L3',
                    'performance counter metrics': {
                        'accesses':
                        'INFORMATION_REQUIRED (e.g., '
                        'L1D_REPLACEMENT__PMC0)',
                        'evicts':
                        'INFORMATION_REQUIRED (e.g., '
                        'L2_LINES_OUT_DIRTY_ALL__PMC2)',
                        'misses':
                        'INFORMATION_REQUIRED (e.g., '
                        'L2_LINES_IN_ALL__PMC1)'
                    },
                    'size per group':
                    PrefixedUnit(25.0, 'M', 'B'),
                    'threads per group':
                    20
                }, {
                    'cores per group':
                    10,
                    'non-overlap upstream throughput': [
                        'full socket memory bandwidth',
                        'INFORMATION_REQUIRED (e.g. "half-duplex" or "full-duplex")'
                    ],
                    'level':
                    'MEM',
                    'penalty cycles per read stream':
                    0,
                    'size per group':
                    None,
                    'threads per group':
                    20
                }],
                'micro-architecture-modeler':
                'INFORMATION_REQUIRED (options: OSACA, IACA)',
                'micro-architecture':
                'INFORMATION_REQUIRED (options: NHM, WSM, SNB, IVB, HSW, BDW, SKL, SKX)',
                'model name':
                'Intel(R) Xeon(R) CPU E5-2660 v2 @ 2.20GHz',
                'model type':
                'Intel Xeon IvyBridge EN/EP/EX processor',
                'non-overlapping model': {
                    'performance counter metric':
                    'INFORAMTION_REQUIRED '
                    'Example:max(UOPS_DISPATCHED_PORT_PORT_0__PMC2, '
                    'UOPS_DISPATCHED_PORT_PORT_1__PMC3,    '
                    'UOPS_DISPATCHED_PORT_PORT_4__PMC0, '
                    'UOPS_DISPATCHED_PORT_PORT_5__PMC1)',
                    'ports':
                    'INFORAMTION_REQUIRED (list of ports as they appear in IACA, '
                    'e.g.), ["0", "0DV", "1", "2", "2D", "3", "3D", "4", "5", "6", '
                    '"7"])'
                },
                'overlapping model': {
                    'performance counter metric':
                    'INFORAMTION_REQUIRED '
                    'Example:max(UOPS_DISPATCHED_PORT_PORT_0__PMC2, '
                    'UOPS_DISPATCHED_PORT_PORT_1__PMC3,    '
                    'UOPS_DISPATCHED_PORT_PORT_4__PMC0, '
                    'UOPS_DISPATCHED_PORT_PORT_5__PMC1)',
                    'ports':
                    'INFORAMTION_REQUIRED (list of ports as they appear in IACA, '
                    'e.g.), ["0", "0DV", "1", "2", "2D", "3", "3D", "4", "5", "6", '
                    '"7"])'
                },
                'sockets':
                2,
                'threads per core':
                2
            })

        # restore enviornment
        os.environ = environ_orig
    def test_machine_model_update(self):
        # patch environment to include dummy likwid
        environ_orig = os.environ
        os.environ['PATH'] = self._find_file('dummy_likwid') + ':' + os.environ['PATH']

        m = machinemodel.MachineModel()
        m.update(readouts=True, memory_hierarchy=True, benchmarks=False, overwrite=True,
                 cpuinfo_path=self._find_file('cpuinfo'))

        self.maxDiff = None

        correct = {'kerncraft version': kerncraft_version,
                   'FLOPs per cycle': {'DP': {'ADD': 'INFORMATION_REQUIRED',
                                              'FMA': 'INFORMATION_REQUIRED',
                                              'MUL': 'INFORMATION_REQUIRED',
                                              'total': 'INFORMATION_REQUIRED'},
                                       'SP': {'ADD': 'INFORMATION_REQUIRED',
                                              'FMA': 'INFORMATION_REQUIRED',
                                              'MUL': 'INFORMATION_REQUIRED',
                                              'total': 'INFORMATION_REQUIRED'}},
                   'NUMA domains per socket': 1,
                   'benchmarks': 'INFORMATION_REQUIRED',
                   'cacheline size': 'INFORMATION_REQUIRED (in bytes, e.g. 64 B)',
                   'clock': PrefixedUnit(2200000000.0, '', 'Hz'),
                   'compiler': OrderedDict(
                       [('icc', 'INFORMATION_REQUIRED (e.g., -O3 -fno-alias -xAVX)'),
                        ('clang',
                         'INFORMATION_REQUIRED (e.g., -O3 -mavx, -D_POSIX_C_SOURCE=200112L, check `gcc -march=native -Q --help=target | '
                         'grep -- "-march="`)'),
                        ('gcc',
                         'INFORMATION_REQUIRED (e.g., -O3 -march=ivybridge, check `gcc -march=native -Q --help=target | grep -- '
                         '"-march="`)')]),
                   'cores per NUMA domain': 10,
                   'cores per socket': 10,
                   'in-core model': OrderedDict([('IACA',
                                                  'INFORMATION_REQUIRED (e.g., NHM, WSM, SNB, IVB, HSW, BDW, SKL, SKX)'),
                                                 ('OSACA',
                                                  'INFORMATION_REQUIRED (e.g., NHM, WSM, SNB, IVB, HSW, BDW, SKL, SKX)'),
                                                 ('LLVM-MCA',
                                                  'INFORMATION_REQUIRED (e.g., -mcpu=skylake-avx512)')]),
                   'memory hierarchy': [OrderedDict([('level', 'L1'),
                                                     ('performance counter metrics',
                                                      {
                                                          'accesses': 'INFORMATION_REQUIRED (e.g., L1D_REPLACEMENT__PMC0)',
                                                          'evicts': 'INFORMATION_REQUIRED (e.g., L2_LINES_OUT_DIRTY_ALL__PMC2)',
                                                          'misses': 'INFORMATION_REQUIRED (e.g., L2_LINES_IN_ALL__PMC1)'}),
                                                     ('cache per group',
                                                      OrderedDict([('sets',
                                                                    'INFORMATION_REQUIRED (sets*ways*cl_size=32.00 kB)'),
                                                                   ('ways',
                                                                    'INFORMATION_REQUIRED (sets*ways*cl_size=32.00 kB)'),
                                                                   ('cl_size',
                                                                    'INFORMATION_REQUIRED (sets*ways*cl_size=32.00 kB)'),
                                                                   ('replacement_policy',
                                                                    'INFORMATION_REQUIRED (options: LRU, FIFO, MRU, RR)'),
                                                                   ('write_allocate',
                                                                    'INFORMATION_REQUIRED (True/False)'),
                                                                   ('write_back',
                                                                    'INFORMATION_REQUIRED (True/False)'),
                                                                   ('load_from', 'L2'),
                                                                   ('store_to', 'L2')])),
                                                     ('size per group',
                                                      PrefixedUnit(32.0, 'k', 'B')),
                                                     ('groups', 20),
                                                     ('cores per group', 1),
                                                     ('threads per group', 2)]),
                                        OrderedDict([('level', 'L2'),
                                                     ('upstream throughput',
                                                      ['INFORMATION_REQUIRED (e.g. 24 B/cy)',
                                                       'INFORMATION_REQUIRED (e.g. "half-duplex" or "full-duplex")']),
                                                     ('performance counter metrics',
                                                      {
                                                          'accesses': 'INFORMATION_REQUIRED (e.g., L1D_REPLACEMENT__PMC0)',
                                                          'evicts': 'INFORMATION_REQUIRED (e.g., L2_LINES_OUT_DIRTY_ALL__PMC2)',
                                                          'misses': 'INFORMATION_REQUIRED (e.g., L2_LINES_IN_ALL__PMC1)'}),
                                                     ('cache per group',
                                                      OrderedDict([('sets',
                                                                    'INFORMATION_REQUIRED (sets*ways*cl_size=256.00 kB)'),
                                                                   ('ways',
                                                                    'INFORMATION_REQUIRED (sets*ways*cl_size=256.00 kB)'),
                                                                   ('cl_size',
                                                                    'INFORMATION_REQUIRED (sets*ways*cl_size=256.00 kB)'),
                                                                   ('replacement_policy',
                                                                    'INFORMATION_REQUIRED (options: LRU, FIFO, MRU, RR)'),
                                                                   ('write_allocate',
                                                                    'INFORMATION_REQUIRED (True/False)'),
                                                                   ('write_back',
                                                                    'INFORMATION_REQUIRED (True/False)'),
                                                                   ('load_from', 'L3'),
                                                                   ('store_to', 'L3')])),
                                                     ('size per group',
                                                      PrefixedUnit(256.0, 'k', 'B')),
                                                     ('groups', 20),
                                                     ('cores per group', 1),
                                                     ('threads per group', 2)]),
                                        OrderedDict([('level', 'L3'),
                                                     ('upstream throughput',
                                                      ['INFORMATION_REQUIRED (e.g. 24 B/cy)',
                                                       'INFORMATION_REQUIRED (e.g. "half-duplex" or "full-duplex")']),
                                                     ('performance counter metrics',
                                                      {
                                                          'accesses': 'INFORMATION_REQUIRED (e.g., L1D_REPLACEMENT__PMC0)',
                                                          'evicts': 'INFORMATION_REQUIRED (e.g., L2_LINES_OUT_DIRTY_ALL__PMC2)',
                                                          'misses': 'INFORMATION_REQUIRED (e.g., L2_LINES_IN_ALL__PMC1)'}),
                                                     ('cache per group',
                                                      OrderedDict([('sets',
                                                                    'INFORMATION_REQUIRED (sets*ways*cl_size=25.00 MB)'),
                                                                   ('ways',
                                                                    'INFORMATION_REQUIRED (sets*ways*cl_size=25.00 MB)'),
                                                                   ('cl_size',
                                                                    'INFORMATION_REQUIRED (sets*ways*cl_size=25.00 MB)'),
                                                                   ('replacement_policy',
                                                                    'INFORMATION_REQUIRED (options: LRU, FIFO, MRU, RR)'),
                                                                   ('write_allocate',
                                                                    'INFORMATION_REQUIRED (True/False)'),
                                                                   ('write_back',
                                                                    'INFORMATION_REQUIRED (True/False)')])),
                                                     ('size per group',
                                                      PrefixedUnit(25.0, 'M', 'B')),
                                                     ('groups', 2),
                                                     ('cores per group', 10),
                                                     ('threads per group', 20)]),
                                        OrderedDict([('level', 'MEM'),
                                                     ('cores per group', 10),
                                                     ('threads per group', 20),
                                                     ('upstream throughput',
                                                      ['full socket memory bandwidth',
                                                       'INFORMATION_REQUIRED (e.g. "half-duplex" or "full-duplex")']),
                                                     ('penalty cycles per read stream', 0),
                                                     ('size per group', None)])],
                   'model name': 'Intel(R) Xeon(R) CPU E5-2660 v2 @ 2.20GHz',
                   'model type': 'Intel Xeon IvyBridge EN/EP/EX processor',
                   'non-overlapping model': {
                       'performance counter metric': 'INFORMATION_REQUIRED Example:max(UOPS_DISPATCHED_PORT_PORT_0__PMC2, '
                                                     'UOPS_DISPATCHED_PORT_PORT_1__PMC3,    UOPS_DISPATCHED_PORT_PORT_4__PMC0, '
                                                     'UOPS_DISPATCHED_PORT_PORT_5__PMC1)',
                       'ports': 'INFORMATION_REQUIRED (list of ports as they appear in IACA, e.g.,, ["0", "0DV", "1", "2", "2D", "3", '
                                '"3D", "4", "5", "6", "7"])'},
                   'overlapping model': {
                       'performance counter metric': 'INFORMATION_REQUIRED Example:max(UOPS_DISPATCHED_PORT_PORT_0__PMC2, '
                                                     'UOPS_DISPATCHED_PORT_PORT_1__PMC3,    UOPS_DISPATCHED_PORT_PORT_4__PMC0, '
                                                     'UOPS_DISPATCHED_PORT_PORT_5__PMC1)',
                       'ports': 'INFORMATION_REQUIRED (list of ports as they appear in IACA, e.g.,, ["0", "0DV", "1", "2", "2D", "3", "3D", '
                                '"4", "5", "6", "7"])'},
                   'sockets': 2,
                   'threads per core': 2}

        for k in correct:
            self.assertEqual(m[k], correct[k])

        # restore enviornment
        os.environ = environ_orig
Beispiel #8
0
    def report(self, output_file=sys.stdout):
        """Print generated model data in human readable format."""
        report = ''
        if self.verbose > 1:
            self._CPU.report()
            self._data.report()

        total_cycles = max(
            self.results['T_OL'],
            sum([self.results['T_nOL']] +
                [i[1] for i in self.results['cycles']]))
        report += '{{ {:.1f} || {:.1f} | {} }} cy/CL'.format(
            self.results['T_OL'], self.results['T_nOL'],
            ' | '.join(['{:.1f}'.format(i[1])
                        for i in self.results['cycles']]))

        if self._args.cores > 1:
            report += " (single core)"

        if self._args.unit:
            report += ' = {}'.format(
                self._CPU.conv_cy(total_cycles, self._args.unit))

        report += '\n{{ {:.1f} \ {} }} cy/CL'.format(
            max(self.results['T_OL'], self.results['T_nOL']), ' \ '.join([
                '{:.1f}'.format(
                    max(
                        sum([x[1] for x in self.results['cycles'][:i + 1]]) +
                        self.results['T_nOL'], self.results['T_OL']))
                for i in range(len(self.results['cycles']))
            ]))

        if self._args.cores > 1:
            report += " (single core)"

        report += '\nsaturating at {:.1f} cores'.format(
            self.results['scaling cores'])

        if self._args.cores > 1:
            report += "\nprediction for {} cores,".format(self._args.cores) + \
                      " assuming static scheduling:\n"

            # out-of-core scaling prediction:
            cores_per_numa_domain = self.machine['cores per NUMA domain']
            innuma_cores = min(self._args.cores, cores_per_numa_domain)
            if innuma_cores <= self.results['scaling cores']:
                innuma_rectp = PrefixedUnit(
                    max(
                        sum([c[1] for c in self.results['cycles']]) +
                        self.results['T_nOL'], self.results['T_OL']) /
                    innuma_cores, "cy/CL")
                note = "memory-interface not saturated"
            else:
                innuma_rectp = PrefixedUnit(self.results['cycles'][-1][1],
                                            'cy/CL')
                note = "memory-interface saturated on first socket"

            if 0 < self._args.cores <= cores_per_numa_domain:
                # only in-numa scaling to consider
                report += "{}".format(
                    self._CPU.conv_cy(innuma_rectp, self._args.unit))
                note += ", in-NUMA-domain scaling"
            elif self._args.cores <= self.machine[
                    'cores per socket'] * self.machine['sockets']:
                # out-of-numa scaling behavior
                report += "{}".format(
                    self._CPU.conv_cy(
                        innuma_rectp * innuma_cores / self._args.cores,
                        self._args.unit))
                note += ", out-of-NUMA-domain scaling"
            else:
                raise ValueError(
                    "Number of cores must be greater than zero and upto the max. "
                    "number of cores defined by cores per socket and sockets in"
                    "machine file.")

            report += " ({})\n".format(note)

        print(report, file=output_file)

        if self._args and self._args.ecm_plot:
            assert plot_support, "matplotlib couldn't be imported. Plotting is not supported."
            fig = plt.figure(frameon=False)
            self.plot(fig)
Beispiel #9
0
    def analyze(self):
        """Run complete analysis."""
        self._CPU.analyze()
        self._data.analyze()
        self.results = copy.deepcopy(self._CPU.results)
        self.results.update(copy.deepcopy(self._data.results))

        cores_per_numa_domain = self.machine['cores per NUMA domain']

        # Compile ECM model
        ECM_OL, ECM_OL_construction = [self.results['T_comp']], ['T_comp']
        ECM_nOL, ECM_nOL_construction = [], []
        if self.machine['memory hierarchy'][0]['transfers overlap']:
            nonoverlap_region = False
            ECM_OL.append(self.results['T_RegL1'])
            ECM_OL_construction.append('T_RegL1')
        else:
            nonoverlap_region = True
            ECM_nOL.append(self.results['T_RegL1'])
            ECM_nOL_construction.append('T_RegL1')

        for cache_level, cache_info in list(
                enumerate(self.machine['memory hierarchy']))[1:]:
            cycles = self.results['cycles'][cache_level - 1][1]
            if cache_info['transfers overlap']:
                if nonoverlap_region:
                    raise ValueError(
                        "Overlapping changes back and forth between levels, this is "
                        "currently not supported.")
                ECM_OL.append(cycles)
                ECM_OL_construction.append(
                    'T_' + self.machine['memory hierarchy'][cache_level -
                                                            1]['level'] +
                    cache_info['level'])
            else:
                nonoverlap_region = True
                ECM_nOL.append(cycles)
                ECM_nOL_construction.append(
                    'T_' + self.machine['memory hierarchy'][cache_level -
                                                            1]['level'] +
                    cache_info['level'])
        # TODO consider multiple paths per cache level with victim caches
        self.results['ECM'] = tuple(ECM_OL + [tuple(ECM_nOL)])
        self.results['ECM Model Construction'] = tuple(
            ECM_OL_construction + [tuple(ECM_nOL_construction)])

        # Compile total single-core prediction
        self.results['total cycles'] = self._CPU.conv_cy(
            max(sum(ECM_nOL), *ECM_OL))
        T_ECM = float(self.results['total cycles']['cy/CL'])
        # T_MEM is the cycles accounted to memory transfers
        T_MEM = self.results['cycles'][-1][1]

        # Simple scaling prediction:
        # Assumptions are:
        #  - bottleneck is always LLC-MEM
        #  - all caches scale with number of cores (bw AND size(WRONG!))

        # Full caching in higher cache level
        self.results['scaling cores'] = float('inf')
        # Not full caching:
        if self.results['cycles'][-1][1] != 0.0:
            # Considering memory bus utilization
            utilization = [0]
            self.results['scaling cores'] = float('inf')
            for c in range(1, cores_per_numa_domain + 1):
                if c * T_MEM > (T_ECM + utilization[c - 1] *
                                (c - 1) * T_MEM / 2):
                    utilization.append(1.0)
                    self.results['scaling cores'] = min(
                        self.results['scaling cores'], c)
                else:
                    utilization.append(c * T_MEM /
                                       (T_ECM + utilization[c - 1] *
                                        (c - 1) * T_MEM / 2))
            utilization = utilization[1:]

            # scaling code
            scaling_predictions = []
            for cores in range(1, self.machine['cores per socket'] + 1):
                scaling = {
                    'cores': cores,
                    'notes': [],
                    'performance': None,
                    'in-NUMA performance': None
                }
                # Detailed scaling:
                if cores <= self.results['scaling cores']:
                    # Is it purely in-cache?
                    innuma_rectp = PrefixedUnit(T_ECM / (T_ECM / T_MEM),
                                                "cy/CL")
                    scaling['notes'].append("memory-interface not saturated")
                else:
                    innuma_rectp = PrefixedUnit(self.results['cycles'][-1][1],
                                                'cy/CL')
                    scaling['notes'].append(
                        "memory-interface saturated on first NUMA domain")
                # Include NUMA-local performance in results dict
                scaling['in-NUMA performance'] = innuma_rectp

                if 0 < cores <= cores_per_numa_domain:
                    # only in-numa scaling to consider
                    scaling['performance'] = self._CPU.conv_cy(
                        innuma_rectp / utilization[cores - 1])
                    scaling['notes'].append("in-NUMA-domain scaling")
                elif cores <= self.machine['cores per socket'] * self.machine[
                        'sockets']:
                    # out-of-numa scaling behavior
                    scaling['performance'] = self._CPU.conv_cy(
                        innuma_rectp * cores_per_numa_domain / cores)
                    scaling['notes'].append("out-of-NUMA-domain scaling")
                else:
                    raise ValueError(
                        "Number of cores must be greater than zero and upto the max. "
                        "number of cores defined by cores per socket and sockets in"
                        "machine file.")
                scaling_predictions.append(scaling)
        else:
            # pure in-cache performace (perfect scaling)
            scaling_predictions = [{
                'cores':
                cores,
                'notes': ['pure in-cache'],
                'performance':
                self._CPU.conv_cy(T_ECM / cores),
                'in-NUMA performance':
                self._CPU.conv_cy(T_ECM / cores_per_numa_domain)
            } for cores in range(1, self.machine['cores per socket'] + 1)]

        # Also include prediction for all in-NUMA core counts in results
        self.results['scaling prediction'] = scaling_predictions
        if self._args.cores:
            self.results['multi-core'] = scaling_predictions[self._args.cores -
                                                             1]
        else:
            self.results['multi-core'] = None
    def analyze(self, output_file=sys.stdout):
        """Run analysis."""
        bench = self.kernel.binary
        element_size = self.kernel.datatypes_size[self.kernel.datatype]

        # Build arguments to pass to command:
        input_args = []

        # Determine base runtime with 10 iterations
        runtimes = {'': 0.0}
        time_per_repetition = None

        repetitions = self.kernel.repetitions

        results = defaultdict(dict)

        # TODO if cores > 1, results are for openmp run. Things might need to be changed here!

        # Check for MEM group existence
        valid_groups = get_supported_likwid_groups()
        if "MEM" in valid_groups:
            group = "MEM"
        else:
            group = valid_groups[0]

        while min(runtimes.values()) < 1.5:

            if min(runtimes.values()) == 0.0:
                adjustable = True
            else:
                adjustable = self.adjust_variables(runtimes)

            if not adjustable:
                print(
                    "WARNING: Could not extrapolate to a 1.5s run (for at least one region). Measurements might not be accurate.",
                    file=output_file)
                break

            input_args = [
                str(variable['value'])
                for variable in self.kernel.define.values()
            ]

            results = self.perfctr([bench] + input_args, group=group)

            if not self.kernel.regions:
                # no region specified for --marker -> benchmark all
                self.kernel.regions = set(results.keys())
                if len(self.kernel.regions) > 1:
                    self.kernel.regions.discard('')
                self.kernel.check()
                repetitions = self.kernel.repetitions

            else:
                # check if specified region(s) are found in results
                for region in self.kernel.regions:
                    if not region in results.keys():
                        print(
                            'Region \'{}\' was not found in the likwid output.'
                            .format(region),
                            file=output_file)
                        sys.exit(-1)

            runtimes = dict(
                zip(self.kernel.regions, [
                    results[r]['Runtime (RDTSC) [s]']
                    for r in self.kernel.regions
                ]))

            for region in self.kernel.regions:
                if self.kernel.repetitions[region]['marker']:
                    repetitions[region]['value'] = results[region][
                        'call count']
                elif self.kernel.repetitions[region]['variable']:
                    repetitions[region]['value'] = self.kernel.define[
                        self.kernel.repetitions[region]['variable']]['value']
                elif self.kernel.repetitions[region]['value']:
                    repetitions[region]['value'] = self.kernel.repetitions[
                        region]['value']

            time_per_repetition = {
                r: runtimes[r] / float(repetitions[r]['value'])
                for r in self.kernel.regions
            }
        raw_results_collection = [results]

        # repetitions were obtained from likwid marker and time per repetition is too small
        # -> overhead introduced by likwid markers is not negligible
        for region in self.kernel.regions:
            if self.kernel.repetitions[region]['marker']:
                # repetitions were obtained from likwid markers
                if time_per_repetition[region] < 1.0:
                    # time per repetition is <1000 ms (overhead is not negligible)
                    print(
                        "WARNING: Overhead introduced by likwid markers for region {} might not be negligible (usage of \'-R marker\').\n"
                        .format(region),
                        file=output_file)

        if self.benchmarked_regions - self.kernel.regions:
            print(
                'WARNING: following likwid regions were found but not specified to be analysed:\n{}'
                .format(self.benchmarked_regions - self.kernel.regions))

        # Base metrics for further metric computations:

        # collect counters for phenoecm run
        if not self.no_phenoecm:
            # Build events and sympy expressions for all model metrics
            T_OL, event_counters = self.machine.parse_perfmetric(
                self.machine['overlapping model']
                ['performance counter metric'])
            T_data, event_dict = self.machine.parse_perfmetric(
                self.machine['non-overlapping model']
                ['performance counter metric'])
            event_counters.update(event_dict)
            cache_metrics = defaultdict(dict)
            for i in range(len(self.machine['memory hierarchy']) - 1):
                cache_info = self.machine['memory hierarchy'][i]
                name = cache_info['level']
                for k, v in cache_info['performance counter metrics'].items():
                    cache_metrics[name][
                        k], event_dict = self.machine.parse_perfmetric(v)
                    event_counters.update(event_dict)

            # Compile minimal runs to gather all required events
            minimal_runs = build_minimal_runs(list(event_counters.values()))
            measured_ctrs = {}

            for region in self.kernel.regions:
                measured_ctrs[region] = {}

            for run in minimal_runs:
                ctrs = ','.join([eventstr(e) for e in run])
                r = self.perfctr([bench] + input_args, group=ctrs)
                raw_results_collection.append(r)

                for region in self.kernel.regions:
                    measured_ctrs[region].update(r[region])

        # start analysing for each region
        for region in self.kernel.regions:

            raw_results = [r[region] for r in raw_results_collection]

            iterations_per_repetition = self.kernel.region__iterations_per_repetition(
                region)

            iterations_per_cacheline = (
                float(self.machine['cacheline size']) /
                self.kernel.region__bytes_per_iteration(region))
            cys_per_repetition = time_per_repetition[region] * float(
                self.machine['clock'])

            # Gather remaining counters
            if not self.no_phenoecm:

                # Match measured counters to symbols
                event_counter_results = {}
                for sym, ctr in event_counters.items():
                    event, regs, parameter = ctr[0], register_options(
                        ctr[1]), ctr[2]
                    for r in regs:
                        if r in measured_ctrs[region][event]:
                            event_counter_results[sym] = measured_ctrs[region][
                                event][r]

                # Analytical metrics needed for further calculation
                cl_size = float(self.machine['cacheline size'])
                total_iterations = iterations_per_repetition * repetitions[
                    region]['value']
                total_cachelines = total_iterations / iterations_per_cacheline

                T_OL_result = T_OL.subs(
                    event_counter_results) / total_cachelines
                cache_metric_results = defaultdict(dict)
                for cache, mtrcs in cache_metrics.items():
                    for m, e in mtrcs.items():
                        cache_metric_results[cache][m] = e.subs(
                            event_counter_results)

                # Inter-cache transfers per CL
                cache_transfers_per_cl = {
                    cache: {
                        k: PrefixedUnit(v / total_cachelines, 'CL/CL')
                        for k, v in d.items()
                    }
                    for cache, d in cache_metric_results.items()
                }
                cache_transfers_per_cl['L1']['accesses'].unit = 'LOAD/CL'

                # Select appropriate bandwidth
                mem_bw, mem_bw_kernel = self.machine.get_bandwidth(
                    -1,  # mem
                    cache_metric_results['L3']['misses'],  # load_streams
                    cache_metric_results['L3']['evicts'],  # store_streams
                    1)

                data_transfers = {
                    # Assuming 0.5 cy / LOAD (SSE on SNB or IVB; AVX on HSW, BDW, SKL or SKX)
                    'T_nOL': (cache_metric_results['L1']['accesses'] /
                              total_cachelines * 0.5),
                    'T_L1L2': ((cache_metric_results['L1']['misses'] +
                                cache_metric_results['L1']['evicts']) /
                               total_cachelines * cl_size /
                               self.machine['memory hierarchy'][1]
                               ['upstream throughput'][0]),
                    'T_L2L3': ((cache_metric_results['L2']['misses'] +
                                cache_metric_results['L2']['evicts']) /
                               total_cachelines * cl_size /
                               self.machine['memory hierarchy'][2]
                               ['upstream throughput'][0]),
                    'T_L3MEM':
                    ((cache_metric_results['L3']['misses'] +
                      cache_metric_results['L3']['evicts']) *
                     float(self.machine['cacheline size']) / total_cachelines /
                     mem_bw * float(self.machine['clock']))
                }

                # Build phenomenological ECM model:
                ecm_model = {'T_OL': T_OL_result}
                ecm_model.update(data_transfers)
            else:
                event_counters = {}
                ecm_model = None
                cache_transfers_per_cl = None

            self.results[region] = {
                'raw output': raw_results,
                'ECM': ecm_model,
                'data transfers': cache_transfers_per_cl,
                'Runtime (per repetition) [s]': time_per_repetition[region],
                'event counters': event_counters,
                'Iterations per repetition': iterations_per_repetition,
                'Iterations per cacheline': iterations_per_cacheline
            }

            self.results[region]['Runtime (per cacheline update) [cy/CL]'] = \
                (cys_per_repetition / iterations_per_repetition) * iterations_per_cacheline
            if 'Memory data volume [GBytes]' in results[region]:
                self.results[region]['MEM volume (per repetition) [B]'] = \
                    results[region]['Memory data volume [GBytes]'] * 1e9 / repetitions[region]['value']
            else:
                self.results[region][
                    'MEM volume (per repetition) [B]'] = float('nan')
            self.results[region]['Performance [MFLOP/s]'] = \
                self.kernel._flops[region] / (time_per_repetition[region] / iterations_per_repetition) / 1e6
            if 'Memory bandwidth [MBytes/s]' in results[region]:
                self.results[region]['MEM BW [MByte/s]'] = results[region][
                    'Memory bandwidth [MBytes/s]']
            elif 'Memory BW [MBytes/s]' in results[region]:
                self.results[region]['MEM BW [MByte/s]'] = results[region][
                    'Memory BW [MBytes/s]']
            else:
                self.results[region]['MEM BW [MByte/s]'] = float('nan')
            self.results[region]['Performance [MLUP/s]'] = \
                (iterations_per_repetition / time_per_repetition[region]) / 1e6
            self.results[region]['Performance [MIt/s]'] = \
                (iterations_per_repetition / time_per_repetition[region]) / 1e6
Beispiel #11
0
    def calculate_cache_access(self):
        """Apply cache prediction to generate cache access behaviour."""
        self.results = {
            'loads': self.predictor.get_loads(),
            'stores': self.predictor.get_stores(),
            'verbose infos':
            self.predictor.get_infos(),  # only for verbose outputs
            'bottleneck level': 0,
            'mem bottlenecks': []
        }

        element_size = self.kernel.datatypes_size[self.kernel.datatype]
        cacheline_size = float(self.machine['cacheline size'])
        elements_per_cacheline = int(cacheline_size // element_size)

        total_flops = sum(self.kernel._flops.values()) * elements_per_cacheline

        # TODO let user choose threads_per_core:
        threads_per_core = 1

        # Compile relevant information

        # CPU-L1 stats (in bytes!)
        # We compile CPU-L1 stats on our own, because cacheprediction only works on cache lines
        read_offsets, write_offsets = zip(*list(
            self.kernel.compile_global_offsets(
                iteration=range(0, elements_per_cacheline))))
        read_offsets = set([
            item for sublist in read_offsets if sublist is not None
            for item in sublist
        ])
        write_offsets = set([
            item for sublist in write_offsets if sublist is not None
            for item in sublist
        ])

        write_streams = len(write_offsets)
        read_streams = len(read_offsets) + write_streams  # write-allocate
        total_loads = read_streams * element_size
        total_evicts = write_streams * element_size
        bw, measurement_kernel = self.machine.get_bandwidth(
            0,
            read_streams,
            0,  # we do not consider stores to L1 
            threads_per_core,
            cores=self.cores)

        # Calculate performance (arithmetic intensity * bandwidth with
        # arithmetic intensity = Iterations / bytes loaded
        if total_loads == 0:
            # This happens in case of full-caching
            arith_intens = None
            it_s = None
        else:
            arith_intens = 1.0 / (total_loads / elements_per_cacheline)
            it_s = PrefixedUnit(float(bw) * arith_intens, 'It/s')

        self.results['mem bottlenecks'].append({
            'performance':
            self.conv_perf(it_s),
            'level':
            self.machine['memory hierarchy'][0]['level'],
            'arithmetic intensity':
            arith_intens,
            'bw kernel':
            measurement_kernel,
            'bandwidth':
            bw,
            'bytes transfered':
            total_loads
        })
        self.results['bottleneck level'] = len(
            self.results['mem bottlenecks']) - 1
        self.results['min performance'] = self.conv_perf(it_s)

        # for other cache and memory levels:
        for cache_level, cache_info in list(
                enumerate(self.machine['memory hierarchy']))[:-1]:
            # Compiling stats (in bytes!)
            total_loads = self.results['loads'][cache_level +
                                                1] * cacheline_size
            total_stores = self.results['stores'][cache_level +
                                                  1] * cacheline_size

            # choose bw according to cache level and problem
            # first, compile stream counts at current cache level
            # write-allocate is allready resolved above
            read_streams = self.results['loads'][cache_level + 1]
            write_streams = self.results['stores'][cache_level + 1]
            # second, try to find best fitting kernel (closest to stream seen stream counts):
            bw, measurement_kernel = self.machine.get_bandwidth(
                cache_level + 1,
                read_streams,
                write_streams,
                threads_per_core,
                cores=self.cores)

            # Calculate performance (arithmetic intensity * bandwidth with
            # arithmetic intensity = flops / bytes transfered)
            bytes_transfered = total_loads + total_stores

            if bytes_transfered == 0:
                # This happens in case of full-caching
                arith_intens = float('inf')
                it_s = PrefixedUnit(float('inf'), 'It/s')
            else:
                arith_intens = 1 / (bytes_transfered / elements_per_cacheline)
                it_s = PrefixedUnit(float(bw) * arith_intens, 'It/s')

            self.results['mem bottlenecks'].append({
                'performance':
                self.conv_perf(it_s),
                'level':
                (self.machine['memory hierarchy'][cache_level + 1]['level']),
                'arithmetic intensity':
                arith_intens,
                'bw kernel':
                measurement_kernel,
                'bandwidth':
                bw,
                'bytes transfered':
                bytes_transfered
            })
            if it_s < self.results.get('min performance',
                                       {'It/s': it_s})['It/s']:
                self.results['bottleneck level'] = len(
                    self.results['mem bottlenecks']) - 1
                self.results['min performance'] = self.conv_perf(it_s)

        return self.results
Beispiel #12
0
    def analyze(self):
        """Run complete analysis."""
        self.results = self.calculate_cache_access()
        try:
            incore_analysis, pointer_increment = self.kernel.incore_analysis(
                asm_block=self.asm_block,
                pointer_increment=self.pointer_increment,
                model=self._args.incore_model,
                verbose=self.verbose > 2)
        except RuntimeError as e:
            print("In-core analysis failed: " + str(e))
            sys.exit(1)

        block_throughput = incore_analysis['throughput']
        uops = incore_analysis['uops']
        incore_output = incore_analysis['output']
        port_cycles = incore_analysis['port cycles']

        # Normalize to cycles per cacheline
        elements_per_block = abs(
            pointer_increment /
            self.kernel.datatypes_size[self.kernel.datatype])
        block_size = elements_per_block * self.kernel.datatypes_size[
            self.kernel.datatype]
        try:
            block_to_cl_ratio = float(
                self.machine['cacheline size']) / block_size
        except ZeroDivisionError as e:
            print("Too small block_size / pointer_increment:",
                  e,
                  file=sys.stderr)
            sys.exit(1)

        port_cycles = dict([(i[0], i[1] * block_to_cl_ratio)
                            for i in list(port_cycles.items())])
        if uops is not None:
            uops = uops * block_to_cl_ratio
        cl_throughput = block_throughput * block_to_cl_ratio
        flops_per_element = sum(self.kernel._flops.values())

        # Overwrite CPU-L1 stats, because they are covered by In-Core Model
        self.results['mem bottlenecks'][0] = None

        # Reevaluate mem bottleneck
        self.results['min performance'] = self.conv_perf(
            PrefixedUnit(float('inf'), 'It/s'))
        self.results['bottleneck level'] = None
        for level, bottleneck in enumerate(self.results['mem bottlenecks']):
            if level == 0:
                # ignoring CPU-L1
                continue
            if bottleneck['performance']['It/s'] < self.results[
                    'min performance']['It/s']:
                self.results['bottleneck level'] = level
                self.results['min performance'] = bottleneck['performance']

        # Create result dictionary
        self.results.update({
            'cpu bottleneck': {
                'port cycles':
                port_cycles,
                'cl throughput':
                cl_throughput,
                'uops':
                uops,
                'performance throughput':
                self.conv_perf(
                    PrefixedUnit(
                        self.machine['clock'] / block_throughput *
                        elements_per_block * self.cores, "It/s")),
                'in-core model output':
                incore_output
            }
        })
Beispiel #13
0
    def test_2d5pt_Roofline(self):
        store_file = os.path.join(self.temp_dir, 'test_2d5pt_Roofline.pickle')

        parser = kc.create_parser()
        args = parser.parse_args([
            '-m',
            self._find_file('SandyBridgeEP_E5-2680.yml'), '-p', 'Roofline',
            self._find_file('2d-5pt.c'), '-D', 'N', '1024-4096:3log2', '-D',
            'M', '50', '-vvv', '--store', store_file
        ])
        kc.check_arguments(args, parser)
        kc.run(parser, args, output_file=sys.stdout)

        with open(store_file, 'rb') as f:
            results = pickle.load(f)

        # Check for correct variations of constants
        self.assertEqual(len(results), 3)

        # Check if results contains correct kernel and some other infoormation
        key = [
            k for k in results if ('define', (('M', 50), ('N', 4096))) in k
        ][0]
        key_dict = dict(key)
        self.assertEqual(key_dict['pmodel'], 'Roofline')

        # Output of first result:
        result = results[key]
        assertRelativlyEqual(result['min performance']['FLOP/s'], 4720000000.0,
                             0.01)
        self.assertEqual(result['bottleneck level'], 1)

        expected_btlncks = [{
            'arithmetic intensity': 0.029411764705882353,
            'bandwidth': PrefixedUnit(84.07, u'G', u'B/s'),
            'bw kernel': 'load',
            'level': u'L1',
            'performance': PrefixedUnit(9.89, u'G', u'FLOP/s')
        }, {
            'arithmetic intensity': 0.025,
            'bandwidth': PrefixedUnit(47.24, u'G', u'B/s'),
            'bw kernel': 'triad',
            'level': u'L2',
            'performance': PrefixedUnit(4.72, u'G', u'FLOP/s')
        }, {
            'arithmetic intensity': 0.041,
            'bandwidth': PrefixedUnit(32.9, 'G', 'B/s'),
            'bw kernel': 'copy',
            'level': u'L3',
            'performance': PrefixedUnit(5.33, u'G', u'FLOP/s')
        }, {
            'arithmetic intensity':
            float('inf'),
            'bandwidth':
            PrefixedUnit(12.01, u'G', u'B/s'),
            'bw kernel':
            'load',
            'level':
            u'MEM',
            'performance':
            PrefixedUnit(float('inf'), u'', u'FLOP/s')
        }]

        for i, btlnck in enumerate(expected_btlncks):
            for k, v in btlnck.items():
                if type(v) is not str:
                    if k == 'performance':
                        assertRelativlyEqual(
                            result['mem bottlenecks'][i][k]['FLOP/s'], v, 0.05)
                    else:
                        assertRelativlyEqual(result['mem bottlenecks'][i][k],
                                             v, 0.05)
                else:
                    self.assertEqual(result['mem bottlenecks'][i][k], v)
Beispiel #14
0
def get_machine_topology(cpuinfo_path='/proc/cpuinfo'):
    try:
        topo = subprocess.Popen(
            ['likwid-topology'],
            stdout=subprocess.PIPE).communicate()[0].decode("utf-8")
    except OSError as e:
        print('likwid-topology execution failed, is it installed and loaded?',
              file=sys.stderr)
        sys.exit(1)
    with open(cpuinfo_path, 'r') as f:
        cpuinfo = f.read()
    sockets = int(get_match_or_break(r'^Sockets:\s+([0-9]+)\s*$', topo)[0])
    cores_per_socket = int(
        get_match_or_break(r'^Cores per socket:\s+([0-9]+)\s*$', topo)[0])
    numa_domains_per_socket = \
        int(get_match_or_break(r'^NUMA domains:\s+([0-9]+)\s*$', topo)[0])/sockets
    cores_per_numa_domain = numa_domains_per_socket / cores_per_socket
    machine = {
        'model type':
        get_match_or_break(r'^CPU type:\s+(.+?)\s*$', topo)[0],
        'model name':
        get_match_or_break(r'^model name\s+:\s+(.+?)\s*$', cpuinfo)[0],
        'sockets':
        sockets,
        'cores per socket':
        cores_per_socket,
        'threads per core':
        int(get_match_or_break(r'^Threads per core:\s+([0-9]+)\s*$', topo)[0]),
        'NUMA domains per socket':
        numa_domains_per_socket,
        'cores per NUMA domain':
        cores_per_numa_domain,
        'clock':
        'INFORMATION_REQUIRED (e.g., 2.7 GHz)',
        'FLOPs per cycle': {
            'SP': {
                'total': 'INFORMATION_REQUIRED',
                'FMA': 'INFORMATION_REQUIRED',
                'ADD': 'INFORMATION_REQUIRED',
                'MUL': 'INFORMATION_REQUIRED'
            },
            'DP': {
                'total': 'INFORMATION_REQUIRED',
                'FMA': 'INFORMATION_REQUIRED',
                'ADD': 'INFORMATION_REQUIRED',
                'MUL': 'INFORMATION_REQUIRED'
            }
        },
        'micro-architecture':
        'INFORMATION_REQUIRED (options: NHM, WSM, SNB, IVB, HSW)',
        # TODO retrive flags automatically from compiler with -march=native
        'compiler': {
            'icc': ['INFORMATION_REQUIRED (e.g., -O3 -fno-alias -xAVX)'],
            'clang': [
                'INFORMATION_REQUIRED (e.g., -O3 -mavx, -D_POSIX_C_SOURCE=200112L'
            ],
            'gcc': ['INFORMATION_REQUIRED (e.g., -O3 -march=ivybridge)']
        },
        'cacheline size':
        'INFORMATION_REQUIRED (in bytes, e.g. 64 B)',
        'overlapping model': {
            'ports':
            'INFORAMTION_REQUIRED (list of ports as they appear in IACA, e.g.)'
            ', ["0", "0DV", "1", "2", "2D", "3", "3D", "4", "5", "6", "7"])',
            'performance counter metric':
            'INFORAMTION_REQUIRED Example:'
            'max(UOPS_DISPATCHED_PORT_PORT_0__PMC2, UOPS_DISPATCHED_PORT_PORT_1__PMC3,'
            '    UOPS_DISPATCHED_PORT_PORT_4__PMC0, UOPS_DISPATCHED_PORT_PORT_5__PMC1)'
        },
        'non-overlapping model': {
            'ports':
            'INFORAMTION_REQUIRED (list of ports as they appear in IACA, e.g.)'
            ', ["0", "0DV", "1", "2", "2D", "3", "3D", "4", "5", "6", "7"])',
            'performance counter metric':
            'INFORAMTION_REQUIRED Example:'
            'max(UOPS_DISPATCHED_PORT_PORT_0__PMC2, UOPS_DISPATCHED_PORT_PORT_1__PMC3,'
            '    UOPS_DISPATCHED_PORT_PORT_4__PMC0, UOPS_DISPATCHED_PORT_PORT_5__PMC1)'
        }
    }

    threads_start = topo.find('HWThread')
    threads_end = topo.find('Cache Topology')
    threads = {}
    for line in topo[threads_start:threads_end].split('\n'):
        m = re.match(r'([0-9]+)\s+([0-9]+)\s+([0-9]+)\s+([0-9]+)', line)
        if m:
            threads[m.groups()[0]] = (m.groups()[1:])

    cache_start = topo.find('Cache Topology')
    cache_end = topo.find('NUMA Topology')
    machine['memory hierarchy'] = []
    mem_level = {}
    for line in topo[cache_start:cache_end].split('\n'):
        if line.startswith('Level:'):
            mem_level = {}
            mem_level['level'] = 'L' + line.split(':')[1].strip()
            machine['memory hierarchy'].append(mem_level)
        elif line.startswith('Size:'):
            size = PrefixedUnit(line.split(':')[1].strip())
            mem_level['cache per group'] = {
                'sets':
                'INFORMATION_REQUIRED (sets*ways*cl_size=' + str(size) + ')',
                'ways':
                'INFORMATION_REQUIRED (sets*ways*cl_size=' + str(size) + ')',
                'cl_size':
                'INFORMATION_REQUIRED (sets*ways*cl_size=' + str(size) + ')',
                'replacement_policy':
                'INFORMATION_REQUIRED (options: LRU, FIFO, MRU, RR)',
                'write_allocate':
                'INFORMATION_REQUIRED (True/False)',
                'write_back':
                'INFORMATION_REQUIRED (True/False)',
                'load_from':
                'L' + str(int(mem_level['level'][1:]) + 1),
                'store_to':
                'L' + str(int(mem_level['level'][1:]) + 1)
            }
            mem_level['size per group'] = size
        elif line.startswith('Cache groups:'):
            mem_level['groups'] = line.count('(')
            mem_level['cores per group'] = \
                (machine['cores per socket'] * machine['sockets']) / mem_level['groups']
            mem_level['threads per group'] = \
                mem_level['cores per group'] * machine['threads per core']
        mem_level['cycles per cacheline transfer'] = 'INFORMATION_REQUIRED'
        mem_level['performance counter metrics'] = {
            'accesses': 'INFORMATION_REQUIRED (e.g., L1D_REPLACEMENT__PMC0)',
            'misses': 'INFORMATION_REQUIRED (e.g., L2_LINES_IN_ALL__PMC1)',
            'evicts':
            'INFORMATION_REQUIRED (e.g., L2_LINES_OUT_DIRTY_ALL__PMC2)'
        }

    # Remove last caches load_from and store_to:
    del machine['memory hierarchy'][-1]['cache per group']['load_from']
    del machine['memory hierarchy'][-1]['cache per group']['store_to']

    machine['memory hierarchy'].append({
        'level':
        'MEM',
        'cores per group':
        machine['cores per socket'],
        'threads per group':
        machine['threads per core'] * machine['cores per socket'],
        'cycles per cacheline transfer':
        None,
        'penalty cycles per read stream':
        0,
        'size per group':
        None
    })

    return machine
Beispiel #15
0
def main():
    machine = get_machine_topology()
    pprint(machine)

    benchmarks = {'kernels': {}, 'measurements': {}}
    machine['benchmarks'] = benchmarks
    benchmarks['kernels'] = {
        'load': {
            'read streams': {
                'streams': 1,
                'bytes': PrefixedUnit(8, 'B')
            },
            'read+write streams': {
                'streams': 0,
                'bytes': PrefixedUnit(0, 'B')
            },
            'write streams': {
                'streams': 0,
                'bytes': PrefixedUnit(0, 'B')
            },
            'FLOPs per iteration': 0
        },
        'copy': {
            'read streams': {
                'streams': 1,
                'bytes': PrefixedUnit(8, 'B')
            },
            'read+write streams': {
                'streams': 0,
                'bytes': PrefixedUnit(0, 'B')
            },
            'write streams': {
                'streams': 1,
                'bytes': PrefixedUnit(8, 'B')
            },
            'FLOPs per iteration': 0
        },
        'update': {
            'read streams': {
                'streams': 1,
                'bytes': PrefixedUnit(8, 'B')
            },
            'read+write streams': {
                'streams': 1,
                'bytes': PrefixedUnit(8, 'B')
            },
            'write streams': {
                'streams': 1,
                'bytes': PrefixedUnit(8, 'B')
            },
            'FLOPs per iteration': 0
        },
        'triad': {
            'read streams': {
                'streams': 3,
                'bytes': PrefixedUnit(24, 'B')
            },
            'read+write streams': {
                'streams': 0,
                'bytes': PrefixedUnit(0, 'B')
            },
            'write streams': {
                'streams': 1,
                'bytes': PrefixedUnit(8, 'B')
            },
            'FLOPs per iteration': 2
        },
        'daxpy': {
            'read streams': {
                'streams': 2,
                'bytes': PrefixedUnit(16, 'B')
            },
            'read+write streams': {
                'streams': 1,
                'bytes': PrefixedUnit(8, 'B')
            },
            'write streams': {
                'streams': 1,
                'bytes': PrefixedUnit(8, 'B')
            },
            'FLOPs per iteration': 2
        },
    }

    USAGE_FACTOR = 0.66
    MEM_FACTOR = 15.0

    cores = list(range(1, machine['cores per socket'] + 1))
    for mem in machine['memory hierarchy']:
        measurement = {}
        machine['benchmarks']['measurements'][mem['level']] = measurement

        for threads_per_core in range(1, machine['threads per core'] + 1):
            threads = [c * threads_per_core for c in cores]
            if mem['size per group'] is not None:
                total_sizes = [
                    PrefixedUnit(
                        max(
                            int(mem['size per group']) *
                            c / mem['cores per group'],
                            int(mem['size per group'])) * USAGE_FACTOR, 'B')
                    for c in cores
                ]
            else:
                last_mem = machine['memory hierarchy'][-2]
                total_sizes = [
                    last_mem['size per group'] * MEM_FACTOR for c in cores
                ]
            sizes_per_core = [t / cores[i] for i, t in enumerate(total_sizes)]
            sizes_per_thread = [
                t / threads[i] for i, t in enumerate(total_sizes)
            ]

            measurement[threads_per_core] = {
                'threads per core': threads_per_core,
                'cores': copy(cores),
                'threads': threads,
                'size per core': sizes_per_core,
                'size per thread': sizes_per_thread,
                'total size': total_sizes,
                'results': {},
            }
    print('Progress: ', end='', file=sys.stderr)
    sys.stderr.flush()
    for mem_level in list(machine['benchmarks']['measurements'].keys()):
        for threads_per_core in list(
                machine['benchmarks']['measurements'][mem_level].keys()):
            measurement = machine['benchmarks']['measurements'][mem_level][
                threads_per_core]
            measurement['results'] = {}
            for kernel in list(machine['benchmarks']['kernels'].keys()):
                measurement['results'][kernel] = []
                for i, total_size in enumerate(measurement['total size']):
                    measurement['results'][kernel].append(
                        measure_bw(kernel,
                                   int(float(total_size) / 1000),
                                   threads_per_core,
                                   machine['threads per core'],
                                   measurement['cores'][i],
                                   sockets=1))

                    print('.', end='', file=sys.stderr)
                    sys.stderr.flush()

    if sys.version_info[0] == 2:
        yaml.representer.Representer.add_representer(unicode, my_unicode_repr)

    machineyaml = machine['model name']
    machineyaml = ' '.join(machineyaml.split())
    machineyaml = machineyaml.replace('(R)', '')
    machineyaml = machineyaml.replace('@', '')
    machineyaml = machineyaml.replace('(TM)', '')
    machineyaml = machineyaml.replace(' ', '_') + '.yml'

    with io.open(machineyaml, 'w', encoding='utf8') as outfile:
        yaml.dump(machine,
                  outfile,
                  default_flow_style=False,
                  allow_unicode=True)
Beispiel #16
0
    def calculate_cache_access(self):
        if self._args.cache_predictor == 'SIM':
            self.predictor = CacheSimulationPredictor(self.kernel,
                                                      self.machine)
        elif self._args.cache_predictor == 'LC':
            self.predictor = LayerConditionPredictor(self.kernel, self.machine)
        else:
            raise NotImplementedError(
                "Unknown cache predictor, only LC (layer condition) and "
                "SIM (cache simulation with pycachesim) is supported.")
        self.results = {
            'misses': self.predictor.get_misses(),
            'hits': self.predictor.get_hits(),
            'evicts': self.predictor.get_evicts(),
            'verbose infos':
            self.predictor.get_infos(),  # only for verbose outputs
            'bottleneck level': 0,
            'mem bottlenecks': []
        }

        element_size = self.kernel.datatypes_size[self.kernel.datatype]
        cacheline_size = float(self.machine['cacheline size'])
        elements_per_cacheline = int(cacheline_size // element_size)

        total_flops = sum(self.kernel._flops.values()) * elements_per_cacheline

        # TODO let user choose threads_per_core:
        threads_per_core = 1

        # Compile relevant information

        # CPU-L1 stats (in bytes!)
        # We compile CPU-L1 stats on our own, because cacheprediction only works on cache lines
        read_offsets, write_offsets = zip(*list(
            self.kernel.compile_global_offsets(
                iteration=range(0, elements_per_cacheline))))
        read_offsets = set(
            [item for sublist in read_offsets for item in sublist])
        write_offsets = set(
            [item for sublist in write_offsets for item in sublist])

        write_streams = len(write_offsets)
        read_streams = len(read_offsets) + write_streams  # write-allocate
        total_loads = read_streams * element_size
        total_evicts = write_streams * element_size
        bw, measurement_kernel = self.machine.get_bandwidth(
            0,
            read_streams,
            write_streams,
            threads_per_core,
            cores=self._args.cores)

        # Calculate performance (arithmetic intensity * bandwidth with
        # arithmetic intensity = flops / bytes loaded )
        if total_loads == 0:
            # This happens in case of full-caching
            arith_intens = None
            performance = None
        else:
            arith_intens = float(total_flops) / total_loads
            performance = arith_intens * float(bw)

        self.results['mem bottlenecks'].append({
            'performance':
            PrefixedUnit(performance, 'FLOP/s'),
            'level':
            self.machine['memory hierarchy'][0]['level'],
            'arithmetic intensity':
            arith_intens,
            'bw kernel':
            measurement_kernel,
            'bandwidth':
            bw,
            'bytes transfered':
            total_loads
        })
        if performance <= self.results.get('min performance', performance):
            self.results['bottleneck level'] = len(
                self.results['mem bottlenecks']) - 1
            self.results['min performance'] = performance

        # for other cache and memory levels:
        for cache_level, cache_info in list(
                enumerate(self.machine['memory hierarchy']))[:-1]:
            # Compiling stats (in bytes!)
            total_misses = self.results['misses'][cache_level] * cacheline_size
            total_evicts = self.results['evicts'][cache_level] * cacheline_size

            # choose bw according to cache level and problem
            # first, compile stream counts at current cache level
            # write-allocate is allready resolved above
            read_streams = self.results['misses'][cache_level]
            write_streams = self.results['evicts'][cache_level]
            # second, try to find best fitting kernel (closest to stream seen stream counts):
            bw, measurement_kernel = self.machine.get_bandwidth(
                cache_level + 1,
                read_streams,
                write_streams,
                threads_per_core,
                cores=self._args.cores)

            # Calculate performance (arithmetic intensity * bandwidth with
            # arithmetic intensity = flops / bytes transfered)
            bytes_transfered = total_misses + total_evicts

            if bytes_transfered == 0:
                # This happens in case of full-caching
                arith_intens = float('inf')
                performance = float('inf')
            else:
                arith_intens = float(total_flops) / bytes_transfered
                performance = arith_intens * float(bw)

            self.results['mem bottlenecks'].append({
                'performance':
                PrefixedUnit(performance, 'FLOP/s'),
                'level':
                (self.machine['memory hierarchy'][cache_level + 1]['level']),
                'arithmetic intensity':
                arith_intens,
                'bw kernel':
                measurement_kernel,
                'bandwidth':
                bw,
                'bytes transfered':
                bytes_transfered
            })
            if performance < self.results.get('min performance', performance):
                self.results['bottleneck level'] = len(
                    self.results['mem bottlenecks']) - 1
                self.results['min performance'] = performance

        return self.results
Beispiel #17
0
    def test_2d5pt_Roofline(self):
        store_file = os.path.join(self.temp_dir, 'test_2d5pt_Roofline.pickle')
        output_stream = StringIO()

        parser = kc.create_parser()
        args = parser.parse_args([
            '-m',
            self._find_file('SandyBridgeEP_E5-2680.yml'), '-p', 'Roofline',
            self._find_file('2d-5pt.c'), '-D', 'N', '1024-4096:3log2', '-D',
            'M', '50', '-vvv', '--store', store_file
        ])
        kc.check_arguments(args, parser)
        kc.run(parser, args, output_file=output_stream)

        with open(store_file, 'rb') as f:
            results = pickle.load(f)

        # Check if results contains correct kernel
        self.assertEqual(list(results), ['2d-5pt.c'])

        # Check for correct variations of constants
        self.assertCountEqual([
            sorted(map(str, r)) for r in results['2d-5pt.c']
        ], [
            sorted(map(str, r)) for r in [((sympy.var('M'), 50), (
                sympy.var('N'),
                1024)), ((sympy.var('M'), 50),
                         (sympy.var('N'),
                          2048)), ((sympy.var('M'), 50),
                                   (sympy.var('N'), 4096))]
        ])

        # Output of first result:
        result = results['2d-5pt.c'][[
            k for k in results['2d-5pt.c'] if (sympy.var('N'), 4096) in k
        ][0]]

        self.assertCountEqual(result, ['Roofline'])

        roofline = result['Roofline']
        assert_relativly_equal(roofline['min performance']['FLOP/s'],
                               5115000000.0, 0.01)
        self.assertEqual(roofline['bottleneck level'], 1)

        expected_btlncks = [{
            'arithmetic intensity':
            0.11764705882352941,
            'bandwidth':
            PrefixedUnit(81.61, u'G', u'B/s'),
            'bw kernel':
            'triad',
            'level':
            u'L1',
            'performance':
            PrefixedUnit(9601176470.588236, u'', u'FLOP/s')
        }, {
            'arithmetic intensity':
            0.1,
            'bandwidth':
            PrefixedUnit(51.15, u'G', u'B/s'),
            'bw kernel':
            'triad',
            'level':
            u'L2',
            'performance':
            PrefixedUnit(5115000000.0, u'', u'FLOP/s')
        }, {
            'arithmetic intensity':
            1.0 / 6.0,
            'bandwidth':
            PrefixedUnit(34815.0, 'M', 'B/s'),
            'bw kernel':
            'copy',
            'level':
            u'L3',
            'performance':
            PrefixedUnit(5802500000.0, u'', u'FLOP/s')
        }, {
            'arithmetic intensity':
            float('inf'),
            'bandwidth':
            PrefixedUnit(12.01, u'G', u'B/s'),
            'bw kernel':
            'load',
            'level':
            u'MEM',
            'performance':
            PrefixedUnit(float('inf'), u'', u'FLOP/s')
        }]

        for i, btlnck in enumerate(expected_btlncks):
            for k, v in btlnck.items():
                if type(v) is not str:
                    if k == 'performance':
                        assert_relativly_equal(
                            roofline['mem bottlenecks'][i][k]['FLOP/s'], v,
                            0.05)
                    else:
                        assert_relativly_equal(
                            roofline['mem bottlenecks'][i][k], v, 0.05)
                else:
                    self.assertEqual(roofline['mem bottlenecks'][i][k], v)
Beispiel #18
0
    def analyze(self):
        """Run analysis."""
        bench = self.kernel.build(verbose=self.verbose > 1,
                                  openmp=self._args.cores > 1)
        element_size = self.kernel.datatypes_size[self.kernel.datatype]

        # Build arguments to pass to command:
        args = [str(s) for s in list(self.kernel.constants.values())]

        # Determine base runtime with 10 iterations
        runtime = 0.0
        time_per_repetition = 2.0 / 10.0
        repetitions = self.iterations // 10
        mem_results = {}

        # TODO if cores > 1, results are for openmp run. Things might need to be changed here!

        while runtime < 1.5:
            # Interpolate to a 2.0s run
            if time_per_repetition != 0.0:
                repetitions = 2.0 // time_per_repetition
            else:
                repetitions = int(repetitions * 10)

            mem_results = self.perfctr([bench] + [str(repetitions)] + args,
                                       group="MEM")
            runtime = mem_results['Runtime (RDTSC) [s]']
            time_per_repetition = runtime / float(repetitions)
        raw_results = [mem_results]

        # Gather remaining counters
        if not self.no_phenoecm:
            # Build events and sympy expressions for all model metrics
            T_OL, event_counters = self.machine.parse_perfmetric(
                self.machine['overlapping model']
                ['performance counter metric'])
            T_data, event_dict = self.machine.parse_perfmetric(
                self.machine['non-overlapping model']
                ['performance counter metric'])
            event_counters.update(event_dict)
            cache_metrics = defaultdict(dict)
            for i in range(len(self.machine['memory hierarchy']) - 1):
                cache_info = self.machine['memory hierarchy'][i]
                name = cache_info['level']
                for k, v in cache_info['performance counter metrics'].items():
                    cache_metrics[name][
                        k], event_dict = self.machine.parse_perfmetric(v)
                    event_counters.update(event_dict)

            # Compile minimal runs to gather all required events
            minimal_runs = build_minimal_runs(list(event_counters.values()))
            measured_ctrs = {}
            for run in minimal_runs:
                ctrs = ','.join([eventstr(e) for e in run])
                r = self.perfctr([bench] + [str(repetitions)] + args,
                                 group=ctrs)
                raw_results.append(r)
                measured_ctrs.update(r)
            # Match measured counters to symbols
            event_counter_results = {}
            for sym, ctr in event_counters.items():
                event, regs, parameter = ctr[0], register_options(
                    ctr[1]), ctr[2]
                for r in regs:
                    if r in measured_ctrs[event]:
                        event_counter_results[sym] = measured_ctrs[event][r]

            # Analytical metrics needed for futher calculation
            cl_size = float(self.machine['cacheline size'])
            elements_per_cacheline = cl_size // element_size
            total_iterations = self.kernel.iteration_length() * repetitions
            total_cachelines = total_iterations / elements_per_cacheline

            T_OL_result = T_OL.subs(event_counter_results) / total_cachelines
            cache_metric_results = defaultdict(dict)
            for cache, mtrcs in cache_metrics.items():
                for m, e in mtrcs.items():
                    cache_metric_results[cache][m] = e.subs(
                        event_counter_results)

            # Inter-cache transfers per CL
            cache_transfers_per_cl = {
                cache: {
                    k: PrefixedUnit(v / total_cachelines, 'CL/CL')
                    for k, v in d.items()
                }
                for cache, d in cache_metric_results.items()
            }
            cache_transfers_per_cl['L1']['accesses'].unit = 'LOAD/CL'

            # Select appropriate bandwidth
            mem_bw, mem_bw_kernel = self.machine.get_bandwidth(
                -1,  # mem
                cache_metric_results['L3']['misses'],  # load_streams
                cache_metric_results['L3']['evicts'],  # store_streams
                1)

            data_transfers = {
                # Assuming 0.5 cy / LOAD (SSE on SNB or IVB; AVX on HSW, BDW, SKL or SKX)
                'T_nOL': (cache_metric_results['L1']['accesses'] /
                          total_cachelines * 0.5),
                'T_L1L2':
                ((cache_metric_results['L1']['misses'] +
                  cache_metric_results['L1']['evicts']) / total_cachelines *
                 cl_size / self.machine['memory hierarchy'][1]
                 ['non-overlap upstream throughput'][0]),
                'T_L2L3':
                ((cache_metric_results['L2']['misses'] +
                  cache_metric_results['L2']['evicts']) / total_cachelines *
                 cl_size / self.machine['memory hierarchy'][2]
                 ['non-overlap upstream throughput'][0]),
                'T_L3MEM':
                ((cache_metric_results['L3']['misses'] +
                  cache_metric_results['L3']['evicts']) *
                 float(self.machine['cacheline size']) / total_cachelines /
                 mem_bw * float(self.machine['clock']))
            }

            # Build phenomenological ECM model:
            ecm_model = {'T_OL': T_OL_result}
            ecm_model.update(data_transfers)
        else:
            event_counters = {}
            ecm_model = None
            cache_transfers_per_cl = None

        self.results = {
            'raw output': raw_results,
            'ECM': ecm_model,
            'data transfers': cache_transfers_per_cl,
            'Runtime (per repetition) [s]': time_per_repetition,
            'event counters': event_counters
        }

        # TODO make more generic to support other (and multiple) constant names
        iterations_per_repetition = reduce(operator.mul, [
            self.kernel.subs_consts(max_ - min_) /
            self.kernel.subs_consts(step)
            for idx, min_, max_, step in self.kernel._loop_stack
        ], 1)
        self.results['Iterations per repetition'] = iterations_per_repetition
        iterations_per_cacheline = float(
            self.machine['cacheline size']) / element_size
        cys_per_repetition = time_per_repetition * float(self.machine['clock'])
        self.results['Runtime (per cacheline update) [cy/CL]'] = \
            (cys_per_repetition / iterations_per_repetition) * iterations_per_cacheline
        self.results['MEM volume (per repetition) [B]'] = \
            mem_results['Memory data volume [GBytes]'] * 1e9 / repetitions
        self.results['Performance [MFLOP/s]'] = \
            sum(self.kernel._flops.values()) / (
            time_per_repetition / iterations_per_repetition) / 1e6
        if 'Memory bandwidth [MBytes/s]' in mem_results:
            self.results['MEM BW [MByte/s]'] = mem_results[
                'Memory bandwidth [MBytes/s]']
        else:
            self.results['MEM BW [MByte/s]'] = mem_results[
                'Memory BW [MBytes/s]']
        self.results['Performance [MLUP/s]'] = (iterations_per_repetition /
                                                time_per_repetition) / 1e6
        self.results['Performance [MIt/s]'] = (iterations_per_repetition /
                                               time_per_repetition) / 1e6
def run_kernel(kernel, args):
    machine = get_machine_model()
    # get per cachelevel performance counter information:
    event_counters = {}
    cache_metrics = defaultdict(dict)
    for i, cache_info in enumerate(machine['memory hierarchy']):
        name = cache_info['level']
        for k, v in cache_info['performance counter metrics'].items():
            if v is None:
                # Some info can not be measured, we skip it
                continue
            try:
                cache_metrics[name][k], event_dict = machine.parse_perfmetric(
                    v)
            except SyntaxError as e:
                print(
                    'Syntax error in machine file perf. metric: {}'.format(v),
                    e,
                    file=sys.stderr)
                continue
            event_counters.update(event_dict)

    bench_filename = f"build/{kernel}.{platform.machine()}"
    raw_results = []
    global_infos = {}
    # Compile minimal runs to gather all required events
    minimal_runs = benchmark.build_minimal_runs(list(event_counters.values()))
    measured_ctrs = defaultdict(dict)
    for run in minimal_runs:
        ctrs = ','.join([benchmark.eventstr(e) for e in run])
        r, o = perfctr([bench_filename] +
                       list(map(lambda t: ' '.join(map(str, t)), args)),
                       cores=1,
                       group=ctrs)
        global_infos = {}
        for m in [
                re.match(r"(:?([a-z_\-0-9]+):)?([a-z]+): ([a-z\_\-0-9]+)", l)
                for l in o
        ]:
            if m is not None:
                try:
                    v = int(m.group(4))
                except ValueError:
                    v = m.group(4)
                if m.group(1) is None:
                    global_infos[m.group(3)] = v
                else:
                    r[m.group(2)][m.group(3)] = v

        raw_results.append(o)
        for k in r:
            measured_ctrs[k].update(r[k])

    # Analytical metrics needed for futher calculation
    cl_size = int(machine['cacheline size'])
    elementsize = global_infos["elementsize"]
    base_iterations = cl_size // elementsize

    event_counter_results = {}
    cache_metric_results = {}
    cache_transfers_per_cl = {}
    for kernel_run in measured_ctrs:
        # Match measured counters to symbols
        event_counter_results[kernel_run] = {}
        for sym, ctr in event_counters.items():
            event, regs, parameters = ctr[0], benchmark.register_options(
                ctr[1]), ctr[2]
            if parameters:
                parameter_str = ':'.join(parameters)
                regs = [r + ':' + parameter_str for r in regs]
            for r in regs:
                if r in measured_ctrs[kernel_run][event]:
                    event_counter_results[kernel_run][sym] = measured_ctrs[
                        kernel_run][event][r]

        cache_metric_results[kernel_run] = defaultdict(dict)
        for cache, mtrcs in cache_metrics.items():
            for m, e in mtrcs.items():
                cache_metric_results[kernel_run][cache][m] = e.subs(
                    event_counter_results[kernel_run])

        total_iterations = \
            measured_ctrs[kernel_run]['iterations'] * measured_ctrs[kernel_run]['repetitions']
        # Inter-cache transfers per CL
        cache_transfers_per_cl[kernel_run] = {
            cache: {
                k: PrefixedUnit(v / (total_iterations / base_iterations),
                                'CL/{}It.'.format(base_iterations))
                for k, v in d.items()
            }
            for cache, d in cache_metric_results[kernel_run].items()
        }

        cache_transfers_per_cl[kernel_run]['L1']['loads'].unit = \
            'LOAD/{}It.'.format(base_iterations)
        cache_transfers_per_cl[kernel_run]['L1']['stores'].unit = \
            'LOAD/{}It.'.format(base_iterations)

    return cache_transfers_per_cl, global_infos, raw_results