def measure_bw(type_, total_size, threads_per_core, max_threads_per_core, cores_per_socket, sockets): """*size* is given in kilo bytes""" groups = [] for s in range(sockets): groups += [ '-w', 'S' + str(s) + ':' + str(total_size) + 'kB:' + str(threads_per_core * cores_per_socket) + ':1:' + str(int(max_threads_per_core / threads_per_core)) ] # for older likwid versions add ['-g', str(sockets), '-i', str(iterations)] to cmd cmd = ['likwid-bench', '-t', type_] + groups sys.stderr.write(' '.join(cmd)) output = subprocess.Popen( cmd, stdout=subprocess.PIPE).communicate()[0].decode('utf-8') if not output: print(' '.join(cmd) + ' returned no output, possibly wrong version installed ' '(requires 4.0 or later)', file=sys.stderr) sys.exit(1) bw = float( get_match_or_break(r'^MByte/s:\s+([0-9]+(?:\.[0-9]+)?)\s*$', output)[0]) print(' ', PrefixedUnit(bw, 'MB/s'), file=sys.stderr) return PrefixedUnit(bw, 'MB/s')
def conv_cy(self, cy_cl, unit, default='cy/CL'): """Convert cycles (cy/CL) to other units, such as FLOP/s or It/s.""" if not isinstance(cy_cl, PrefixedUnit): cy_cl = PrefixedUnit(cy_cl, '', 'cy/CL') if not unit: unit = default clock = self.machine['clock'] element_size = self.kernel.datatypes_size[self.kernel.datatype] elements_per_cacheline = int( self.machine['cacheline size']) // element_size if cy_cl != 0: it_s = clock / cy_cl * elements_per_cacheline it_s.unit = 'It/s' else: it_s = PrefixedUnit('inf It/S') flops_per_it = sum(self.kernel._flops.values()) performance = it_s * flops_per_it performance.unit = 'FLOP/s' cy_it = cy_cl * elements_per_cacheline cy_it.unit = 'cy/It' return { 'It/s': it_s, 'cy/CL': cy_cl, 'cy/It': cy_it, 'FLOP/s': performance }[unit]
def test_2d5pt_Roofline(self): store_file = os.path.join(self.temp_dir, 'test_2d5pt_Roofline.pickle') output_stream = StringIO() parser = kc.create_parser() args = parser.parse_args(['-m', self._find_file('phinally_gcc.yaml'), '-p', 'Roofline', self._find_file('2d-5pt.c'), '-D', 'N', '1024-4096:3log2', '-D', 'M', '50', '-vvv', '--store', store_file]) kc.check_arguments(args, parser) kc.run(parser, args, output_file=output_stream) results = pickle.load(open(store_file, 'rb')) # Check if results contains correct kernel self.assertEqual(list(results), ['2d-5pt.c']) # Check for correct variations of constants six.assertCountEqual(self, [sorted(map(str, r)) for r in results['2d-5pt.c']], [sorted(map(str, r)) for r in [ ((sympy.var('M'), 50), (sympy.var('N'), 1024)), ((sympy.var('M'), 50), (sympy.var('N'), 2048)), ((sympy.var('M'), 50), (sympy.var('N'), 4096))]]) # Output of first result: result = results['2d-5pt.c'][[k for k in results['2d-5pt.c'] if (sympy.var('N'), 4096) in k][0]] six.assertCountEqual(self, result, ['Roofline']) roofline = result['Roofline'] self.assertAlmostEqual(roofline['min performance'], 5802500000.0, places=0) self.assertEqual(roofline['bottleneck level'], 2) expected_btlncks = [{u'arithmetic intensity': 0.11764705882352941, u'bandwidth': PrefixedUnit(122.97, u'G', u'B/s'), u'bw kernel': 'copy', u'level': u'L1', u'performance': PrefixedUnit(14467058823.529411, u'', u'FLOP/s')}, {u'arithmetic intensity': 0.1, u'bandwidth': PrefixedUnit(61.92, u'G', u'B/s'), u'bw kernel': 'copy', u'level': u'L2', u'performance': PrefixedUnit(6192000000.0, u'', u'FLOP/s')}, {u'arithmetic intensity': 0.16666666666666666, u'bandwidth': PrefixedUnit(34815.0, u'M', u'B/s'), u'bw kernel': 'copy', u'level': u'L3', u'performance': PrefixedUnit(5802500000.0, u'', u'FLOP/s')}, {u'arithmetic intensity': float(0.5), u'bandwidth': PrefixedUnit(12.01, u'G', u'B/s'), u'bw kernel': 'load', u'level': u'MEM', u'performance': PrefixedUnit(6005000000.0, u'', u'FLOP/s')}] for i, btlnck in enumerate(expected_btlncks): for k,v in btlnck.items(): self.assertEqual(roofline['mem bottlenecks'][i][k], v)
def report(self, output_file=sys.stdout): cpu_flops = PrefixedUnit( self.results['cpu bottleneck']['performance throughput'], "FLOP/s") if self._args and self._args.verbose >= 3: print('{}'.format(pformat(self.results)), file=output_file) if self._args and self._args.verbose >= 1: print('Bottlenecks:', file=output_file) print( ' level | a. intensity | performance | bandwidth | bandwidth kernel', file=output_file) print( '--------+--------------+-----------------+--------------+-----------------', file=output_file) print(' CPU | | {!s:>15} | |'.format( self.conv_perf(cpu_flops, self._args.unit)), file=output_file) for b in self.results['mem bottlenecks']: if b is None: continue # Skip CPU-L1 from Roofline model print( '{level:>7} | {arithmetic intensity:>5.2} FLOP/B | {!s:>15} |' ' {bandwidth!s:>12} | {bw kernel:<8}'.format( self.conv_perf(b['performance'], self._args.unit), **b), file=output_file) print('', file=output_file) print('IACA analisys:', file=output_file) print('{!s}'.format({ k: v for k, v in list(self.results['cpu bottleneck'].items()) if k not in ['IACA output'] }), file=output_file) if float(self.results['min performance']) > float(cpu_flops): # CPU bound print('CPU bound with {} core(s)'.format(self._args.cores), file=output_file) print('{!s} due to CPU bottleneck'.format( self.conv_perf(cpu_flops, self._args.unit)), file=output_file) else: # Cache or mem bound print('Cache or mem bound with {} core(s)'.format( self._args.cores), file=output_file) bottleneck = self.results['mem bottlenecks'][ self.results['bottleneck level']] print( '{!s} due to {} transfer bottleneck (bw with from {} benchmark)' .format( self.conv_perf(bottleneck['performance'], self._args.unit), bottleneck['level'], bottleneck['bw kernel']), file=output_file) print('Arithmetic Intensity: {:.2f} FLOP/B'.format( bottleneck['arithmetic intensity']), file=output_file)
def analyze(self): """Run complete analysis.""" self._CPU.analyze() self._data.analyze() self.results = copy.deepcopy(self._CPU.results) self.results.update(copy.deepcopy(self._data.results)) # Simple scaling prediction: # Assumptions are: # - bottleneck is always LLC-MEM # - all caches scale with number of cores (bw AND size(WRONG!)) if self.results['cycles'][-1][1] == 0.0: # Full caching in higher cache level self.results['scaling cores'] = float('inf') else: self.results['scaling cores'] = (max( self.results['T_OL'], self.results['T_nOL'] + sum([c[1] for c in self.results['cycles']])) / self.results['cycles'][-1][1]) # Compile total single-core prediction self.results['total cycles'] = self._CPU.conv_cy( max( self.results['T_OL'], sum([self.results['T_nOL']] + [i[1] for i in self.results['cycles']]))) # Detailed scaling: if self._args.cores > 1: notes = [] cores_per_numa_domain = self.machine['cores per NUMA domain'] innuma_cores = min(self._args.cores, cores_per_numa_domain) if innuma_cores <= self.results['scaling cores']: innuma_rectp = PrefixedUnit( max( sum([c[1] for c in self.results['cycles']]) + self.results['T_nOL'], self.results['T_OL']) / innuma_cores, "cy/CL") notes.append("memory-interface not saturated") else: innuma_rectp = PrefixedUnit(self.results['cycles'][-1][1], 'cy/CL') notes.append("memory-interface saturated on first socket") if 0 < self._args.cores <= cores_per_numa_domain: # only in-numa scaling to consider multi_core_perf = self._CPU.conv_cy(innuma_rectp) notes.append("in-NUMA-domain scaling") elif self._args.cores <= self.machine[ 'cores per socket'] * self.machine['sockets']: # out-of-numa scaling behavior multi_core_perf = self._CPU.conv_cy( innuma_rectp * innuma_cores / self._args.cores) notes.append("out-of-NUMA-domain scaling") else: raise ValueError( "Number of cores must be greater than zero and upto the max. " "number of cores defined by cores per socket and sockets in" "machine file.") self.results['multi-core'] = { 'cores': self._args.cores, 'performance': multi_core_perf, 'notes': notes } else: self.results['multi-core'] = None
def test_get_machine_topology(self): # patch environment to include dummy likwid environ_orig = os.environ os.environ['PATH'] = self._find_file( 'dummy_likwid') + ':' + os.environ['PATH'] self.maxDiff = None self.assertEqual( lba.get_machine_topology(cpuinfo_path=self._find_file('cpuinfo')), { 'kerncraft version': kerncraft_version, 'FLOPs per cycle': { 'DP': { 'ADD': 'INFORMATION_REQUIRED', 'FMA': 'INFORMATION_REQUIRED', 'MUL': 'INFORMATION_REQUIRED', 'total': 'INFORMATION_REQUIRED' }, 'SP': { 'ADD': 'INFORMATION_REQUIRED', 'FMA': 'INFORMATION_REQUIRED', 'MUL': 'INFORMATION_REQUIRED', 'total': 'INFORMATION_REQUIRED' } }, 'NUMA domains per socket': 1, 'cacheline size': 'INFORMATION_REQUIRED (in bytes, e.g. 64 B)', 'clock': 'INFORMATION_REQUIRED (e.g., 2.7 GHz)', 'compiler': collections.OrderedDict([ ('icc', 'INFORMATION_REQUIRED (e.g., -O3 -fno-alias -xAVX)'), ('clang', 'INFORMATION_REQUIRED (e.g., -O3 -mavx, -D_POSIX_C_SOURCE=200112L' ), ('gcc', 'INFORMATION_REQUIRED (e.g., -O3 -march=ivybridge)') ]), 'cores per NUMA domain': 10, 'cores per socket': 10, 'memory hierarchy': [{ 'cache per group': { 'cl_size': 'INFORMATION_REQUIRED ' '(sets*ways*cl_size=32.00 kB)', 'load_from': 'L2', 'replacement_policy': 'INFORMATION_REQUIRED (options: ' 'LRU, FIFO, MRU, RR)', 'sets': 'INFORMATION_REQUIRED (sets*ways*cl_size=32.00 ' 'kB)', 'store_to': 'L2', 'ways': 'INFORMATION_REQUIRED (sets*ways*cl_size=32.00 ' 'kB)', 'write_allocate': 'INFORMATION_REQUIRED (True/False)', 'write_back': 'INFORMATION_REQUIRED (True/False)' }, 'cores per group': 1, 'groups': 20, 'level': 'L1', 'performance counter metrics': { 'accesses': 'INFORMATION_REQUIRED (e.g., ' 'L1D_REPLACEMENT__PMC0)', 'evicts': 'INFORMATION_REQUIRED (e.g., ' 'L2_LINES_OUT_DIRTY_ALL__PMC2)', 'misses': 'INFORMATION_REQUIRED (e.g., ' 'L2_LINES_IN_ALL__PMC1)' }, 'size per group': PrefixedUnit(32.0, 'k', 'B'), 'threads per group': 2 }, { 'cache per group': { 'cl_size': 'INFORMATION_REQUIRED ' '(sets*ways*cl_size=256.00 kB)', 'load_from': 'L3', 'replacement_policy': 'INFORMATION_REQUIRED (options: ' 'LRU, FIFO, MRU, RR)', 'sets': 'INFORMATION_REQUIRED (sets*ways*cl_size=256.00 ' 'kB)', 'store_to': 'L3', 'ways': 'INFORMATION_REQUIRED (sets*ways*cl_size=256.00 ' 'kB)', 'write_allocate': 'INFORMATION_REQUIRED (True/False)', 'write_back': 'INFORMATION_REQUIRED (True/False)' }, 'cores per group': 1, 'non-overlap upstream throughput': [ 'INFORMATION_REQUIRED (e.g. 24 B/cy)', 'INFORMATION_REQUIRED (e.g. "half-duplex" or "full-duplex")' ], 'groups': 20, 'level': 'L2', 'performance counter metrics': { 'accesses': 'INFORMATION_REQUIRED (e.g., ' 'L1D_REPLACEMENT__PMC0)', 'evicts': 'INFORMATION_REQUIRED (e.g., ' 'L2_LINES_OUT_DIRTY_ALL__PMC2)', 'misses': 'INFORMATION_REQUIRED (e.g., ' 'L2_LINES_IN_ALL__PMC1)' }, 'size per group': PrefixedUnit(256.0, 'k', 'B'), 'threads per group': 2 }, { 'cache per group': { 'cl_size': 'INFORMATION_REQUIRED ' '(sets*ways*cl_size=25.00 MB)', 'replacement_policy': 'INFORMATION_REQUIRED (options: ' 'LRU, FIFO, MRU, RR)', 'sets': 'INFORMATION_REQUIRED (sets*ways*cl_size=25.00 ' 'MB)', 'ways': 'INFORMATION_REQUIRED (sets*ways*cl_size=25.00 ' 'MB)', 'write_allocate': 'INFORMATION_REQUIRED (True/False)', 'write_back': 'INFORMATION_REQUIRED (True/False)' }, 'cores per group': 10, 'non-overlap upstream throughput': [ 'INFORMATION_REQUIRED (e.g. 24 B/cy)', 'INFORMATION_REQUIRED (e.g. "half-duplex" or "full-duplex")' ], 'groups': 2, 'level': 'L3', 'performance counter metrics': { 'accesses': 'INFORMATION_REQUIRED (e.g., ' 'L1D_REPLACEMENT__PMC0)', 'evicts': 'INFORMATION_REQUIRED (e.g., ' 'L2_LINES_OUT_DIRTY_ALL__PMC2)', 'misses': 'INFORMATION_REQUIRED (e.g., ' 'L2_LINES_IN_ALL__PMC1)' }, 'size per group': PrefixedUnit(25.0, 'M', 'B'), 'threads per group': 20 }, { 'cores per group': 10, 'non-overlap upstream throughput': [ 'full socket memory bandwidth', 'INFORMATION_REQUIRED (e.g. "half-duplex" or "full-duplex")' ], 'level': 'MEM', 'penalty cycles per read stream': 0, 'size per group': None, 'threads per group': 20 }], 'micro-architecture-modeler': 'INFORMATION_REQUIRED (options: OSACA, IACA)', 'micro-architecture': 'INFORMATION_REQUIRED (options: NHM, WSM, SNB, IVB, HSW, BDW, SKL, SKX)', 'model name': 'Intel(R) Xeon(R) CPU E5-2660 v2 @ 2.20GHz', 'model type': 'Intel Xeon IvyBridge EN/EP/EX processor', 'non-overlapping model': { 'performance counter metric': 'INFORAMTION_REQUIRED ' 'Example:max(UOPS_DISPATCHED_PORT_PORT_0__PMC2, ' 'UOPS_DISPATCHED_PORT_PORT_1__PMC3, ' 'UOPS_DISPATCHED_PORT_PORT_4__PMC0, ' 'UOPS_DISPATCHED_PORT_PORT_5__PMC1)', 'ports': 'INFORAMTION_REQUIRED (list of ports as they appear in IACA, ' 'e.g.), ["0", "0DV", "1", "2", "2D", "3", "3D", "4", "5", "6", ' '"7"])' }, 'overlapping model': { 'performance counter metric': 'INFORAMTION_REQUIRED ' 'Example:max(UOPS_DISPATCHED_PORT_PORT_0__PMC2, ' 'UOPS_DISPATCHED_PORT_PORT_1__PMC3, ' 'UOPS_DISPATCHED_PORT_PORT_4__PMC0, ' 'UOPS_DISPATCHED_PORT_PORT_5__PMC1)', 'ports': 'INFORAMTION_REQUIRED (list of ports as they appear in IACA, ' 'e.g.), ["0", "0DV", "1", "2", "2D", "3", "3D", "4", "5", "6", ' '"7"])' }, 'sockets': 2, 'threads per core': 2 }) # restore enviornment os.environ = environ_orig
def test_machine_model_update(self): # patch environment to include dummy likwid environ_orig = os.environ os.environ['PATH'] = self._find_file('dummy_likwid') + ':' + os.environ['PATH'] m = machinemodel.MachineModel() m.update(readouts=True, memory_hierarchy=True, benchmarks=False, overwrite=True, cpuinfo_path=self._find_file('cpuinfo')) self.maxDiff = None correct = {'kerncraft version': kerncraft_version, 'FLOPs per cycle': {'DP': {'ADD': 'INFORMATION_REQUIRED', 'FMA': 'INFORMATION_REQUIRED', 'MUL': 'INFORMATION_REQUIRED', 'total': 'INFORMATION_REQUIRED'}, 'SP': {'ADD': 'INFORMATION_REQUIRED', 'FMA': 'INFORMATION_REQUIRED', 'MUL': 'INFORMATION_REQUIRED', 'total': 'INFORMATION_REQUIRED'}}, 'NUMA domains per socket': 1, 'benchmarks': 'INFORMATION_REQUIRED', 'cacheline size': 'INFORMATION_REQUIRED (in bytes, e.g. 64 B)', 'clock': PrefixedUnit(2200000000.0, '', 'Hz'), 'compiler': OrderedDict( [('icc', 'INFORMATION_REQUIRED (e.g., -O3 -fno-alias -xAVX)'), ('clang', 'INFORMATION_REQUIRED (e.g., -O3 -mavx, -D_POSIX_C_SOURCE=200112L, check `gcc -march=native -Q --help=target | ' 'grep -- "-march="`)'), ('gcc', 'INFORMATION_REQUIRED (e.g., -O3 -march=ivybridge, check `gcc -march=native -Q --help=target | grep -- ' '"-march="`)')]), 'cores per NUMA domain': 10, 'cores per socket': 10, 'in-core model': OrderedDict([('IACA', 'INFORMATION_REQUIRED (e.g., NHM, WSM, SNB, IVB, HSW, BDW, SKL, SKX)'), ('OSACA', 'INFORMATION_REQUIRED (e.g., NHM, WSM, SNB, IVB, HSW, BDW, SKL, SKX)'), ('LLVM-MCA', 'INFORMATION_REQUIRED (e.g., -mcpu=skylake-avx512)')]), 'memory hierarchy': [OrderedDict([('level', 'L1'), ('performance counter metrics', { 'accesses': 'INFORMATION_REQUIRED (e.g., L1D_REPLACEMENT__PMC0)', 'evicts': 'INFORMATION_REQUIRED (e.g., L2_LINES_OUT_DIRTY_ALL__PMC2)', 'misses': 'INFORMATION_REQUIRED (e.g., L2_LINES_IN_ALL__PMC1)'}), ('cache per group', OrderedDict([('sets', 'INFORMATION_REQUIRED (sets*ways*cl_size=32.00 kB)'), ('ways', 'INFORMATION_REQUIRED (sets*ways*cl_size=32.00 kB)'), ('cl_size', 'INFORMATION_REQUIRED (sets*ways*cl_size=32.00 kB)'), ('replacement_policy', 'INFORMATION_REQUIRED (options: LRU, FIFO, MRU, RR)'), ('write_allocate', 'INFORMATION_REQUIRED (True/False)'), ('write_back', 'INFORMATION_REQUIRED (True/False)'), ('load_from', 'L2'), ('store_to', 'L2')])), ('size per group', PrefixedUnit(32.0, 'k', 'B')), ('groups', 20), ('cores per group', 1), ('threads per group', 2)]), OrderedDict([('level', 'L2'), ('upstream throughput', ['INFORMATION_REQUIRED (e.g. 24 B/cy)', 'INFORMATION_REQUIRED (e.g. "half-duplex" or "full-duplex")']), ('performance counter metrics', { 'accesses': 'INFORMATION_REQUIRED (e.g., L1D_REPLACEMENT__PMC0)', 'evicts': 'INFORMATION_REQUIRED (e.g., L2_LINES_OUT_DIRTY_ALL__PMC2)', 'misses': 'INFORMATION_REQUIRED (e.g., L2_LINES_IN_ALL__PMC1)'}), ('cache per group', OrderedDict([('sets', 'INFORMATION_REQUIRED (sets*ways*cl_size=256.00 kB)'), ('ways', 'INFORMATION_REQUIRED (sets*ways*cl_size=256.00 kB)'), ('cl_size', 'INFORMATION_REQUIRED (sets*ways*cl_size=256.00 kB)'), ('replacement_policy', 'INFORMATION_REQUIRED (options: LRU, FIFO, MRU, RR)'), ('write_allocate', 'INFORMATION_REQUIRED (True/False)'), ('write_back', 'INFORMATION_REQUIRED (True/False)'), ('load_from', 'L3'), ('store_to', 'L3')])), ('size per group', PrefixedUnit(256.0, 'k', 'B')), ('groups', 20), ('cores per group', 1), ('threads per group', 2)]), OrderedDict([('level', 'L3'), ('upstream throughput', ['INFORMATION_REQUIRED (e.g. 24 B/cy)', 'INFORMATION_REQUIRED (e.g. "half-duplex" or "full-duplex")']), ('performance counter metrics', { 'accesses': 'INFORMATION_REQUIRED (e.g., L1D_REPLACEMENT__PMC0)', 'evicts': 'INFORMATION_REQUIRED (e.g., L2_LINES_OUT_DIRTY_ALL__PMC2)', 'misses': 'INFORMATION_REQUIRED (e.g., L2_LINES_IN_ALL__PMC1)'}), ('cache per group', OrderedDict([('sets', 'INFORMATION_REQUIRED (sets*ways*cl_size=25.00 MB)'), ('ways', 'INFORMATION_REQUIRED (sets*ways*cl_size=25.00 MB)'), ('cl_size', 'INFORMATION_REQUIRED (sets*ways*cl_size=25.00 MB)'), ('replacement_policy', 'INFORMATION_REQUIRED (options: LRU, FIFO, MRU, RR)'), ('write_allocate', 'INFORMATION_REQUIRED (True/False)'), ('write_back', 'INFORMATION_REQUIRED (True/False)')])), ('size per group', PrefixedUnit(25.0, 'M', 'B')), ('groups', 2), ('cores per group', 10), ('threads per group', 20)]), OrderedDict([('level', 'MEM'), ('cores per group', 10), ('threads per group', 20), ('upstream throughput', ['full socket memory bandwidth', 'INFORMATION_REQUIRED (e.g. "half-duplex" or "full-duplex")']), ('penalty cycles per read stream', 0), ('size per group', None)])], 'model name': 'Intel(R) Xeon(R) CPU E5-2660 v2 @ 2.20GHz', 'model type': 'Intel Xeon IvyBridge EN/EP/EX processor', 'non-overlapping model': { 'performance counter metric': 'INFORMATION_REQUIRED Example:max(UOPS_DISPATCHED_PORT_PORT_0__PMC2, ' 'UOPS_DISPATCHED_PORT_PORT_1__PMC3, UOPS_DISPATCHED_PORT_PORT_4__PMC0, ' 'UOPS_DISPATCHED_PORT_PORT_5__PMC1)', 'ports': 'INFORMATION_REQUIRED (list of ports as they appear in IACA, e.g.,, ["0", "0DV", "1", "2", "2D", "3", ' '"3D", "4", "5", "6", "7"])'}, 'overlapping model': { 'performance counter metric': 'INFORMATION_REQUIRED Example:max(UOPS_DISPATCHED_PORT_PORT_0__PMC2, ' 'UOPS_DISPATCHED_PORT_PORT_1__PMC3, UOPS_DISPATCHED_PORT_PORT_4__PMC0, ' 'UOPS_DISPATCHED_PORT_PORT_5__PMC1)', 'ports': 'INFORMATION_REQUIRED (list of ports as they appear in IACA, e.g.,, ["0", "0DV", "1", "2", "2D", "3", "3D", ' '"4", "5", "6", "7"])'}, 'sockets': 2, 'threads per core': 2} for k in correct: self.assertEqual(m[k], correct[k]) # restore enviornment os.environ = environ_orig
def report(self, output_file=sys.stdout): """Print generated model data in human readable format.""" report = '' if self.verbose > 1: self._CPU.report() self._data.report() total_cycles = max( self.results['T_OL'], sum([self.results['T_nOL']] + [i[1] for i in self.results['cycles']])) report += '{{ {:.1f} || {:.1f} | {} }} cy/CL'.format( self.results['T_OL'], self.results['T_nOL'], ' | '.join(['{:.1f}'.format(i[1]) for i in self.results['cycles']])) if self._args.cores > 1: report += " (single core)" if self._args.unit: report += ' = {}'.format( self._CPU.conv_cy(total_cycles, self._args.unit)) report += '\n{{ {:.1f} \ {} }} cy/CL'.format( max(self.results['T_OL'], self.results['T_nOL']), ' \ '.join([ '{:.1f}'.format( max( sum([x[1] for x in self.results['cycles'][:i + 1]]) + self.results['T_nOL'], self.results['T_OL'])) for i in range(len(self.results['cycles'])) ])) if self._args.cores > 1: report += " (single core)" report += '\nsaturating at {:.1f} cores'.format( self.results['scaling cores']) if self._args.cores > 1: report += "\nprediction for {} cores,".format(self._args.cores) + \ " assuming static scheduling:\n" # out-of-core scaling prediction: cores_per_numa_domain = self.machine['cores per NUMA domain'] innuma_cores = min(self._args.cores, cores_per_numa_domain) if innuma_cores <= self.results['scaling cores']: innuma_rectp = PrefixedUnit( max( sum([c[1] for c in self.results['cycles']]) + self.results['T_nOL'], self.results['T_OL']) / innuma_cores, "cy/CL") note = "memory-interface not saturated" else: innuma_rectp = PrefixedUnit(self.results['cycles'][-1][1], 'cy/CL') note = "memory-interface saturated on first socket" if 0 < self._args.cores <= cores_per_numa_domain: # only in-numa scaling to consider report += "{}".format( self._CPU.conv_cy(innuma_rectp, self._args.unit)) note += ", in-NUMA-domain scaling" elif self._args.cores <= self.machine[ 'cores per socket'] * self.machine['sockets']: # out-of-numa scaling behavior report += "{}".format( self._CPU.conv_cy( innuma_rectp * innuma_cores / self._args.cores, self._args.unit)) note += ", out-of-NUMA-domain scaling" else: raise ValueError( "Number of cores must be greater than zero and upto the max. " "number of cores defined by cores per socket and sockets in" "machine file.") report += " ({})\n".format(note) print(report, file=output_file) if self._args and self._args.ecm_plot: assert plot_support, "matplotlib couldn't be imported. Plotting is not supported." fig = plt.figure(frameon=False) self.plot(fig)
def analyze(self): """Run complete analysis.""" self._CPU.analyze() self._data.analyze() self.results = copy.deepcopy(self._CPU.results) self.results.update(copy.deepcopy(self._data.results)) cores_per_numa_domain = self.machine['cores per NUMA domain'] # Compile ECM model ECM_OL, ECM_OL_construction = [self.results['T_comp']], ['T_comp'] ECM_nOL, ECM_nOL_construction = [], [] if self.machine['memory hierarchy'][0]['transfers overlap']: nonoverlap_region = False ECM_OL.append(self.results['T_RegL1']) ECM_OL_construction.append('T_RegL1') else: nonoverlap_region = True ECM_nOL.append(self.results['T_RegL1']) ECM_nOL_construction.append('T_RegL1') for cache_level, cache_info in list( enumerate(self.machine['memory hierarchy']))[1:]: cycles = self.results['cycles'][cache_level - 1][1] if cache_info['transfers overlap']: if nonoverlap_region: raise ValueError( "Overlapping changes back and forth between levels, this is " "currently not supported.") ECM_OL.append(cycles) ECM_OL_construction.append( 'T_' + self.machine['memory hierarchy'][cache_level - 1]['level'] + cache_info['level']) else: nonoverlap_region = True ECM_nOL.append(cycles) ECM_nOL_construction.append( 'T_' + self.machine['memory hierarchy'][cache_level - 1]['level'] + cache_info['level']) # TODO consider multiple paths per cache level with victim caches self.results['ECM'] = tuple(ECM_OL + [tuple(ECM_nOL)]) self.results['ECM Model Construction'] = tuple( ECM_OL_construction + [tuple(ECM_nOL_construction)]) # Compile total single-core prediction self.results['total cycles'] = self._CPU.conv_cy( max(sum(ECM_nOL), *ECM_OL)) T_ECM = float(self.results['total cycles']['cy/CL']) # T_MEM is the cycles accounted to memory transfers T_MEM = self.results['cycles'][-1][1] # Simple scaling prediction: # Assumptions are: # - bottleneck is always LLC-MEM # - all caches scale with number of cores (bw AND size(WRONG!)) # Full caching in higher cache level self.results['scaling cores'] = float('inf') # Not full caching: if self.results['cycles'][-1][1] != 0.0: # Considering memory bus utilization utilization = [0] self.results['scaling cores'] = float('inf') for c in range(1, cores_per_numa_domain + 1): if c * T_MEM > (T_ECM + utilization[c - 1] * (c - 1) * T_MEM / 2): utilization.append(1.0) self.results['scaling cores'] = min( self.results['scaling cores'], c) else: utilization.append(c * T_MEM / (T_ECM + utilization[c - 1] * (c - 1) * T_MEM / 2)) utilization = utilization[1:] # scaling code scaling_predictions = [] for cores in range(1, self.machine['cores per socket'] + 1): scaling = { 'cores': cores, 'notes': [], 'performance': None, 'in-NUMA performance': None } # Detailed scaling: if cores <= self.results['scaling cores']: # Is it purely in-cache? innuma_rectp = PrefixedUnit(T_ECM / (T_ECM / T_MEM), "cy/CL") scaling['notes'].append("memory-interface not saturated") else: innuma_rectp = PrefixedUnit(self.results['cycles'][-1][1], 'cy/CL') scaling['notes'].append( "memory-interface saturated on first NUMA domain") # Include NUMA-local performance in results dict scaling['in-NUMA performance'] = innuma_rectp if 0 < cores <= cores_per_numa_domain: # only in-numa scaling to consider scaling['performance'] = self._CPU.conv_cy( innuma_rectp / utilization[cores - 1]) scaling['notes'].append("in-NUMA-domain scaling") elif cores <= self.machine['cores per socket'] * self.machine[ 'sockets']: # out-of-numa scaling behavior scaling['performance'] = self._CPU.conv_cy( innuma_rectp * cores_per_numa_domain / cores) scaling['notes'].append("out-of-NUMA-domain scaling") else: raise ValueError( "Number of cores must be greater than zero and upto the max. " "number of cores defined by cores per socket and sockets in" "machine file.") scaling_predictions.append(scaling) else: # pure in-cache performace (perfect scaling) scaling_predictions = [{ 'cores': cores, 'notes': ['pure in-cache'], 'performance': self._CPU.conv_cy(T_ECM / cores), 'in-NUMA performance': self._CPU.conv_cy(T_ECM / cores_per_numa_domain) } for cores in range(1, self.machine['cores per socket'] + 1)] # Also include prediction for all in-NUMA core counts in results self.results['scaling prediction'] = scaling_predictions if self._args.cores: self.results['multi-core'] = scaling_predictions[self._args.cores - 1] else: self.results['multi-core'] = None
def analyze(self, output_file=sys.stdout): """Run analysis.""" bench = self.kernel.binary element_size = self.kernel.datatypes_size[self.kernel.datatype] # Build arguments to pass to command: input_args = [] # Determine base runtime with 10 iterations runtimes = {'': 0.0} time_per_repetition = None repetitions = self.kernel.repetitions results = defaultdict(dict) # TODO if cores > 1, results are for openmp run. Things might need to be changed here! # Check for MEM group existence valid_groups = get_supported_likwid_groups() if "MEM" in valid_groups: group = "MEM" else: group = valid_groups[0] while min(runtimes.values()) < 1.5: if min(runtimes.values()) == 0.0: adjustable = True else: adjustable = self.adjust_variables(runtimes) if not adjustable: print( "WARNING: Could not extrapolate to a 1.5s run (for at least one region). Measurements might not be accurate.", file=output_file) break input_args = [ str(variable['value']) for variable in self.kernel.define.values() ] results = self.perfctr([bench] + input_args, group=group) if not self.kernel.regions: # no region specified for --marker -> benchmark all self.kernel.regions = set(results.keys()) if len(self.kernel.regions) > 1: self.kernel.regions.discard('') self.kernel.check() repetitions = self.kernel.repetitions else: # check if specified region(s) are found in results for region in self.kernel.regions: if not region in results.keys(): print( 'Region \'{}\' was not found in the likwid output.' .format(region), file=output_file) sys.exit(-1) runtimes = dict( zip(self.kernel.regions, [ results[r]['Runtime (RDTSC) [s]'] for r in self.kernel.regions ])) for region in self.kernel.regions: if self.kernel.repetitions[region]['marker']: repetitions[region]['value'] = results[region][ 'call count'] elif self.kernel.repetitions[region]['variable']: repetitions[region]['value'] = self.kernel.define[ self.kernel.repetitions[region]['variable']]['value'] elif self.kernel.repetitions[region]['value']: repetitions[region]['value'] = self.kernel.repetitions[ region]['value'] time_per_repetition = { r: runtimes[r] / float(repetitions[r]['value']) for r in self.kernel.regions } raw_results_collection = [results] # repetitions were obtained from likwid marker and time per repetition is too small # -> overhead introduced by likwid markers is not negligible for region in self.kernel.regions: if self.kernel.repetitions[region]['marker']: # repetitions were obtained from likwid markers if time_per_repetition[region] < 1.0: # time per repetition is <1000 ms (overhead is not negligible) print( "WARNING: Overhead introduced by likwid markers for region {} might not be negligible (usage of \'-R marker\').\n" .format(region), file=output_file) if self.benchmarked_regions - self.kernel.regions: print( 'WARNING: following likwid regions were found but not specified to be analysed:\n{}' .format(self.benchmarked_regions - self.kernel.regions)) # Base metrics for further metric computations: # collect counters for phenoecm run if not self.no_phenoecm: # Build events and sympy expressions for all model metrics T_OL, event_counters = self.machine.parse_perfmetric( self.machine['overlapping model'] ['performance counter metric']) T_data, event_dict = self.machine.parse_perfmetric( self.machine['non-overlapping model'] ['performance counter metric']) event_counters.update(event_dict) cache_metrics = defaultdict(dict) for i in range(len(self.machine['memory hierarchy']) - 1): cache_info = self.machine['memory hierarchy'][i] name = cache_info['level'] for k, v in cache_info['performance counter metrics'].items(): cache_metrics[name][ k], event_dict = self.machine.parse_perfmetric(v) event_counters.update(event_dict) # Compile minimal runs to gather all required events minimal_runs = build_minimal_runs(list(event_counters.values())) measured_ctrs = {} for region in self.kernel.regions: measured_ctrs[region] = {} for run in minimal_runs: ctrs = ','.join([eventstr(e) for e in run]) r = self.perfctr([bench] + input_args, group=ctrs) raw_results_collection.append(r) for region in self.kernel.regions: measured_ctrs[region].update(r[region]) # start analysing for each region for region in self.kernel.regions: raw_results = [r[region] for r in raw_results_collection] iterations_per_repetition = self.kernel.region__iterations_per_repetition( region) iterations_per_cacheline = ( float(self.machine['cacheline size']) / self.kernel.region__bytes_per_iteration(region)) cys_per_repetition = time_per_repetition[region] * float( self.machine['clock']) # Gather remaining counters if not self.no_phenoecm: # Match measured counters to symbols event_counter_results = {} for sym, ctr in event_counters.items(): event, regs, parameter = ctr[0], register_options( ctr[1]), ctr[2] for r in regs: if r in measured_ctrs[region][event]: event_counter_results[sym] = measured_ctrs[region][ event][r] # Analytical metrics needed for further calculation cl_size = float(self.machine['cacheline size']) total_iterations = iterations_per_repetition * repetitions[ region]['value'] total_cachelines = total_iterations / iterations_per_cacheline T_OL_result = T_OL.subs( event_counter_results) / total_cachelines cache_metric_results = defaultdict(dict) for cache, mtrcs in cache_metrics.items(): for m, e in mtrcs.items(): cache_metric_results[cache][m] = e.subs( event_counter_results) # Inter-cache transfers per CL cache_transfers_per_cl = { cache: { k: PrefixedUnit(v / total_cachelines, 'CL/CL') for k, v in d.items() } for cache, d in cache_metric_results.items() } cache_transfers_per_cl['L1']['accesses'].unit = 'LOAD/CL' # Select appropriate bandwidth mem_bw, mem_bw_kernel = self.machine.get_bandwidth( -1, # mem cache_metric_results['L3']['misses'], # load_streams cache_metric_results['L3']['evicts'], # store_streams 1) data_transfers = { # Assuming 0.5 cy / LOAD (SSE on SNB or IVB; AVX on HSW, BDW, SKL or SKX) 'T_nOL': (cache_metric_results['L1']['accesses'] / total_cachelines * 0.5), 'T_L1L2': ((cache_metric_results['L1']['misses'] + cache_metric_results['L1']['evicts']) / total_cachelines * cl_size / self.machine['memory hierarchy'][1] ['upstream throughput'][0]), 'T_L2L3': ((cache_metric_results['L2']['misses'] + cache_metric_results['L2']['evicts']) / total_cachelines * cl_size / self.machine['memory hierarchy'][2] ['upstream throughput'][0]), 'T_L3MEM': ((cache_metric_results['L3']['misses'] + cache_metric_results['L3']['evicts']) * float(self.machine['cacheline size']) / total_cachelines / mem_bw * float(self.machine['clock'])) } # Build phenomenological ECM model: ecm_model = {'T_OL': T_OL_result} ecm_model.update(data_transfers) else: event_counters = {} ecm_model = None cache_transfers_per_cl = None self.results[region] = { 'raw output': raw_results, 'ECM': ecm_model, 'data transfers': cache_transfers_per_cl, 'Runtime (per repetition) [s]': time_per_repetition[region], 'event counters': event_counters, 'Iterations per repetition': iterations_per_repetition, 'Iterations per cacheline': iterations_per_cacheline } self.results[region]['Runtime (per cacheline update) [cy/CL]'] = \ (cys_per_repetition / iterations_per_repetition) * iterations_per_cacheline if 'Memory data volume [GBytes]' in results[region]: self.results[region]['MEM volume (per repetition) [B]'] = \ results[region]['Memory data volume [GBytes]'] * 1e9 / repetitions[region]['value'] else: self.results[region][ 'MEM volume (per repetition) [B]'] = float('nan') self.results[region]['Performance [MFLOP/s]'] = \ self.kernel._flops[region] / (time_per_repetition[region] / iterations_per_repetition) / 1e6 if 'Memory bandwidth [MBytes/s]' in results[region]: self.results[region]['MEM BW [MByte/s]'] = results[region][ 'Memory bandwidth [MBytes/s]'] elif 'Memory BW [MBytes/s]' in results[region]: self.results[region]['MEM BW [MByte/s]'] = results[region][ 'Memory BW [MBytes/s]'] else: self.results[region]['MEM BW [MByte/s]'] = float('nan') self.results[region]['Performance [MLUP/s]'] = \ (iterations_per_repetition / time_per_repetition[region]) / 1e6 self.results[region]['Performance [MIt/s]'] = \ (iterations_per_repetition / time_per_repetition[region]) / 1e6
def calculate_cache_access(self): """Apply cache prediction to generate cache access behaviour.""" self.results = { 'loads': self.predictor.get_loads(), 'stores': self.predictor.get_stores(), 'verbose infos': self.predictor.get_infos(), # only for verbose outputs 'bottleneck level': 0, 'mem bottlenecks': [] } element_size = self.kernel.datatypes_size[self.kernel.datatype] cacheline_size = float(self.machine['cacheline size']) elements_per_cacheline = int(cacheline_size // element_size) total_flops = sum(self.kernel._flops.values()) * elements_per_cacheline # TODO let user choose threads_per_core: threads_per_core = 1 # Compile relevant information # CPU-L1 stats (in bytes!) # We compile CPU-L1 stats on our own, because cacheprediction only works on cache lines read_offsets, write_offsets = zip(*list( self.kernel.compile_global_offsets( iteration=range(0, elements_per_cacheline)))) read_offsets = set([ item for sublist in read_offsets if sublist is not None for item in sublist ]) write_offsets = set([ item for sublist in write_offsets if sublist is not None for item in sublist ]) write_streams = len(write_offsets) read_streams = len(read_offsets) + write_streams # write-allocate total_loads = read_streams * element_size total_evicts = write_streams * element_size bw, measurement_kernel = self.machine.get_bandwidth( 0, read_streams, 0, # we do not consider stores to L1 threads_per_core, cores=self.cores) # Calculate performance (arithmetic intensity * bandwidth with # arithmetic intensity = Iterations / bytes loaded if total_loads == 0: # This happens in case of full-caching arith_intens = None it_s = None else: arith_intens = 1.0 / (total_loads / elements_per_cacheline) it_s = PrefixedUnit(float(bw) * arith_intens, 'It/s') self.results['mem bottlenecks'].append({ 'performance': self.conv_perf(it_s), 'level': self.machine['memory hierarchy'][0]['level'], 'arithmetic intensity': arith_intens, 'bw kernel': measurement_kernel, 'bandwidth': bw, 'bytes transfered': total_loads }) self.results['bottleneck level'] = len( self.results['mem bottlenecks']) - 1 self.results['min performance'] = self.conv_perf(it_s) # for other cache and memory levels: for cache_level, cache_info in list( enumerate(self.machine['memory hierarchy']))[:-1]: # Compiling stats (in bytes!) total_loads = self.results['loads'][cache_level + 1] * cacheline_size total_stores = self.results['stores'][cache_level + 1] * cacheline_size # choose bw according to cache level and problem # first, compile stream counts at current cache level # write-allocate is allready resolved above read_streams = self.results['loads'][cache_level + 1] write_streams = self.results['stores'][cache_level + 1] # second, try to find best fitting kernel (closest to stream seen stream counts): bw, measurement_kernel = self.machine.get_bandwidth( cache_level + 1, read_streams, write_streams, threads_per_core, cores=self.cores) # Calculate performance (arithmetic intensity * bandwidth with # arithmetic intensity = flops / bytes transfered) bytes_transfered = total_loads + total_stores if bytes_transfered == 0: # This happens in case of full-caching arith_intens = float('inf') it_s = PrefixedUnit(float('inf'), 'It/s') else: arith_intens = 1 / (bytes_transfered / elements_per_cacheline) it_s = PrefixedUnit(float(bw) * arith_intens, 'It/s') self.results['mem bottlenecks'].append({ 'performance': self.conv_perf(it_s), 'level': (self.machine['memory hierarchy'][cache_level + 1]['level']), 'arithmetic intensity': arith_intens, 'bw kernel': measurement_kernel, 'bandwidth': bw, 'bytes transfered': bytes_transfered }) if it_s < self.results.get('min performance', {'It/s': it_s})['It/s']: self.results['bottleneck level'] = len( self.results['mem bottlenecks']) - 1 self.results['min performance'] = self.conv_perf(it_s) return self.results
def analyze(self): """Run complete analysis.""" self.results = self.calculate_cache_access() try: incore_analysis, pointer_increment = self.kernel.incore_analysis( asm_block=self.asm_block, pointer_increment=self.pointer_increment, model=self._args.incore_model, verbose=self.verbose > 2) except RuntimeError as e: print("In-core analysis failed: " + str(e)) sys.exit(1) block_throughput = incore_analysis['throughput'] uops = incore_analysis['uops'] incore_output = incore_analysis['output'] port_cycles = incore_analysis['port cycles'] # Normalize to cycles per cacheline elements_per_block = abs( pointer_increment / self.kernel.datatypes_size[self.kernel.datatype]) block_size = elements_per_block * self.kernel.datatypes_size[ self.kernel.datatype] try: block_to_cl_ratio = float( self.machine['cacheline size']) / block_size except ZeroDivisionError as e: print("Too small block_size / pointer_increment:", e, file=sys.stderr) sys.exit(1) port_cycles = dict([(i[0], i[1] * block_to_cl_ratio) for i in list(port_cycles.items())]) if uops is not None: uops = uops * block_to_cl_ratio cl_throughput = block_throughput * block_to_cl_ratio flops_per_element = sum(self.kernel._flops.values()) # Overwrite CPU-L1 stats, because they are covered by In-Core Model self.results['mem bottlenecks'][0] = None # Reevaluate mem bottleneck self.results['min performance'] = self.conv_perf( PrefixedUnit(float('inf'), 'It/s')) self.results['bottleneck level'] = None for level, bottleneck in enumerate(self.results['mem bottlenecks']): if level == 0: # ignoring CPU-L1 continue if bottleneck['performance']['It/s'] < self.results[ 'min performance']['It/s']: self.results['bottleneck level'] = level self.results['min performance'] = bottleneck['performance'] # Create result dictionary self.results.update({ 'cpu bottleneck': { 'port cycles': port_cycles, 'cl throughput': cl_throughput, 'uops': uops, 'performance throughput': self.conv_perf( PrefixedUnit( self.machine['clock'] / block_throughput * elements_per_block * self.cores, "It/s")), 'in-core model output': incore_output } })
def test_2d5pt_Roofline(self): store_file = os.path.join(self.temp_dir, 'test_2d5pt_Roofline.pickle') parser = kc.create_parser() args = parser.parse_args([ '-m', self._find_file('SandyBridgeEP_E5-2680.yml'), '-p', 'Roofline', self._find_file('2d-5pt.c'), '-D', 'N', '1024-4096:3log2', '-D', 'M', '50', '-vvv', '--store', store_file ]) kc.check_arguments(args, parser) kc.run(parser, args, output_file=sys.stdout) with open(store_file, 'rb') as f: results = pickle.load(f) # Check for correct variations of constants self.assertEqual(len(results), 3) # Check if results contains correct kernel and some other infoormation key = [ k for k in results if ('define', (('M', 50), ('N', 4096))) in k ][0] key_dict = dict(key) self.assertEqual(key_dict['pmodel'], 'Roofline') # Output of first result: result = results[key] assertRelativlyEqual(result['min performance']['FLOP/s'], 4720000000.0, 0.01) self.assertEqual(result['bottleneck level'], 1) expected_btlncks = [{ 'arithmetic intensity': 0.029411764705882353, 'bandwidth': PrefixedUnit(84.07, u'G', u'B/s'), 'bw kernel': 'load', 'level': u'L1', 'performance': PrefixedUnit(9.89, u'G', u'FLOP/s') }, { 'arithmetic intensity': 0.025, 'bandwidth': PrefixedUnit(47.24, u'G', u'B/s'), 'bw kernel': 'triad', 'level': u'L2', 'performance': PrefixedUnit(4.72, u'G', u'FLOP/s') }, { 'arithmetic intensity': 0.041, 'bandwidth': PrefixedUnit(32.9, 'G', 'B/s'), 'bw kernel': 'copy', 'level': u'L3', 'performance': PrefixedUnit(5.33, u'G', u'FLOP/s') }, { 'arithmetic intensity': float('inf'), 'bandwidth': PrefixedUnit(12.01, u'G', u'B/s'), 'bw kernel': 'load', 'level': u'MEM', 'performance': PrefixedUnit(float('inf'), u'', u'FLOP/s') }] for i, btlnck in enumerate(expected_btlncks): for k, v in btlnck.items(): if type(v) is not str: if k == 'performance': assertRelativlyEqual( result['mem bottlenecks'][i][k]['FLOP/s'], v, 0.05) else: assertRelativlyEqual(result['mem bottlenecks'][i][k], v, 0.05) else: self.assertEqual(result['mem bottlenecks'][i][k], v)
def get_machine_topology(cpuinfo_path='/proc/cpuinfo'): try: topo = subprocess.Popen( ['likwid-topology'], stdout=subprocess.PIPE).communicate()[0].decode("utf-8") except OSError as e: print('likwid-topology execution failed, is it installed and loaded?', file=sys.stderr) sys.exit(1) with open(cpuinfo_path, 'r') as f: cpuinfo = f.read() sockets = int(get_match_or_break(r'^Sockets:\s+([0-9]+)\s*$', topo)[0]) cores_per_socket = int( get_match_or_break(r'^Cores per socket:\s+([0-9]+)\s*$', topo)[0]) numa_domains_per_socket = \ int(get_match_or_break(r'^NUMA domains:\s+([0-9]+)\s*$', topo)[0])/sockets cores_per_numa_domain = numa_domains_per_socket / cores_per_socket machine = { 'model type': get_match_or_break(r'^CPU type:\s+(.+?)\s*$', topo)[0], 'model name': get_match_or_break(r'^model name\s+:\s+(.+?)\s*$', cpuinfo)[0], 'sockets': sockets, 'cores per socket': cores_per_socket, 'threads per core': int(get_match_or_break(r'^Threads per core:\s+([0-9]+)\s*$', topo)[0]), 'NUMA domains per socket': numa_domains_per_socket, 'cores per NUMA domain': cores_per_numa_domain, 'clock': 'INFORMATION_REQUIRED (e.g., 2.7 GHz)', 'FLOPs per cycle': { 'SP': { 'total': 'INFORMATION_REQUIRED', 'FMA': 'INFORMATION_REQUIRED', 'ADD': 'INFORMATION_REQUIRED', 'MUL': 'INFORMATION_REQUIRED' }, 'DP': { 'total': 'INFORMATION_REQUIRED', 'FMA': 'INFORMATION_REQUIRED', 'ADD': 'INFORMATION_REQUIRED', 'MUL': 'INFORMATION_REQUIRED' } }, 'micro-architecture': 'INFORMATION_REQUIRED (options: NHM, WSM, SNB, IVB, HSW)', # TODO retrive flags automatically from compiler with -march=native 'compiler': { 'icc': ['INFORMATION_REQUIRED (e.g., -O3 -fno-alias -xAVX)'], 'clang': [ 'INFORMATION_REQUIRED (e.g., -O3 -mavx, -D_POSIX_C_SOURCE=200112L' ], 'gcc': ['INFORMATION_REQUIRED (e.g., -O3 -march=ivybridge)'] }, 'cacheline size': 'INFORMATION_REQUIRED (in bytes, e.g. 64 B)', 'overlapping model': { 'ports': 'INFORAMTION_REQUIRED (list of ports as they appear in IACA, e.g.)' ', ["0", "0DV", "1", "2", "2D", "3", "3D", "4", "5", "6", "7"])', 'performance counter metric': 'INFORAMTION_REQUIRED Example:' 'max(UOPS_DISPATCHED_PORT_PORT_0__PMC2, UOPS_DISPATCHED_PORT_PORT_1__PMC3,' ' UOPS_DISPATCHED_PORT_PORT_4__PMC0, UOPS_DISPATCHED_PORT_PORT_5__PMC1)' }, 'non-overlapping model': { 'ports': 'INFORAMTION_REQUIRED (list of ports as they appear in IACA, e.g.)' ', ["0", "0DV", "1", "2", "2D", "3", "3D", "4", "5", "6", "7"])', 'performance counter metric': 'INFORAMTION_REQUIRED Example:' 'max(UOPS_DISPATCHED_PORT_PORT_0__PMC2, UOPS_DISPATCHED_PORT_PORT_1__PMC3,' ' UOPS_DISPATCHED_PORT_PORT_4__PMC0, UOPS_DISPATCHED_PORT_PORT_5__PMC1)' } } threads_start = topo.find('HWThread') threads_end = topo.find('Cache Topology') threads = {} for line in topo[threads_start:threads_end].split('\n'): m = re.match(r'([0-9]+)\s+([0-9]+)\s+([0-9]+)\s+([0-9]+)', line) if m: threads[m.groups()[0]] = (m.groups()[1:]) cache_start = topo.find('Cache Topology') cache_end = topo.find('NUMA Topology') machine['memory hierarchy'] = [] mem_level = {} for line in topo[cache_start:cache_end].split('\n'): if line.startswith('Level:'): mem_level = {} mem_level['level'] = 'L' + line.split(':')[1].strip() machine['memory hierarchy'].append(mem_level) elif line.startswith('Size:'): size = PrefixedUnit(line.split(':')[1].strip()) mem_level['cache per group'] = { 'sets': 'INFORMATION_REQUIRED (sets*ways*cl_size=' + str(size) + ')', 'ways': 'INFORMATION_REQUIRED (sets*ways*cl_size=' + str(size) + ')', 'cl_size': 'INFORMATION_REQUIRED (sets*ways*cl_size=' + str(size) + ')', 'replacement_policy': 'INFORMATION_REQUIRED (options: LRU, FIFO, MRU, RR)', 'write_allocate': 'INFORMATION_REQUIRED (True/False)', 'write_back': 'INFORMATION_REQUIRED (True/False)', 'load_from': 'L' + str(int(mem_level['level'][1:]) + 1), 'store_to': 'L' + str(int(mem_level['level'][1:]) + 1) } mem_level['size per group'] = size elif line.startswith('Cache groups:'): mem_level['groups'] = line.count('(') mem_level['cores per group'] = \ (machine['cores per socket'] * machine['sockets']) / mem_level['groups'] mem_level['threads per group'] = \ mem_level['cores per group'] * machine['threads per core'] mem_level['cycles per cacheline transfer'] = 'INFORMATION_REQUIRED' mem_level['performance counter metrics'] = { 'accesses': 'INFORMATION_REQUIRED (e.g., L1D_REPLACEMENT__PMC0)', 'misses': 'INFORMATION_REQUIRED (e.g., L2_LINES_IN_ALL__PMC1)', 'evicts': 'INFORMATION_REQUIRED (e.g., L2_LINES_OUT_DIRTY_ALL__PMC2)' } # Remove last caches load_from and store_to: del machine['memory hierarchy'][-1]['cache per group']['load_from'] del machine['memory hierarchy'][-1]['cache per group']['store_to'] machine['memory hierarchy'].append({ 'level': 'MEM', 'cores per group': machine['cores per socket'], 'threads per group': machine['threads per core'] * machine['cores per socket'], 'cycles per cacheline transfer': None, 'penalty cycles per read stream': 0, 'size per group': None }) return machine
def main(): machine = get_machine_topology() pprint(machine) benchmarks = {'kernels': {}, 'measurements': {}} machine['benchmarks'] = benchmarks benchmarks['kernels'] = { 'load': { 'read streams': { 'streams': 1, 'bytes': PrefixedUnit(8, 'B') }, 'read+write streams': { 'streams': 0, 'bytes': PrefixedUnit(0, 'B') }, 'write streams': { 'streams': 0, 'bytes': PrefixedUnit(0, 'B') }, 'FLOPs per iteration': 0 }, 'copy': { 'read streams': { 'streams': 1, 'bytes': PrefixedUnit(8, 'B') }, 'read+write streams': { 'streams': 0, 'bytes': PrefixedUnit(0, 'B') }, 'write streams': { 'streams': 1, 'bytes': PrefixedUnit(8, 'B') }, 'FLOPs per iteration': 0 }, 'update': { 'read streams': { 'streams': 1, 'bytes': PrefixedUnit(8, 'B') }, 'read+write streams': { 'streams': 1, 'bytes': PrefixedUnit(8, 'B') }, 'write streams': { 'streams': 1, 'bytes': PrefixedUnit(8, 'B') }, 'FLOPs per iteration': 0 }, 'triad': { 'read streams': { 'streams': 3, 'bytes': PrefixedUnit(24, 'B') }, 'read+write streams': { 'streams': 0, 'bytes': PrefixedUnit(0, 'B') }, 'write streams': { 'streams': 1, 'bytes': PrefixedUnit(8, 'B') }, 'FLOPs per iteration': 2 }, 'daxpy': { 'read streams': { 'streams': 2, 'bytes': PrefixedUnit(16, 'B') }, 'read+write streams': { 'streams': 1, 'bytes': PrefixedUnit(8, 'B') }, 'write streams': { 'streams': 1, 'bytes': PrefixedUnit(8, 'B') }, 'FLOPs per iteration': 2 }, } USAGE_FACTOR = 0.66 MEM_FACTOR = 15.0 cores = list(range(1, machine['cores per socket'] + 1)) for mem in machine['memory hierarchy']: measurement = {} machine['benchmarks']['measurements'][mem['level']] = measurement for threads_per_core in range(1, machine['threads per core'] + 1): threads = [c * threads_per_core for c in cores] if mem['size per group'] is not None: total_sizes = [ PrefixedUnit( max( int(mem['size per group']) * c / mem['cores per group'], int(mem['size per group'])) * USAGE_FACTOR, 'B') for c in cores ] else: last_mem = machine['memory hierarchy'][-2] total_sizes = [ last_mem['size per group'] * MEM_FACTOR for c in cores ] sizes_per_core = [t / cores[i] for i, t in enumerate(total_sizes)] sizes_per_thread = [ t / threads[i] for i, t in enumerate(total_sizes) ] measurement[threads_per_core] = { 'threads per core': threads_per_core, 'cores': copy(cores), 'threads': threads, 'size per core': sizes_per_core, 'size per thread': sizes_per_thread, 'total size': total_sizes, 'results': {}, } print('Progress: ', end='', file=sys.stderr) sys.stderr.flush() for mem_level in list(machine['benchmarks']['measurements'].keys()): for threads_per_core in list( machine['benchmarks']['measurements'][mem_level].keys()): measurement = machine['benchmarks']['measurements'][mem_level][ threads_per_core] measurement['results'] = {} for kernel in list(machine['benchmarks']['kernels'].keys()): measurement['results'][kernel] = [] for i, total_size in enumerate(measurement['total size']): measurement['results'][kernel].append( measure_bw(kernel, int(float(total_size) / 1000), threads_per_core, machine['threads per core'], measurement['cores'][i], sockets=1)) print('.', end='', file=sys.stderr) sys.stderr.flush() if sys.version_info[0] == 2: yaml.representer.Representer.add_representer(unicode, my_unicode_repr) machineyaml = machine['model name'] machineyaml = ' '.join(machineyaml.split()) machineyaml = machineyaml.replace('(R)', '') machineyaml = machineyaml.replace('@', '') machineyaml = machineyaml.replace('(TM)', '') machineyaml = machineyaml.replace(' ', '_') + '.yml' with io.open(machineyaml, 'w', encoding='utf8') as outfile: yaml.dump(machine, outfile, default_flow_style=False, allow_unicode=True)
def calculate_cache_access(self): if self._args.cache_predictor == 'SIM': self.predictor = CacheSimulationPredictor(self.kernel, self.machine) elif self._args.cache_predictor == 'LC': self.predictor = LayerConditionPredictor(self.kernel, self.machine) else: raise NotImplementedError( "Unknown cache predictor, only LC (layer condition) and " "SIM (cache simulation with pycachesim) is supported.") self.results = { 'misses': self.predictor.get_misses(), 'hits': self.predictor.get_hits(), 'evicts': self.predictor.get_evicts(), 'verbose infos': self.predictor.get_infos(), # only for verbose outputs 'bottleneck level': 0, 'mem bottlenecks': [] } element_size = self.kernel.datatypes_size[self.kernel.datatype] cacheline_size = float(self.machine['cacheline size']) elements_per_cacheline = int(cacheline_size // element_size) total_flops = sum(self.kernel._flops.values()) * elements_per_cacheline # TODO let user choose threads_per_core: threads_per_core = 1 # Compile relevant information # CPU-L1 stats (in bytes!) # We compile CPU-L1 stats on our own, because cacheprediction only works on cache lines read_offsets, write_offsets = zip(*list( self.kernel.compile_global_offsets( iteration=range(0, elements_per_cacheline)))) read_offsets = set( [item for sublist in read_offsets for item in sublist]) write_offsets = set( [item for sublist in write_offsets for item in sublist]) write_streams = len(write_offsets) read_streams = len(read_offsets) + write_streams # write-allocate total_loads = read_streams * element_size total_evicts = write_streams * element_size bw, measurement_kernel = self.machine.get_bandwidth( 0, read_streams, write_streams, threads_per_core, cores=self._args.cores) # Calculate performance (arithmetic intensity * bandwidth with # arithmetic intensity = flops / bytes loaded ) if total_loads == 0: # This happens in case of full-caching arith_intens = None performance = None else: arith_intens = float(total_flops) / total_loads performance = arith_intens * float(bw) self.results['mem bottlenecks'].append({ 'performance': PrefixedUnit(performance, 'FLOP/s'), 'level': self.machine['memory hierarchy'][0]['level'], 'arithmetic intensity': arith_intens, 'bw kernel': measurement_kernel, 'bandwidth': bw, 'bytes transfered': total_loads }) if performance <= self.results.get('min performance', performance): self.results['bottleneck level'] = len( self.results['mem bottlenecks']) - 1 self.results['min performance'] = performance # for other cache and memory levels: for cache_level, cache_info in list( enumerate(self.machine['memory hierarchy']))[:-1]: # Compiling stats (in bytes!) total_misses = self.results['misses'][cache_level] * cacheline_size total_evicts = self.results['evicts'][cache_level] * cacheline_size # choose bw according to cache level and problem # first, compile stream counts at current cache level # write-allocate is allready resolved above read_streams = self.results['misses'][cache_level] write_streams = self.results['evicts'][cache_level] # second, try to find best fitting kernel (closest to stream seen stream counts): bw, measurement_kernel = self.machine.get_bandwidth( cache_level + 1, read_streams, write_streams, threads_per_core, cores=self._args.cores) # Calculate performance (arithmetic intensity * bandwidth with # arithmetic intensity = flops / bytes transfered) bytes_transfered = total_misses + total_evicts if bytes_transfered == 0: # This happens in case of full-caching arith_intens = float('inf') performance = float('inf') else: arith_intens = float(total_flops) / bytes_transfered performance = arith_intens * float(bw) self.results['mem bottlenecks'].append({ 'performance': PrefixedUnit(performance, 'FLOP/s'), 'level': (self.machine['memory hierarchy'][cache_level + 1]['level']), 'arithmetic intensity': arith_intens, 'bw kernel': measurement_kernel, 'bandwidth': bw, 'bytes transfered': bytes_transfered }) if performance < self.results.get('min performance', performance): self.results['bottleneck level'] = len( self.results['mem bottlenecks']) - 1 self.results['min performance'] = performance return self.results
def test_2d5pt_Roofline(self): store_file = os.path.join(self.temp_dir, 'test_2d5pt_Roofline.pickle') output_stream = StringIO() parser = kc.create_parser() args = parser.parse_args([ '-m', self._find_file('SandyBridgeEP_E5-2680.yml'), '-p', 'Roofline', self._find_file('2d-5pt.c'), '-D', 'N', '1024-4096:3log2', '-D', 'M', '50', '-vvv', '--store', store_file ]) kc.check_arguments(args, parser) kc.run(parser, args, output_file=output_stream) with open(store_file, 'rb') as f: results = pickle.load(f) # Check if results contains correct kernel self.assertEqual(list(results), ['2d-5pt.c']) # Check for correct variations of constants self.assertCountEqual([ sorted(map(str, r)) for r in results['2d-5pt.c'] ], [ sorted(map(str, r)) for r in [((sympy.var('M'), 50), ( sympy.var('N'), 1024)), ((sympy.var('M'), 50), (sympy.var('N'), 2048)), ((sympy.var('M'), 50), (sympy.var('N'), 4096))] ]) # Output of first result: result = results['2d-5pt.c'][[ k for k in results['2d-5pt.c'] if (sympy.var('N'), 4096) in k ][0]] self.assertCountEqual(result, ['Roofline']) roofline = result['Roofline'] assert_relativly_equal(roofline['min performance']['FLOP/s'], 5115000000.0, 0.01) self.assertEqual(roofline['bottleneck level'], 1) expected_btlncks = [{ 'arithmetic intensity': 0.11764705882352941, 'bandwidth': PrefixedUnit(81.61, u'G', u'B/s'), 'bw kernel': 'triad', 'level': u'L1', 'performance': PrefixedUnit(9601176470.588236, u'', u'FLOP/s') }, { 'arithmetic intensity': 0.1, 'bandwidth': PrefixedUnit(51.15, u'G', u'B/s'), 'bw kernel': 'triad', 'level': u'L2', 'performance': PrefixedUnit(5115000000.0, u'', u'FLOP/s') }, { 'arithmetic intensity': 1.0 / 6.0, 'bandwidth': PrefixedUnit(34815.0, 'M', 'B/s'), 'bw kernel': 'copy', 'level': u'L3', 'performance': PrefixedUnit(5802500000.0, u'', u'FLOP/s') }, { 'arithmetic intensity': float('inf'), 'bandwidth': PrefixedUnit(12.01, u'G', u'B/s'), 'bw kernel': 'load', 'level': u'MEM', 'performance': PrefixedUnit(float('inf'), u'', u'FLOP/s') }] for i, btlnck in enumerate(expected_btlncks): for k, v in btlnck.items(): if type(v) is not str: if k == 'performance': assert_relativly_equal( roofline['mem bottlenecks'][i][k]['FLOP/s'], v, 0.05) else: assert_relativly_equal( roofline['mem bottlenecks'][i][k], v, 0.05) else: self.assertEqual(roofline['mem bottlenecks'][i][k], v)
def analyze(self): """Run analysis.""" bench = self.kernel.build(verbose=self.verbose > 1, openmp=self._args.cores > 1) element_size = self.kernel.datatypes_size[self.kernel.datatype] # Build arguments to pass to command: args = [str(s) for s in list(self.kernel.constants.values())] # Determine base runtime with 10 iterations runtime = 0.0 time_per_repetition = 2.0 / 10.0 repetitions = self.iterations // 10 mem_results = {} # TODO if cores > 1, results are for openmp run. Things might need to be changed here! while runtime < 1.5: # Interpolate to a 2.0s run if time_per_repetition != 0.0: repetitions = 2.0 // time_per_repetition else: repetitions = int(repetitions * 10) mem_results = self.perfctr([bench] + [str(repetitions)] + args, group="MEM") runtime = mem_results['Runtime (RDTSC) [s]'] time_per_repetition = runtime / float(repetitions) raw_results = [mem_results] # Gather remaining counters if not self.no_phenoecm: # Build events and sympy expressions for all model metrics T_OL, event_counters = self.machine.parse_perfmetric( self.machine['overlapping model'] ['performance counter metric']) T_data, event_dict = self.machine.parse_perfmetric( self.machine['non-overlapping model'] ['performance counter metric']) event_counters.update(event_dict) cache_metrics = defaultdict(dict) for i in range(len(self.machine['memory hierarchy']) - 1): cache_info = self.machine['memory hierarchy'][i] name = cache_info['level'] for k, v in cache_info['performance counter metrics'].items(): cache_metrics[name][ k], event_dict = self.machine.parse_perfmetric(v) event_counters.update(event_dict) # Compile minimal runs to gather all required events minimal_runs = build_minimal_runs(list(event_counters.values())) measured_ctrs = {} for run in minimal_runs: ctrs = ','.join([eventstr(e) for e in run]) r = self.perfctr([bench] + [str(repetitions)] + args, group=ctrs) raw_results.append(r) measured_ctrs.update(r) # Match measured counters to symbols event_counter_results = {} for sym, ctr in event_counters.items(): event, regs, parameter = ctr[0], register_options( ctr[1]), ctr[2] for r in regs: if r in measured_ctrs[event]: event_counter_results[sym] = measured_ctrs[event][r] # Analytical metrics needed for futher calculation cl_size = float(self.machine['cacheline size']) elements_per_cacheline = cl_size // element_size total_iterations = self.kernel.iteration_length() * repetitions total_cachelines = total_iterations / elements_per_cacheline T_OL_result = T_OL.subs(event_counter_results) / total_cachelines cache_metric_results = defaultdict(dict) for cache, mtrcs in cache_metrics.items(): for m, e in mtrcs.items(): cache_metric_results[cache][m] = e.subs( event_counter_results) # Inter-cache transfers per CL cache_transfers_per_cl = { cache: { k: PrefixedUnit(v / total_cachelines, 'CL/CL') for k, v in d.items() } for cache, d in cache_metric_results.items() } cache_transfers_per_cl['L1']['accesses'].unit = 'LOAD/CL' # Select appropriate bandwidth mem_bw, mem_bw_kernel = self.machine.get_bandwidth( -1, # mem cache_metric_results['L3']['misses'], # load_streams cache_metric_results['L3']['evicts'], # store_streams 1) data_transfers = { # Assuming 0.5 cy / LOAD (SSE on SNB or IVB; AVX on HSW, BDW, SKL or SKX) 'T_nOL': (cache_metric_results['L1']['accesses'] / total_cachelines * 0.5), 'T_L1L2': ((cache_metric_results['L1']['misses'] + cache_metric_results['L1']['evicts']) / total_cachelines * cl_size / self.machine['memory hierarchy'][1] ['non-overlap upstream throughput'][0]), 'T_L2L3': ((cache_metric_results['L2']['misses'] + cache_metric_results['L2']['evicts']) / total_cachelines * cl_size / self.machine['memory hierarchy'][2] ['non-overlap upstream throughput'][0]), 'T_L3MEM': ((cache_metric_results['L3']['misses'] + cache_metric_results['L3']['evicts']) * float(self.machine['cacheline size']) / total_cachelines / mem_bw * float(self.machine['clock'])) } # Build phenomenological ECM model: ecm_model = {'T_OL': T_OL_result} ecm_model.update(data_transfers) else: event_counters = {} ecm_model = None cache_transfers_per_cl = None self.results = { 'raw output': raw_results, 'ECM': ecm_model, 'data transfers': cache_transfers_per_cl, 'Runtime (per repetition) [s]': time_per_repetition, 'event counters': event_counters } # TODO make more generic to support other (and multiple) constant names iterations_per_repetition = reduce(operator.mul, [ self.kernel.subs_consts(max_ - min_) / self.kernel.subs_consts(step) for idx, min_, max_, step in self.kernel._loop_stack ], 1) self.results['Iterations per repetition'] = iterations_per_repetition iterations_per_cacheline = float( self.machine['cacheline size']) / element_size cys_per_repetition = time_per_repetition * float(self.machine['clock']) self.results['Runtime (per cacheline update) [cy/CL]'] = \ (cys_per_repetition / iterations_per_repetition) * iterations_per_cacheline self.results['MEM volume (per repetition) [B]'] = \ mem_results['Memory data volume [GBytes]'] * 1e9 / repetitions self.results['Performance [MFLOP/s]'] = \ sum(self.kernel._flops.values()) / ( time_per_repetition / iterations_per_repetition) / 1e6 if 'Memory bandwidth [MBytes/s]' in mem_results: self.results['MEM BW [MByte/s]'] = mem_results[ 'Memory bandwidth [MBytes/s]'] else: self.results['MEM BW [MByte/s]'] = mem_results[ 'Memory BW [MBytes/s]'] self.results['Performance [MLUP/s]'] = (iterations_per_repetition / time_per_repetition) / 1e6 self.results['Performance [MIt/s]'] = (iterations_per_repetition / time_per_repetition) / 1e6
def run_kernel(kernel, args): machine = get_machine_model() # get per cachelevel performance counter information: event_counters = {} cache_metrics = defaultdict(dict) for i, cache_info in enumerate(machine['memory hierarchy']): name = cache_info['level'] for k, v in cache_info['performance counter metrics'].items(): if v is None: # Some info can not be measured, we skip it continue try: cache_metrics[name][k], event_dict = machine.parse_perfmetric( v) except SyntaxError as e: print( 'Syntax error in machine file perf. metric: {}'.format(v), e, file=sys.stderr) continue event_counters.update(event_dict) bench_filename = f"build/{kernel}.{platform.machine()}" raw_results = [] global_infos = {} # Compile minimal runs to gather all required events minimal_runs = benchmark.build_minimal_runs(list(event_counters.values())) measured_ctrs = defaultdict(dict) for run in minimal_runs: ctrs = ','.join([benchmark.eventstr(e) for e in run]) r, o = perfctr([bench_filename] + list(map(lambda t: ' '.join(map(str, t)), args)), cores=1, group=ctrs) global_infos = {} for m in [ re.match(r"(:?([a-z_\-0-9]+):)?([a-z]+): ([a-z\_\-0-9]+)", l) for l in o ]: if m is not None: try: v = int(m.group(4)) except ValueError: v = m.group(4) if m.group(1) is None: global_infos[m.group(3)] = v else: r[m.group(2)][m.group(3)] = v raw_results.append(o) for k in r: measured_ctrs[k].update(r[k]) # Analytical metrics needed for futher calculation cl_size = int(machine['cacheline size']) elementsize = global_infos["elementsize"] base_iterations = cl_size // elementsize event_counter_results = {} cache_metric_results = {} cache_transfers_per_cl = {} for kernel_run in measured_ctrs: # Match measured counters to symbols event_counter_results[kernel_run] = {} for sym, ctr in event_counters.items(): event, regs, parameters = ctr[0], benchmark.register_options( ctr[1]), ctr[2] if parameters: parameter_str = ':'.join(parameters) regs = [r + ':' + parameter_str for r in regs] for r in regs: if r in measured_ctrs[kernel_run][event]: event_counter_results[kernel_run][sym] = measured_ctrs[ kernel_run][event][r] cache_metric_results[kernel_run] = defaultdict(dict) for cache, mtrcs in cache_metrics.items(): for m, e in mtrcs.items(): cache_metric_results[kernel_run][cache][m] = e.subs( event_counter_results[kernel_run]) total_iterations = \ measured_ctrs[kernel_run]['iterations'] * measured_ctrs[kernel_run]['repetitions'] # Inter-cache transfers per CL cache_transfers_per_cl[kernel_run] = { cache: { k: PrefixedUnit(v / (total_iterations / base_iterations), 'CL/{}It.'.format(base_iterations)) for k, v in d.items() } for cache, d in cache_metric_results[kernel_run].items() } cache_transfers_per_cl[kernel_run]['L1']['loads'].unit = \ 'LOAD/{}It.'.format(base_iterations) cache_transfers_per_cl[kernel_run]['L1']['stores'].unit = \ 'LOAD/{}It.'.format(base_iterations) return cache_transfers_per_cl, global_infos, raw_results