def test_max(self): l = [1, 2] dl = make_deferrable(l) self.assertEqual(2, sn.max(dl)) l.append(3) self.assertEqual(3, sn.max(dl))
def test_max(self): l = [1, 2] dl = sn.defer(l) self.assertEqual(2, sn.max(dl)) l.append(3) self.assertEqual(3, sn.max(dl))
def __init__(self): super().__init__() self.descr = 'Test Cray LibSci on the GPU (dgemm with libsci alloc)' self.build_system = 'SingleSource' self.sourcesdir = None self.sourcepath = ('$CRAY_LIBSCI_ACC_DIR/examples/examples/c_simple/' 'dgemm_simple.c') self.sanity_patterns = sn.assert_found(r'(4096\s+){3}', self.stdout) regex = r'(\s+\d+){3}\s+(?P<gpu_flops>\S+)\s+(?P<cpu_flops>\S+)\s+' self.perf_patterns = { 'dgemm_gpu': sn.max(sn.extractall(regex, self.stdout, 'gpu_flops', float)), 'dgemm_cpu': sn.max(sn.extractall(regex, self.stdout, 'cpu_flops', float)), } self.reference = { 'daint:gpu': { 'dgemm_gpu': (2264.0, -0.05, None, 'GFLop/s'), 'dgemm_cpu': (45.0, -0.05, None, 'GFLop/s'), }, 'dom:gpu': { 'dgemm_gpu': (2264.0, -0.05, None, 'GFLop/s'), 'dgemm_cpu': (45.0, -0.05, None, 'GFLop/s'), }, }
def test_max(): l = [1, 2] dl = sn.defer(l) assert 2 == sn.max(dl) l.append(3) assert 3 == sn.max(dl)
def set_perf_patterns(self): '''Set performance patterns.''' self.perf_patterns = { 'latency': sn.max(sn.extractall( r'\[\S+\] \[gpu \d+\] Kernel launch latency: ' r'(?P<latency>\S+) us', self.stdout, 'latency', float)) }
def __init__(self): super().__init__() self.maintainers = ['JG'] self.valid_systems += ['eiger:mc', 'pilatus:mc'] self.time_limit = '5m' self.sourcepath = 'eatmemory_mpi.c' self.tags.add('mem') self.executable_opts = ['100%'] self.sanity_patterns = sn.assert_found(r'(oom-kill)|(Killed)', self.stderr) # {{{ perf regex = (r'^Eating \d+ MB\/mpi \*\d+mpi = -\d+ MB memory from \/proc\/' r'meminfo: total: \d+ GB, free: \d+ GB, avail: \d+ GB, using:' r' (\d+) GB') self.perf_patterns = { 'max_cn_memory': sn.getattr(self, 'reference_meminfo'), 'max_allocated_memory': sn.max(sn.extractall(regex, self.stdout, 1, int)), } no_limit = (0, None, None, 'GB') self.reference = { '*': { 'max_cn_memory': no_limit, 'max_allocated_memory': (sn.getattr(self, 'reference_meminfo'), -0.05, None, 'GB'), } }
def __init__(self, **kwargs): super().__init__('Monch', **kwargs) self.tags = {'monch_acceptance'} self.valid_systems = ['monch:compute'] self.valid_prog_environs = ['PrgEnv-gnu'] self.num_tasks = 1 self.num_tasks_per_node = 1 self.num_tasks_per_core = 1 self.num_cpus_per_task = 20 self.num_tasks_per_socket = 10 self.use_multithreading = False self.cflags = '-O3 -I$EBROOTOPENBLAS/include' self.ldflags = '-L$EBROOTOPENBLAS/lib -lopenblas -lpthread -lgfortran' self.variables = { 'OMP_NUM_THREADS': str(self.num_cpus_per_task), 'MV2_ENABLE_AFFINITY': '0' } self.perf_patterns = { 'perf': sn.max( sn.extractall(r'Run\s\d\s+:\s+(?P<gflops>\S+)\s\S+', self.stdout, "gflops", float) ) } self.reference = { 'monch:compute': { 'perf': (350, -0.1, None) } }
def __init__(self): self.valid_systems = ['cannon:local-gpu','cannon:gpu_test','fasse:fasse_gpu','test:gpu'] self.descr = 'GPU burn test' self.valid_prog_environs = ['gpu'] self.executable_opts = ['-d', '40'] self.build_system = 'Make' self.build_system.makefile = 'makefile.cuda' self.executable = './gpu_burn.x' patt = (r'^\s*\[[^\]]*\]\s*GPU\s+\d+\(\S*\):\s+(?P<perf>\S*)\s+GF\/s' r'\s+(?P<temp>\S*)\s+Celsius') self.perf_patterns = { 'perf': sn.min(sn.extractall(patt, self.stdout, 'perf', float)), 'temp': sn.max(sn.extractall(patt, self.stdout, 'temp', float)), } self.reference = { 'cannon:local-gpu': { 'perf': (6200, -0.10, None, 'Gflop/s per gpu'), }, 'cannon:gpu_test': { 'perf': (6200, -0.10, None, 'Gflop/s per gpu'), }, 'test:gpu': { 'perf': (4115, None, None, 'Gflop/s per gpu'), }, '*': { 'perf': (4115, None, None, 'Gflop/s per gpu'), }, '*': {'temp': (0, None, None, 'degC')} }
def __init__(self, linkage, **kwargs): super().__init__('scalapack_performance_compile_run_', linkage, **kwargs) # FIXME: # Currently, this test case is only aimed for the monch acceptance, # yet it could be interesting to extend it to other systems. # NB: The test case is very small, but larger cases did not succeed! self.tags |= {'monch_acceptance'} self.sourcepath = 'scalapack_performance_compile_run.f' self.valid_systems = ['monch:compute'] self.valid_prog_environs = ['PrgEnv-gnu'] self.num_tasks = 64 self.num_tasks_per_node = 16 self.sanity_patterns = sn.assert_found(r'Run', self.stdout) self.perf_patterns = { 'perf': sn.max( sn.extractall(r'GFLOPS/s:\s+(?P<gflops>\S+)', self.stdout, 'gflops', float)) } self.reference = {'monch:compute': {'perf': (24., -0.1, None)}}
def mpip_perf_patterns(obj, reg): '''More perf_patterns for the tool .. code-block:: ----------------------------------- @--- MPI Time (seconds) ----------- ----------------------------------- Task AppTime MPITime MPI% 0 8.6 0.121 1.40 <-- min 1 8.6 0.157 1.82 2 8.6 5.92 68.84 <-- max * 25.8 6.2 24.02 <--- => NonMPI= AppTime - MPITime Typical performance reporting: .. code-block:: * mpip_avg_app_time: 8.6 s (= 25.8/3mpi) * mpip_avg_mpi_time: 2.07 s (= 6.2/3mpi) * %mpip_avg_mpi_time: 24.02 % * %max/%min * %mpip_avg_non_mpi_time: 75.98 % ''' # rpt = os.path.join(obj.stagedir, obj.rpt_file_txt) rpt = sn.extractsingle(r'^mpiP: Storing mpiP output in \[(?P<rpt>.*)\]', obj.stdout, 'rpt', str) regex_star = r'^\s+\*\s+(?P<appt>\S+)\s+(?P<mpit>\S+)\s+(?P<pct>\S+)$' regex_minmax = (r'^\s+(?P<mpirk>\S+)\s+(?P<appt>\S+)\s+(?P<mpit>\S+)\s+' r'(?P<pct>\S+)$') if reg == 1: # mpip_avg_mpi_time result = sn.round( sn.extractsingle(regex_star, rpt, 'mpit', float) / obj.num_tasks, 2) elif reg == 2: # mpip_avg_app_time result = sn.round( sn.extractsingle(regex_star, rpt, 'appt', float) / obj.num_tasks, 2) elif reg == 3: # %mpip_avg_mpi_time result = sn.extractsingle(regex_star, rpt, 'pct', float) elif reg == 4: # %nonmpi mpi_pct = sn.extractsingle(regex_star, rpt, 'pct', float) result = sn.round(100 - mpi_pct, 2) elif reg == 5: # %mpip_avg_mpi_time_max result = sn.max(sn.extractall(regex_minmax, rpt, 'pct', float)) elif reg == 6: # %mpip_avg_mpi_time_min result = sn.min(sn.extractall(regex_minmax, rpt, 'pct', float)) else: raise ValueError('unknown region id in mpip_perf_patterns') return result
def set_performance_patterns(self): self.perf_patterns = { 'average_latency': sn.max(sn.extractall( r'^\s*\[[^\]]*\]\s* On device \d+, ' r'the chase took on average (\d+) ' r'cycles per node jump.', self.stdout, 1, int) ), }
def ru_maxrss_rk0(obj): '''Reports the ``maximum resident set size`` ''' maxrss_rk0 = sn.max( sn.extractall( r'^METRIC\s+0\s+.*ru_maxrss\" <2>; UINT64; (?P<rss>\d+)\)', obj.rpt_otf2, 'rss', int)) return maxrss_rk0
def stress_diff(ostream, ostream_ref): ''' Return the difference between obtained and reference stress tensor components''' stress = get_stress(ostream) stress_ref = get_stress(ostream_ref) return sn.max( sn.abs(stress_ref[i][j] - stress[i][j]) for i in range(2) for j in range(2))
def average_D2D_latency(self): '''Extract the average D2D latency. The pChase code returns a table with the cummulative latency for all D2D list traversals, and the last column of this table has the max values for each device. ''' return sn.max( sn.extractall(r'^\s*\[[^\]]*\]\s*GPU\s*\d+\s+(\s*\d+.\s+)+', self.stdout, 1, int))
def __init__(self): self.valid_systems = [ 'daint:gpu', 'dom:gpu', 'arolla:cn', 'tsa:cn', 'ault:amdv100', 'ault:intelv100', 'ault:amda100', 'ault:amdvega' ] self.descr = 'GPU burn test' self.valid_prog_environs = ['PrgEnv-gnu'] self.exclusive_access = True self.executable_opts = ['-d', '40'] self.build_system = 'Make' self.executable = './gpu_burn.x' self.num_tasks = 0 self.num_tasks_per_node = 1 self.sanity_patterns = self.assert_num_tasks() patt = (r'^\s*\[[^\]]*\]\s*GPU\s+\d+\(\S*\):\s+(?P<perf>\S*)\s+GF\/s' r'\s+(?P<temp>\S*)\s+Celsius') self.perf_patterns = { 'perf': sn.min(sn.extractall(patt, self.stdout, 'perf', float)), 'temp': sn.max(sn.extractall(patt, self.stdout, 'temp', float)), } self.reference = { 'dom:gpu': { 'perf': (4115, -0.10, None, 'Gflop/s'), }, 'daint:gpu': { 'perf': (4115, -0.10, None, 'Gflop/s'), }, 'arolla:cn': { 'perf': (5861, -0.10, None, 'Gflop/s'), }, 'tsa:cn': { 'perf': (5861, -0.10, None, 'Gflop/s'), }, 'ault:amda100': { 'perf': (15000, -0.10, None, 'Gflop/s'), }, 'ault:amdv100': { 'perf': (5500, -0.10, None, 'Gflop/s'), }, 'ault:intelv100': { 'perf': (5500, -0.10, None, 'Gflop/s'), }, 'ault:amdvega': { 'perf': (3450, -0.10, None, 'Gflop/s'), }, '*': { 'temp': (0, None, None, 'degC') } } self.maintainers = ['AJ', 'TM'] self.tags = {'diagnostic', 'benchmark', 'craype'}
def __init__(self): super().__init__() self.valid_systems = ( self.single_device_systems + self.multi_device_systems ) self.perf_patterns = { 'average_latency': sn.max(sn.extractall( r'^\s*\[[^\]]*\]\s* On device \d+, ' r'the chase took on average (\d+) ' r'cycles per node jump.', self.stdout, 1, int) ), }
def set_perf_patterns(self): '''Extract the minimum performance and maximum temperature recorded. The performance and temperature data are reported in Gflops/s and deg. Celsius respectively. ''' patt = (r'^\s*\[[^\]]*\]\s*GPU\s+\d+\(\S*\):\s+(?P<perf>\S*)\s+GF\/s' r'\s+(?P<temp>\S*)\s+Celsius') self.perf_patterns = { 'perf': sn.min(sn.extractall(patt, self.stdout, 'perf', float)), 'temp': sn.max(sn.extractall(patt, self.stdout, 'temp', float)), }
def vtune_momentumAndEnergyIAD(self): ''' sphexa::sph::computeMomentumAndEnergyIADImpl<...> sqpatch.exe 40.919s sphexa::sph::computeMomentumAndEnergyIADImpl<...> sqpatch.exe 38.994s sphexa::sph::computeMomentumAndEnergyIADImpl<...> sqpatch.exe 40.245s sphexa::sph::computeMomentumAndEnergyIADImpl<...> sqpatch.exe 39.487s ''' # ^[sphexa::|MPI|[Others].*\s+(?P<sec>\S+)s$' regex1 = r'^\s+CPU Time: (?P<sec>\S+)s' result1 = sn.max(sn.extractall(regex1, self.stdout, 'sec', float)) regex2 = r'^sphexa::sph::computeMomentumAndEnergyIADImpl.*\s+(?P<x>\S+)s$' result2 = sn.max(sn.extractall(regex2, self.stdout, 'x', float)) print("vtune_cput=", result1) print("vtune_energ=", result2) print("vtune_cput/24=", result1 / 24) print("vtune_energ/24=", result2 / 24) # print("t=", result1/result2) # print("c=", self.num_tasks) # print("t=", (result1/result2) / self.num_tasks) # t= 5.208910219363269 / 24 = 0.2170379258068029 # vtune_momentumAndEnergyIAD: 5.2089 % return 0
def set_mpip_perf_patterns(self): '''More perf_patterns for the tool .. code-block:: ----------------------------------- @--- MPI Time (seconds) ----------- ----------------------------------- Task AppTime MPITime MPI% 0 8.6 0.121 1.40 <-- min 1 8.6 0.157 1.82 2 8.6 5.92 68.84 <-- max * 25.8 6.2 24.02 <--- => NonMPI= AppTime - MPITime Typical performance reporting: .. code-block:: * mpip_avg_app_time: 8.6 s (= 25.8/3mpi) * mpip_avg_mpi_time: 2.07 s (= 6.2/3mpi) * %mpip_avg_mpi_time: 24.02 % * %mpip_avg_non_mpi_time: 75.98 % ''' regex_star = r'^\s+\*\s+(?P<appt>\S+)\s+(?P<mpit>\S+)\s+(?P<pct>\S+)$' app_t = sn.extractsingle(regex_star, self.rpt, 'appt', float) mpi_t = sn.extractsingle(regex_star, self.rpt, 'mpit', float) mpi_pct = sn.extractsingle(regex_star, self.rpt, 'pct', float) nonmpi_pct = sn.round(100 - mpi_pct, 2) # min/max regex = (r'^\s+(?P<mpirk>\S+)\s+(?P<appt>\S+)\s+(?P<mpit>\S+)\s+' r'(?P<pct>\S+)$') mpi_pct_max = sn.max(sn.extractall(regex, self.rpt, 'pct', float)) mpi_pct_min = sn.min(sn.extractall(regex, self.rpt, 'pct', float)) perf_pattern = { 'mpip_avg_app_time': sn.round(app_t / self.num_tasks, 2), 'mpip_avg_mpi_time': sn.round(mpi_t / self.num_tasks, 2), '%mpip_avg_mpi_time': mpi_pct, '%mpip_avg_mpi_time_max': mpi_pct_max, '%mpip_avg_mpi_time_min': mpi_pct_min, '%mpip_avg_non_mpi_time': nonmpi_pct, } if self.perf_patterns: self.perf_patterns = {**self.perf_patterns, **perf_pattern} else: self.perf_patterns = perf_pattern
def __init__(self, kernel_version): self.valid_systems = [ 'cannon:local-gpu', 'cannon:gpu_test', 'fasse:fasse_gpu', 'test:gpu' ] self.valid_prog_environs = ['gpu'] self.build_system = 'Make' self.executable = './kernel_latency.x' if kernel_version == 'sync': self.build_system.cppflags = ['-D SYNCKERNEL=1'] else: self.build_system.cppflags = ['-D SYNCKERNEL=0'] self.perf_patterns = { 'latency': sn.max( sn.extractall( r'\[\S+\] \[gpu \d+\] Kernel launch latency: ' r'(?P<latency>\S+) us', self.stdout, 'latency', float)) } self.sys_reference = { 'sync': { 'cannon:local-gpu': { 'latency': (6.0, None, 0.10, 'us') }, 'cannon:gpu_test': { 'latency': (4.0, None, 0.10, 'us') }, '*': { 'latency': (15.1, None, None, 'us') }, }, 'async': { 'cannon:local-gpu': { 'latency': (6.0, None, 0.10, 'us') }, 'cannon:gpu_test': { 'latency': (4.0, None, 0.10, 'us') }, '*': { 'latency': (2.2, None, None, 'us') }, }, } self.reference = self.sys_reference[kernel_version]
def forces_diff(ostream, ostream_ref): ''' Return the difference between obtained and reference atomic forces''' forces = get_forces(ostream) forces_ref = get_forces(ostream_ref) na = 0 for e in forces: na += 1 na_ref = 0 for e in forces_ref: na_ref += 1 sn.assert_eq(na, na_ref, msg='Wrong length of forces array: {0} != {1}').evaluate() return sn.max( sn.abs(forces[i][j] - forces_ref[i][j]) for i in range(na) for j in range(2))
def pw_perf_patterns(obj): '''Reports hardware counter values from the tool .. code-block:: collector time time (%) PAPI_REF_CYC PAPI_L2_DCM -------------------------------------------------------------------------- computeMomentumAndEnergyIAD 0.6816 100.00 1770550470 2438527 ^^^^^^^ ''' regex = r'^computeMomentumAndEnergyIAD\s+\S+\s+\S+\s+\S+\s+(?P<hwc>\d+)$' hwc_min = sn.min(sn.extractall(regex, obj.stderr, 'hwc', int)) hwc_avg = sn.round(sn.avg(sn.extractall(regex, obj.stderr, 'hwc', int)), 1) hwc_max = sn.max(sn.extractall(regex, obj.stderr, 'hwc', int)) res_d = { 'papiwrap_hwc_min': hwc_min, 'papiwrap_hwc_avg': hwc_avg, 'papiwrap_hwc_max': hwc_max, } return res_d
def __init__(self, linkage): super().__init__(linkage) self.tags |= {'monch_acceptance'} self.sourcepath = 'scalapack_performance_compile_run.f' self.valid_systems = ['monch:compute'] self.valid_prog_environs = ['PrgEnv-gnu'] self.num_tasks = 64 self.num_tasks_per_node = 16 self.sanity_patterns = sn.assert_found(r'Run', self.stdout) self.perf_patterns = { 'perf': sn.max( sn.extractall(r'GFLOPS/s:\s+(?P<gflops>\S+)', self.stdout, 'gflops', float) ) } self.reference = { 'monch:compute': { 'perf': (24., -0.1, None) } }
def density_SQ_INSTS_SALU(self): regex = self.set_regex('density') rpt = os.path.join(self.stagedir, self.metric_file.replace(".txt", ".csv")) return sn.round(sn.max(sn.extractall(regex, rpt, 'm4', int)), 0)
def __init__(self, kernel_version): super().__init__() # List known partitions here so as to avoid specifying them every time # with --system self.valid_systems = ['daint:gpu', 'dom:gpu', 'kesch:cn'] self.num_tasks = 0 self.num_tasks_per_node = 1 self.sourcepath = 'kernel_latency.cu' self.build_system = 'SingleSource' self.build_system.cxxflags = ['-std=c++11'] if self.current_system.name in {'dom', 'daint'}: self.num_gpus_per_node = 1 gpu_arch = '60' self.modules = ['craype-accel-nvidia60'] self.valid_prog_environs = [ 'PrgEnv-cray', 'PrgEnv-pgi', 'PrgEnv-gnu' ] elif self.current_system.name == 'kesch': self.num_gpus_per_node = 16 self.valid_prog_environs = ['PrgEnv-cray', 'PrgEnv-pgi'] self.modules = ['craype-accel-nvidia35'] gpu_arch = '37' else: # Enable test when running on an unknown system self.num_gpus_per_node = 1 self.valid_systems = ['*'] self.valid_prog_environs = ['*'] gpu_arch = None if gpu_arch: self.build_system.cxxflags += [ '-arch=compute_%s' % gpu_arch, '-code=sm_%s' % gpu_arch ] if kernel_version == 'sync': self.build_system.cppflags = ['-D SYNCKERNEL=1'] else: self.build_system.cppflags = ['-D SYNCKERNEL=0'] self.sanity_patterns = sn.all([ sn.assert_eq( sn.count(sn.findall(r'\[\S+\] Found \d+ gpu\(s\)', self.stdout)), self.num_tasks_assigned), sn.assert_eq( sn.count( sn.findall( r'\[\S+\] \[gpu \d+\] Kernel launch ' r'latency: \S+ us', self.stdout)), self.num_tasks_assigned * self.num_gpus_per_node) ]) self.perf_patterns = { 'latency': sn.max( sn.extractall( r'\[\S+\] \[gpu \d+\] Kernel launch latency: ' r'(?P<latency>\S+) us', self.stdout, 'latency', float)) } self.sys_reference = { 'sync': { 'dom:gpu': { 'latency': (6.6, None, 0.10, 'us') }, 'daint:gpu': { 'latency': (6.6, None, 0.10, 'us') }, 'kesch:cn': { 'latency': (12.0, None, 0.10, 'us') }, '*': { 'latency': (0.0, None, None, 'us') } }, 'async': { 'dom:gpu': { 'latency': (2.2, None, 0.10, 'us') }, 'daint:gpu': { 'latency': (2.2, None, 0.10, 'us') }, 'kesch:cn': { 'latency': (5.7, None, 0.10, 'us') }, '*': { 'latency': (0.0, None, None, 'us') } }, } self.reference = self.sys_reference[kernel_version] self.maintainers = ['TM'] self.tags = {'benchmark', 'diagnostic'}
def max_gpu_memory(self): # Node name Usage Max mem Execution time # ------------ ----------- ------------ -------------- # nid06681 38 % 2749 MiB 00:00:06 regex = r'^\s+nid\S+\s+\d+\s+%\s+(\d+)\s+MiB.*:' return sn.max(sn.extractall(regex, self.stdout, 1, int))
def vtune_time(self): '''Vtune creates 1 report per compute node. For example, a 48 mpi tasks job (= 2 compute nodes when running with 24 c/cn) will create 2 directories: * rpt.nid00001/rpt.nid00001.vtune * rpt.nid00002/rpt.nid00002.vtune Typical output (for each compute node) is: .. code-block:: Elapsed Time: 14.866s CPU Time: 319.177s /24 = 13.3 Effective Time: 308.218s /24 = 12.8 Idle: 0s Poor: 19.725s Ok: 119.570s Ideal: 168.922s Over: 0s Spin Time: 10.959s /24 = 0.4 MPI Busy Wait Time: 10.795s Other: 0.164s Overhead Time: 0s Total Thread Count: 25 Paused Time: 0s ''' result_d = {} # --- ranks per node if self.num_tasks < self.num_tasks_per_node: vtune_tasks_per_node = self.num_tasks else: vtune_tasks_per_node = self.num_tasks_per_node # --- Elapsed Time (min, max) regex = r'.*Elapsed Time: (?P<sec>\S+)s' result = sn.extractall(regex, self.stdout, 'sec', float) result_d['elapsed_min'] = sn.round(sn.min(result), 4) result_d['elapsed_max'] = sn.round(sn.max(result), 4) # --- CPU Time (max) regex = r'^\s+CPU Time: (?P<sec>\S+)s' result = sn.extractall(regex, self.stdout, 'sec', float) result_d['elapsed_cput'] = sn.round( sn.max(result) / vtune_tasks_per_node, 4) # --- CPU Time: Effective Time (max) regex = r'^\s+Effective Time: (?P<sec>\S+)s' result = sn.extractall(regex, self.stdout, 'sec', float) result_d['elapsed_cput_efft'] = sn.round( sn.max(result) / vtune_tasks_per_node, 4) # --- CPU Time: Spin Time (max) regex = r'^\s+Spin Time: (?P<sec>\S+)s' result = sn.extractall(regex, self.stdout, 'sec', float) result_d['elapsed_cput_spint'] = sn.round( sn.max(result) / vtune_tasks_per_node, 4) # --- CPU Time: Spin Time: MPI Busy Wait (max) if self.num_tasks > 1: regex = r'\s+MPI Busy Wait Time: (?P<sec>\S+)s' result = sn.extractall(regex, self.stdout, 'sec', float) result_d['elapsed_cput_spint_mpit'] = sn.round( sn.max(result) / vtune_tasks_per_node, 4) else: result_d['elapsed_cput_spint_mpit'] = 0 # TODO: # 'vtune_momentumAndEnergyIAD': # sphsintel.vtune_momentumAndEnergyIAD(self), # '%vtune_srcf_lookupTables': self.vtune_pct_lookupTables, # '%vtune_srcf_Octree': self.vtune_pct_Octree, # '%vtune_srcf_momentumAndEnergyIAD': # self.vtune_pct_momentumAndEnergyIAD, # '%vtune_srcf_IAD': self.vtune_pct_IAD, return result_d
def __init__(self): super().__init__() self.valid_systems = ['daint:gpu', 'dom:gpu', 'kesch:cn', 'tiger:gpu'] self.descr = 'GPU burn test' self.valid_prog_environs = ['PrgEnv-gnu'] if self.current_system.name == 'kesch': self.exclusive_access = True self.modules = ['craype-accel-nvidia35'] # NOTE: The first option indicates the precision (-d for double) # while the seconds is the time (in secs) to run the test. # For multi-gpu nodes, we run the gpu burn test for more # time to get reliable measurements. self.executable_opts = ['-d', '40'] self.num_gpus_per_node = 16 gpu_arch = '37' elif self.current_system.name in {'daint', 'dom', 'tiger'}: self.modules = ['craype-accel-nvidia60'] self.executable_opts = ['-d', '20'] self.num_gpus_per_node = 1 gpu_arch = '60' else: self.num_gpus_per_node = 1 gpu_arch = None self.sourcepath = 'gpu_burn.cu' self.build_system = 'SingleSource' if gpu_arch: self.build_system.cxxflags = [ '-arch=compute_%s' % gpu_arch, '-code=sm_%s' % gpu_arch ] self.build_system.ldflags = ['-lcuda', '-lcublas', '-lnvidia-ml'] self.sanity_patterns = sn.assert_eq( sn.count(sn.findall('OK', self.stdout)), self.num_tasks_assigned) patt = r'GPU\s+\d+\(\S*\): (?P<perf>\S*) GF\/s (?P<temp>\S*) Celsius' self.perf_patterns = { 'perf': sn.min(sn.extractall(patt, self.stdout, 'perf', float)), 'max_temp': sn.max(sn.extractall(patt, self.stdout, 'temp', float)) } self.reference = { 'dom:gpu': { 'perf': (4115, -0.10, None, 'Gflop/s'), 'max_temp': (0, None, None, 'Celsius') }, 'daint:gpu': { 'perf': (4115, -0.10, None, 'Gflop/s'), 'max_temp': (0, None, None, 'Celsius') }, 'kesch:cn': { 'perf': (950, -0.10, None, 'Gflop/s'), 'max_temp': (0, None, None, 'Celsius') }, '*': { 'perf': (0, None, None, 'Gflop/s'), 'max_temp': (0, None, None, 'Celsius') } } self.num_tasks = 0 self.num_tasks_per_node = 1 self.maintainers = ['AJ', 'TM'] self.tags = {'diagnostic', 'benchmark', 'craype'}
def max_temp(self, nid=None): '''Maximum temperature recorded.''' return sn.max(self._extract_perf_metric('temp', nid))
def speedup(self): regex = r'^\S+(f32|f64)\s+(\S+) ns\s+' slowest = sn.max(sn.extractall(regex, self.stdout, 2, float)) fastest = sn.min(sn.extractall(regex, self.stdout, 2, float)) return sn.round(slowest / fastest, 3)