def mpip_perf_patterns(obj, reg): '''More perf_patterns for the tool .. code-block:: ----------------------------------- @--- MPI Time (seconds) ----------- ----------------------------------- Task AppTime MPITime MPI% 0 8.6 0.121 1.40 <-- min 1 8.6 0.157 1.82 2 8.6 5.92 68.84 <-- max * 25.8 6.2 24.02 <--- => NonMPI= AppTime - MPITime Typical performance reporting: .. code-block:: * mpip_avg_app_time: 8.6 s (= 25.8/3mpi) * mpip_avg_mpi_time: 2.07 s (= 6.2/3mpi) * %mpip_avg_mpi_time: 24.02 % * %max/%min * %mpip_avg_non_mpi_time: 75.98 % ''' # rpt = os.path.join(obj.stagedir, obj.rpt_file_txt) rpt = sn.extractsingle(r'^mpiP: Storing mpiP output in \[(?P<rpt>.*)\]', obj.stdout, 'rpt', str) regex_star = r'^\s+\*\s+(?P<appt>\S+)\s+(?P<mpit>\S+)\s+(?P<pct>\S+)$' regex_minmax = (r'^\s+(?P<mpirk>\S+)\s+(?P<appt>\S+)\s+(?P<mpit>\S+)\s+' r'(?P<pct>\S+)$') if reg == 1: # mpip_avg_mpi_time result = sn.round( sn.extractsingle(regex_star, rpt, 'mpit', float) / obj.num_tasks, 2) elif reg == 2: # mpip_avg_app_time result = sn.round( sn.extractsingle(regex_star, rpt, 'appt', float) / obj.num_tasks, 2) elif reg == 3: # %mpip_avg_mpi_time result = sn.extractsingle(regex_star, rpt, 'pct', float) elif reg == 4: # %nonmpi mpi_pct = sn.extractsingle(regex_star, rpt, 'pct', float) result = sn.round(100 - mpi_pct, 2) elif reg == 5: # %mpip_avg_mpi_time_max result = sn.max(sn.extractall(regex_minmax, rpt, 'pct', float)) elif reg == 6: # %mpip_avg_mpi_time_min result = sn.min(sn.extractall(regex_minmax, rpt, 'pct', float)) else: raise ValueError('unknown region id in mpip_perf_patterns') return result
def seconds_elaps(self): '''Reports elapsed time in seconds using the internal timer from the code .. code-block:: === Total time for iteration(0) 3.61153s reports: * Elapsed: 3.6115 s ''' regex = r'^=== Total time for iteration\(\d+\)\s+(?P<sec>\d+\D\d+)s' res = sn.round(sn.sum(sn.extractall(regex, self.stdout, 'sec', float)), 4) if res > 0: return sn.round( sn.sum(sn.extractall(regex, self.stdout, 'sec', float)), 4) else: return 1
def scorep_inclusivepct_energy(obj): '''Reports % of elapsed time (inclusive) for MomentumAndEnergy function (small scale job) .. code-block:: > sqpatch_048mpi_001omp_125n_10steps_1000000cycles/rpt.exclusive 0.0193958 (0.0009252%) sqpatch.exe 1.39647 (0.06661%) + main ... 714.135 (34.063%) | + ... ******* _ZN6sphexa3sph31computeMomentumAndEnergyIADImplIdNS_13 ... ParticlesDataIdEEEEvRKNS_4TaskERT0_ 0.205453 (0.0098%) | + _ZN6sphexa3sph15computeTimestepIdNS0_21TimestepPress2ndOrderIdNS_13 ... ParticlesDataIdEEEES4_EEvRKSt6vectorINS_4TaskESaIS7_EERT1_ 201.685 (9.62%) | | + MPI_Allreduce ''' # regex = r'^\d+.\d+ \((?P<pct>\d+.\d+).*computeMomentumAndEnergy' # return sn.extractsingle(regex, obj.rpt_inclusive, 'pct', float) regex = r'^\d+.\d+\s+\((?P<pct>\d+.\d+).*momentumAndEnergyIAD' try: result = sn.round( sn.sum(sn.extractall(regex, obj.rpt_inclusive, 'pct', float)), 2) except Exception as e: printer.error(f'scorep_inclusivepct_energy failed: {e}') result = 0 return result
def patrun_walltime_and_memory(self): '''This table shows total wall clock time for the ranks with the maximum, mean, and minimum time, as well as the average across ranks. .. code-block:: Table 10: Wall Clock Time, Memory High Water Mark Process | Process | PE=[mmm] Time | HiMem | | (MiBytes) | 11.389914 | 76.3 | Total <-- avgt |-------------------------------- | 11.398188 | 57.7 | pe.24 <-- maxt | 11.389955 | 98.9 | pe.34 | 11.365630 | 54.0 | pe.93 <-- mint |================================ Typical output: * patrun_wallt_max: 11.3982 s * patrun_wallt_avg: 11.3899 s * patrun_wallt_min: 11.3656 s * patrun_mem_max: 57.7 MiBytes * patrun_mem_min: 54.0 MiBytes ''' # TODO: bug avg mem? res = {} # --- avg regex = (r'^Table \d+: Wall Clock Time, Memory High Water Mark\n' r'(.*\n){4}\s+(.*\n)\s+(?P<proct>\S+)\s+\|\s+(?P<mem>\S+)' r' \| Total$') res['patrun_wallt_avg'] = sn.extractsingle(regex, self.stdout, 'proct', float) res['patrun_mem_avg'] = sn.extractsingle(regex, self.stdout, 'mem', float) # --- max regex = (r'^Table \d+: Wall Clock Time, Memory High Water Mark\n' r'(.*\n){4}\s+(.*\n){2}\|\s+(?P<proct>\S+) \|\s+(?P<mem>\S+)' r'\s+\|\s(?P<pe>\S+)$') res['patrun_wallt_max'] = sn.extractsingle(regex, self.stdout, 'proct', float) res['patrun_mem_max'] = sn.extractsingle(regex, self.stdout, 'mem', float) res['patrun_mem_max_pe'] = sn.extractsingle(regex, self.stdout, 'pe', float) # --- min regex = (r'^Table \d+: Wall Clock Time, Memory High Water Mark\n' r'(.*\n){4}\s+(.*\n){4}\|\s+(?P<proct>\S+) \|\s+(?P<mem>\S+)' r'\s+\|\s(?P<pe>\S+)$') res['patrun_wallt_min'] = sn.extractsingle(regex, self.stdout, 'proct', float) res['patrun_mem_min'] = sn.extractsingle(regex, self.stdout, 'mem', float) res['patrun_mem_min_pe'] = sn.extractsingle(regex, self.stdout, 'pe', float) for kk, vv in res.items(): if not isinstance(vv, str): res[kk] = sn.round(vv, 4) self.patrun_perf_d = res
def nsys_report_computeIAD_pct(self): '''Reports ``CUDA Kernel`` Time (%) for computeIAD measured by the tool and averaged over compute nodes .. code-block:: > job.stdout # CUDA Kernel Statistics (nanoseconds) # # Time(%) Total Time Instances Average Minimum # ------- -------------- ---------- -------------- -------------- # 49.7 69968829 6 11661471.5 11507063 # 26.4 37101887 6 6183647.8 6047175 # **** # 24.0 33719758 24 1404989.9 1371531 # ... # Maximum Name # -------------- ------------------ # 11827539 computeMomentumAndEnergyIAD # 6678078 computeIAD # 1459594 density ''' # new regex: regex = (r'^\s+(?P<pctg>\S+).*::computeIAD<') result = sn.round(sn.avg(sn.extractall(regex, self.stdout, 'pctg', float)), 1) return result
def nsys_report_cudaMemcpy_pct(self): '''Reports ``CUDA API`` Time (%) for cudaMemcpy measured by the tool and averaged over compute nodes .. code-block:: > job.stdout # CUDA API Statistics (nanoseconds) # # Time(%) Total Time Calls Average Minimum # ------- -------------- ---------- -------------- -------------- # 44.9 309427138 378 818590.3 9709 # **** # 40.6 279978449 2 139989224.5 24173 # 9.5 65562201 308 212864.3 738 # 4.9 33820196 306 110523.5 2812 # 0.1 704223 36 19561.8 9305 # .... # Maximum Name # -------------- ------------------ # 11665852 cudaMemcpy # 279954276 cudaMemcpyToSymbol # 3382747 cudaFree # 591094 cudaMalloc # 34042 cudaLaunch ''' regex = r'^\s+(?P<pctg>\S+)\s+\S+\s+\S+\s+\S+\s+\S+\s+\S+\s+cudaMemcpy\s+$' result = sn.round(sn.avg(sn.extractall(regex, self.stdout, 'pctg', float)), 1) return result
def nsys_report_DtoH_KiB(self): '''Reports ``[CUDA memcpy DtoH]`` Memory Operation (KiB) measured by the tool and averaged over compute nodes .. code-block:: > job.stdout # CUDA Memory Operation Statistics (KiB) # # Total Operations Average Minimum # ----------------- -------------- ----------------- ----------------- # 1530313.0 296 5170.0 0.055 # 16500.0 84 196.4 62.500 # ******* # ... # Maximum Name # ----------------- ------------------- # 81250.0 [CUDA memcpy HtoD] # 250.0 [CUDA memcpy DtoH] ''' regex = (r'^\s+(?P<KiB>\d+.\d+)\s+\d+\s+\S+\s+\S+\s+\S+\s+' r'\[CUDA memcpy DtoH\]\s+$') result = sn.round(sn.avg(sn.extractall(regex, self.stdout, 'KiB', float)), 1) return result
def set_mpip_perf_patterns(self): '''More perf_patterns for the tool .. code-block:: ----------------------------------- @--- MPI Time (seconds) ----------- ----------------------------------- Task AppTime MPITime MPI% 0 8.6 0.121 1.40 <-- min 1 8.6 0.157 1.82 2 8.6 5.92 68.84 <-- max * 25.8 6.2 24.02 <--- => NonMPI= AppTime - MPITime Typical performance reporting: .. code-block:: * mpip_avg_app_time: 8.6 s (= 25.8/3mpi) * mpip_avg_mpi_time: 2.07 s (= 6.2/3mpi) * %mpip_avg_mpi_time: 24.02 % * %mpip_avg_non_mpi_time: 75.98 % ''' regex_star = r'^\s+\*\s+(?P<appt>\S+)\s+(?P<mpit>\S+)\s+(?P<pct>\S+)$' app_t = sn.extractsingle(regex_star, self.rpt, 'appt', float) mpi_t = sn.extractsingle(regex_star, self.rpt, 'mpit', float) mpi_pct = sn.extractsingle(regex_star, self.rpt, 'pct', float) nonmpi_pct = sn.round(100 - mpi_pct, 2) # min/max regex = (r'^\s+(?P<mpirk>\S+)\s+(?P<appt>\S+)\s+(?P<mpit>\S+)\s+' r'(?P<pct>\S+)$') mpi_pct_max = sn.max(sn.extractall(regex, self.rpt, 'pct', float)) mpi_pct_min = sn.min(sn.extractall(regex, self.rpt, 'pct', float)) perf_pattern = { 'mpip_avg_app_time': sn.round(app_t / self.num_tasks, 2), 'mpip_avg_mpi_time': sn.round(mpi_t / self.num_tasks, 2), '%mpip_avg_mpi_time': mpi_pct, '%mpip_avg_mpi_time_max': mpi_pct_max, '%mpip_avg_mpi_time_min': mpi_pct_min, '%mpip_avg_non_mpi_time': nonmpi_pct, } if self.perf_patterns: self.perf_patterns = {**self.perf_patterns, **perf_pattern} else: self.perf_patterns = perf_pattern
def density_ns(self): regex = self.set_regex('density') rpt = os.path.join(self.stagedir, self.metric_file.replace(".txt", ".csv")) begin_ns = sn.extractall(regex, rpt, 'begin', int) end_ns = sn.extractall(regex, rpt, 'end', int) ns_list = [zz[1] - zz[0] for zz in zip(begin_ns, end_ns)] return sn.round(sn.avg(ns_list), 0)
def speedup(self): # comparing fast and slow timings: slow = sn.extractsingle(r'^std_sqrt/f64\s+(?P<slow>\d+) ns', self.stdout, 'slow', int) fast = sn.extractsingle(r'^nsimd_cpu_sqrt/f64\s+(?P<fast>\d+) ns', self.stdout, 'fast', int) sp = sn.round(slow / fast, 3) return sp
def seconds_iad(self): '''Reports `IAD` time in seconds using the internal timer from the code .. code-block:: # IAD: 0.626564s reports: * IAD: 0.6284 s ''' regex = r'^# IAD:\s+(?P<sec>.*)s' return sn.round(sn.sum(sn.extractall(regex, self.stdout, 'sec', float)), 4)
def seconds_denst(self): '''Reports `Density` time in seconds using the internal timer from the code .. code-block:: # Density: 0.296224s reports: * Density: 0.296 s ''' regex = r'^# Density:\s+(?P<sec>.*)s' return sn.round(sn.sum(sn.extractall(regex, self.stdout, 'sec', float)), 4)
def rocprof_stats_computeMomentumAndEnergyIAD(self): """ "Name","Calls","TotalDurationNs","AverageNs","Percentage" "findNeighborsKernel",192,2334058542,12156554,47.20607754469413 "computeMomentumAndEnergyIAD",64,1796800226,28075003,36.34008713774581 """ # regex = '^\"(\S+)\",.*,(\S+)\d$' regex = r'^\"computeMomentumAndEnergyIAD\",.*,(\S+)\d$' rpt = os.path.join(self.stagedir, self.rpt_file) return sn.round(sn.extractsingle(regex, rpt, 1, float), 1)
def seconds_domaindistrib(self): '''Reports `domain::distribute` time in seconds using the internal timer from the code .. code-block:: # domain::distribute: 0.0983208s reports: * domain_distribute: 0.0983 s ''' regex = r'^# domain::distribute:\s+(?P<sec>.*)s' return sn.round(sn.sum(sn.extractall(regex, self.stdout, 'sec', float)), 4)
def seconds_step(self): '''Reports `Timestep` time in seconds using the internal timer from the code .. code-block:: # Timestep: 0.621583s reports: * Timestep: 0.6215 s ''' regex = r'^# Timestep:\s+(?P<sec>.*)s' return sn.round(sn.sum(sn.extractall(regex, self.stdout, 'sec', float)), 4)
def seconds_state(self): '''Reports `EquationOfState` time in seconds using the internal timer from the code .. code-block:: # EquationOfState: 0.00244751s reports: * EquationOfState: 0.0024 s ''' regex = r'^# EquationOfState:\s+(?P<sec>.*)s' return sn.round(sn.sum(sn.extractall(regex, self.stdout, 'sec', float)), 4)
def seconds_updat(self): '''Reports `UpdateQuantities` time in seconds using the internal timer from the code .. code-block:: # UpdateQuantities: 0.00498222s reports: * UpdateQuantities: 0.0049 s ''' regex = r'^# UpdateQuantities:\s+(?P<sec>.*)s' return sn.round(sn.sum(sn.extractall(regex, self.stdout, 'sec', float)), 4)
def seconds_consv(self): '''Reports `EnergyConservation` time in seconds using the internal timer from the code .. code-block:: # EnergyConservation: 0.00137127s reports: * EnergyConservation: 0.0013 s ''' regex = r'^# EnergyConservation:\s+(?P<sec>.*)s' return sn.round(sn.sum(sn.extractall(regex, self.stdout, 'sec', float)), 4)
def seconds_neigh(self): '''Reports `FindNeighbors` time in seconds using the internal timer from the code .. code-block:: # FindNeighbors: 0.354712s reports: * FindNeighbors: 0.3547 s ''' regex = r'^# FindNeighbors:\s+(?P<sec>.*)s' return sn.round(sn.sum(sn.extractall(regex, self.stdout, 'sec', float)), 4)
def seconds_smoothinglength(self): '''Reports `UpdateSmoothingLength` time in seconds using the internal timer from the code .. code-block:: # UpdateSmoothingLength: 0.00321161s reports: * SmoothingLength: 0.0032 s ''' regex = r'^# UpdateSmoothingLength:\s+(?P<sec>.*)s' return sn.round(sn.sum(sn.extractall(regex, self.stdout, 'sec', float)), 4)
def seconds_tree(self): '''Reports `domain:BuildTree` time in seconds using the internal timer from the code .. code-block:: # domain::buildTree: 0.084004s reports: * BuildTree: 0 s ''' regex = r'^# domain::buildTree:\s+(?P<sec>.*)s' return sn.round(sn.sum(sn.extractall(regex, self.stdout, 'sec', float)), 4)
def rpt_tracestats_omp(obj): '''Reports OpenMP statistics by reading the trace.stat file: - omp_ibarrier_wait: OMP Wait at Implicit Barrier (sec) in Cube GUI - omp_lock_contention_critical: OMP Critical Contention (sec) in Cube GUI Each column (Count Mean Median Minimum Maximum Sum Variance Quartil25 and Quartil75) is read, only Sum is reported here. ''' regex_omp_ibarrier = r'^omp_ibarrier_wait\s+(\S+\s+){5}(?P<Sum>\S+)' regex_omp_lock = r'^omp_lock_contention_critical\s+(\S+\s+){5}(?P<Sum>\S+)' # omp_ibarrier = sn.extractsingle(regex_omp_ibarrier, obj.stat_rpt, 'Sum', float) omp_lock = sn.extractsingle(regex_omp_lock, obj.stat_rpt, 'Sum', float) # n_procs = obj.compute_node * obj.num_tasks_per_node * obj.omp_threads res_d = { 'omp_ibarrier': sn.round(omp_ibarrier / n_procs, 4), 'omp_lock': sn.round(omp_lock / n_procs, 4), } return res_d
def seconds_updateTasks(self): '''Reports `updateTasks` time in seconds using the internal timer from the code .. code-block:: # updateTasks: 0.000900428s reports: ... ''' regex = r'^# updateTasks:\s+(?P<sec>.*)s' return sn.round(sn.sum(sn.extractall(regex, self.stdout, 'sec', float)), 4)
def ipc_rk0(obj): '''Reports the ``IPC`` (instructions per cycle) for rank 0 ''' regex1 = (r'^METRIC\s+0\s+.*Values: \(\"PAPI_TOT_INS\" <0>; UINT64;' r'\s+(?P<ins>\d+)\)') tot_ins_rk0 = sn.extractall(regex1, obj.rpt_otf2, 'ins', float) regex2 = (r'^METRIC\s+0\s+.*Values:.*\(\"PAPI_TOT_CYC\" <1>; UINT64;' r'\s+(?P<cyc>\d+)\)') tot_cyc_rk0 = sn.extractall(regex2, obj.rpt_otf2, 'cyc', float) ipc = [a / b for a, b in zip(tot_ins_rk0, tot_cyc_rk0)] return sn.round(max(ipc), 6)
def seconds_energ(self): '''Reports `MomentumEnergyIAD` time in seconds using the internal timer from the code .. code-block:: # MomentumEnergyIAD: 1.05951s reports: * MomentumEnergyIAD: 1.0595 s ''' regex = r'^# MomentumEnergyIAD:\s+(?P<sec>.*)s' return sn.round(sn.sum(sn.extractall(regex, self.stdout, 'sec', float)), 4)
def vtune_logical_core_utilization(self): '''Reports the minimum ``Physical Core Utilization`` (%) measured by the tool .. code-block:: Effective Logical Core Utilization: 96.0% (23.028 out of 24) Effective Logical Core Utilization: 95.9% (23.007 out of 24) Effective Logical Core Utilization: 95.5% (22.911 out of 24) ''' regex = r'^\s+Effective Logical Core Utilization: (?P<pct>\S+)%' return sn.round(sn.min(sn.extractall(regex, self.stdout, 'pct', float)), 4)
def vtune_physical_core_utilization(self): '''Reports the minimum ``Physical Core Utilization`` (%) measured by the tool .. code-block:: Effective Physical Core Utilization: 96.3% (11.554 out of 12) Effective Physical Core Utilization: 96.1% (11.534 out of 12) Effective Physical Core Utilization: 95.9% (11.512 out of 12) ''' regex = r'^Effective Physical Core Utilization: (?P<pct>\S+)%' return sn.round(sn.min(sn.extractall(regex, self.stdout, 'pct', float)), 4)
def seconds_halos(self): '''Reports `mpi::synchronizeHalos` time in seconds using the internal timer from the code .. code-block:: # mpi::synchronizeHalos: 0.0341479s # mpi::synchronizeHalos: 0.0770191s # mpi::synchronizeHalos: 0.344856s reports: * mpi_synchronizeHalos: 0.4560 s ''' regex = r'^# mpi::synchronizeHalos:\s+(?P<sec>.*)s' return sn.round(sn.sum(sn.extractall(regex, self.stdout, 'sec', float)), 4)
def set_vtune_perf_patterns_rpt(self): '''More perf_patterns for the tool Typical performance reporting: .. literalinclude:: ../../reframechecks/intel/intel_vtune.res :lines: 117-127 ''' regex_l = r'^vtune: Using result path .(?P<paths>\S+).$' paths_l = sn.extractall(regex_l, self.dir_rpt, 'paths') regex = (r'(?P<funcname>.*);(?P<cput>\d+.\d+);(?P<cput_efft>\S+);' r'(?P<cput_spint>\S+);(?P<cput_overhead>\S+)') res = {} res2 = {} for ii in range(self.vtune_paths): rpt = paths_l[ii] + '.csv' # rpt.nid00034.csv kk = 'vtune_cput_cn%s' % ii res[kk] = sn.round(sn.sum(sn.extractall(regex, rpt, 'cput', float)), 2) kk = 'vtune_cput_cn%s_efft' % ii res2[kk] = sn.round(sn.sum(sn.extractall(regex, rpt, 'cput_efft', float)), 2) kk = 'vtune_cput_cn%s_spint' % ii res2[kk] = sn.round(sn.sum(sn.extractall(regex, rpt, 'cput_spint', float)), 2) kk = '%svtune_cput_cn%s_efft' % ('%', ii) res[kk] = sn.round(res2['vtune_cput_cn%s_efft' % ii] / res['vtune_cput_cn%s' % ii] * 100, 1) kk = '%svtune_cput_cn%s_spint' % ('%', ii) res[kk] = sn.round(res2['vtune_cput_cn%s_spint' % ii] / res['vtune_cput_cn%s' % ii] * 100, 1) if self.perf_patterns: self.perf_patterns = {**self.perf_patterns, **res} else: self.perf_patterns = res
def nvprof_report_DtoH_pct(self): '''Reports ``[CUDA memcpy DtoH]`` Time(%) measured by the tool and averaged over compute nodes .. code-block:: > job.stdout (Name: [CUDA memcpy DtoH]) # Time(%) Time Calls Avg Min Max # 2.80% 1.3194ms 44 29.986us 29.855us 30.528us [CUDA memcpy DtoH] # 1.34% 1.7667ms 44 40.152us 39.519us 41.887us [CUDA memcpy DtoH] # ^^^^ ''' regex = r'^\s+\s+(?P<pctg>\S+)%.*\[CUDA memcpy DtoH\]$' result = sn.round(sn.avg(sn.extractall(regex, self.stdout, 'pctg', float)), 1) return result