def _emit_build_profiling(self): if not is_log_enabled_for('PERF'): return # Rounder to K decimal places fround = lambda i, n=100: ceil(i * n) / n timings = self._profiler.py_timers.copy() tot = timings.pop('op-compile') perf("Operator `%s` generated in %.2f s" % (self.name, fround(tot))) max_hotspots = 3 threshold = 20. def _emit_timings(timings, indent=''): timings.pop('total', None) entries = sorted(timings, key=lambda i: timings[i]['total'], reverse=True) for i in entries[:max_hotspots]: v = fround(timings[i]['total']) perc = fround(v / tot * 100, n=10) if perc > threshold: perf("%s%s: %.2f s (%.1f %%)" % (indent, i.lstrip('_'), v, perc)) _emit_timings(timings[i], ' ' * len(indent) + ' * ') _emit_timings(timings, ' * ') if self._profiler._ops: ops = ['%d --> %d' % i for i in self._profiler._ops] perf("Flops reduction after symbolic optimization: [%s]" % ' ; '.join(ops))
def _emit_build_profiling(self): if not is_log_enabled_for('PERF'): return # Rounder to K decimal places fround = lambda i, n=100: ceil(i * n) / n timings = self._profiler.py_timers.copy() tot = timings.pop('op-compile') perf("Operator `%s` generated in %.2f s" % (self.name, fround(tot))) max_hotspots = 3 for i in sorted(timings, key=timings.get, reverse=True)[:max_hotspots]: v = fround(timings[i]) perc = fround(v / tot * 100, n=10) if perc > 20.: perf("- [Hotspot] %s: %.2f s (%.1f %%)" % (i.lstrip('_'), v, perc))
def _emit_apply_profiling(self, args): """Produce a performance summary of the profiled sections.""" # Rounder to 2 decimal places fround = lambda i: ceil(i * 100) / 100 info("Operator `%s` ran in %.2f s" % (self.name, fround(self._profiler.py_timers['apply']))) summary = self._profiler.summary(args, self._dtype, reduce_over='apply') if not is_log_enabled_for('PERF'): # Do not waste time return summary if summary.globals: # Note that with MPI enabled, the global performance indicators # represent "cross-rank" performance data metrics = [] v = summary.globals.get('vanilla') if v is not None: metrics.append("OI=%.2f" % fround(v.oi)) metrics.append("%.2f GFlops/s" % fround(v.gflopss)) v = summary.globals.get('fdlike') if v is not None: metrics.append("%.2f GPts/s" % fround(v.gpointss)) if metrics: perf("Global performance: [%s]" % ', '.join(metrics)) perf("Local performance:") indent = " " * 2 else: indent = "" # Emit local, i.e. "per-rank" performance. Without MPI, this is the only # thing that will be emitted for k, v in summary.items(): rank = "[rank%d]" % k.rank if k.rank is not None else "" oi = "OI=%.2f" % fround(v.oi) gflopss = "%.2f GFlops/s" % fround(v.gflopss) gpointss = "%.2f GPts/s" % fround( v.gpointss) if v.gpointss else None metrics = ", ".join(i for i in [oi, gflopss, gpointss] if i is not None) itershapes = [ ",".join(str(i) for i in its) for its in v.itershapes ] if len(itershapes) > 1: itershapes = ",".join("<%s>" % i for i in itershapes) elif len(itershapes) == 1: itershapes = itershapes[0] else: itershapes = "" name = "%s%s<%s>" % (k.name, rank, itershapes) perf("%s* %s ran in %.2f s [%s]" % (indent, name, fround(v.time), metrics)) for n, time in summary.subsections.get(k.name, {}).items(): perf("%s+ %s ran in %.2f s [%.2f%%]" % (indent * 2, n, time, fround(time / v.time * 100))) # Emit performance mode and arguments perf_args = {} for i in self.input + self.dimensions: if not i.is_PerfKnob: continue try: perf_args[i.name] = args[i.name] except KeyError: # Try with the aliases for a in i._arg_names: if a in args: perf_args[a] = args[a] break perf("Performance[mode=%s] arguments: %s" % (self._state['optimizations'], perf_args)) return summary
def _emit_apply_profiling(self, args): """Produce a performance summary of the profiled sections.""" # Rounder to 2 decimal places fround = lambda i: ceil(i * 100) / 100 info("Operator `%s` run in %.2f s" % (self.name, fround(self._profiler.py_timers['apply']))) summary = self._profiler.summary(args, self._dtype, reduce_over='apply') if not is_log_enabled_for('PERF'): # Do not waste time return summary if summary.globals: indent = " " * 2 perf("Global performance indicators") # With MPI enabled, the 'vanilla' entry contains "cross-rank" performance data v = summary.globals.get('vanilla') if v is not None: gflopss = "%.2f GFlops/s" % fround(v.gflopss) gpointss = "%.2f GPts/s" % fround( v.gpointss) if v.gpointss else None metrics = ", ".join(i for i in [gflopss, gpointss] if i is not None) perf( "%s* Operator `%s` with OI=%.2f computed in %.2f s [%s]" % (indent, self.name, fround(v.oi), fround(v.time), metrics)) v = summary.globals.get('fdlike') if v is not None: perf("%s* Achieved %.2f FD-GPts/s" % (indent, v.gpointss)) perf("Local performance indicators") else: indent = "" # Emit local, i.e. "per-rank" performance. Without MPI, this is the only # thing that will be emitted for k, v in summary.items(): rank = "[rank%d]" % k.rank if k.rank is not None else "" gflopss = "%.2f GFlops/s" % fround(v.gflopss) gpointss = "%.2f GPts/s" % fround( v.gpointss) if v.gpointss else None metrics = ", ".join(i for i in [gflopss, gpointss] if i is not None) itershapes = [ ",".join(str(i) for i in its) for its in v.itershapes ] if len(itershapes) > 1: name = "%s%s<%s>" % (k.name, rank, ",".join( "<%s>" % i for i in itershapes)) perf("%s* %s with OI=%.2f computed in %.2f s [%s]" % (indent, name, fround(v.oi), fround(v.time), metrics)) elif len(itershapes) == 1: name = "%s%s<%s>" % (k.name, rank, itershapes[0]) perf("%s* %s with OI=%.2f computed in %.2f s [%s]" % (indent, name, fround(v.oi), fround(v.time), metrics)) else: name = k.name perf("%s* %s%s computed in %.2f s" % (indent, name, rank, fround(v.time))) # Emit relevant configuration values perf("Configuration: %s" % self._state['optimizations']) # Emit relevant performance arguments perf_args = {} for i in self.input + self.dimensions: if not i.is_PerfKnob: continue try: perf_args[i.name] = args[i.name] except KeyError: # Try with the aliases for a in i._arg_names: if a in args: perf_args[a] = args[a] break perf("Performance arguments: %s" % perf_args) return summary