Ejemplo n.º 1
0
    def _emit_build_profiling(self):
        if not is_log_enabled_for('PERF'):
            return

        # Rounder to K decimal places
        fround = lambda i, n=100: ceil(i * n) / n

        timings = self._profiler.py_timers.copy()

        tot = timings.pop('op-compile')
        perf("Operator `%s` generated in %.2f s" % (self.name, fround(tot)))

        max_hotspots = 3
        threshold = 20.

        def _emit_timings(timings, indent=''):
            timings.pop('total', None)
            entries = sorted(timings,
                             key=lambda i: timings[i]['total'],
                             reverse=True)
            for i in entries[:max_hotspots]:
                v = fround(timings[i]['total'])
                perc = fround(v / tot * 100, n=10)
                if perc > threshold:
                    perf("%s%s: %.2f s (%.1f %%)" %
                         (indent, i.lstrip('_'), v, perc))
                    _emit_timings(timings[i], ' ' * len(indent) + ' * ')

        _emit_timings(timings, '  * ')

        if self._profiler._ops:
            ops = ['%d --> %d' % i for i in self._profiler._ops]
            perf("Flops reduction after symbolic optimization: [%s]" %
                 ' ; '.join(ops))
Ejemplo n.º 2
0
 def _emit_timings(timings, indent=''):
     timings.pop('total', None)
     entries = sorted(timings, key=lambda i: timings[i]['total'], reverse=True)
     for i in entries[:max_hotspots]:
         v = fround(timings[i]['total'])
         perc = fround(v/tot*100, n=10)
         if perc > threshold:
             perf("%s%s: %.2f s (%.1f %%)" % (indent, i.lstrip('_'), v, perc))
             _emit_timings(timings[i], ' '*len(indent) + ' * ')
Ejemplo n.º 3
0
 def _profile_output(self, args):
     """Produce a performance summary of the profiled sections."""
     summary = self._profiler.summary(args, self._dtype)
     info("Operator `%s` run in %.2f s" % (self.name, sum(summary.timings.values())))
     for k, v in summary.items():
         itershapes = [",".join(str(i) for i in its) for its in v.itershapes]
         if len(itershapes) > 1:
             name = "%s<%s>" % (k, ",".join("<%s>" % i for i in itershapes))
         elif len(itershapes) == 1:
             name = "%s<%s>" % (k, itershapes[0])
         else:
             name = None
         gpointss = ", %.2f GPts/s" % v.gpointss if v.gpointss else ''
         perf("* %s with OI=%.2f computed in %.3f s [%.2f GFlops/s%s]" %
              (name, v.oi, v.time, v.gflopss, gpointss))
     return summary
Ejemplo n.º 4
0
    def _jit_compile(self):
        """
        JIT-compile the C code generated by the Operator.

        It is ensured that JIT compilation will only be performed once per
        Operator, reagardless of how many times this method is invoked.
        """
        if self._lib is None:
            with self._profiler.timer_on('jit-compile'):
                recompiled, src_file = self._compiler.jit_compile(
                    self._soname, str(self.ccode))

            elapsed = self._profiler.py_timers['jit-compile']
            if recompiled:
                perf("Operator `%s` jit-compiled `%s` in %.2f s with `%s`" %
                     (self.name, src_file, elapsed, self._compiler))
            else:
                perf("Operator `%s` fetched `%s` in %.2f s from jit-cache" %
                     (self.name, src_file, elapsed))
Ejemplo n.º 5
0
    def _emit_build_profiling(self):
        if not is_log_enabled_for('PERF'):
            return

        # Rounder to K decimal places
        fround = lambda i, n=100: ceil(i * n) / n

        timings = self._profiler.py_timers.copy()

        tot = timings.pop('op-compile')
        perf("Operator `%s` generated in %.2f s" % (self.name, fround(tot)))

        max_hotspots = 3
        for i in sorted(timings, key=timings.get, reverse=True)[:max_hotspots]:
            v = fround(timings[i])
            perc = fround(v / tot * 100, n=10)
            if perc > 20.:
                perf("- [Hotspot] %s: %.2f s (%.1f %%)" %
                     (i.lstrip('_'), v, perc))
Ejemplo n.º 6
0
def autotune(operator, arguments, parameters, tunable):
    """
    Acting as a high-order function, take as input an operator and a list of
    operator arguments to perform empirical autotuning. Some of the operator
    arguments are marked as tunable.
    """
    # We get passed all the arguments, but the cfunction only requires a subset
    at_arguments = OrderedDict([(p.name, arguments[p.name])
                                for p in parameters])

    # User-provided output data must not be altered
    output = [i.name for i in operator.output]
    for k, v in arguments.items():
        if k in output:
            at_arguments[k] = v.copy()

    iterations = FindNodes(Iteration).visit(operator.body)
    dim_mapper = {i.dim.name: i.dim for i in iterations}

    # Shrink the iteration space of time-stepping dimension so that auto-tuner
    # runs will finish quickly
    steppers = [i for i in iterations if i.dim.is_Time]
    if len(steppers) == 0:
        timesteps = 1
    elif len(steppers) == 1:
        stepper = steppers[0]
        start = at_arguments[stepper.dim.min_name]
        timesteps = stepper.extent(start=start,
                                   finish=options['at_squeezer']) - 1
        if timesteps < 0:
            timesteps = options['at_squeezer'] - timesteps
            perf("AutoTuner: Number of timesteps adjusted to %d" % timesteps)
        at_arguments[stepper.dim.min_name] = start
        at_arguments[stepper.dim.max_name] = timesteps
        if stepper.dim.is_Stepping:
            at_arguments[stepper.dim.parent.min_name] = start
            at_arguments[stepper.dim.parent.max_name] = timesteps
    else:
        warning("AutoTuner: Couldn't understand loop structure; giving up")
        return arguments

    # Attempted block sizes ...
    mapper = OrderedDict([(i.argument.symbolic_size.name, i) for i in tunable])
    # ... Defaults (basic mode)
    blocksizes = [
        OrderedDict([(i, v) for i in mapper]) for v in options['at_blocksize']
    ]
    # ... Always try the entire iteration space (degenerate block)
    itershape = [
        mapper[i].iteration.symbolic_extent.subs(arguments) for i in mapper
    ]
    blocksizes.append(
        OrderedDict([(i, mapper[i].iteration.extent(0, j - 1))
                     for i, j in zip(mapper, itershape)]))
    # ... More attempts if auto-tuning in aggressive mode
    if configuration['autotuning'].level == 'aggressive':
        blocksizes = more_heuristic_attempts(blocksizes)

    # How many temporaries are allocated on the stack?
    # Will drop block sizes that might lead to a stack overflow
    functions = FindSymbols('symbolics').visit(operator.body +
                                               operator.elemental_functions)
    stack_shapes = [
        i.symbolic_shape for i in functions if i.is_Array and i._mem_stack
    ]
    stack_space = sum(reduce(mul, i, 1)
                      for i in stack_shapes) * operator._dtype().itemsize

    # Note: there is only a single loop over 'blocksize' because only
    # square blocks are tested
    timings = OrderedDict()
    for bs in blocksizes:
        illegal = False
        for k, v in at_arguments.items():
            if k in bs:
                val = bs[k]
                start = mapper[k].original_dim.symbolic_start.subs(arguments)
                end = mapper[k].original_dim.symbolic_end.subs(arguments)

                if val <= mapper[k].iteration.extent(start, end):
                    at_arguments[k] = val
                else:
                    # Block size cannot be larger than actual dimension
                    illegal = True
                    break
        if illegal:
            continue

        # Make sure we remain within stack bounds, otherwise skip block size
        dim_sizes = {}
        for k, v in at_arguments.items():
            if k in bs:
                dim_sizes[mapper[k].argument.symbolic_size] = bs[k]
            elif k in dim_mapper:
                dim_sizes[dim_mapper[k].symbolic_size] = v
        try:
            bs_stack_space = stack_space.xreplace(dim_sizes)
        except AttributeError:
            bs_stack_space = stack_space
        try:
            if int(bs_stack_space) > options['at_stack_limit']:
                continue
        except TypeError:
            # We should never get here
            warning(
                "AutoTuner: Couldn't determine stack size; skipping block shape %s"
                % str(bs))
            continue

        # Use AutoTuner-specific profiler structs
        timer = operator.profiler.timer.reset()
        at_arguments[operator.profiler.name] = timer

        operator.cfunction(*list(at_arguments.values()))
        elapsed = sum(getattr(timer._obj, i) for i, _ in timer._obj._fields_)
        timings[tuple(bs.items())] = elapsed
        perf("AutoTuner: Block shape <%s> took %f (s) in %d timesteps" %
             (','.join('%d' % i for i in bs.values()), elapsed, timesteps))

    try:
        best = dict(min(timings, key=timings.get))
        perf("AutoTuner: Selected block shape %s" % best)
    except ValueError:
        warning("AutoTuner: Couldn't find legal block shapes")
        return arguments

    # Build the new argument list
    tuned = OrderedDict()
    for k, v in arguments.items():
        tuned[k] = best[k] if k in mapper else v

    # Reset the profiling struct
    assert operator.profiler.name in tuned
    tuned[operator.profiler.name] = operator.profiler.timer.reset()

    return tuned
Ejemplo n.º 7
0
def log(msg):
    perf("AutoTuner: %s" % msg)
Ejemplo n.º 8
0
    def _emit_apply_profiling(self, args):
        """Produce a performance summary of the profiled sections."""
        # Rounder to 2 decimal places
        fround = lambda i: ceil(i * 100) / 100

        info("Operator `%s` ran in %.2f s" %
             (self.name, fround(self._profiler.py_timers['apply'])))

        summary = self._profiler.summary(args,
                                         self._dtype,
                                         reduce_over='apply')

        if not is_log_enabled_for('PERF'):
            # Do not waste time
            return summary

        if summary.globals:
            # Note that with MPI enabled, the global performance indicators
            # represent "cross-rank" performance data
            metrics = []

            v = summary.globals.get('vanilla')
            if v is not None:
                metrics.append("OI=%.2f" % fround(v.oi))
                metrics.append("%.2f GFlops/s" % fround(v.gflopss))

            v = summary.globals.get('fdlike')
            if v is not None:
                metrics.append("%.2f GPts/s" % fround(v.gpointss))

            if metrics:
                perf("Global performance: [%s]" % ', '.join(metrics))

            perf("Local performance:")
            indent = " " * 2
        else:
            indent = ""

        # Emit local, i.e. "per-rank" performance. Without MPI, this is the only
        # thing that will be emitted
        for k, v in summary.items():
            rank = "[rank%d]" % k.rank if k.rank is not None else ""
            oi = "OI=%.2f" % fround(v.oi)
            gflopss = "%.2f GFlops/s" % fround(v.gflopss)
            gpointss = "%.2f GPts/s" % fround(
                v.gpointss) if v.gpointss else None
            metrics = ", ".join(i for i in [oi, gflopss, gpointss]
                                if i is not None)
            itershapes = [
                ",".join(str(i) for i in its) for its in v.itershapes
            ]
            if len(itershapes) > 1:
                itershapes = ",".join("<%s>" % i for i in itershapes)
            elif len(itershapes) == 1:
                itershapes = itershapes[0]
            else:
                itershapes = ""
            name = "%s%s<%s>" % (k.name, rank, itershapes)

            perf("%s* %s ran in %.2f s [%s]" %
                 (indent, name, fround(v.time), metrics))
            for n, time in summary.subsections.get(k.name, {}).items():
                perf("%s+ %s ran in %.2f s [%.2f%%]" %
                     (indent * 2, n, time, fround(time / v.time * 100)))

        # Emit performance mode and arguments
        perf_args = {}
        for i in self.input + self.dimensions:
            if not i.is_PerfKnob:
                continue
            try:
                perf_args[i.name] = args[i.name]
            except KeyError:
                # Try with the aliases
                for a in i._arg_names:
                    if a in args:
                        perf_args[a] = args[a]
                        break
        perf("Performance[mode=%s] arguments: %s" %
             (self._state['optimizations'], perf_args))

        return summary
Ejemplo n.º 9
0
    def _emit_apply_profiling(self, args):
        """Produce a performance summary of the profiled sections."""
        # Rounder to 2 decimal places
        fround = lambda i: ceil(i * 100) / 100

        info("Operator `%s` run in %.2f s" %
             (self.name, fround(self._profiler.py_timers['apply'])))

        summary = self._profiler.summary(args,
                                         self._dtype,
                                         reduce_over='apply')

        if not is_log_enabled_for('PERF'):
            # Do not waste time
            return summary

        if summary.globals:
            indent = " " * 2

            perf("Global performance indicators")

            # With MPI enabled, the 'vanilla' entry contains "cross-rank" performance data
            v = summary.globals.get('vanilla')
            if v is not None:
                gflopss = "%.2f GFlops/s" % fround(v.gflopss)
                gpointss = "%.2f GPts/s" % fround(
                    v.gpointss) if v.gpointss else None
                metrics = ", ".join(i for i in [gflopss, gpointss]
                                    if i is not None)
                perf(
                    "%s* Operator `%s` with OI=%.2f computed in %.2f s [%s]" %
                    (indent, self.name, fround(v.oi), fround(v.time), metrics))

            v = summary.globals.get('fdlike')
            if v is not None:
                perf("%s* Achieved %.2f FD-GPts/s" % (indent, v.gpointss))

            perf("Local performance indicators")
        else:
            indent = ""

        # Emit local, i.e. "per-rank" performance. Without MPI, this is the only
        # thing that will be emitted
        for k, v in summary.items():
            rank = "[rank%d]" % k.rank if k.rank is not None else ""
            gflopss = "%.2f GFlops/s" % fround(v.gflopss)
            gpointss = "%.2f GPts/s" % fround(
                v.gpointss) if v.gpointss else None
            metrics = ", ".join(i for i in [gflopss, gpointss]
                                if i is not None)
            itershapes = [
                ",".join(str(i) for i in its) for its in v.itershapes
            ]
            if len(itershapes) > 1:
                name = "%s%s<%s>" % (k.name, rank, ",".join(
                    "<%s>" % i for i in itershapes))
                perf("%s* %s with OI=%.2f computed in %.2f s [%s]" %
                     (indent, name, fround(v.oi), fround(v.time), metrics))
            elif len(itershapes) == 1:
                name = "%s%s<%s>" % (k.name, rank, itershapes[0])
                perf("%s* %s with OI=%.2f computed in %.2f s [%s]" %
                     (indent, name, fround(v.oi), fround(v.time), metrics))
            else:
                name = k.name
                perf("%s* %s%s computed in %.2f s" %
                     (indent, name, rank, fround(v.time)))

        # Emit relevant configuration values
        perf("Configuration:  %s" % self._state['optimizations'])

        # Emit relevant performance arguments
        perf_args = {}
        for i in self.input + self.dimensions:
            if not i.is_PerfKnob:
                continue
            try:
                perf_args[i.name] = args[i.name]
            except KeyError:
                # Try with the aliases
                for a in i._arg_names:
                    if a in args:
                        perf_args[a] = args[a]
                        break
        perf("Performance arguments:  %s" % perf_args)

        return summary
Ejemplo n.º 10
0
def log(msg):
    perf("AutoTuner: %s" % msg)
Ejemplo n.º 11
0
    def _profile_output(self, args):
        """Produce a performance summary of the profiled sections."""
        # Rounder to 2 decimal places
        fround = lambda i: ceil(i * 100) / 100

        info("Operator `%s` run in %.2f s" %
             (self.name, fround(self._profiler.py_timers['apply'])))

        summary = self._profiler.summary(args,
                                         self._dtype,
                                         reduce_over='apply')

        if summary.globals:
            indent = " " * 2

            perf("Global performance indicators")

            # With MPI enabled, the 'vanilla' entry contains "cross-rank" performance data
            v = summary.globals.get('vanilla')
            if v is not None:
                gflopss = "%.2f GFlops/s" % fround(v.gflopss)
                gpointss = "%.2f GPts/s" % fround(
                    v.gpointss) if v.gpointss else None
                metrics = ", ".join(i for i in [gflopss, gpointss]
                                    if i is not None)
                perf(
                    "%s* Operator `%s` with OI=%.2f computed in %.2f s [%s]" %
                    (indent, self.name, fround(v.oi), fround(v.time), metrics))

            v = summary.globals.get('fdlike')
            if v is not None:
                perf("%s* Achieved %.2f FD-GPts/s" % (indent, v.gpointss))

            perf("Local performance indicators")
        else:
            indent = ""

        # Emit local, i.e. "per-rank" performance. Without MPI, this is the only
        # thing that will be emitted
        for k, v in summary.items():
            rank = "[rank%d]" % k.rank if k.rank is not None else ""
            gflopss = "%.2f GFlops/s" % fround(v.gflopss)
            gpointss = "%.2f GPts/s" % fround(
                v.gpointss) if v.gpointss else None
            metrics = ", ".join(i for i in [gflopss, gpointss]
                                if i is not None)
            itershapes = [
                ",".join(str(i) for i in its) for its in v.itershapes
            ]
            if len(itershapes) > 1:
                name = "%s%s<%s>" % (k.name, rank, ",".join(
                    "<%s>" % i for i in itershapes))
                perf("%s* %s with OI=%.2f computed in %.2f s [%s]" %
                     (indent, name, fround(v.oi), fround(v.time), metrics))
            elif len(itershapes) == 1:
                name = "%s%s<%s>" % (k.name, rank, itershapes[0])
                perf("%s* %s with OI=%.2f computed in %.2f s [%s]" %
                     (indent, name, fround(v.oi), fround(v.time), metrics))
            else:
                name = k.name
                perf("%s* %s%s computed in %.2f s" %
                     (indent, name, rank, fround(v.time)))

        perf("Configuration:  %s" % self._state['optimizations'])

        return summary