Ejemplo n.º 1
0
def instrument(graph, **kwargs):
    track_subsections(graph, **kwargs)

    # Construct a fresh Timer object
    profiler = kwargs['profiler']
    timer = Timer(profiler.name, list(profiler.all_sections))

    instrument_sections(graph, timer=timer, **kwargs)
Ejemplo n.º 2
0
def test_timers():
    """Pickling for Timers used in Operators for C-level profiling."""
    timer = Timer('timer', ['sec0', 'sec1'])
    pkl_obj = pickle.dumps(timer)
    new_obj = pickle.loads(pkl_obj)
    assert new_obj.name == timer.name
    assert new_obj.sections == timer.sections
    assert new_obj.value._obj.sec0 == timer.value._obj.sec0 == 0.0
    assert new_obj.value._obj.sec1 == timer.value._obj.sec1 == 0.0
Ejemplo n.º 3
0
def autotune(operator, args, level, mode):
    """
    Operator autotuning.

    Parameters
    ----------
    operator : Operator
        Input Operator.
    args : dict_like
        The runtime arguments with which `operator` is run.
    level : str
        The autotuning aggressiveness (basic, aggressive, max). A more
        aggressive autotuning might eventually result in higher runtime
        performance, but the autotuning phase will take longer.
    mode : str
        The autotuning mode (preemptive, runtime). In preemptive mode, the
        output runtime values supplied by the user to `operator.apply` are
        replaced with shadow copies.
    """
    key = [level, mode]
    accepted = configuration._accepted['autotuning']
    if key not in accepted:
        raise ValueError("The accepted `(level, mode)` combinations are `%s`; "
                         "provided `%s` instead" % (accepted, key))

    # We get passed all the arguments, but the cfunction only requires a subset
    at_args = OrderedDict([(p.name, args[p.name])
                           for p in operator.parameters])

    # User-provided output data won't be altered in `preemptive` mode
    if mode == 'preemptive':
        output = {i.name: i for i in operator.output}
        copies = {
            k: output[k]._C_as_ndarray(v).copy()
            for k, v in args.items() if k in output
        }
        # WARNING: `copies` keeps references to numpy arrays, which is required
        # to avoid garbage collection to kick in during autotuning and prematurely
        # free the shadow copies handed over to C-land
        at_args.update(
            {k: output[k]._C_make_dataobj(v)
             for k, v in copies.items()})

    # Disable halo exchanges through MPI_PROC_NULL
    if mode in ['preemptive', 'destructive']:
        for p in operator.parameters:
            if isinstance(p, MPINeighborhood):
                at_args.update(MPINeighborhood(p.neighborhood)._arg_values())
                for i in p.fields:
                    setattr(at_args[p.name]._obj, i, MPI.PROC_NULL)
            elif isinstance(p, MPIMsgEnriched):
                at_args.update(
                    MPIMsgEnriched(p.name, p.target, p.halos)._arg_values())
                for i in at_args[p.name]:
                    i.fromrank = MPI.PROC_NULL
                    i.torank = MPI.PROC_NULL

    roots = [operator.body] + [i.root for i in operator._func_table.values()]
    trees = filter_ordered(retrieve_iteration_tree(roots),
                           key=lambda i: i.root)

    # Detect the time-stepping Iteration; shrink its iteration range so that
    # each autotuning run only takes a few iterations
    steppers = {i for i in flatten(trees) if i.dim.is_Time}
    if len(steppers) == 0:
        stepper = None
        timesteps = 1
    elif len(steppers) == 1:
        stepper = steppers.pop()
        timesteps = init_time_bounds(stepper, at_args, args)
        if timesteps is None:
            return args, {}
    else:
        warning(
            "cannot perform autotuning unless there is one time loop; skipping"
        )
        return args, {}

    # Use a fresh Timer for auto-tuning
    timer = Timer('timers', list(operator._profiler.all_sections))
    at_args.update(timer._arg_values())

    # Perform autotuning
    timings = {}
    for n, tree in enumerate(trees):
        blockable = [i.dim for i in tree if not is_integer(i.step)]

        # Tunable arguments
        try:
            tunable = []
            tunable.append(generate_block_shapes(blockable, args, level))
            tunable.append(generate_nthreads(operator.nthreads, args, level))
            tunable = list(product(*tunable))
        except ValueError:
            # Some arguments are compulsory, otherwise autotuning is skipped
            continue

        # Symbolic number of loop-blocking blocks per thread
        nblocks_per_thread = calculate_nblocks(tree,
                                               blockable) / operator.nthreads

        for bs, nt in tunable:
            # Can we safely autotune over the given time range?
            if not check_time_bounds(stepper, at_args, args, mode):
                break

            # Update `at_args` to use the new tunable arguments
            run = [(k, v) for k, v in bs + nt if k in at_args]
            at_args.update(dict(run))

            # Drop run if not at least one block per thread
            if not configuration['develop-mode'] and nblocks_per_thread.subs(
                    at_args) < 1:
                continue

            # Run the Operator
            operator.cfunction(*list(at_args.values()))

            # Record timing
            elapsed = timer.total
            timings.setdefault(nt, OrderedDict()).setdefault(n,
                                                             {})[bs] = elapsed
            log("run <%s> took %f (s) in %d timesteps" %
                (','.join('%s=%s' % i for i in run), elapsed, timesteps))

            # Prepare for the next autotuning run
            update_time_bounds(stepper, at_args, timesteps, mode)
            timer.reset()

    # The best variant is the one that for a given number of threads had the minium
    # turnaround time
    try:
        runs = 0
        mapper = {}
        for k, v in timings.items():
            for i in v.values():
                runs += len(i)
                record = mapper.setdefault(k, Record())
                record.add(min(i, key=i.get), min(i.values()))
        best = min(mapper, key=mapper.get)
        best = OrderedDict(best + tuple(mapper[best].args))
        best.pop(None, None)
        log("selected <%s>" % (','.join('%s=%s' % i for i in best.items())))
    except ValueError:
        warning("could not perform any runs")
        return args, {}

    # Update the argument list with the tuned arguments
    args.update(best)

    # In `runtime` mode, some timesteps have been executed already, so we must
    # adjust the time range
    finalize_time_bounds(stepper, at_args, args, mode)

    # Autotuning summary
    summary = {}
    summary['runs'] = runs
    summary['tpr'] = timesteps  # tpr -> timesteps per run
    summary['tuned'] = dict(best)

    return args, summary