コード例 #1
ファイル: at_controller.py プロジェクト: emmapearce13/my_RTM
    def block_size(self):
        """ Gets block size from auto tuning report

        :returns: auto tuned block size
        :raises ValueError: if auto tuning report not found
        :raises EnvironmentError: if matching model for auto tuned block size not found
        if not path.isfile(self.final_report_path):
            raise ValueError("AT report at %s not found" % self.final_report_path)

        # model description string
        model_desc_str = self.model_desc_template % (self.op.getName(),
                                                     str(self.op.shape).replace(" ", ''),
                                                     str(self.blocked_dims).replace(" ",
        with open(self.final_report_path, 'r') as f:
            for line in f.readlines():

                if model_desc_str in line:
                    blocks_str = line.split(' ')[5]
                    block_split = blocks_str[1:len(blocks_str) - 2].split(',')
                    block_size = [int(block) if block != "None" else None
                                  for block in block_split]

                    info_at("Picked: %s" % block_size)
                    return block_size

        raise EnvironmentError("Matching model with auto tuned block size not found.")
コード例 #2
def autotune(operator, arguments, tunable, mode='basic'):
    Acting as a high-order function, take as input an operator and a list of
    operator arguments to perform empirical autotuning. Some of the operator
    arguments are marked as tunable.
    at_arguments = arguments.copy()

    # User-provided output data must not be altered
    output = [i.name for i in operator.output]
    for k, v in arguments.items():
        if k in output:
            at_arguments[k] = v.copy()

    # Squeeze dimensions to minimize auto-tuning time
    iterations = FindNodes(Iteration).visit(operator.body)
    squeezable = [
        i.dim.parent.name for i in iterations
        if i.is_Sequential and i.dim.is_Buffered

    # Attempted block sizes
    mapper = OrderedDict([(i.argument.name, i) for i in tunable])
    blocksizes = [
        OrderedDict([(i, v) for i in mapper]) for v in options['at_blocksize']
    if mode == 'aggressive':
        blocksizes = more_heuristic_attempts(blocksizes)

    # Note: there is only a single loop over 'blocksize' because only
    # square blocks are tested
    timings = OrderedDict()
    for blocksize in blocksizes:
        illegal = False
        for k, v in at_arguments.items():
            if k in blocksize:
                val = blocksize[k]
                handle = at_arguments.get(mapper[k].original_dim.name)
                if val <= mapper[k].iteration.end(handle):
                    at_arguments[k] = val
                    # Block size cannot be larger than actual dimension
                    illegal = True
            elif k in squeezable:
                at_arguments[k] = options['at_squeezer']
        if illegal:

        # Add profiler structs

        elapsed = sum(operator.profiler.timings.values())
        timings[tuple(blocksize.items())] = elapsed
        info_at("<%s>: %f" % (','.join('%d' % i
                                       for i in blocksize.values()), elapsed))

    best = dict(min(timings, key=timings.get))
    info('Auto-tuned block shape: %s' % best)

    # Build the new argument list
    tuned = OrderedDict()
    for k, v in arguments.items():
        tuned[k] = best[k] if k in mapper else v

    return tuned
コード例 #3
def autotune(operator, arguments, tunable):
    Acting as a high-order function, take as input an operator and a list of
    operator arguments to perform empirical autotuning. Some of the operator
    arguments are marked as tunable.
    at_arguments = arguments.copy()

    # User-provided output data must not be altered
    output = [i.name for i in operator.output]
    for k, v in arguments.items():
        if k in output:
            at_arguments[k] = v.copy()

    iterations = FindNodes(Iteration).visit(operator.body)
    dim_mapper = {i.dim.name: i.dim for i in iterations}

    # Shrink the iteration space of sequential dimensions so that auto-tuner
    # runs take a negligible amount of time
    sequentials = [i for i in iterations if i.is_Sequential]
    if len(sequentials) == 0:
        timesteps = 1
    elif len(sequentials) == 1:
        sequential = sequentials[0]
        start = sequential.dim.rtargs.start.default_value
        timesteps = sequential.extent(start=start,
        if timesteps < 0:
            timesteps = options['at_squeezer'] - timesteps + 1
            info_at("Adjusted auto-tuning timestep to %d" % timesteps)
        at_arguments[sequential.dim.symbolic_start.name] = start
        at_arguments[sequential.dim.symbolic_end.name] = timesteps
        if sequential.dim.is_Stepping:
            at_arguments[sequential.dim.parent.symbolic_start.name] = start
            at_arguments[sequential.dim.parent.symbolic_end.name] = timesteps
        info_at("Couldn't understand loop structure, giving up auto-tuning")
        return arguments

    # Attempted block sizes ...
    mapper = OrderedDict([(i.argument.symbolic_size.name, i) for i in tunable])
    # ... Defaults (basic mode)
    blocksizes = [
        OrderedDict([(i, v) for i in mapper]) for v in options['at_blocksize']
    # ... Always try the entire iteration space (degenerate block)
    datashape = [
        at_arguments[mapper[i].original_dim.symbolic_end.name] -
        for i in mapper
        OrderedDict([(i, mapper[i].iteration.extent(0, j))
                     for i, j in zip(mapper, datashape)]))
    # ... More attempts if auto-tuning in aggressive mode
    if configuration.core['autotuning'] == 'aggressive':
        blocksizes = more_heuristic_attempts(blocksizes)

    # How many temporaries are allocated on the stack?
    # Will drop block sizes that might lead to a stack overflow
    functions = FindSymbols('symbolics').visit(operator.body +
    stack_shapes = [i.shape for i in functions if i.is_Array and i._mem_stack]
    stack_space = sum(reduce(mul, i, 1)
                      for i in stack_shapes) * operator.dtype().itemsize

    # Note: there is only a single loop over 'blocksize' because only
    # square blocks are tested
    timings = OrderedDict()
    for bs in blocksizes:
        illegal = False
        for k, v in at_arguments.items():
            if k in bs:
                val = bs[k]
                start = at_arguments[
                end = at_arguments[mapper[k].original_dim.symbolic_end.name]
                if val <= mapper[k].iteration.extent(start, end):
                    at_arguments[k] = val
                    # Block size cannot be larger than actual dimension
                    illegal = True
        if illegal:

        # Make sure we remain within stack bounds, otherwise skip block size
        dim_sizes = {}
        for k, v in at_arguments.items():
            if k in bs:
                dim_sizes[mapper[k].argument.symbolic_size] = bs[k]
            elif k in dim_mapper:
                dim_sizes[dim_mapper[k].symbolic_size] = v
            bs_stack_space = stack_space.xreplace(dim_sizes)
        except AttributeError:
            bs_stack_space = stack_space
            if int(bs_stack_space) > options['at_stack_limit']:
        except TypeError:
            # We should never get here
            info_at("Couldn't determine stack size, skipping block size %s" %

        # Use AT-specific profiler structs
        at_arguments[operator.profiler.varname] = operator.profiler.setup()

        elapsed = sum(operator.profiler.timings.values())
        timings[tuple(bs.items())] = elapsed
        info_at("Block shape <%s> took %f (s) in %d time steps" %
                (','.join('%d' % i for i in bs.values()), elapsed, timesteps))

        best = dict(min(timings, key=timings.get))
        info("Auto-tuned block shape: %s" % best)
    except ValueError:
        info("Auto-tuning request, but couldn't find legal block sizes")
        return arguments

    # Build the new argument list
    tuned = OrderedDict()
    for k, v in arguments.items():
        tuned[k] = best[k] if k in mapper else v

    # Reset the profiling struct
    assert operator.profiler.varname in tuned
    tuned[operator.profiler.varname] = operator.profiler.setup()

    return tuned
コード例 #4
ファイル: at_controller.py プロジェクト: emmapearce13/my_RTM
    def auto_tune_blocks(self, minimum=5, maximum=20):
        """Auto tunes block sizes. Times all block size combinations withing given range
           and writes it into report

        :param minimum: int (optional) - minimum value for auto tuning range. Default 5
        :param maximum: int (optional) - maximum value for auto tuning range. Default 20
        :raises ValueError: if  minimum is >= maximum
        if minimum >= maximum:
            raise ValueError("Invalid parameters. Min tune range has to be less than Max")
        # setting time step to 3 as we don't need to iterate more than that
        # for auto tuning purposes
        at_nt = 3
        self.op.propagator.nt = at_nt
        self.op.propagator.profile = True

        info_at("Start. Mode: brute force")

        block_list = set()  # used to make sure we do not test the same block sizes
        mask = [i if i else None for i in self.blocked_dims]
        block = [None for i in mask]

        for x in range(minimum, maximum):
            block[0] = mask[0] and x

            if len(block) > 1:
                for y in range(minimum, x + 1):
                    block[1] = mask[1] and y

                    if len(block) > 2:
                        for z in range(minimum, maximum):
                            block[2] = mask[2] and z

        # filter off some of the block sizes, heuristically
        block_list = sorted(self._filter(block_list))

        info_at("Number of block sizes that will be attempted: %d" % len(block_list))

        # runs function for each block_size
        times = []
        self.op.propagator.cache_blocking = list(block_list[0])
        for block in block_list:
            self.op.propagator.block_sizes = list(block)
            # populate output arrays with values different than 0.0, to make sure that
            # actual computation is carried out
            for param in self.op.output_params:
            times.append((block, self.get_execution_time()))

        # sorts the list of tuples based on time
        times = sorted(times, key=itemgetter(1))

        info_at("Estimated runtime for %s and %d time steps: %f hours" %
                (self.op.getName(), self.nt_full,
                 self.nt_full * times[0][1] / (at_nt * 3600)))

        self._write_block_report(times)  # writes the report