def block_size(self): """ Gets block size from auto tuning report :returns: auto tuned block size :raises ValueError: if auto tuning report not found :raises EnvironmentError: if matching model for auto tuned block size not found """ if not path.isfile(self.final_report_path): raise ValueError("AT report at %s not found" % self.final_report_path) # model description string model_desc_str = self.model_desc_template % (self.op.getName(), self.op.time_order, self.op.spc_border, str(self.op.shape).replace(" ", ''), str(self.blocked_dims).replace(" ", '')) with open(self.final_report_path, 'r') as f: for line in f.readlines(): if model_desc_str in line: blocks_str = line.split(' ')[5] block_split = blocks_str[1:len(blocks_str) - 2].split(',') block_size = [int(block) if block != "None" else None for block in block_split] info_at("Picked: %s" % block_size) return block_size raise EnvironmentError("Matching model with auto tuned block size not found.")
def autotune(operator, arguments, tunable, mode='basic'): """ Acting as a high-order function, take as input an operator and a list of operator arguments to perform empirical autotuning. Some of the operator arguments are marked as tunable. """ at_arguments = arguments.copy() # User-provided output data must not be altered output = [i.name for i in operator.output] for k, v in arguments.items(): if k in output: at_arguments[k] = v.copy() # Squeeze dimensions to minimize auto-tuning time iterations = FindNodes(Iteration).visit(operator.body) squeezable = [ i.dim.parent.name for i in iterations if i.is_Sequential and i.dim.is_Buffered ] # Attempted block sizes mapper = OrderedDict([(i.argument.name, i) for i in tunable]) blocksizes = [ OrderedDict([(i, v) for i in mapper]) for v in options['at_blocksize'] ] if mode == 'aggressive': blocksizes = more_heuristic_attempts(blocksizes) # Note: there is only a single loop over 'blocksize' because only # square blocks are tested timings = OrderedDict() for blocksize in blocksizes: illegal = False for k, v in at_arguments.items(): if k in blocksize: val = blocksize[k] handle = at_arguments.get(mapper[k].original_dim.name) if val <= mapper[k].iteration.end(handle): at_arguments[k] = val else: # Block size cannot be larger than actual dimension illegal = True break elif k in squeezable: at_arguments[k] = options['at_squeezer'] if illegal: continue # Add profiler structs at_arguments.update(operator._extra_arguments()) operator.cfunction(*list(at_arguments.values())) elapsed = sum(operator.profiler.timings.values()) timings[tuple(blocksize.items())] = elapsed info_at("<%s>: %f" % (','.join('%d' % i for i in blocksize.values()), elapsed)) best = dict(min(timings, key=timings.get)) info('Auto-tuned block shape: %s' % best) # Build the new argument list tuned = OrderedDict() for k, v in arguments.items(): tuned[k] = best[k] if k in mapper else v return tuned
def autotune(operator, arguments, tunable): """ Acting as a high-order function, take as input an operator and a list of operator arguments to perform empirical autotuning. Some of the operator arguments are marked as tunable. """ at_arguments = arguments.copy() # User-provided output data must not be altered output = [i.name for i in operator.output] for k, v in arguments.items(): if k in output: at_arguments[k] = v.copy() iterations = FindNodes(Iteration).visit(operator.body) dim_mapper = {i.dim.name: i.dim for i in iterations} # Shrink the iteration space of sequential dimensions so that auto-tuner # runs take a negligible amount of time sequentials = [i for i in iterations if i.is_Sequential] if len(sequentials) == 0: timesteps = 1 elif len(sequentials) == 1: sequential = sequentials[0] start = sequential.dim.rtargs.start.default_value timesteps = sequential.extent(start=start, finish=options['at_squeezer']) if timesteps < 0: timesteps = options['at_squeezer'] - timesteps + 1 info_at("Adjusted auto-tuning timestep to %d" % timesteps) at_arguments[sequential.dim.symbolic_start.name] = start at_arguments[sequential.dim.symbolic_end.name] = timesteps if sequential.dim.is_Stepping: at_arguments[sequential.dim.parent.symbolic_start.name] = start at_arguments[sequential.dim.parent.symbolic_end.name] = timesteps else: info_at("Couldn't understand loop structure, giving up auto-tuning") return arguments # Attempted block sizes ... mapper = OrderedDict([(i.argument.symbolic_size.name, i) for i in tunable]) # ... Defaults (basic mode) blocksizes = [ OrderedDict([(i, v) for i in mapper]) for v in options['at_blocksize'] ] # ... Always try the entire iteration space (degenerate block) datashape = [ at_arguments[mapper[i].original_dim.symbolic_end.name] - at_arguments[mapper[i].original_dim.symbolic_start.name] for i in mapper ] blocksizes.append( OrderedDict([(i, mapper[i].iteration.extent(0, j)) for i, j in zip(mapper, datashape)])) # ... More attempts if auto-tuning in aggressive mode if configuration.core['autotuning'] == 'aggressive': blocksizes = more_heuristic_attempts(blocksizes) # How many temporaries are allocated on the stack? # Will drop block sizes that might lead to a stack overflow functions = FindSymbols('symbolics').visit(operator.body + operator.elemental_functions) stack_shapes = [i.shape for i in functions if i.is_Array and i._mem_stack] stack_space = sum(reduce(mul, i, 1) for i in stack_shapes) * operator.dtype().itemsize # Note: there is only a single loop over 'blocksize' because only # square blocks are tested timings = OrderedDict() for bs in blocksizes: illegal = False for k, v in at_arguments.items(): if k in bs: val = bs[k] start = at_arguments[ mapper[k].original_dim.symbolic_start.name] end = at_arguments[mapper[k].original_dim.symbolic_end.name] if val <= mapper[k].iteration.extent(start, end): at_arguments[k] = val else: # Block size cannot be larger than actual dimension illegal = True break if illegal: continue # Make sure we remain within stack bounds, otherwise skip block size dim_sizes = {} for k, v in at_arguments.items(): if k in bs: dim_sizes[mapper[k].argument.symbolic_size] = bs[k] elif k in dim_mapper: dim_sizes[dim_mapper[k].symbolic_size] = v try: bs_stack_space = stack_space.xreplace(dim_sizes) except AttributeError: bs_stack_space = stack_space try: if int(bs_stack_space) > options['at_stack_limit']: continue except TypeError: # We should never get here info_at("Couldn't determine stack size, skipping block size %s" % str(bs)) continue # Use AT-specific profiler structs at_arguments[operator.profiler.varname] = operator.profiler.setup() operator.cfunction(*list(at_arguments.values())) elapsed = sum(operator.profiler.timings.values()) timings[tuple(bs.items())] = elapsed info_at("Block shape <%s> took %f (s) in %d time steps" % (','.join('%d' % i for i in bs.values()), elapsed, timesteps)) try: best = dict(min(timings, key=timings.get)) info("Auto-tuned block shape: %s" % best) except ValueError: info("Auto-tuning request, but couldn't find legal block sizes") return arguments # Build the new argument list tuned = OrderedDict() for k, v in arguments.items(): tuned[k] = best[k] if k in mapper else v # Reset the profiling struct assert operator.profiler.varname in tuned tuned[operator.profiler.varname] = operator.profiler.setup() return tuned
def auto_tune_blocks(self, minimum=5, maximum=20): """Auto tunes block sizes. Times all block size combinations withing given range and writes it into report :param minimum: int (optional) - minimum value for auto tuning range. Default 5 :param maximum: int (optional) - maximum value for auto tuning range. Default 20 :raises ValueError: if minimum is >= maximum """ if minimum >= maximum: raise ValueError("Invalid parameters. Min tune range has to be less than Max") # setting time step to 3 as we don't need to iterate more than that # for auto tuning purposes at_nt = 3 self.op.propagator.nt = at_nt self.op.propagator.profile = True info_at("Start. Mode: brute force") block_list = set() # used to make sure we do not test the same block sizes mask = [i if i else None for i in self.blocked_dims] block = [None for i in mask] for x in range(minimum, maximum): block[0] = mask[0] and x if len(block) > 1: for y in range(minimum, x + 1): block[1] = mask[1] and y if len(block) > 2: for z in range(minimum, maximum): block[2] = mask[2] and z block_list.add((tuple(block))) else: block_list.add(tuple(block)) else: block_list.add(tuple(block)) # filter off some of the block sizes, heuristically block_list = sorted(self._filter(block_list)) info_at("Number of block sizes that will be attempted: %d" % len(block_list)) # runs function for each block_size times = [] self.op.propagator.cache_blocking = list(block_list[0]) for block in block_list: self.op.propagator.block_sizes = list(block) # populate output arrays with values different than 0.0, to make sure that # actual computation is carried out for param in self.op.output_params: param.data.fill(np.random.rand()) times.append((block, self.get_execution_time())) # sorts the list of tuples based on time times = sorted(times, key=itemgetter(1)) info_at("Finish.") info_at("Estimated runtime for %s and %d time steps: %f hours" % (self.op.getName(), self.nt_full, self.nt_full * times[0][1] / (at_nt * 3600))) self._write_block_report(times) # writes the report