def get_typed_and_scheduled_kernel(self, var_to_dtype_set): kernel = self.kernel from loopy.kernel.tools import add_dtypes if var_to_dtype_set: var_to_dtype = {} for var, dtype in var_to_dtype_set: try: dest_name = kernel.impl_arg_to_arg[var].name except KeyError: dest_name = var try: var_to_dtype[dest_name] = dtype except KeyError: raise LoopyError("cannot set type for '%s': " "no known variable/argument with that name" % var) kernel = add_dtypes(kernel, var_to_dtype) from loopy.preprocess import infer_unknown_types kernel = infer_unknown_types(kernel, expect_completion=True) if kernel.schedule is None: from loopy.preprocess import preprocess_kernel kernel = preprocess_kernel(kernel) from loopy.schedule import get_one_scheduled_kernel kernel = get_one_scheduled_kernel(kernel) return kernel
def gather_access_footprint_bytes(kernel, ignore_uncountable=False): """Return a dictionary mapping ``(var_name, direction)`` to :class:`islpy.PwQPolynomial` instances capturing the number of bytes are read/written (where *direction* is either ``read`` or ``write`` on array *var_name* :arg ignore_uncountable: If *True*, an error will be raised for accesses on which the footprint cannot be determined (e.g. data-dependent or nonlinear indices) """ from loopy.preprocess import preprocess_kernel, infer_unknown_types kernel = infer_unknown_types(kernel, expect_completion=True) from loopy.kernel import kernel_state if kernel.state < kernel_state.PREPROCESSED: kernel = preprocess_kernel(kernel) result = {} fp = gather_access_footprints(kernel, ignore_uncountable=ignore_uncountable) for key, var_fp in fp.items(): vname, direction = key var_descr = kernel.get_var_descriptor(vname) bytes_transferred = ( int(var_descr.dtype.numpy_dtype.itemsize) * count(kernel, var_fp)) if key in result: result[key] += bytes_transferred else: result[key] = bytes_transferred return result
def get_typed_and_scheduled_kernel(self, var_to_dtype_set): kernel = self.kernel from loopy.kernel.tools import add_dtypes if var_to_dtype_set: var_to_dtype = {} for var, dtype in var_to_dtype_set: try: dest_name = kernel.impl_arg_to_arg[var].name except KeyError: dest_name = var try: var_to_dtype[dest_name] = dtype except KeyError: raise LoopyError("cannot set type for '%s': " "no known variable/argument with that name" % var) kernel = add_dtypes(kernel, var_to_dtype) from loopy.preprocess import infer_unknown_types kernel = infer_unknown_types(kernel, expect_completion=True) if kernel.schedule is None: from loopy.preprocess import preprocess_kernel kernel = preprocess_kernel(kernel) from loopy.schedule import get_one_scheduled_kernel kernel = get_one_scheduled_kernel(kernel) return kernel
def get_op_poly(knl, numpy_types=True): """Count the number of operations in a loopy kernel. :parameter knl: A :class:`loopy.LoopKernel` whose operations are to be counted. :return: A mapping of **{(** *type* **,** :class:`string` **)** **:** :class:`islpy.PwQPolynomial` **}**. - The *type* specifies the type of the data being accessed. This can be a :class:`numpy.dtype` if *numpy_types* is True, otherwise the internal loopy type. - The string specifies the operation type as *add*, *sub*, *mul*, *div*, *pow*, *shift*, *bw* (bitwise), etc. - The :class:`islpy.PwQPolynomial` holds the number of operations of the kind specified in the key (in terms of the :class:`loopy.LoopKernel` *parameter inames*). Example usage:: # (first create loopy kernel and specify array data types) poly = get_op_poly(knl) params = {'n': 512, 'm': 256, 'l': 128} f32add = poly[(np.dtype(np.float32), 'add')].eval_with_dict(params) f32mul = poly[(np.dtype(np.float32), 'mul')].eval_with_dict(params) # (now use these counts to predict performance) """ from loopy.preprocess import preprocess_kernel, infer_unknown_types knl = infer_unknown_types(knl, expect_completion=True) knl = preprocess_kernel(knl) op_poly = ToCountMap() op_counter = ExpressionOpCounter(knl) for insn in knl.instructions: # how many times is this instruction executed? # check domain size: insn_inames = knl.insn_inames(insn) inames_domain = knl.get_inames_domain(insn_inames) domain = (inames_domain.project_out_except(insn_inames, [dim_type.set])) ops = op_counter(insn.assignee) + op_counter(insn.expression) op_poly = op_poly + ops*count(knl, domain) result = op_poly.dict if numpy_types: result = dict( ((dtype.numpy_dtype, kind), count) for (dtype, kind), count in six.iteritems(result)) return result
def get_op_poly(knl, numpy_types=True): """Count the number of operations in a loopy kernel. :parameter knl: A :class:`loopy.LoopKernel` whose operations are to be counted. :return: A mapping of **{(** *type* **,** :class:`string` **)** **:** :class:`islpy.PwQPolynomial` **}**. - The *type* specifies the type of the data being accessed. This can be a :class:`numpy.dtype` if *numpy_types* is True, otherwise the internal loopy type. - The string specifies the operation type as *add*, *sub*, *mul*, *div*, *pow*, *shift*, *bw* (bitwise), etc. - The :class:`islpy.PwQPolynomial` holds the number of operations of the kind specified in the key (in terms of the :class:`loopy.LoopKernel` *parameter inames*). Example usage:: # (first create loopy kernel and specify array data types) poly = get_op_poly(knl) params = {'n': 512, 'm': 256, 'l': 128} f32add = poly[(np.dtype(np.float32), 'add')].eval_with_dict(params) f32mul = poly[(np.dtype(np.float32), 'mul')].eval_with_dict(params) # (now use these counts to predict performance) """ from loopy.preprocess import preprocess_kernel, infer_unknown_types knl = infer_unknown_types(knl, expect_completion=True) knl = preprocess_kernel(knl) op_poly = ToCountMap() op_counter = ExpressionOpCounter(knl) for insn in knl.instructions: # how many times is this instruction executed? # check domain size: insn_inames = knl.insn_inames(insn) inames_domain = knl.get_inames_domain(insn_inames) domain = (inames_domain.project_out_except(insn_inames, [dim_type.set])) ops = op_counter(insn.assignee) + op_counter(insn.expression) op_poly = op_poly + ops*count(knl, domain) result = op_poly.dict if numpy_types: result = dict( ((dtype.numpy_dtype, kind), count) for (dtype, kind), count in six.iteritems(result)) return result
def add_and_infer_dtypes(knl, dtype_dict): processed_dtype_dict = {} for k, v in six.iteritems(dtype_dict): for subkey in k.split(","): subkey = subkey.strip() if subkey: processed_dtype_dict[subkey] = v knl = add_dtypes(knl, processed_dtype_dict) from loopy.preprocess import infer_unknown_types return infer_unknown_types(knl, expect_completion=True)
def gather_access_footprints(kernel, ignore_uncountable=False): """Return a dictionary mapping ``(var_name, direction)`` to :class:`islpy.Set` instances capturing which indices of each the array *var_name* are read/written (where *direction* is either ``read`` or ``write``. :arg ignore_uncountable: If *True*, an error will be raised for accesses on which the footprint cannot be determined (e.g. data-dependent or nonlinear indices) """ from loopy.preprocess import preprocess_kernel, infer_unknown_types kernel = infer_unknown_types(kernel, expect_completion=True) kernel = preprocess_kernel(kernel) write_footprints = [] read_footprints = [] for insn in kernel.instructions: if not isinstance(insn, MultiAssignmentBase): warn( kernel, "count_non_assignment", "Non-assignment instruction encountered in " "gather_access_footprints, not counted") continue insn_inames = kernel.insn_inames(insn) inames_domain = kernel.get_inames_domain(insn_inames) domain = (inames_domain.project_out_except(insn_inames, [dim_type.set])) afg = AccessFootprintGatherer(kernel, domain, ignore_uncountable=ignore_uncountable) for assignee in insn.assignees: write_footprints.append(afg(insn.assignees)) read_footprints.append(afg(insn.expression)) write_footprints = AccessFootprintGatherer.combine(write_footprints) read_footprints = AccessFootprintGatherer.combine(read_footprints) result = {} for vname, footprint in six.iteritems(write_footprints): result[(vname, "write")] = footprint for vname, footprint in six.iteritems(read_footprints): result[(vname, "read")] = footprint return result
def gather_access_footprints(kernel, ignore_uncountable=False): """Return a dictionary mapping ``(var_name, direction)`` to :class:`islpy.Set` instances capturing which indices of each the array *var_name* are read/written (where *direction* is either ``read`` or ``write``. :arg ignore_uncountable: If *True*, an error will be raised for accesses on which the footprint cannot be determined (e.g. data-dependent or nonlinear indices) """ from loopy.preprocess import preprocess_kernel, infer_unknown_types kernel = infer_unknown_types(kernel, expect_completion=True) kernel = preprocess_kernel(kernel) write_footprints = [] read_footprints = [] for insn in kernel.instructions: if not isinstance(insn, MultiAssignmentBase): warn(kernel, "count_non_assignment", "Non-assignment instruction encountered in " "gather_access_footprints, not counted") continue insn_inames = kernel.insn_inames(insn) inames_domain = kernel.get_inames_domain(insn_inames) domain = (inames_domain.project_out_except(insn_inames, [dim_type.set])) afg = AccessFootprintGatherer(kernel, domain, ignore_uncountable=ignore_uncountable) for assignee in insn.assignees: write_footprints.append(afg(insn.assignees)) read_footprints.append(afg(insn.expression)) write_footprints = AccessFootprintGatherer.combine(write_footprints) read_footprints = AccessFootprintGatherer.combine(read_footprints) result = {} for vname, footprint in six.iteritems(write_footprints): result[(vname, "write")] = footprint for vname, footprint in six.iteritems(read_footprints): result[(vname, "read")] = footprint return result
def get_barrier_poly(knl): """Count the number of barriers each thread encounters in a loopy kernel. :parameter knl: A :class:`loopy.LoopKernel` whose barriers are to be counted. :return: An :class:`islpy.PwQPolynomial` holding the number of barrier calls made (in terms of the :class:`loopy.LoopKernel` *inames*). Example usage:: # (first create loopy kernel and specify array data types) barrier_poly = get_barrier_poly(knl) params = {'n': 512, 'm': 256, 'l': 128} barrier_count = barrier_poly.eval_with_dict(params) # (now use this count to predict performance) """ from loopy.preprocess import preprocess_kernel, infer_unknown_types from loopy.schedule import EnterLoop, LeaveLoop, Barrier from operator import mul knl = infer_unknown_types(knl, expect_completion=True) knl = preprocess_kernel(knl) knl = lp.get_one_scheduled_kernel(knl) iname_list = [] barrier_poly = isl.PwQPolynomial('{ 0 }') for sched_item in knl.schedule: if isinstance(sched_item, EnterLoop): if sched_item.iname: # (if not empty) iname_list.append(sched_item.iname) elif isinstance(sched_item, LeaveLoop): if sched_item.iname: # (if not empty) iname_list.pop() elif isinstance(sched_item, Barrier): if iname_list: # (if iname_list is not empty) ct = (count(knl, ( knl.get_inames_domain(iname_list). project_out_except(iname_list, [dim_type.set]) )), ) barrier_poly += reduce(mul, ct) else: barrier_poly += isl.PwQPolynomial('{ 1 }') return barrier_poly
def generate_body(kernel): if kernel.schedule is None: from loopy.schedule import get_one_scheduled_kernel kernel = get_one_scheduled_kernel(kernel) from loopy.kernel import kernel_state if kernel.state != kernel_state.SCHEDULED: raise LoopyError("cannot generate code for a kernel that has not been " "scheduled") from loopy.preprocess import infer_unknown_types kernel = infer_unknown_types(kernel, expect_completion=True) from loopy.check import pre_codegen_checks pre_codegen_checks(kernel) logger.info("%s: generate code: start" % kernel.name) allow_complex = False for var in kernel.args + list(six.itervalues(kernel.temporary_variables)): if var.dtype.kind == "c": allow_complex = True seen_dtypes = set() seen_functions = set() initial_implemented_domain = isl.BasicSet.from_params(kernel.assumptions) codegen_state = CodeGenerationState( kernel=kernel, implemented_domain=initial_implemented_domain, implemented_predicates=frozenset(), seen_dtypes=seen_dtypes, seen_functions=seen_functions, var_subst_map={}, allow_complex=allow_complex, ) code_str, implemented_domains = kernel.target.generate_body(kernel, codegen_state) from loopy.check import check_implemented_domains assert check_implemented_domains(kernel, implemented_domains, code_str) logger.info("%s: generate code: done" % kernel.name) return code_str
def get_op_poly(knl): from loopy.preprocess import preprocess_kernel, infer_unknown_types knl = infer_unknown_types(knl, expect_completion=True) knl = preprocess_kernel(knl) op_poly = 0 op_counter = ExpressionOpCounter(knl) for insn in knl.instructions: # how many times is this instruction executed? # check domain size: insn_inames = knl.insn_inames(insn) inames_domain = knl.get_inames_domain(insn_inames) domain = (inames_domain.project_out_except(insn_inames, [dim_type.set])) ops = op_counter(insn.expression) op_poly = op_poly + ops*count(knl, domain) return op_poly
def estimate_regs_per_thread(knl): """Estimate registers per thread usage by a loopy kernel. :parameter knl: A :class:`loopy.LoopKernel` whose reg usage will be estimated. :return: An :class:`integer` holding an estimate for the number of registers used per thread. This number will most likely be too low, but will hopefully be consistantly too low by the same constant factor. """ from loopy.preprocess import preprocess_kernel, infer_unknown_types from loopy.schedule import EnterLoop, LeaveLoop, Barrier, RunInstruction # noqa knl = infer_unknown_types(knl, expect_completion=True) knl = preprocess_kernel(knl) knl = lp.get_one_scheduled_kernel(knl) max_regs = 0 block_reg_totals = [0] # counters to track nested sets of previously used iname+index combinations reg_counters = [RegisterUsageEstimator(knl)] for sched_item in knl.schedule: if isinstance(sched_item, EnterLoop): block_reg_totals.append(0) # start a new estimator reg_counters.append(RegisterUsageEstimator(knl)) elif isinstance(sched_item, LeaveLoop): if block_reg_totals[-1] > max_regs: max_regs = block_reg_totals[-1] # pop to resume previous total block_reg_totals.pop() reg_counters.pop() elif isinstance(sched_item, RunInstruction): insn = knl.id_to_insn[sched_item.insn_id] block_reg_totals[-1] += reg_counters[-1](insn.assignee) + \ reg_counters[-1](insn.expression) # finished looping, check outer block if block_reg_totals[-1] > max_regs: max_regs = block_reg_totals[-1] return max_regs
def get_synchronization_poly(knl): """Count the number of synchronization events each thread encounters in a loopy kernel. :parameter knl: A :class:`loopy.LoopKernel` whose barriers are to be counted. :return: A dictionary mapping each type of synchronization event to a :class:`islpy.PwQPolynomial` holding the number of such events per thread. Possible keys include ``barrier_local``, ``barrier_global`` (if supported by the target) and ``kernel_launch``. Example usage:: # (first create loopy kernel and specify array data types) barrier_poly = get_barrier_poly(knl) params = {'n': 512, 'm': 256, 'l': 128} barrier_count = barrier_poly.eval_with_dict(params) # (now use this count to predict performance) """ from loopy.preprocess import preprocess_kernel, infer_unknown_types from loopy.schedule import (EnterLoop, LeaveLoop, Barrier, CallKernel, ReturnFromKernel, RunInstruction) from operator import mul knl = infer_unknown_types(knl, expect_completion=True) knl = preprocess_kernel(knl) knl = lp.get_one_scheduled_kernel(knl) iname_list = [] result = ToCountMap() one = isl.PwQPolynomial('{ 1 }') def get_count_poly(iname_list): if iname_list: # (if iname_list is not empty) ct = (count(knl, ( knl.get_inames_domain(iname_list). project_out_except(iname_list, [dim_type.set]) )), ) return reduce(mul, ct) else: return one for sched_item in knl.schedule: if isinstance(sched_item, EnterLoop): if sched_item.iname: # (if not empty) iname_list.append(sched_item.iname) elif isinstance(sched_item, LeaveLoop): if sched_item.iname: # (if not empty) iname_list.pop() elif isinstance(sched_item, Barrier): result = result + ToCountMap( {"barrier_%s" % sched_item.kind: get_count_poly(iname_list)}) elif isinstance(sched_item, CallKernel): result = result + ToCountMap( {"kernel_launch": get_count_poly(iname_list)}) elif isinstance(sched_item, (ReturnFromKernel, RunInstruction)): pass else: raise LoopyError("unexpected schedule item: %s" % type(sched_item).__name__) return result.dict
def generate_code_v2(kernel): """ :returns: a :class:`CodeGenerationResult` """ from loopy.kernel import kernel_state if kernel.state == kernel_state.INITIAL: from loopy.preprocess import preprocess_kernel kernel = preprocess_kernel(kernel) if kernel.schedule is None: from loopy.schedule import get_one_scheduled_kernel kernel = get_one_scheduled_kernel(kernel) if kernel.state != kernel_state.SCHEDULED: raise LoopyError("cannot generate code for a kernel that has not been " "scheduled") # {{{ cache retrieval from loopy import CACHING_ENABLED if CACHING_ENABLED: input_kernel = kernel try: result = code_gen_cache[input_kernel] logger.debug("%s: code generation cache hit" % kernel.name) return result except KeyError: pass # }}} from loopy.preprocess import infer_unknown_types kernel = infer_unknown_types(kernel, expect_completion=True) from loopy.check import pre_codegen_checks pre_codegen_checks(kernel) logger.info("%s: generate code: start" % kernel.name) # {{{ examine arg list from loopy.kernel.data import ValueArg from loopy.kernel.array import ArrayBase implemented_data_info = [] for arg in kernel.args: is_written = arg.name in kernel.get_written_variables() if isinstance(arg, ArrayBase): implemented_data_info.extend( arg.decl_info(kernel.target, is_written=is_written, index_dtype=kernel.index_dtype)) elif isinstance(arg, ValueArg): implemented_data_info.append( ImplementedDataInfo(target=kernel.target, name=arg.name, dtype=arg.dtype, arg_class=ValueArg, is_written=is_written)) else: raise ValueError("argument type not understood: '%s'" % type(arg)) allow_complex = False for var in kernel.args + list(six.itervalues(kernel.temporary_variables)): if var.dtype.involves_complex(): allow_complex = True # }}} seen_dtypes = set() seen_functions = set() seen_atomic_dtypes = set() initial_implemented_domain = isl.BasicSet.from_params(kernel.assumptions) codegen_state = CodeGenerationState( kernel=kernel, implemented_data_info=implemented_data_info, implemented_domain=initial_implemented_domain, implemented_predicates=frozenset(), seen_dtypes=seen_dtypes, seen_functions=seen_functions, seen_atomic_dtypes=seen_atomic_dtypes, var_subst_map={}, allow_complex=allow_complex, var_name_generator=kernel.get_var_name_generator(), is_generating_device_code=False, gen_program_name=(kernel.target.host_program_name_prefix + kernel.name + kernel.target.host_program_name_suffix), schedule_index_end=len(kernel.schedule)) from loopy.codegen.result import generate_host_or_device_program codegen_result = generate_host_or_device_program(codegen_state, schedule_index=0) device_code_str = codegen_result.device_code() from loopy.check import check_implemented_domains assert check_implemented_domains(kernel, codegen_result.implemented_domains, device_code_str) # {{{ handle preambles for arg in kernel.args: seen_dtypes.add(arg.dtype) for tv in six.itervalues(kernel.temporary_variables): seen_dtypes.add(tv.dtype) preambles = kernel.preambles[:] preamble_info = PreambleInfo( kernel=kernel, seen_dtypes=seen_dtypes, seen_functions=seen_functions, # a set of LoopyTypes (!) seen_atomic_dtypes=seen_atomic_dtypes) preamble_generators = ( kernel.preamble_generators + kernel.target.get_device_ast_builder().preamble_generators()) for prea_gen in preamble_generators: preambles.extend(prea_gen(preamble_info)) codegen_result = codegen_result.copy(device_preambles=preambles) # }}} logger.info("%s: generate code: done" % kernel.name) if CACHING_ENABLED: code_gen_cache[input_kernel] = codegen_result return codegen_result
def get_synchronization_poly(knl): """Count the number of synchronization events each thread encounters in a loopy kernel. :parameter knl: A :class:`loopy.LoopKernel` whose barriers are to be counted. :return: A dictionary mapping each type of synchronization event to a :class:`islpy.PwQPolynomial` holding the number of such events per thread. Possible keys include ``barrier_local``, ``barrier_global`` (if supported by the target) and ``kernel_launch``. Example usage:: # (first create loopy kernel and specify array data types) barrier_poly = get_barrier_poly(knl) params = {'n': 512, 'm': 256, 'l': 128} barrier_count = barrier_poly.eval_with_dict(params) # (now use this count to predict performance) """ from loopy.preprocess import preprocess_kernel, infer_unknown_types from loopy.schedule import (EnterLoop, LeaveLoop, Barrier, CallKernel, ReturnFromKernel, RunInstruction) from operator import mul knl = infer_unknown_types(knl, expect_completion=True) knl = preprocess_kernel(knl) knl = lp.get_one_scheduled_kernel(knl) iname_list = [] result = ToCountMap() one = isl.PwQPolynomial('{ 1 }') def get_count_poly(iname_list): if iname_list: # (if iname_list is not empty) ct = (count(knl, (knl.get_inames_domain(iname_list).project_out_except( iname_list, [dim_type.set]))), ) return reduce(mul, ct) else: return one for sched_item in knl.schedule: if isinstance(sched_item, EnterLoop): if sched_item.iname: # (if not empty) iname_list.append(sched_item.iname) elif isinstance(sched_item, LeaveLoop): if sched_item.iname: # (if not empty) iname_list.pop() elif isinstance(sched_item, Barrier): result = result + ToCountMap( {"barrier_%s" % sched_item.kind: get_count_poly(iname_list)}) elif isinstance(sched_item, CallKernel): result = result + ToCountMap( {"kernel_launch": get_count_poly(iname_list)}) elif isinstance(sched_item, (ReturnFromKernel, RunInstruction)): pass else: raise LoopyError("unexpected schedule item: %s" % type(sched_item).__name__) return result.dict
def get_gmem_access_poly(knl, numpy_types=True): # for now just counting subscripts """Count the number of global memory accesses in a loopy kernel. :parameter knl: A :class:`loopy.LoopKernel` whose DRAM accesses are to be counted. :return: A mapping of **{(** *type* **,** :class:`string` **,** :class:`string` **)** **:** :class:`islpy.PwQPolynomial` **}**. - The *type* specifies the type of the data being accessed. This can be a :class:`numpy.dtype` if *numpy_types* is True, otherwise the internal loopy type. - The first string in the map key specifies the global memory access type as *consecutive*, *nonconsecutive*, or *uniform*. - The second string in the map key specifies the global memory access type as a *load*, or a *store*. - The :class:`islpy.PwQPolynomial` holds the number of DRAM accesses with the characteristics specified in the key (in terms of the :class:`loopy.LoopKernel` *inames*). Example usage:: # (first create loopy kernel and specify array data types) subscript_map = get_gmem_access_poly(knl) params = {'n': 512, 'm': 256, 'l': 128} f32_uncoalesced_load = subscript_map.dict[ (np.dtype(np.float32), 'nonconsecutive', 'load') ].eval_with_dict(params) f32_coalesced_load = subscript_map.dict[ (np.dtype(np.float32), 'consecutive', 'load') ].eval_with_dict(params) f32_coalesced_store = subscript_map.dict[ (np.dtype(np.float32), 'consecutive', 'store') ].eval_with_dict(params) # (now use these counts to predict performance) """ from loopy.preprocess import preprocess_kernel, infer_unknown_types class CacheHolder(object): pass cache_holder = CacheHolder() @memoize_in(cache_holder, "insn_count") def get_insn_count(knl, insn_inames, uniform=False): if uniform: from loopy.kernel.data import LocalIndexTag insn_inames = [ iname for iname in insn_inames if not isinstance(knl.iname_to_tag.get(iname), LocalIndexTag) ] inames_domain = knl.get_inames_domain(insn_inames) domain = (inames_domain.project_out_except(insn_inames, [dim_type.set])) return count(knl, domain) knl = infer_unknown_types(knl, expect_completion=True) knl = preprocess_kernel(knl) subs_poly = ToCountMap() subscript_counter = GlobalSubscriptCounter(knl) for insn in knl.instructions: # count subscripts, distinguishing loads and stores subs_expr = subscript_counter(insn.expression) subs_expr = ToCountMap( dict((key + ("load", ), val) for key, val in six.iteritems(subs_expr.dict))) subs_assignee = subscript_counter(insn.assignee) subs_assignee = ToCountMap( dict((key + ("store", ), val) for key, val in six.iteritems(subs_assignee.dict))) insn_inames = knl.insn_inames(insn) # use count excluding local index tags for uniform accesses for key in subs_expr.dict: poly = ToCountMap({key: subs_expr.dict[key]}) if key[1] == "uniform": subs_poly = subs_poly + poly * get_insn_count( knl, insn_inames, True) else: subs_poly = subs_poly + poly * get_insn_count(knl, insn_inames) for key in subs_assignee.dict: poly = ToCountMap({key: subs_assignee.dict[key]}) if key[1] == "uniform": subs_poly = subs_poly + poly * get_insn_count( knl, insn_inames, True) else: subs_poly = subs_poly + poly * get_insn_count(knl, insn_inames) result = subs_poly.dict if numpy_types: result = dict( ((dtype.numpy_dtype, kind, direction), count) for (dtype, kind, direction), count in six.iteritems(result)) return result
def auto_test_vs_ref(ref_knl, ctx, test_knl=None, op_count=[], op_label=[], parameters={}, print_ref_code=False, print_code=True, warmup_rounds=2, dump_binary=False, fills_entire_output=None, do_check=True, check_result=None, max_test_kernel_count=1, quiet=False, blacklist_ref_vendors=[]): """Compare results of `ref_knl` to the kernels generated by scheduling *test_knl*. :arg check_result: a callable with :class:`numpy.ndarray` arguments *(result, reference_result)* returning a a tuple (class:`bool`, message) indicating correctness/acceptability of the result :arg max_test_kernel_count: Stop testing after this many *test_knl* """ import pyopencl as cl if test_knl is None: test_knl = ref_knl do_check = False if len(ref_knl.args) != len(test_knl.args): raise LoopyError("ref_knl and test_knl do not have the same number " "of arguments") for i, (ref_arg, test_arg) in enumerate(zip(ref_knl.args, test_knl.args)): if ref_arg.name != test_arg.name: raise LoopyError( "ref_knl and test_knl argument lists disagree at index " "%d (1-based)" % (i + 1)) if ref_arg.dtype != test_arg.dtype: raise LoopyError( "ref_knl and test_knl argument lists disagree at index " "%d (1-based)" % (i + 1)) from loopy.compiled import CompiledKernel, get_highlighted_cl_code if isinstance(op_count, (int, float)): warn("op_count should be a list", stacklevel=2) op_count = [op_count] if isinstance(op_label, str): warn("op_label should be a list", stacklevel=2) op_label = [op_label] from time import time if check_result is None: check_result = _default_check_result if fills_entire_output is not None: warn("fills_entire_output is deprecated", DeprecationWarning, stacklevel=2) # {{{ compile and run reference code from loopy.preprocess import infer_unknown_types ref_knl = infer_unknown_types(ref_knl, expect_completion=True) found_ref_device = False ref_errors = [] for dev in _enumerate_cl_devices_for_ref_test(blacklist_ref_vendors): ref_ctx = cl.Context([dev]) ref_queue = cl.CommandQueue( ref_ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) pp_ref_knl = lp.preprocess_kernel(ref_knl) for knl in lp.generate_loop_schedules(pp_ref_knl): ref_sched_kernel = knl break logger.info("%s (ref): trying %s for the reference calculation" % (ref_knl.name, dev)) ref_compiled = CompiledKernel(ref_ctx, ref_sched_kernel) if not quiet and print_ref_code: print(75 * "-") print("Reference Code:") print(75 * "-") print(get_highlighted_cl_code(ref_compiled.code)) print(75 * "-") ref_cl_kernel_info = ref_compiled.cl_kernel_info(frozenset()) try: ref_args, ref_arg_data = \ make_ref_args(ref_sched_kernel, ref_cl_kernel_info.implemented_data_info, ref_queue, parameters) ref_args["out_host"] = False except cl.RuntimeError as e: if e.code == cl.status_code.IMAGE_FORMAT_NOT_SUPPORTED: import traceback ref_errors.append("\n".join([ 75 * "-", "On %s:" % dev, 75 * "-", traceback.format_exc(), 75 * "-" ])) continue else: raise found_ref_device = True if not do_check: break ref_queue.finish() logger.info("%s (ref): using %s for the reference calculation" % (ref_knl.name, dev)) logger.info("%s (ref): run" % ref_knl.name) ref_start = time() if not AUTO_TEST_SKIP_RUN: ref_evt, _ = ref_compiled(ref_queue, **ref_args) else: ref_evt = cl.enqueue_marker(ref_queue) ref_queue.finish() ref_stop = time() ref_elapsed_wall = ref_stop - ref_start logger.info("%s (ref): run done" % ref_knl.name) ref_evt.wait() ref_elapsed_event = 1e-9 * (ref_evt.profile.END - ref_evt.profile.START) break if not found_ref_device: raise LoopyError("could not find a suitable device for the " "reference computation.\n" "These errors were encountered:\n" + "\n".join(ref_errors)) # }}} # {{{ compile and run parallel code need_check = do_check queue = cl.CommandQueue( ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) args = None from loopy.kernel import kernel_state if test_knl.state not in [ kernel_state.PREPROCESSED, kernel_state.SCHEDULED ]: test_knl = lp.preprocess_kernel(test_knl) if not test_knl.schedule: test_kernels = lp.generate_loop_schedules(test_knl) else: test_kernels = [test_knl] test_kernel_count = 0 from loopy.preprocess import infer_unknown_types for i, kernel in enumerate(test_kernels): test_kernel_count += 1 if test_kernel_count > max_test_kernel_count: break kernel = infer_unknown_types(kernel, expect_completion=True) compiled = CompiledKernel(ctx, kernel) if args is None: cl_kernel_info = compiled.cl_kernel_info(frozenset()) args = make_args(kernel, cl_kernel_info.implemented_data_info, queue, ref_arg_data, parameters) args["out_host"] = False if not quiet: print(75 * "-") print("Kernel #%d:" % i) print(75 * "-") if print_code: print(compiled.get_highlighted_code()) print(75 * "-") if dump_binary: print(type(compiled.cl_program)) print(compiled.cl_program.binaries[0]) print(75 * "-") logger.info("%s: run warmup" % (knl.name)) for i in range(warmup_rounds): if not AUTO_TEST_SKIP_RUN: compiled(queue, **args) if need_check and not AUTO_TEST_SKIP_RUN: for arg_desc in ref_arg_data: if arg_desc is None: continue if not arg_desc.needs_checking: continue from pyopencl.compyte.array import as_strided ref_ary = as_strided( arg_desc.ref_storage_array.get(), shape=arg_desc.ref_shape, strides=arg_desc.ref_numpy_strides).flatten() test_ary = as_strided( arg_desc.test_storage_array.get(), shape=arg_desc.test_shape, strides=arg_desc.test_numpy_strides).flatten() common_len = min(len(ref_ary), len(test_ary)) ref_ary = ref_ary[:common_len] test_ary = test_ary[:common_len] error_is_small, error = check_result(test_ary, ref_ary) if not error_is_small: raise AutomaticTestFailure(error) need_check = False events = [] queue.finish() logger.info("%s: warmup done" % (knl.name)) logger.info("%s: timing run" % (knl.name)) timing_rounds = warmup_rounds while True: from time import time start_time = time() evt_start = cl.enqueue_marker(queue) for i in range(timing_rounds): if not AUTO_TEST_SKIP_RUN: evt, _ = compiled(queue, **args) events.append(evt) else: events.append(cl.enqueue_marker(queue)) evt_end = cl.enqueue_marker(queue) queue.finish() stop_time = time() for evt in events: evt.wait() evt_start.wait() evt_end.wait() elapsed_event = (1e-9*events[-1].profile.END - 1e-9*events[0].profile.START) \ / timing_rounds try: elapsed_event_marker = ((1e-9 * evt_end.profile.START - 1e-9 * evt_start.profile.START) / timing_rounds) except cl.RuntimeError: elapsed_event_marker = None elapsed_wall = (stop_time - start_time) / timing_rounds if elapsed_wall * timing_rounds < 0.3: timing_rounds *= 4 else: break logger.info("%s: timing run done" % (knl.name)) rates = "" for cnt, lbl in zip(op_count, op_label): rates += " %g %s/s" % (cnt / elapsed_wall, lbl) if not quiet: def format_float_or_none(v): if v is None: return "<unavailable>" else: return "%g" % v print("elapsed: %s s event, %s s marker-event %s s wall " "(%d rounds)%s" % (format_float_or_none(elapsed_event), format_float_or_none(elapsed_event_marker), format_float_or_none(elapsed_wall), timing_rounds, rates)) if do_check: ref_rates = "" for cnt, lbl in zip(op_count, op_label): ref_rates += " %g %s/s" % (cnt / ref_elapsed_event, lbl) if not quiet: print("ref: elapsed: %g s event, %g s wall%s" % (ref_elapsed_event, ref_elapsed_wall, ref_rates)) # }}} result_dict = {} result_dict["elapsed_event"] = elapsed_event result_dict["elapsed_event_marker"] = elapsed_event_marker result_dict["elapsed_wall"] = elapsed_wall result_dict["timing_rounds"] = timing_rounds if do_check: result_dict["ref_elapsed_event"] = ref_elapsed_event result_dict["ref_elapsed_wall"] = ref_elapsed_wall return result_dict
def get_gmem_access_poly(knl): # for now just counting subscripts """Count the number of global memory accesses in a loopy kernel. :parameter knl: A :class:`loopy.LoopKernel` whose DRAM accesses are to be counted. :return: A mapping of **{(** :class:`numpy.dtype` **,** :class:`string` **,** :class:`string` **)** **:** :class:`islpy.PwQPolynomial` **}**. - The :class:`numpy.dtype` specifies the type of the data being accessed. - The first string in the map key specifies the global memory access type as *consecutive*, *nonconsecutive*, or *uniform*. - The second string in the map key specifies the global memory access type as a *load*, or a *store*. - The :class:`islpy.PwQPolynomial` holds the number of DRAM accesses with the characteristics specified in the key (in terms of the :class:`loopy.LoopKernel` *inames*). Example usage:: # (first create loopy kernel and specify array data types) subscript_map = get_gmem_access_poly(knl) params = {'n': 512, 'm': 256, 'l': 128} f32_uncoalesced_load = subscript_map.dict[ (np.dtype(np.float32), 'nonconsecutive', 'load') ].eval_with_dict(params) f32_coalesced_load = subscript_map.dict[ (np.dtype(np.float32), 'consecutive', 'load') ].eval_with_dict(params) f32_coalesced_store = subscript_map.dict[ (np.dtype(np.float32), 'consecutive', 'store') ].eval_with_dict(params) # (now use these counts to predict performance) """ from loopy.preprocess import preprocess_kernel, infer_unknown_types knl = infer_unknown_types(knl, expect_completion=True) knl = preprocess_kernel(knl) subs_poly = ToCountMap() subscript_counter = GlobalSubscriptCounter(knl) for insn in knl.instructions: insn_inames = knl.insn_inames(insn) inames_domain = knl.get_inames_domain(insn_inames) domain = (inames_domain.project_out_except(insn_inames, [dim_type.set])) subs_expr = subscript_counter(insn.expression) subs_expr = ToCountMap(dict( (key + ("load",), val) for key, val in six.iteritems(subs_expr.dict))) subs_assignee = subscript_counter(insn.assignee) subs_assignee = ToCountMap(dict( (key + ("store",), val) for key, val in six.iteritems(subs_assignee.dict))) subs_poly = subs_poly + (subs_expr + subs_assignee)*count(knl, domain) return subs_poly.dict
def generate_code(kernel, device=None): if device is not None: from warnings import warn warn("passing 'device' to generate_code() is deprecated", DeprecationWarning, stacklevel=2) if kernel.schedule is None: from loopy.schedule import get_one_scheduled_kernel kernel = get_one_scheduled_kernel(kernel) from loopy.kernel import kernel_state if kernel.state != kernel_state.SCHEDULED: raise LoopyError("cannot generate code for a kernel that has not been " "scheduled") # {{{ cache retrieval from loopy import CACHING_ENABLED if CACHING_ENABLED: input_kernel = kernel try: result = code_gen_cache[input_kernel] logger.info("%s: code generation cache hit" % kernel.name) return result except KeyError: pass # }}} from loopy.preprocess import infer_unknown_types kernel = infer_unknown_types(kernel, expect_completion=True) from loopy.check import pre_codegen_checks pre_codegen_checks(kernel) logger.info("%s: generate code: start" % kernel.name) # {{{ examine arg list from loopy.kernel.data import ValueArg from loopy.kernel.array import ArrayBase impl_arg_info = [] for arg in kernel.args: if isinstance(arg, ArrayBase): impl_arg_info.extend( arg.decl_info( kernel.target, is_written=arg.name in kernel.get_written_variables(), index_dtype=kernel.index_dtype)) elif isinstance(arg, ValueArg): impl_arg_info.append(ImplementedDataInfo( target=kernel.target, name=arg.name, dtype=arg.dtype, cgen_declarator=arg.get_arg_decl(kernel.target), arg_class=ValueArg)) else: raise ValueError("argument type not understood: '%s'" % type(arg)) allow_complex = False for var in kernel.args + list(six.itervalues(kernel.temporary_variables)): if var.dtype.kind == "c": allow_complex = True # }}} seen_dtypes = set() seen_functions = set() initial_implemented_domain = isl.BasicSet.from_params(kernel.assumptions) codegen_state = CodeGenerationState( kernel=kernel, implemented_domain=initial_implemented_domain, implemented_predicates=frozenset(), seen_dtypes=seen_dtypes, seen_functions=seen_functions, var_subst_map={}, allow_complex=allow_complex) code_str, implemented_domains = kernel.target.generate_code( kernel, codegen_state, impl_arg_info) from loopy.check import check_implemented_domains assert check_implemented_domains(kernel, implemented_domains, code_str) # {{{ handle preambles for arg in kernel.args: seen_dtypes.add(arg.dtype) for tv in six.itervalues(kernel.temporary_variables): seen_dtypes.add(tv.dtype) preambles = kernel.preambles[:] preamble_generators = (kernel.preamble_generators + kernel.target.preamble_generators()) for prea_gen in preamble_generators: preambles.extend(prea_gen(kernel, seen_dtypes, seen_functions)) seen_preamble_tags = set() dedup_preambles = [] for tag, preamble in sorted(preambles, key=lambda tag_code: tag_code[0]): if tag in seen_preamble_tags: continue seen_preamble_tags.add(tag) dedup_preambles.append(preamble) from loopy.tools import remove_common_indentation preamble_codes = [ remove_common_indentation(lines) + "\n" for lines in dedup_preambles] code_str = "".join(preamble_codes) + code_str # }}} logger.info("%s: generate code: done" % kernel.name) result = code_str, impl_arg_info if CACHING_ENABLED: code_gen_cache[input_kernel] = result return result
def get_gmem_access_poly(knl, numpy_types=True): # for now just counting subscripts """Count the number of global memory accesses in a loopy kernel. :parameter knl: A :class:`loopy.LoopKernel` whose DRAM accesses are to be counted. :return: A mapping of **{(** *type* **,** :class:`string` **,** :class:`string` **)** **:** :class:`islpy.PwQPolynomial` **}**. - The *type* specifies the type of the data being accessed. This can be a :class:`numpy.dtype` if *numpy_types* is True, otherwise the internal loopy type. - The first string in the map key specifies the global memory access type as *consecutive*, *nonconsecutive*, or *uniform*. - The second string in the map key specifies the global memory access type as a *load*, or a *store*. - The :class:`islpy.PwQPolynomial` holds the number of DRAM accesses with the characteristics specified in the key (in terms of the :class:`loopy.LoopKernel` *inames*). Example usage:: # (first create loopy kernel and specify array data types) subscript_map = get_gmem_access_poly(knl) params = {'n': 512, 'm': 256, 'l': 128} f32_uncoalesced_load = subscript_map.dict[ (np.dtype(np.float32), 'nonconsecutive', 'load') ].eval_with_dict(params) f32_coalesced_load = subscript_map.dict[ (np.dtype(np.float32), 'consecutive', 'load') ].eval_with_dict(params) f32_coalesced_store = subscript_map.dict[ (np.dtype(np.float32), 'consecutive', 'store') ].eval_with_dict(params) # (now use these counts to predict performance) """ from loopy.preprocess import preprocess_kernel, infer_unknown_types class CacheHolder(object): pass cache_holder = CacheHolder() @memoize_in(cache_holder, "insn_count") def get_insn_count(knl, insn_inames, uniform=False): if uniform: from loopy.kernel.data import LocalIndexTag insn_inames = [iname for iname in insn_inames if not isinstance(knl.iname_to_tag.get(iname), LocalIndexTag)] inames_domain = knl.get_inames_domain(insn_inames) domain = (inames_domain.project_out_except( insn_inames, [dim_type.set])) return count(knl, domain) knl = infer_unknown_types(knl, expect_completion=True) knl = preprocess_kernel(knl) subs_poly = ToCountMap() subscript_counter = GlobalSubscriptCounter(knl) for insn in knl.instructions: # count subscripts, distinguishing loads and stores subs_expr = subscript_counter(insn.expression) subs_expr = ToCountMap(dict( (key + ("load",), val) for key, val in six.iteritems(subs_expr.dict))) subs_assignee = subscript_counter(insn.assignee) subs_assignee = ToCountMap(dict( (key + ("store",), val) for key, val in six.iteritems(subs_assignee.dict))) insn_inames = knl.insn_inames(insn) # use count excluding local index tags for uniform accesses for key in subs_expr.dict: poly = ToCountMap({key: subs_expr.dict[key]}) if key[1] == "uniform": subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames, True) else: subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames) for key in subs_assignee.dict: poly = ToCountMap({key: subs_assignee.dict[key]}) if key[1] == "uniform": subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames, True) else: subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames) result = subs_poly.dict if numpy_types: result = dict( ((dtype.numpy_dtype, kind, direction), count) for (dtype, kind, direction), count in six.iteritems(result)) return result
def _add_and_infer_dtypes_overdetermined(knl, dtype_dict): knl = _add_dtypes_overdetermined(knl, dtype_dict) from loopy.preprocess import infer_unknown_types return infer_unknown_types(knl, expect_completion=True)
def auto_test_vs_ref( ref_knl, ctx, test_knl=None, op_count=[], op_label=[], parameters={}, print_ref_code=False, print_code=True, warmup_rounds=2, dump_binary=False, fills_entire_output=None, do_check=True, check_result=None, max_test_kernel_count=1, quiet=False, blacklist_ref_vendors=[]): """Compare results of `ref_knl` to the kernels generated by scheduling *test_knl*. :arg check_result: a callable with :class:`numpy.ndarray` arguments *(result, reference_result)* returning a a tuple (class:`bool`, message) indicating correctness/acceptability of the result :arg max_test_kernel_count: Stop testing after this many *test_knl* """ import pyopencl as cl if test_knl is None: test_knl = ref_knl do_check = False if len(ref_knl.args) != len(test_knl.args): raise LoopyError("ref_knl and test_knl do not have the same number " "of arguments") for i, (ref_arg, test_arg) in enumerate(zip(ref_knl.args, test_knl.args)): if ref_arg.name != test_arg.name: raise LoopyError("ref_knl and test_knl argument lists disagree at index " "%d (1-based)" % (i+1)) if ref_arg.dtype != test_arg.dtype: raise LoopyError("ref_knl and test_knl argument lists disagree at index " "%d (1-based)" % (i+1)) from loopy.compiled import CompiledKernel, get_highlighted_cl_code if isinstance(op_count, (int, float)): warn("op_count should be a list", stacklevel=2) op_count = [op_count] if isinstance(op_label, str): warn("op_label should be a list", stacklevel=2) op_label = [op_label] from time import time if check_result is None: check_result = _default_check_result if fills_entire_output is not None: warn("fills_entire_output is deprecated", DeprecationWarning, stacklevel=2) # {{{ compile and run reference code from loopy.preprocess import infer_unknown_types ref_knl = infer_unknown_types(ref_knl, expect_completion=True) found_ref_device = False ref_errors = [] for dev in _enumerate_cl_devices_for_ref_test(blacklist_ref_vendors): ref_ctx = cl.Context([dev]) ref_queue = cl.CommandQueue(ref_ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) pp_ref_knl = lp.preprocess_kernel(ref_knl) for knl in lp.generate_loop_schedules(pp_ref_knl): ref_sched_kernel = knl break logger.info("%s (ref): trying %s for the reference calculation" % ( ref_knl.name, dev)) ref_compiled = CompiledKernel(ref_ctx, ref_sched_kernel) if not quiet and print_ref_code: print(75*"-") print("Reference Code:") print(75*"-") print(get_highlighted_cl_code(ref_compiled.code)) print(75*"-") ref_cl_kernel_info = ref_compiled.cl_kernel_info(frozenset()) try: ref_args, ref_arg_data = \ make_ref_args(ref_sched_kernel, ref_cl_kernel_info.implemented_data_info, ref_queue, parameters) ref_args["out_host"] = False except cl.RuntimeError as e: if e.code == cl.status_code.IMAGE_FORMAT_NOT_SUPPORTED: import traceback ref_errors.append("\n".join([ 75*"-", "On %s:" % dev, 75*"-", traceback.format_exc(), 75*"-"])) continue else: raise found_ref_device = True if not do_check: break ref_queue.finish() logger.info("%s (ref): using %s for the reference calculation" % ( ref_knl.name, dev)) logger.info("%s (ref): run" % ref_knl.name) ref_start = time() if not AUTO_TEST_SKIP_RUN: ref_evt, _ = ref_compiled(ref_queue, **ref_args) else: ref_evt = cl.enqueue_marker(ref_queue) ref_queue.finish() ref_stop = time() ref_elapsed_wall = ref_stop-ref_start logger.info("%s (ref): run done" % ref_knl.name) ref_evt.wait() ref_elapsed_event = 1e-9*(ref_evt.profile.END-ref_evt.profile.START) break if not found_ref_device: raise LoopyError("could not find a suitable device for the " "reference computation.\n" "These errors were encountered:\n"+"\n".join(ref_errors)) # }}} # {{{ compile and run parallel code need_check = do_check queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) args = None from loopy.kernel import kernel_state if test_knl.state not in [ kernel_state.PREPROCESSED, kernel_state.SCHEDULED]: test_knl = lp.preprocess_kernel(test_knl) if not test_knl.schedule: test_kernels = lp.generate_loop_schedules(test_knl) else: test_kernels = [test_knl] test_kernel_count = 0 from loopy.preprocess import infer_unknown_types for i, kernel in enumerate(test_kernels): test_kernel_count += 1 if test_kernel_count > max_test_kernel_count: break kernel = infer_unknown_types(kernel, expect_completion=True) compiled = CompiledKernel(ctx, kernel) if args is None: cl_kernel_info = compiled.cl_kernel_info(frozenset()) args = make_args(kernel, cl_kernel_info.implemented_data_info, queue, ref_arg_data, parameters) args["out_host"] = False if not quiet: print(75*"-") print("Kernel #%d:" % i) print(75*"-") if print_code: print(compiled.get_highlighted_code()) print(75*"-") if dump_binary: print(type(compiled.cl_program)) print(compiled.cl_program.binaries[0]) print(75*"-") logger.info("%s: run warmup" % (knl.name)) for i in range(warmup_rounds): if not AUTO_TEST_SKIP_RUN: compiled(queue, **args) if need_check and not AUTO_TEST_SKIP_RUN: for arg_desc in ref_arg_data: if arg_desc is None: continue if not arg_desc.needs_checking: continue from pyopencl.compyte.array import as_strided ref_ary = as_strided( arg_desc.ref_storage_array.get(), shape=arg_desc.ref_shape, strides=arg_desc.ref_numpy_strides).flatten() test_ary = as_strided( arg_desc.test_storage_array.get(), shape=arg_desc.test_shape, strides=arg_desc.test_numpy_strides).flatten() common_len = min(len(ref_ary), len(test_ary)) ref_ary = ref_ary[:common_len] test_ary = test_ary[:common_len] error_is_small, error = check_result(test_ary, ref_ary) if not error_is_small: raise AutomaticTestFailure(error) need_check = False events = [] queue.finish() logger.info("%s: warmup done" % (knl.name)) logger.info("%s: timing run" % (knl.name)) timing_rounds = warmup_rounds while True: from time import time start_time = time() evt_start = cl.enqueue_marker(queue) for i in range(timing_rounds): if not AUTO_TEST_SKIP_RUN: evt, _ = compiled(queue, **args) events.append(evt) else: events.append(cl.enqueue_marker(queue)) evt_end = cl.enqueue_marker(queue) queue.finish() stop_time = time() for evt in events: evt.wait() evt_start.wait() evt_end.wait() elapsed_event = (1e-9*events[-1].profile.END - 1e-9*events[0].profile.START) \ / timing_rounds try: elapsed_event_marker = ((1e-9*evt_end.profile.START - 1e-9*evt_start.profile.START) / timing_rounds) except cl.RuntimeError: elapsed_event_marker = None elapsed_wall = (stop_time-start_time)/timing_rounds if elapsed_wall * timing_rounds < 0.3: timing_rounds *= 4 else: break logger.info("%s: timing run done" % (knl.name)) rates = "" for cnt, lbl in zip(op_count, op_label): rates += " %g %s/s" % (cnt/elapsed_wall, lbl) if not quiet: def format_float_or_none(v): if v is None: return "<unavailable>" else: return "%g" % v print("elapsed: %s s event, %s s marker-event %s s wall " "(%d rounds)%s" % ( format_float_or_none(elapsed_event), format_float_or_none(elapsed_event_marker), format_float_or_none(elapsed_wall), timing_rounds, rates)) if do_check: ref_rates = "" for cnt, lbl in zip(op_count, op_label): ref_rates += " %g %s/s" % (cnt/ref_elapsed_event, lbl) if not quiet: print("ref: elapsed: %g s event, %g s wall%s" % ( ref_elapsed_event, ref_elapsed_wall, ref_rates)) # }}} result_dict = {} result_dict["elapsed_event"] = elapsed_event result_dict["elapsed_event_marker"] = elapsed_event_marker result_dict["elapsed_wall"] = elapsed_wall result_dict["timing_rounds"] = timing_rounds if do_check: result_dict["ref_elapsed_event"] = ref_elapsed_event result_dict["ref_elapsed_wall"] = ref_elapsed_wall return result_dict
def generate_code_v2(kernel): """ :returns: a :class:`CodeGenerationResult` """ from loopy.kernel import kernel_state if kernel.state == kernel_state.INITIAL: from loopy.preprocess import preprocess_kernel kernel = preprocess_kernel(kernel) if kernel.schedule is None: from loopy.schedule import get_one_scheduled_kernel kernel = get_one_scheduled_kernel(kernel) if kernel.state != kernel_state.SCHEDULED: raise LoopyError("cannot generate code for a kernel that has not been " "scheduled") # {{{ cache retrieval from loopy import CACHING_ENABLED if CACHING_ENABLED: input_kernel = kernel try: result = code_gen_cache[input_kernel] logger.info("%s: code generation cache hit" % kernel.name) return result except KeyError: pass # }}} from loopy.preprocess import infer_unknown_types kernel = infer_unknown_types(kernel, expect_completion=True) from loopy.check import pre_codegen_checks pre_codegen_checks(kernel) logger.info("%s: generate code: start" % kernel.name) # {{{ examine arg list from loopy.kernel.data import ValueArg from loopy.kernel.array import ArrayBase implemented_data_info = [] for arg in kernel.args: is_written = arg.name in kernel.get_written_variables() if isinstance(arg, ArrayBase): implemented_data_info.extend( arg.decl_info( kernel.target, is_written=is_written, index_dtype=kernel.index_dtype)) elif isinstance(arg, ValueArg): implemented_data_info.append(ImplementedDataInfo( target=kernel.target, name=arg.name, dtype=arg.dtype, arg_class=ValueArg, is_written=is_written)) else: raise ValueError("argument type not understood: '%s'" % type(arg)) allow_complex = False for var in kernel.args + list(six.itervalues(kernel.temporary_variables)): if var.dtype.involves_complex(): allow_complex = True # }}} seen_dtypes = set() seen_functions = set() seen_atomic_dtypes = set() initial_implemented_domain = isl.BasicSet.from_params(kernel.assumptions) codegen_state = CodeGenerationState( kernel=kernel, implemented_data_info=implemented_data_info, implemented_domain=initial_implemented_domain, implemented_predicates=frozenset(), seen_dtypes=seen_dtypes, seen_functions=seen_functions, seen_atomic_dtypes=seen_atomic_dtypes, var_subst_map={}, allow_complex=allow_complex, var_name_generator=kernel.get_var_name_generator(), is_generating_device_code=False, gen_program_name=( kernel.target.host_program_name_prefix + kernel.name + kernel.target.host_program_name_suffix), schedule_index_end=len(kernel.schedule)) from loopy.codegen.result import generate_host_or_device_program codegen_result = generate_host_or_device_program( codegen_state, schedule_index=0) device_code_str = codegen_result.device_code() from loopy.check import check_implemented_domains assert check_implemented_domains(kernel, codegen_result.implemented_domains, device_code_str) # {{{ handle preambles for arg in kernel.args: seen_dtypes.add(arg.dtype) for tv in six.itervalues(kernel.temporary_variables): seen_dtypes.add(tv.dtype) preambles = kernel.preambles[:] from pytools import Record class PreambleInfo(Record): pass preamble_info = PreambleInfo( kernel=kernel, seen_dtypes=seen_dtypes, seen_functions=seen_functions, # a set of LoopyTypes (!) seen_atomic_dtypes=seen_atomic_dtypes) preamble_generators = (kernel.preamble_generators + kernel.target.get_device_ast_builder().preamble_generators()) for prea_gen in preamble_generators: preambles.extend(prea_gen(preamble_info)) seen_preamble_tags = set() dedup_preambles = [] for tag, preamble in sorted(preambles, key=lambda tag_code: tag_code[0]): if tag in seen_preamble_tags: continue seen_preamble_tags.add(tag) dedup_preambles.append(preamble) from loopy.tools import remove_common_indentation preamble_codes = [ remove_common_indentation(lines) + "\n" for lines in dedup_preambles] codegen_result = codegen_result.copy( device_preambles=preamble_codes) # }}} logger.info("%s: generate code: done" % kernel.name) if CACHING_ENABLED: code_gen_cache[input_kernel] = codegen_result return codegen_result