def get_usable_inames_for_conditional(kernel, sched_index): from loopy.schedule import ( find_active_inames_at, get_insn_ids_for_block_at, has_barrier_within) from loopy.kernel.data import (ConcurrentTag, LocalIndexTagBase, VectorizeTag, IlpBaseTag) result = find_active_inames_at(kernel, sched_index) crosses_barrier = has_barrier_within(kernel, sched_index) # Find our containing subkernel. Grab inames for all insns from there. within_subkernel = False for sched_item_index, sched_item in enumerate(kernel.schedule[:sched_index]): from loopy.schedule import CallKernel, ReturnFromKernel if isinstance(sched_item, CallKernel): within_subkernel = True subkernel_index = sched_item_index elif isinstance(sched_item, ReturnFromKernel): within_subkernel = False if not within_subkernel: # Outside all subkernels - use only inames available to host. return frozenset(result) insn_ids_for_subkernel = get_insn_ids_for_block_at( kernel.schedule, subkernel_index) inames_for_subkernel = ( iname for insn in insn_ids_for_subkernel for iname in kernel.insn_inames(insn)) for iname in inames_for_subkernel: # Parallel inames are defined within a subkernel, BUT: # # - local indices may not be used in conditionals that cross barriers. # # - ILP indices and vector lane indices are not available in loop # bounds, they only get defined at the innermost level of nesting. if ( kernel.iname_tags_of_type(iname, ConcurrentTag) and not kernel.iname_tags_of_type(iname, VectorizeTag) and not (kernel.iname_tags_of_type(iname, LocalIndexTagBase) and crosses_barrier) and not kernel.iname_tags_of_type(iname, IlpBaseTag) ): result.add(iname) return frozenset(result)
def get_usable_inames_for_conditional(kernel, sched_index): from loopy.schedule import ( find_active_inames_at, get_insn_ids_for_block_at, has_barrier_within) from loopy.kernel.data import (ConcurrentTag, LocalIndexTagBase, IlpBaseTag) result = find_active_inames_at(kernel, sched_index) crosses_barrier = has_barrier_within(kernel, sched_index) # Find our containing subkernel. Grab inames for all insns from there. within_subkernel = False for sched_item_index, sched_item in enumerate(kernel.schedule[:sched_index+1]): from loopy.schedule import CallKernel, ReturnFromKernel if isinstance(sched_item, CallKernel): within_subkernel = True subkernel_index = sched_item_index elif isinstance(sched_item, ReturnFromKernel): within_subkernel = False if not within_subkernel: # Outside all subkernels - use only inames available to host. return frozenset(result) insn_ids_for_subkernel = get_insn_ids_for_block_at( kernel.schedule, subkernel_index) inames_for_subkernel = ( iname for insn in insn_ids_for_subkernel for iname in kernel.insn_inames(insn)) for iname in inames_for_subkernel: # Parallel inames are defined within a subkernel, BUT: # # - local indices may not be used in conditionals that cross barriers. # # - ILP indices are not available in loop bounds, they only get defined # at the innermost level of nesting. if ( kernel.iname_tags_of_type(iname, ConcurrentTag) and not (kernel.iname_tags_of_type(iname, LocalIndexTagBase) and crosses_barrier) and not kernel.iname_tags_of_type(iname, IlpBaseTag) ): result.add(iname) return frozenset(result)
def get_admissible_conditional_inames_for(kernel, sched_index): """This function disallows conditionals on local-idx tagged inames if there is a barrier nested somewhere within. """ from loopy.kernel.data import LocalIndexTag, HardwareParallelTag from loopy.schedule import find_active_inames_at, has_barrier_within result = find_active_inames_at(kernel, sched_index) has_barrier = has_barrier_within(kernel, sched_index) for iname, tag in six.iteritems(kernel.iname_to_tag): if isinstance(tag, HardwareParallelTag): if not has_barrier or not isinstance(tag, LocalIndexTag): result.add(iname) return frozenset(result)
def get_admissible_conditional_inames_for(codegen_state, sched_index): """This function disallows conditionals on local-idx tagged inames if there is a barrier nested somewhere within. """ kernel = codegen_state.kernel from loopy.kernel.data import LocalIndexTag, HardwareParallelTag from loopy.schedule import find_active_inames_at, has_barrier_within result = find_active_inames_at(kernel, sched_index) has_barrier = has_barrier_within(kernel, sched_index) for iname, tag in six.iteritems(kernel.iname_to_tag): if (isinstance(tag, HardwareParallelTag) and codegen_state.is_generating_device_code): if not has_barrier or not isinstance(tag, LocalIndexTag): result.add(iname) return frozenset(result)
def get_admissible_conditional_inames_for(codegen_state, sched_index): """This function disallows conditionals on local-idx tagged inames if there is a barrier nested somewhere within. """ kernel = codegen_state.kernel from loopy.kernel.data import (LocalIndexTag, HardwareConcurrentTag, filter_iname_tags_by_type) from loopy.schedule import find_active_inames_at, has_barrier_within result = find_active_inames_at(kernel, sched_index) has_barrier = has_barrier_within(kernel, sched_index) for iname, tags in six.iteritems(kernel.iname_to_tags): if (filter_iname_tags_by_type(tags, HardwareConcurrentTag) and codegen_state.is_generating_device_code): if not has_barrier or not filter_iname_tags_by_type(tags, LocalIndexTag): result.add(iname) return frozenset(result)
def get_usable_inames_for_conditional(kernel, sched_index): from loopy.schedule import (find_active_inames_at, get_insn_ids_for_block_at, has_barrier_within) from loopy.kernel.data import ParallelTag, LocalIndexTagBase, IlpBaseTag result = find_active_inames_at(kernel, sched_index) crosses_barrier = has_barrier_within(kernel, sched_index) # Find our containing subkernel, grab inames for all insns from there. subkernel_index = sched_index from loopy.schedule import CallKernel while not isinstance(kernel.schedule[subkernel_index], CallKernel): subkernel_index -= 1 insn_ids_for_subkernel = get_insn_ids_for_block_at(kernel.schedule, subkernel_index) inames_for_subkernel = (iname for insn in insn_ids_for_subkernel for iname in kernel.insn_inames(insn)) for iname in inames_for_subkernel: tag = kernel.iname_to_tag.get(iname) # Parallel inames are defined within a subkernel, BUT: # # - local indices may not be used in conditionals that cross barriers. # # - ILP indices are not available in loop bounds, they only get defined # at the innermost level of nesting. if (isinstance(tag, ParallelTag) and not (isinstance(tag, LocalIndexTagBase) and crosses_barrier) and not isinstance(tag, IlpBaseTag)): result.add(iname) return frozenset(result)