Beispiel #1
0
 def _print_TemporaryMemoryAllocation(self, node):
     align = 64
     np_dtype = node.symbol.dtype.base_type.numpy_dtype
     required_size = np_dtype.itemsize * node.size + align
     size = modulo_ceil(required_size, align)
     code = "{dtype} {name}=({dtype})aligned_alloc({align}, {size}) + {offset};"
     return code.format(dtype=node.symbol.dtype,
                        name=self.sympy_printer.doprint(node.symbol.name),
                        size=self.sympy_printer.doprint(size),
                        offset=int(node.offset(align)),
                        align=align)
Beispiel #2
0
    def benchmark_run(self, time_steps=0, init_time_steps=0):
        init_time_steps_rounded = modulo_ceil(init_time_steps,
                                              self._fixed_steps)
        time_steps_rounded = modulo_ceil(time_steps, self._fixed_steps)
        call_data = self._call_data

        self.pre_run()
        for i in range(init_time_steps_rounded // self._fixed_steps):
            for func, kwargs in call_data:
                func(**kwargs)
        self.time_steps_run += init_time_steps_rounded

        start = time.perf_counter()
        for i in range(time_steps_rounded // self._fixed_steps):
            for func, kwargs in call_data:
                func(**kwargs)
        end = time.perf_counter()
        self.time_steps_run += time_steps_rounded
        self.post_run()

        time_for_one_iteration = (end - start) / time_steps
        return time_for_one_iteration
Beispiel #3
0
    def _print_TemporaryMemoryAllocation(self, node):
        if self._vector_instruction_set:
            align = self._vector_instruction_set['bytes']
        else:
            align = node.symbol.dtype.base_type.numpy_dtype.itemsize

        np_dtype = node.symbol.dtype.base_type.numpy_dtype
        required_size = np_dtype.itemsize * node.size + align
        size = modulo_ceil(required_size, align)
        code = "#if defined(_MSC_VER)\n"
        code += "{dtype} {name}=({dtype})_aligned_malloc({size}, {align}) + {offset};\n"
        code += "#elif __cplusplus >= 201703L || __STDC_VERSION__ >= 201112L\n"
        code += "{dtype} {name}=({dtype})aligned_alloc({align}, {size}) + {offset};\n"
        code += "#else\n"
        code += "{dtype} {name};\n"
        code += "posix_memalign((void**) &{name}, {align}, {size});\n"
        code += "{name} += {offset};\n"
        code += "#endif"
        return code.format(dtype=node.symbol.dtype,
                           name=self.sympy_printer.doprint(node.symbol.name),
                           size=self.sympy_printer.doprint(size),
                           offset=int(node.offset(align)),
                           align=align)
Beispiel #4
0
def vectorize_inner_loops_and_adapt_load_stores(ast_node, vector_width, assume_aligned, nontemporal_fields,
                                                strided, keep_loop_stop, assume_sufficient_line_padding):
    """Goes over all innermost loops, changes increment to vector width and replaces field accesses by vector type."""
    all_loops = filtered_tree_iteration(ast_node, ast.LoopOverCoordinate, stop_type=ast.SympyAssignment)
    inner_loops = [n for n in all_loops if n.is_innermost_loop]
    zero_loop_counters = {l.loop_counter_symbol: 0 for l in all_loops}

    for loop_node in inner_loops:
        loop_range = loop_node.stop - loop_node.start

        # cut off loop tail, that is not a multiple of four
        if keep_loop_stop:
            pass
        elif assume_aligned and assume_sufficient_line_padding:
            loop_range = loop_node.stop - loop_node.start
            new_stop = loop_node.start + modulo_ceil(loop_range, vector_width)
            loop_node.stop = new_stop
        else:
            cutting_point = modulo_floor(loop_range, vector_width) + loop_node.start
            loop_nodes = [l for l in cut_loop(loop_node, [cutting_point]).args if isinstance(l, ast.LoopOverCoordinate)]
            assert len(loop_nodes) in (0, 1, 2)  # 2 for main and tail loop, 1 if loop range divisible by vector width
            if len(loop_nodes) == 0:
                continue
            loop_node = loop_nodes[0]

        # Find all array accesses (indexed) that depend on the loop counter as offset
        loop_counter_symbol = ast.LoopOverCoordinate.get_loop_counter_symbol(loop_node.coordinate_to_loop_over)
        substitutions = {}
        successful = True
        for indexed in loop_node.atoms(sp.Indexed):
            base, index = indexed.args
            if loop_counter_symbol in index.atoms(sp.Symbol):
                loop_counter_is_offset = loop_counter_symbol not in (index - loop_counter_symbol).atoms()
                aligned_access = (index - loop_counter_symbol).subs(zero_loop_counters) == 0
                stride = sp.simplify(index.subs({loop_counter_symbol: loop_counter_symbol + 1}) - index)
                if not loop_counter_is_offset and (not strided or loop_counter_symbol in stride.atoms()):
                    successful = False
                    break
                typed_symbol = base.label
                assert type(typed_symbol.dtype) is PointerType, \
                    f"Type of access is {typed_symbol.dtype}, {indexed}"

                vec_type = VectorType(typed_symbol.dtype.base_type, vector_width)
                use_aligned_access = aligned_access and assume_aligned
                nontemporal = False
                if hasattr(indexed, 'field'):
                    nontemporal = (indexed.field in nontemporal_fields) or (indexed.field.name in nontemporal_fields)
                substitutions[indexed] = vector_memory_access(indexed, vec_type, use_aligned_access, nontemporal, True,
                                                              stride if strided else 1)
                if nontemporal:
                    # insert NontemporalFence after the outermost loop
                    parent = loop_node.parent
                    while type(parent.parent.parent) is not ast.KernelFunction:
                        parent = parent.parent
                    parent.parent.insert_after(NontemporalFence(), parent, if_not_exists=True)
                    # insert CachelineSize at the beginning of the kernel
                    parent.parent.insert_front(CachelineSize(), if_not_exists=True)
        if not successful:
            warnings.warn("Could not vectorize loop because of non-consecutive memory access")
            continue

        loop_node.step = vector_width
        loop_node.subs(substitutions)
        vector_int_width = ast_node.instruction_set['intwidth']
        vector_loop_counter = cast_func(loop_counter_symbol, VectorType(loop_counter_symbol.dtype, vector_int_width)) \
            + cast_func(tuple(range(vector_int_width if type(vector_int_width) is int else 2)),
                        VectorType(loop_counter_symbol.dtype, vector_int_width))

        fast_subs(loop_node, {loop_counter_symbol: vector_loop_counter},
                  skip=lambda e: isinstance(e, ast.ResolvedFieldAccess) or isinstance(e, vector_memory_access))

        mask_conditionals(loop_node)

        from pystencils.rng import RNGBase
        substitutions = {}
        for rng in loop_node.atoms(RNGBase):
            new_result_symbols = [TypedSymbol(s.name, VectorType(s.dtype, width=vector_width))
                                  for s in rng.result_symbols]
            substitutions.update({s[0]: s[1] for s in zip(rng.result_symbols, new_result_symbols)})
            rng._symbols_defined = set(new_result_symbols)
        fast_subs(loop_node, substitutions, skip=lambda e: isinstance(e, RNGBase))
Beispiel #5
0
def vectorize_inner_loops_and_adapt_load_stores(
        ast_node, vector_width, assume_aligned, nontemporal_fields,
        assume_sufficient_line_padding):
    """Goes over all innermost loops, changes increment to vector width and replaces field accesses by vector type."""
    all_loops = filtered_tree_iteration(ast_node,
                                        ast.LoopOverCoordinate,
                                        stop_type=ast.SympyAssignment)
    inner_loops = [n for n in all_loops if n.is_innermost_loop]
    zero_loop_counters = {l.loop_counter_symbol: 0 for l in all_loops}

    for loop_node in inner_loops:
        loop_range = loop_node.stop - loop_node.start

        # cut off loop tail, that is not a multiple of four
        if assume_aligned and assume_sufficient_line_padding:
            loop_range = loop_node.stop - loop_node.start
            new_stop = loop_node.start + modulo_ceil(loop_range, vector_width)
            loop_node.stop = new_stop
        else:
            cutting_point = modulo_floor(loop_range,
                                         vector_width) + loop_node.start
            loop_nodes = [
                l for l in cut_loop(loop_node, [cutting_point]).args
                if isinstance(l, ast.LoopOverCoordinate)
            ]
            assert len(loop_nodes) in (
                0, 1, 2
            )  # 2 for main and tail loop, 1 if loop range divisible by vector width
            if len(loop_nodes) == 0:
                continue
            loop_node = loop_nodes[0]

        # Find all array accesses (indexed) that depend on the loop counter as offset
        loop_counter_symbol = ast.LoopOverCoordinate.get_loop_counter_symbol(
            loop_node.coordinate_to_loop_over)
        substitutions = {}
        successful = True
        for indexed in loop_node.atoms(sp.Indexed):
            base, index = indexed.args
            if loop_counter_symbol in index.atoms(sp.Symbol):
                loop_counter_is_offset = loop_counter_symbol not in (
                    index - loop_counter_symbol).atoms()
                aligned_access = (
                    index - loop_counter_symbol).subs(zero_loop_counters) == 0
                if not loop_counter_is_offset:
                    successful = False
                    break
                typed_symbol = base.label
                assert type(typed_symbol.dtype) is PointerType, \
                    "Type of access is {}, {}".format(typed_symbol.dtype, indexed)

                vec_type = VectorType(typed_symbol.dtype.base_type,
                                      vector_width)
                use_aligned_access = aligned_access and assume_aligned
                nontemporal = False
                if hasattr(indexed, 'field'):
                    nontemporal = (indexed.field in nontemporal_fields) or (
                        indexed.field.name in nontemporal_fields)
                substitutions[indexed] = vector_memory_access(
                    indexed, vec_type, use_aligned_access, nontemporal)
        if not successful:
            warnings.warn(
                "Could not vectorize loop because of non-consecutive memory access"
            )
            continue

        loop_node.step = vector_width
        loop_node.subs(substitutions)