Ejemplo n.º 1
0
 def decorate(nodes):
     processed = []
     for node in nodes:
         mapper = {}
         for tree in retrieve_iteration_tree(node):
             vector_iterations = [i for i in tree if i.is_Vectorizable]
             for i in vector_iterations:
                 handle = FindSymbols('symbolics').visit(i)
                 try:
                     aligned = [
                         j for j in handle
                         if j.is_Tensor and j.shape[-1] %
                         get_simd_items(j.dtype) == 0
                     ]
                 except KeyError:
                     aligned = []
                 if aligned:
                     simd = omplang['simd-for-aligned']
                     simd = as_tuple(
                         simd(','.join([j.name for j in aligned]),
                              simdinfo[get_simd_flag()]))
                 else:
                     simd = as_tuple(omplang['simd-for'])
                 mapper[i] = i._rebuild(pragmas=i.pragmas +
                                        ignore_deps + simd)
         processed.append(Transformer(mapper).visit(node))
     return processed
Ejemplo n.º 2
0
    def _simdize(self, nodes, state):
        """
        Add compiler-specific or, if not available, OpenMP pragmas to the
        Iteration/Expression tree to emit SIMD-friendly code.
        """
        ignore_deps = as_tuple(self._compiler_decoration('ignore-deps'))

        mapper = {}
        for tree in retrieve_iteration_tree(nodes):
            vector_iterations = [i for i in tree if i.is_Vectorizable]
            for i in vector_iterations:
                handle = FindSymbols('symbolics').visit(i)
                try:
                    aligned = [
                        j for j in handle if j.is_Tensor and j.shape[-1] %
                        get_simd_items(j.dtype) == 0
                    ]
                except KeyError:
                    aligned = []
                if aligned:
                    simd = Ompizer.lang['simd-for-aligned']
                    simd = as_tuple(
                        simd(','.join([j.name for j in aligned]),
                             simdinfo[get_simd_flag()]))
                else:
                    simd = as_tuple(Ompizer.lang['simd-for'])
                mapper[i] = i._rebuild(pragmas=i.pragmas + ignore_deps + simd)

        processed = Transformer(mapper).visit(nodes)

        return processed, {}
Ejemplo n.º 3
0
    def _padding(self, nodes, state):
        """
        Introduce temporary buffers padded to the nearest multiple of the vector
        length, to maximize data alignment. At the bottom of the kernel, the
        values in the padded temporaries will be copied back into the input arrays.
        """
        mapper = OrderedDict()

        # Assess feasibility of the transformation
        handle = FindSymbols('symbolics-writes').visit(nodes)
        if not handle:
            return nodes, {}
        shape = max([i.shape for i in handle], key=len)
        if not shape:
            return nodes, {}
        candidates = [i for i in handle if i.shape[-1] == shape[-1]]
        if not candidates:
            return nodes, {}

        # Retrieve the maximum number of items in a SIMD register when processing
        # the expressions in /node/
        exprs = FindNodes(Expression).visit(nodes)
        exprs = [e for e in exprs if e.write in candidates]
        assert len(exprs) > 0
        dtype = exprs[0].dtype
        assert all(e.dtype == dtype for e in exprs)
        try:
            simd_items = get_simd_items(dtype)
        except KeyError:
            # Fallback to 16 (maximum expectable padding, for AVX512 registers)
            simd_items = simdinfo['avx512f'] / np.dtype(dtype).itemsize

        shapes = {
            k: k.shape[:-1] + (roundm(k.shape[-1], simd_items), )
            for k in candidates
        }
        mapper.update(
            OrderedDict([(k.indexed,
                          Array(name='p%s' % k.name,
                                shape=shapes[k],
                                dimensions=k.indices,
                                onstack=k._mem_stack).indexed)
                         for k in candidates]))

        # Substitute original arrays with padded buffers
        processed = SubstituteExpression(mapper).visit(nodes)

        # Build Iteration trees for initialization and copy-back of padded arrays
        mapper = OrderedDict([(k, v) for k, v in mapper.items()
                              if k.function.is_SymbolicFunction])
        init = copy_arrays(mapper, reverse=True)
        copyback = copy_arrays(mapper)

        processed = List(body=init + as_tuple(processed) + copyback)

        return processed, {}
Ejemplo n.º 4
0
    def _minimize_remainders(self, state, **kwargs):
        """
        Reshape temporary tensors and adjust loop trip counts to prevent as many
        compiler-generated remainder loops as possible.
        """

        mapper = {}
        for tree in retrieve_iteration_tree(state.nodes +
                                            state.elemental_functions):
            vector_iterations = [i for i in tree if i.is_Vectorizable]
            if not vector_iterations:
                continue
            assert len(vector_iterations) == 1
            root = vector_iterations[0]
            if root.tag is None:
                continue

            # Padding
            writes = [
                i for i in FindSymbols('symbolics-writes').visit(root)
                if i.is_TensorFunction
            ]
            padding = []
            for i in writes:
                try:
                    simd_items = get_simd_items(i.dtype)
                except KeyError:
                    # Fallback to 16 (maximum expectable padding, for AVX512 registers)
                    simd_items = simdinfo['avx512f'] / np.dtype(
                        i.dtype).itemsize
                padding.append(simd_items - i.shape[-1] % simd_items)
            if len(set(padding)) == 1:
                padding = padding[0]
                for i in writes:
                    i.update(shape=i.shape[:-1] + (i.shape[-1] + padding, ))
            else:
                # Padding must be uniform -- not the case, so giving up
                continue

            # Dynamic trip count adjustment
            endpoint = root.end_symbolic
            if not endpoint.is_Symbol:
                continue
            condition = []
            externals = set(i.symbolic_shape[-1]
                            for i in FindSymbols().visit(root))
            for i in root.uindices:
                for j in externals:
                    condition.append(root.end_symbolic + padding < j)
            condition = ' || '.join(ccode(i) for i in condition)
            endpoint_padded = endpoint.func(name='_%s' % endpoint.name)
            init = cgen.Initializer(
                cgen.Value("const int", endpoint_padded),
                cgen.Line('(%s) ? %s : %s' %
                          (condition, ccode(endpoint + padding), endpoint)))

            # Update the Iteration bound
            limits = list(root.limits)
            limits[1] = endpoint_padded.func(endpoint_padded.name)
            rebuilt = list(tree)
            rebuilt[rebuilt.index(root)] = root._rebuild(limits=limits)

            mapper[tree[0]] = List(header=init, body=compose_nodes(rebuilt))

        nodes = Transformer(mapper).visit(state.nodes)
        elemental_functions = Transformer(mapper).visit(
            state.elemental_functions)

        return {'nodes': nodes, 'elemental_functions': elemental_functions}
Ejemplo n.º 5
0
    def _minimize_remainders(self, nodes, state):
        """
        Reshape temporary tensors and adjust loop trip counts to prevent as many
        compiler-generated remainder loops as possible.
        """
        # The innermost dimension is the one that might get padded
        p_dim = -1

        mapper = {}
        for tree in retrieve_iteration_tree(nodes):
            vector_iterations = [i for i in tree if i.is_Vectorizable]
            if not vector_iterations or len(vector_iterations) > 1:
                continue
            root = vector_iterations[0]
            if root.tag is None:
                continue

            # Padding
            writes = [i.write for i in FindNodes(Expression).visit(root)
                      if i.write.is_Array]
            padding = []
            for i in writes:
                try:
                    simd_items = get_simd_items(i.dtype)
                except KeyError:
                    # Fallback to 16 (maximum expectable padding, for AVX512 registers)
                    simd_items = simdinfo['avx512f'] / np.dtype(i.dtype).itemsize
                padding.append(simd_items - i.shape[-1] % simd_items)
            if len(set(padding)) == 1:
                padding = padding[0]
                for i in writes:
                    padded = (i._padding[p_dim][0], i._padding[p_dim][1] + padding)
                    i.update(padding=i._padding[:p_dim] + (padded,))
            else:
                # Padding must be uniform -- not the case, so giving up
                continue

            # Dynamic trip count adjustment
            endpoint = root.symbolic_max
            if not endpoint.is_Symbol:
                continue
            condition = []
            externals = set(i.symbolic_shape[-1] for i in FindSymbols().visit(root)
                            if i.is_Tensor)
            for i in root.uindices:
                for j in externals:
                    condition.append(root.symbolic_max + padding < j)
            condition = ' && '.join(ccode(i) for i in condition)
            endpoint_padded = endpoint.func('_%s' % endpoint.name)
            init = cgen.Initializer(
                cgen.Value("const int", endpoint_padded),
                cgen.Line('(%s) ? %s : %s' % (condition,
                                              ccode(endpoint + padding),
                                              endpoint))
            )

            # Update the Iteration bound
            limits = list(root.limits)
            limits[1] = endpoint_padded.func(endpoint_padded.name)
            rebuilt = list(tree)
            rebuilt[rebuilt.index(root)] = root._rebuild(limits=limits)

            mapper[tree[0]] = List(header=init, body=compose_nodes(rebuilt))

        processed = Transformer(mapper).visit(nodes)

        return processed, {}