Exemple #1
0
 def visit_FunctionDecl(self, node):
     super(StencilCTransformer, self).visit_FunctionDecl(node)
     for index, arg in enumerate(self.input_grids + (self.output_grid, )):
         defname = "_%s_array_macro" % node.params[index].name
         params = ','.join(["_d" + str(x) for x in range(arg.ndim)])
         params = "(%s)" % params
         calc = "((_d%d)" % (arg.ndim - 1)
         for x in range(arg.ndim - 1):
             ndim = str(int(arg.strides[x] / arg.itemsize))
             calc += "+((_d%s) * %s)" % (str(x), ndim)
         calc += ")"
         params = ["_d" + str(x) for x in range(arg.ndim)]
         node.defn.insert(0, CppDefine(defname, params, calc))
     abs_decl = FunctionDecl(c_int(), SymbolRef('abs'),
                             [SymbolRef('n', c_int())])
     macro = CppDefine(
         "min", [SymbolRef('_a'), SymbolRef('_b')],
         TernaryOp(Lt(SymbolRef('_a'), SymbolRef('_b')), SymbolRef('_a'),
                   SymbolRef('_b')))
     node.params.append(SymbolRef('duration', POINTER(c_float)))
     start_time = Assign(StringTemplate('clock_t start_time'),
                         FunctionCall(SymbolRef('clock')))
     node.defn.insert(0, start_time)
     end_time = Assign(
         Deref(SymbolRef('duration')),
         Div(Sub(FunctionCall(SymbolRef('clock')), SymbolRef('start_time')),
             SymbolRef('CLOCKS_PER_SEC')))
     node.defn.append(end_time)
     return [StringTemplate("#include <time.h>"), abs_decl, macro, node]
Exemple #2
0
 def _gen_reduce_for_loop(self, loop, var, size):
     looplen1 = loop.test.right
     loopincr = loop.incr.value.value
     kernel_name = self._gen_unique_kernel_name()
     kernel_src = StringTemplate("""
       __kernel void $kernel_name(__global float * $arr) {
         int x = get_global_id(0);
         float sum = $arr[x];
         #pragma unroll
         for (int i = 1; i < $batch_size; ++ i) {
           sum += $arr[i * $size + x];
         }
         $arr[x] = sum;
       }
     """, {'batch_size': C.Constant(self.batch_size),
           'arr': C.SymbolRef(var),
           'size': C.Constant(size),
           'kernel_name': C.SymbolRef(kernel_name)})
     program = cl.clCreateProgramWithSource(
         latte.config.cl_ctx, kernel_src.codegen()).build()
     kernel = program[kernel_name]
     self.kernels[kernel_name] = kernel
     kernel.setarg(0, self.cl_buffers[var], ctypes.sizeof(cl.cl_mem))
     return StringTemplate(
         """
         size_t global_size_{kernel_name}[1] = {{{looplen1}}};
         clEnqueueNDRangeKernel(queue, {kernel_name}, 1, NULL, global_size_{kernel_name}, NULL, 0, NULL, NULL);
         clFinish(queue);
         """.format(
             kernel_name=kernel_name, 
             looplen1=size)
     )
Exemple #3
0
    def transform(self, py_ast, program_cfg):
        arg_cfg, tune_cfg = program_cfg
        tree = PyBasicConversions().visit(py_ast)
        param_dict = {}
        tree.body[0].params.append(C.SymbolRef("retval", arg_cfg[0]()))
        # Annotate arguments
        for param, type in zip(tree.body[0].params, arg_cfg):
            param.type = type()
            param_dict[param.name] = type._dtype_

        length = np.prod(arg_cfg[0]._shape_)
        transformer = MapTransformer("i", param_dict, "retval")
        body = list(map(transformer.visit, tree.body[0].defn))

        tree.body[0].defn = [C.For(
                C.Assign(C.SymbolRef("i", ct.c_int()), C.Constant(0)),
                C.Lt(C.SymbolRef("i"), C.Constant(length)),
                C.PostInc(C.SymbolRef("i")),
                body=body,
                pragma="ivdep"
            )]

        tree = DeclarationFiller().visit(tree)
        defns = []
        tree = HwachaVectorize(param_dict, defns).visit(tree)
        file_body = [
            StringTemplate("#include <stdlib.h>"),
            StringTemplate("#include <stdint.h>"),
            StringTemplate("#include <assert.h>"),
            StringTemplate("extern \"C\" void __hwacha_body(void);"),
        ]
        file_body.extend(defns)
        file_body.append(tree)
        return [CFile("generated", file_body)]
Exemple #4
0
 def _gen_reduce_for_loop(self, loop, var, size):
     looplen1 = loop.test.right
     loopincr = loop.incr.value.value
     return StringTemplate("""
         //{
         //  ContinueNode *$reduce_node = new ContinueNode(&graph, [=]() {
           parallel_for(0,$size,
             [=](int low, int high) {
               #pragma simd
               for (int x = low; x < high; ++x) {
                 float sum = _$arr[x];
                 #pragma unroll
                 for (int i = 1; i < $batch_size; ++ i) {
                   sum += _$arr[i * $size + x];
                 }
                 _$arr[x] = sum;
               }
             });
         //  });
         //  for (int i = 0; i < $looplen1; i+=$loopincr) {
         //    make_edge($node_list[i], $reduce_node);
         //  }
         //};
         """, {'size': C.Constant(size),
               'batch_size': C.Constant(self.batch_size),
               'arr': C.SymbolRef(var),
               'node_list': C.SymbolRef("node_list_"),
               'reduce_node': C.SymbolRef("reduce_node_"),
               'looplen1': C.Constant(looplen1.value),  
               'loopincr': C.Constant(loopincr)
               })
Exemple #5
0
def insert_malloc(body, shape, name, dtype, _global=False):
    shape_str = "".join("[{}]".format(d) for d in shape[1:])
    size = 1
    for d in shape:
        size *= d

    body.insert(
        0,
        StringTemplate(
            "$global$type (* $arg_name0)$shape = ($global$type (*)$cast) $arg_name1;",
            {
                "arg_name0":
                C.SymbolRef(name),
                "arg_name1":
                C.FunctionCall(C.SymbolRef('_mm_malloc'), [
                    C.Mul(
                        C.Constant(size),
                        C.FunctionCall(C.SymbolRef('sizeof'), [
                            ctree.types.codegen_type(
                                ctree.types.get_c_type_from_numpy_dtype(dtype)
                                ())
                        ])),
                    C.Constant(64)
                ]),
                "shape":
                C.SymbolRef(shape_str),
                "cast":
                C.SymbolRef(shape_str),
                "type":
                C.SymbolRef(
                    ctree.types.codegen_type(
                        ctree.types.get_c_type_from_numpy_dtype(dtype)())),
                "global":
                C.SymbolRef("__global " if _global else "")
            }))
Exemple #6
0
 def test_child_nested_list(self):
     d = {'stmts': [[Constant(1)], Constant(2)]}
     t = """$stmts"""
     tree = StringTemplate(t, d)
     self._check(tree, """\
     1;
     2;""")
Exemple #7
0
 def test_child_single_list(self):
     d = {
         'stmts': [Constant(1)]
     }
     t = """$stmts"""
     tree = StringTemplate(t, d)
     self._check(tree, "1")
Exemple #8
0
    def transform(self, py_ast, program_config):
        """
        Convert the Python AST to a C AST according to the directions
        given in program_config.
        """
        tune_cfg = program_config[1]
        x, y = tune_cfg['x'], tune_cfg['y']

        template_args = {
            'min_x': Constant(self.min_x),
            'min_y': Constant(self.min_y),
            'x': Constant(x),
            'y': Constant(y),
        }

        cfile = CFile("generated", [
            StringTemplate(
                """\
                // paraboloid with global min of 1 at ($min_x, $min_y)
                float get_height() {
                    return ($x-$min_x)*($x-$min_x) +
                           ($y-$min_y)*($y-$min_y) + 1.0;
                }""", template_args)
        ])

        entry_typesig = FuncType(Float(), [])
        return Project([cfile]), entry_typesig.as_ctype()
Exemple #9
0
def insert_free(name):

    return (StringTemplate(
        "$free;", {
            "free": C.FunctionCall(C.SymbolRef('_mm_free'),
                                   [C.SymbolRef(name)]),
        }))
Exemple #10
0
 def _gen_k_rank1_updates(self, rx, ry, cx, cy, unroll, lda):
     stmts = [StringTemplate("// do K rank-1 updates")]
     for i in range(ry / 4):
         stmts.append(SymbolRef("a%d" % i, m256d()))
     stmts.append(SymbolRef("b", m256d()))
     stmts.extend(
         self._gen_rank1_update(i, rx, ry, cx, cy, lda)
         for i in range(unroll))
     return Block(stmts)
Exemple #11
0
def scalar_init(scalar):
    name = "".join(number_dict[digit] for digit in str(scalar))
    return StringTemplate("""
union {{
    float f;
    uint32_t i;
}} {name};
{name}.f = {scalar}f;
    """.format(name=name, scalar=scalar))
Exemple #12
0
    def transform(self, py_ast, program_config):
        """
        Convert the Python AST to a C AST according to the directions
        given in program_config.
        """
        A = program_config[0]
        len_A = np.prod(A._shape_)
        inner_type = A._dtype_.type()
        # browser_show_ast(py_ast,'tmp.png')
        apply_one = PyBasicConversions().visit(py_ast.body[0])
        apply_one.return_type = inner_type
        apply_one.params[0].type = inner_type

        apply_kernel = FunctionDecl(
            None,
            "apply_kernel",
            params=[SymbolRef("A", A()).set_global()],
            defn=[
                Assign(SymbolRef("i", ct.c_int()), get_global_id(0)),
                If(Lt(SymbolRef("i"), Constant(len_A)), [
                    Assign(
                        ArrayRef(SymbolRef("A"), SymbolRef("i")),
                        FunctionCall(
                            SymbolRef("apply"),
                            [ArrayRef(SymbolRef("A"), SymbolRef("i"))])),
                ], []),
            ]).set_kernel()

        kernel = OclFile("kernel", [apply_one, apply_kernel])

        control = StringTemplate(
            r"""
        #ifdef __APPLE__
        #include <OpenCL/opencl.h>
        #else
        #include <CL/cl.h>
        #endif
        void apply_all(cl_command_queue queue, cl_kernel kernel, cl_mem buf) {
            size_t global = $n;
            size_t local = 32;
            clSetKernelArg(kernel, 0, sizeof(cl_mem), &buf);
            clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);

        }
        """, {'n': Constant(len_A + 32 - (len_A % 32))})

        proj = Project([kernel, CFile("generated", [control])])
        fn = OpFunction()

        program = cl.clCreateProgramWithSource(fn.context,
                                               kernel.codegen()).build()
        apply_kernel_ptr = program['apply_kernel']

        entry_type = ct.CFUNCTYPE(None, cl.cl_command_queue, cl.cl_kernel,
                                  cl.cl_mem)
        return fn.finalize(apply_kernel_ptr, proj, "apply_all", entry_type)
Exemple #13
0
 def visit_CFile(self, node):
     self.defns = []
     self.includes = set()
     node = super(Backend, self).generic_visit(node)
     for defn in self.defns:
         node.body.insert(0, defn)
     for include in self.includes:
         node.body.insert(0,
                          StringTemplate("#include <{}>".format(include)))
     return node
Exemple #14
0
 def test_indent_1(self):
     d = {'cond': Constant(1)}
     t = """\
     while($cond)
         printf("hello");
     """
     tree = StringTemplate(t, d)
     self._check(tree, """\
     while(1)
         printf("hello");""")
Exemple #15
0
    def test_template_parent_pointers(self):
        from ctree.c.nodes import SymbolRef

        symbol = SymbolRef("hello")
        template = "char *str = $val"
        template_args = {
            'val': symbol,
        }
        node = StringTemplate(template, template_args)
        self.assertIs(symbol.parent, node)
Exemple #16
0
 def get_load_a_block(self, transpose, template_args):
     if transpose:
         raise NotImplementedError()
     else:
         return StringTemplate(
             """
             //  make a local aligned copy of A's block
             for( int j = 0; j < K; j++ )
                 for( int i = 0; i < M; i++ )
                     a[i+j*$CY] = A[i+j*$lda];
             """, template_args)
Exemple #17
0
 def transform(self, tree, program_cfg):
     arg_cfg, tune_cfg = program_cfg
     # tree = Desugar().visit(tree)
     inliner = InlineEnvironment(self.symbol_table)
     tree = inliner.visit(tree)
     tree = PyBasicConversions().visit(tree)
     tree.body = inliner.files + tree.body
     # tree.find(C.For).pragma = 'omp parallel for'
     tree.name = self.original_tree.body[0].name
     tree.body.insert(0, StringTemplate("#include <math.h>"))
     # print(tree)
     return [tree]
Exemple #18
0
 def _gen_store_c_block(self, rx, ry, lda):
     """
     Return a subtree that loads a block of 'c'.
     """
     stmts = [StringTemplate("// Store the c block")]
     for j in range(rx):
         for i in range(ry / 4):
             stmt = mm256_storeu_pd(
                 Add(SymbolRef("C"), Constant(i * 4 + j * lda)),
                 MultiArrayRef("c", i, j))
             stmts.append(stmt)
     return Block(stmts)
Exemple #19
0
 def test_indent_0(self):
     d = {'cond': Constant(1)}
     t = """\
     while($cond)
         printf("hello");
     """
     tree = While(Constant(0), [StringTemplate(t, d)])
     self._check(tree, """\
     while (0) {
         while(1)
             printf("hello");
     }""")
Exemple #20
0
 def _gen_load_c_block(self, rx, ry, lda):
     """
     Return a subtree that loads a block of 'c'.
     """
     stmts = [StringTemplate("// Load a block of c", {})]
     for j in range(rx):
         for i in range(ry / 4):
             stmt = Assign(
                 MultiArrayRef("c", i, j),
                 mm256_loadu_pd(
                     Add(SymbolRef("C"), Constant(i * 4 + j * lda))))
             stmts.append(stmt)
     return Block(stmts)
Exemple #21
0
    def transform(self, tree, program_cfg):
        arg_cfg, tune_cfg = program_cfg
        channels, height, width = arg_cfg[0]
        cfg = {
            'pad_h': C.Constant(self.pad_h),
            'pad_w': C.Constant(self.pad_w),
            'stride_h': C.Constant(self.stride_h),
            'stride_w': C.Constant(self.stride_w),
            'kernel_h': C.Constant(self.kernel_h),
            'kernel_w': C.Constant(self.kernel_w),
            'channels': C.Constant(channels),
            'height': C.Constant(height),
            'width': C.Constant(width),
        }
        im2col = C.FunctionDecl(
            None,
            C.SymbolRef("im2col"),
            [C.SymbolRef("data_im", arg_cfg[1]()),
             C.SymbolRef("data_col", arg_cfg[1]())],
            [StringTemplate("""
int stride_h = $stride_h;
int stride_w = $stride_w;
int pad_h = $pad_h;
int pad_w = $pad_w;
int kernel_h = $kernel_h;
int kernel_w = $kernel_w;
int channels = $channels;
int height = $height;
int width = $width;
int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
int channels_col = channels * kernel_h * kernel_w;
for (int c = 0; c < channels_col; ++c) {
    int w_offset = c % kernel_w;
    int h_offset = (c / kernel_w) % kernel_h;
    int c_im = c / kernel_h / kernel_w;
    for (int h = 0; h < height_col; ++h) {
        for (int w = 0; w < width_col; ++w) {
            int h_pad = h * stride_h - pad_h + h_offset;
            int w_pad = w * stride_w - pad_w + w_offset;
            if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
            data_col[(c * height_col + h) * width_col + w] =
                data_im[(c_im * height + h_pad) * width + w_pad];
            else
                data_col[(c * height_col + h) * width_col + w] = 0;
        }
    }
} """, cfg)])
        return [C.CFile('im2col', [im2col])]
Exemple #22
0
def get_asm_body(node, scalars, refs, type_map):
    body = """
__asm__ volatile (
".align 3\\n"
"__hwacha_body:\\n"
    """
    asm_body = []
    translator = HwachaASMTranslator(scalars, refs, asm_body, type_map)
    for s in node.body:
        translator.visit(s)
    for s in asm_body:
        body += "\"" + s + "\"\n"
    body += "\"  vstop\\n\"\n"
    body += "  );"
    return StringTemplate(body)
Exemple #23
0
    def test_template_with_transformer(self):
        from ctree.visitors import NodeTransformer
        from ctree.c.nodes import String, SymbolRef

        template = "char *str = $val"
        template_args = {
            'val': SymbolRef("hello"),
        }
        tree = StringTemplate(template, template_args)
        self._check(tree, 'char *str = hello')

        class SymbolsToStrings(NodeTransformer):
            def visit_SymbolRef(self, node):
                return String(node.name)

        tree = SymbolsToStrings().visit(tree)
        self._check(tree, 'char *str = "hello"')
Exemple #24
0
    def test_template_parent_pointers_with_transformer(self):
        from ctree.visitors import NodeTransformer
        from ctree.c.nodes import String, SymbolRef

        template = "char *str = $val"
        template_args = {
            'val': SymbolRef("hello"),
        }

        class SymbolsToStrings(NodeTransformer):
            def visit_SymbolRef(self, node):
                return String(node.name)

        tree = StringTemplate(template, template_args)
        tree = SymbolsToStrings().visit(tree)

        template_node, string = tree, tree.val
        self.assertIs(string.parent, template_node)
Exemple #25
0
    def build_kernel(self, kernel_src, kernel_name, kernel_args):
        kernel_src = C.CFile('generated', [StringTemplate(
"""
#define MIN(x, y) (((x) < (y)) ? (x) : (y))
#define MAX(x, y) (((x) > (y)) ? (x) : (y))
"""
            ), kernel_src])
        try:
            program = cl.clCreateProgramWithSource(
                latte.config.cl_ctx, kernel_src.codegen()).build()
            kernel = program[kernel_name]
        except cl.BuildProgramFailureError as e:
            logger.error("Failed build program:\n %s", kernel_src.codegen())
            raise e
        self.kernels[kernel_name] = kernel
        for index, arg in enumerate(kernel_args):
            kernel.setarg(index, self.cl_buffers[arg], ctypes.sizeof(cl.cl_mem))
        logger.debug(kernel_src)
Exemple #26
0
    def visit_FunctionCall(self, node):
        if isinstance(node.func, C.SymbolRef):
            if node.func.name == "rand":
                if "OPENCL" in latte.config.parallel_strategy:
                    import struct
                    platform_c_maxint = 2 ** (struct.Struct('i').size * 8 - 1) - 1
                    return StringTemplate("((({} + get_global_id(0)) * 0x5DEECE66DL + 0xBL) & ((1L << 48) - 1)) >> 16".format(int(random.random() * platform_c_maxint)))
                return C.Div(node, C.Cast(ctypes.c_float(), C.SymbolRef("RAND_MAX")))
        #ANAND: 10/11/2016 Adding following uutility to convert python max to c max
        if isinstance(node.func, C.SymbolRef):
            if node.func.name == "max":
                return C.FunctionCall(C.SymbolRef("MAX"),
                [node.args[0],node.args[1]])
 




        return node
    def transform(self, py_ast, program_config):

        # Get the initial data
        input_data = program_config[0]
        length = np.prod(input_data.size)
        pointer = np.ctypeslib.ndpointer(input_data.dtype, input_data.ndim,
                                         input_data.shape)
        data_type = get_c_type_from_numpy_dtype(input_data.dtype)()
        scalar_data_type = get_c_type_from_numpy_dtype(
            np.dtype(input_data.scalar_type))()

        apply_one = PyBasicConversions().visit(py_ast.body[0])
        apply_one.name = 'apply'
        apply_one.params[0].type = data_type
        apply_one.params[1].type = scalar_data_type
        apply_one.return_type = data_type  # TODO: figure out which data type to actually preserve

        # TODO: MAKE A CLASS THAT HANDLES SUPPORTED TYPES (INT, FLOAT, DOUBLE)

        array_add_template = StringTemplate(
            r"""
            #pragma omp parallel for
            for (int i = 0; i < $length; i++) {
                output[i] = apply(arr[i], scalar);
            }
        """, {'length': Constant(length)})

        array_op = CFile("generated", [
            CppInclude("omp.h"),
            CppInclude("stdio.h"), apply_one,
            FunctionDecl(None,
                         FUNC_NAME,
                         params=[
                             SymbolRef("arr", pointer()),
                             SymbolRef("scalar", scalar_data_type),
                             SymbolRef("output", pointer())
                         ],
                         defn=[array_add_template])
        ], 'omp')

        return [array_op]
Exemple #28
0
def insert_cast(body, shape, name, dtype, _global=False):
    shape_str = "".join("[{}]".format(d) for d in shape)

    body.insert(
        0,
        StringTemplate(
            "$global$type (* $arg_name)$shape = ($global$type (*)$cast) _$arg_name;",
            {
                "arg_name":
                C.SymbolRef(name),
                "shape":
                C.SymbolRef(shape_str),
                "cast":
                C.SymbolRef(shape_str),
                "type":
                C.SymbolRef(
                    ctree.types.codegen_type(
                        ctree.types.get_c_type_from_numpy_dtype(dtype)())),
                "global":
                C.SymbolRef("__global " if _global else "")
            }))
Exemple #29
0
 def _gen_reduce_for_loop(self, loop, var, size):
     looplen1 = loop.test.right
     loopincr = loop.incr.value.value
     return StringTemplate("""
           #pragma omp parallel for simd
           for (int x = 0; x < $size; ++x) {
             float sum = _$arr[x];
             #pragma unroll
             for (int i = 1; i < $batch_size; ++ i) {
               sum += _$arr[i * $size + x];
             }
             _$arr[x] = sum;
           }
         """, {'size': C.Constant(size),
               'batch_size': C.Constant(self.batch_size),
               'arr': C.SymbolRef(var),
               'node_list': C.SymbolRef("node_list_"),
               'reduce_node': C.SymbolRef("reduce_node_"),
               'looplen1': C.Constant(looplen1.value),  
               'loopincr': C.Constant(loopincr)
               })
    def transform(self, py_ast, program_config):

        # Get the initial data
        input_data = program_config[0]
        length = np.prod(input_data.size)
        pointer = np.ctypeslib.ndpointer(input_data.dtype, input_data.ndim, input_data.shape)
        data_type = get_c_type_from_numpy_dtype(input_data.dtype)()

        apply_one = PyBasicConversions().visit(py_ast.body[0])
        apply_one.name = 'apply'
        apply_one.params[0].type = data_type
        apply_one.params[1].type = data_type
        apply_one.return_type = data_type

        array_add_template = StringTemplate(r"""
            #pragma omp parallel for
            for (int i = 0; i < $length; i++) {
                output[i] = apply(input1[i], input2[i]);
            }
        """, {
            'length': Constant(length)
        })

        array_op = CFile("generated", [
            CppInclude("omp.h"),
            CppInclude("stdio.h"),
            apply_one,
            FunctionDecl(None, FUNC_NAME,
                         params=[
                             SymbolRef("input1", pointer()),
                             SymbolRef("input2", pointer()),
                             SymbolRef("output", pointer())
                         ],
                         defn=[
                             array_add_template
                         ])
        ], 'omp')

        return [array_op]
Exemple #31
0
 def test_dotgen(self):
     tree = StringTemplate("return $one $two", {"one": Constant(1), "two": Constant(2)})
     dot = tree.to_dot()
Exemple #32
0
 def test_dotgen(self):
     tree = StringTemplate("return $one $two", {
         'one': Constant(1),
         'two': Constant(2),
     })
     dot = tree.to_dot()