Esempio n. 1
0
 def visit_AugAssign(self, node):
     node.value = self.visit(node.value)
     if not self.vectorize:
         node.target = self.visit(node.target)
         return node
     if util.contains_symbol(node.target, self.loop_var):
         return simd_macros.mm256_store_ps(
             node.target,
             C.BinaryOp(self.visit(node.target), node.op, node.value))
     elif isinstance(node.op, C.Op.Add) and isinstance(node.value, C.BinaryOp) and \
             isinstance(node.value.op, C.Op.Mul):
         # if not isinstance(node.target, C.SymbolRef):
         #     node.value = C.FunctionCall(C.SymbolRef("vsum"), [node.value])
         #     return node
         # else:
         return C.Assign(
             node.target,
             C.FunctionCall(
                 C.SymbolRef("_mm256_fmadd_ps"),
                 [node.value.left, node.value.right, node.target]))
     elif isinstance(node.op, C.Op.Add) and isinstance(
             node.value, C.FunctionCall):
         # TODO: Verfiy it's a vector intrinsic
         return C.Assign(
             node.target,
             C.FunctionCall(C.SymbolRef("_mm256_add_ps"),
                            [node.value, node.target]))
     elif isinstance(node.target, C.BinaryOp) and isinstance(
             node.target.op, C.Op.ArrayRef):
         raise NotImplementedError()
     node.target = self.visit(node.target)
     return node
Esempio n. 2
0
def insert_malloc(body, shape, name, dtype, _global=False):
    shape_str = "".join("[{}]".format(d) for d in shape[1:])
    size = 1
    for d in shape:
        size *= d

    body.insert(
        0,
        StringTemplate(
            "$global$type (* $arg_name0)$shape = ($global$type (*)$cast) $arg_name1;",
            {
                "arg_name0":
                C.SymbolRef(name),
                "arg_name1":
                C.FunctionCall(C.SymbolRef('_mm_malloc'), [
                    C.Mul(
                        C.Constant(size),
                        C.FunctionCall(C.SymbolRef('sizeof'), [
                            ctree.types.codegen_type(
                                ctree.types.get_c_type_from_numpy_dtype(dtype)
                                ())
                        ])),
                    C.Constant(64)
                ]),
                "shape":
                C.SymbolRef(shape_str),
                "cast":
                C.SymbolRef(shape_str),
                "type":
                C.SymbolRef(
                    ctree.types.codegen_type(
                        ctree.types.get_c_type_from_numpy_dtype(dtype)())),
                "global":
                C.SymbolRef("__global " if _global else "")
            }))
Esempio n. 3
0
    def visit_For(self, node):
        node.body = util.flatten([s for s in node.body])
        new_body = []
        for stmt in node.body:
          if isinstance(stmt, C.FunctionCall) and "_mm" in stmt.func.name \
             and "_store" in stmt.func.name and inReplaceMapSource(stmt.args[0], self.replace_map):
                  
                  if isinstance(stmt.args[1], C.SymbolRef):
                    sym_arr_ref = extract_reference(stmt.args)  
                    store_in_du_map(sym_arr_ref)  
                    reg = stmt.args[1]
                    self.seen[reg.name] = None
                    new_body.append(stmt)

                  elif isinstance(stmt.args[1], C.FunctionCall) and "_mm" in stmt.func.name:
                      tmp = self._gen_register()
                      new_body.append(C.Assign(C.SymbolRef(tmp, get_simd_type()()), deepcopy(stmt.args[1])))
                      new_body.append(C.FunctionCall(C.SymbolRef(stmt.func.name),  [stmt.args[0],C.SymbolRef(tmp, None)]))
                      sym_arr_ref = extract_reference(C.FunctionCall(C.SymbolRef(stmt.func.name),  [stmt.args[0],C.SymbolRef(tmp, None)]).args)  
                      store_in_du_map(sym_arr_ref)
                  # if stmt.args[0].type:
                  #    self.seen[reg.name] = stmt.args[0].type     
                  #else:
                      self.seen[tmp] = None

          elif isinstance(stmt, C.BinaryOp) and \
             isinstance(stmt.op, C.Op.Assign) and \
             isinstance(stmt.left, C.SymbolRef) and \
             isinstance(stmt.right, C.FunctionCall) and "_mm" in stmt.right.func.name and "_load" in stmt.right.func.name and inReplaceMapSink(stmt.right.args[0], self.replace_map): 
                  #print(stmt.right.args[0])                         
                  source = get_alias(stmt.right.args, self.replace_map)
                  #print(source)      
                  if (source is not None):
                    sym_arr_ref = construct_arr_reference(source, deepcopy(stmt.right.args))
                    if in_du_map(sym_arr_ref):
                       reg = get_register(sym_arr_ref)
                       #print(reg.name)   
                       if str(reg.name) in self.seen: 
                          #print(reg.name)  
                          sym_map[stmt.left.name] = reg
                       else:
                          new_body.append(stmt) 
                    else:
                       new_body.append(stmt)    
                  else:
                      new_body.append(stmt)
                            
          else:
              new_body.append(stmt)  
        node.body = util.flatten([self.visit(s) for s in new_body])
        return node
Esempio n. 4
0
def load_ps(arg):
    return C.FunctionCall(
        C.SymbolRef({
            "AVX": "_mm256_load_ps",
            "AVX-2": "_mm256_load_ps",
            "AVX-512": "_mm512_load_ps",
        }[latte.config.vec_config]), [arg])
Esempio n. 5
0
def set_zero_ps():
    return C.FunctionCall(
        C.SymbolRef({
            "AVX": "_mm256_setzero_ps",
            "AVX-2": "_mm256_setzero_ps",
            "AVX-512": "_mm512_setzero_ps"
        }[latte.config.vec_config]), [])
Esempio n. 6
0
def store_epi32(target, value):
    return C.FunctionCall(
        C.SymbolRef({
            "AVX": "_mm256_store_epi32",
            "AVX-2": "_mm256_store_epi32",
            "AVX-512": "_mm512_store_epi32",
        }[latte.config.vec_config]), [target, value])
Esempio n. 7
0
def simd_add(left, right):
    func = {
        "AVX": "_mm256_add_ps",
        "AVX-2": "_mm256_add_ps",
        "AVX-512": "_mm512_add_ps",
    }[latte.config.vec_config]
    return C.FunctionCall(C.SymbolRef(func), [left, right])
Esempio n. 8
0
def insert_free(name):

    return (StringTemplate(
        "$free;", {
            "free": C.FunctionCall(C.SymbolRef('_mm_free'),
                                   [C.SymbolRef(name)]),
        }))
Esempio n. 9
0
 def visit(self, node):
     node = super().visit(node)
     if hasattr(node, 'body'):
         # [collector.visit(s) for s in node.body]
         new_body = []
         seen = {}
         stores = []
         collector = VectorLoadCollector()
         for s in node.body:
             collector.visit(s)
             for stmt in collector.loads.keys():
                 if stmt not in seen:
                     reg = self._gen_register()
                     load_node, number, func = collector.loads[stmt]
                     seen[stmt] = (reg, load_node, func)
                     self.sym[reg] = get_simd_type()()
                     new_body.append(
                         C.Assign(
                             C.SymbolRef(reg,
                                         get_simd_type()()),
                             C.FunctionCall(C.SymbolRef(func),
                                            [load_node])))
             if isinstance(
                     s, C.FunctionCall
             ) and "_mm" in s.func.name and "_store" in s.func.name:
                 if s.args[0].codegen() in seen:
                     stores.append((s.args[0], seen[s.args[0].codegen()][0],
                                    s.func.name))
                     s = C.Assign(C.SymbolRef(seen[s.args[0].codegen()][0]),
                                  s.args[1])
             for stmt in seen.keys():
                 reg, load_node, func = seen[stmt]
                 replacer = VectorLoadReplacer(
                     C.FunctionCall(C.SymbolRef(func),
                                    [load_node]).codegen(),
                     C.SymbolRef(reg))
                 s = replacer.visit(s)
             new_body.append(s)
         for target, value, name in stores:
             if "epi32" in name:
                 new_body.append(store_epi32(target, C.SymbolRef(value)))
             elif "ps" in name:
                 new_body.append(store_ps(target, C.SymbolRef(value)))
             else:
                 assert (false)
         node.body = util.flatten(new_body)
     return node
Esempio n. 10
0
def simd_fma(*args):
    assert len(args) == 3
    fma_func = {
        "AVX": "_mm256_fmadd_ps",
        "AVX-2": "_mm256_fmadd_ps",
        "AVX-512": "_mm512_fmadd_ps",
    }[latte.config.vec_config]
    return C.FunctionCall(C.SymbolRef(fma_func), list(args))
Esempio n. 11
0
def broadcast_ss(arg):
    if latte.config.vec_config == "AVX-512":
        # AVX-512 doesn't support broadcast, use set1_ps and remove Ref node
        assert isinstance(arg, C.UnaryOp) and isinstance(arg.op, C.Op.Ref)
        arg = arg.arg
    return C.FunctionCall(
        C.SymbolRef({
            "AVX": "_mm256_broadcast_ss",
            "AVX-2": "_mm256_broadcast_ss",
            "AVX-512": "_mm512_set1_ps",
        }[latte.config.vec_config]), [arg])
Esempio n. 12
0
 def visit_AugAssign(self, node):
     node.value = self.visit(node.value)
     if util.contains_symbol(node.target, self.loop_var):
         if not util.contains_symbol(node.target.right, self.loop_var):
             target = self.visit(deepcopy(node.target))
             curr_node = node.target
             idx = 1
             while curr_node.left.right.name != self.loop_var:
                 curr_node = curr_node.left
                 idx += 1
             curr_node.left = curr_node.left.left
             node.target = C.ArrayRef(node.target,
                                      C.SymbolRef(self.loop_var))
             while not isinstance(curr_node, C.SymbolRef):
                 curr_node = curr_node.left
             if curr_node.name in self.transposed_buffers and self.transposed_buffers[
                     curr_node.name] != idx:
                 raise NotImplementedError()
             self.transposed_buffers[curr_node.name] = idx
             curr_node.name += "_transposed"
             if isinstance(node.target.right,
                           C.Constant) and node.target.value == 0.0:
                 return store_ps(node.target.left,
                                 C.BinaryOp(target, node.op, node.value))
             else:
                 return store_ps(C.Ref(node.target),
                                 C.BinaryOp(target, node.op, node.value))
         else:
             if isinstance(node.target.right,
                           C.Constant) and node.target.value == 0.0:
                 return store_ps(
                     node.target.left,
                     C.BinaryOp(self.visit(node.target), node.op,
                                node.value))
             else:
                 return store_ps(
                     C.Ref(node.target),
                     C.BinaryOp(self.visit(node.target), node.op,
                                node.value))
     elif isinstance(node.op, C.Op.Add) and isinstance(
             node.value, C.FunctionCall):
         # TODO: Verfiy it's a vector intrinsic
         return C.Assign(
             node.target,
             C.FunctionCall(C.SymbolRef("_mm256_add_ps"),
                            [node.value, node.target]))
     elif isinstance(node.target, C.BinaryOp) and isinstance(
             node.target.op, C.Op.ArrayRef):
         raise NotImplementedError(node)
     node.target = self.visit(node.target)
     return node
Esempio n. 13
0
 def visit_FunctionDecl(self, node):
     new_body = []
     count = 0
     for statement in node.defn:
                
         if isinstance(statement, ast.For) or isinstance(statement, C.For):
             pre =  C.SubAssign(C.ArrayRef(C.SymbolRef('times'), C.Constant(count)),C.FunctionCall('omp_get_wtime', []))
             post =  C.AddAssign(C.ArrayRef(C.SymbolRef('times'), C.Constant(count)),C.FunctionCall('omp_get_wtime', []))
             new_body.append(pre)
             new_body.append(statement)
             new_body.append(post)
             count = count + 1
         else:
             new_body.append(statement)
     
     memset = C.Assign(C.SymbolRef('times'), C.FunctionCall(C.SymbolRef('calloc_doubles'),[C.Constant(count)]))
     new_body.insert(0,  memset)
     new_body.insert(0, C.Assign(C.SymbolRef("*times", ctypes.c_double()), C.Constant(0)))
     for i in range(0,count):
       print_stmt = C.FunctionCall(C.SymbolRef('printf'),[C.String("\ttimes[%d] = %g\\n"), C.Constant(i), C.ArrayRef(C.SymbolRef('times'), C.Constant(i))])
       new_body.append(print_stmt)
     node.defn = new_body     
     return node
Esempio n. 14
0
def gen_vector_cmp_instruction(dest, src1, src2, type_map, symbol_map):

    src1_type = get_type(src1, type_map, symbol_map)
    src2_type = get_type(src2, type_map, symbol_map)

    assert (src1_type is not None)
    assert (src2_type is not None)
    if isinstance(src1_type, simd.types.m256) and isinstance(
            src2_type, simd.types.m256):
        return C.Assign(
            dest,
            C.FunctionCall(
                C.SymbolRef("_mm256_cmp_ps_mask"),
                [src1, src2, C.SymbolRef("_MM_CMPINT_GT", None)]))
    elif isinstance(src1_type, simd.types.m512) and isinstance(
            src2_type, simd.types.m512):
        return C.Assign(
            dest,
            C.FunctionCall(
                C.SymbolRef("_mm512_cmp_ps_mask"),
                [src1, src2, C.SymbolRef("_MM_CMPINT_GT", None)]))
    elif isinstance(src1_type, simd.types.m256i) and isinstance(
            src2_type, simd.types.m256i):
        return C.Assign(
            dest,
            C.FunctionCall(
                C.SymbolRef("_mm256_cmp_epi32_mask"),
                [src1, src2, C.SymbolRef("_MM_CMPINT_GT", None)]))
    elif isinstance(src1_type, simd.types.m512i) and isinstance(
            src2_type, simd.types.m512i):
        return C.Assign(
            dest,
            C.FunctionCall(
                C.SymbolRef("_mm512_cmp_epi32_mask"),
                [src1, src2, C.SymbolRef("_MM_CMPINT_GT", None)]))
    else:
        assert (False)
Esempio n. 15
0
def gen_mask_move_instruction(dest, src1, selector, src2, type_map,
                              symbol_map):

    src1_type = get_type(src1, type_map, symbol_map)
    src2_type = get_type(src2, type_map, symbol_map)

    #assert(src1_type == src2_type)

    assert (src1_type is not None)
    assert (src2_type is not None)
    if isinstance(src1_type, simd.types.m256) and isinstance(
            src2_type, simd.types.m256):
        return C.Assign(
            dest,
            C.FunctionCall(C.SymbolRef("_mm256_mask_mov_ps"),
                           [src1, selector, src2]))
    elif isinstance(src1_type, simd.types.m512) and isinstance(
            src2_type, simd.types.m512):
        return C.Assign(
            dest,
            C.FunctionCall(C.SymbolRef("_mm512_mask_mov_ps"),
                           [src1, selector, src2]))
    elif isinstance(src1_type, simd.types.m256i) and isinstance(
            src2_type, simd.types.m256i):
        return C.Assign(
            dest,
            C.FunctionCall(C.SymbolRef("_mm256_mask_mov_epi32"),
                           [src1, selector, src2]))
    elif isinstance(src1_type, simd.types.m512i) and isinstance(
            src2_type, simd.types.m512i):
        return C.Assign(
            dest,
            C.FunctionCall(C.SymbolRef("_mm512_mask_mov_epi32"),
                           [src1, selector, src2]))
    else:
        assert (False)
Esempio n. 16
0
def broadcast_ss(arg, type_):

    if isinstance(type_, ctypes.c_int):
        if latte.config.vec_config == "AVX-512":
            # AVX-512 doesn't support broadcast, use set1_ps and remove Ref node
            if isinstance(arg, C.UnaryOp) and isinstance(arg.op, C.Op.Ref):
                arg = arg.arg
        return C.FunctionCall(
            C.SymbolRef({
                "AVX-2": "_mm256_broadcastd_epi32",
                "AVX-512": "_mm512_set1_epi32",
            }[latte.config.vec_config]), [arg])

    else:
        if latte.config.vec_config == "AVX-512":
            # AVX-512 doesn't support broadcast, use set1_ps and remove Ref node
            if isinstance(arg, C.UnaryOp) and isinstance(arg.op, C.Op.Ref):
                arg = arg.arg
        return C.FunctionCall(
            C.SymbolRef({
                "AVX": "_mm256_broadcast_ss",
                "AVX-2": "_mm256_broadcast_ss",
                "AVX-512": "_mm512_set1_ps",
            }[latte.config.vec_config]), [arg])
Esempio n. 17
0
 def block_loop(self, node):
     loopvar = node.init.left.name
     loopvar += loopvar
     self.nest.insert(
         0,
         C.For(
             C.Assign(C.SymbolRef(loopvar, node.init.left.type),
                      node.init.right),
             C.Lt(C.SymbolRef(loopvar), node.test.right),
             C.AddAssign(C.SymbolRef(loopvar),
                         C.Constant(self.block_factor)), [None]))
     node.init.right = C.SymbolRef(loopvar)
     node.test.right = C.FunctionCall(C.SymbolRef("fmin"), [
         C.Add(C.SymbolRef(loopvar), C.Constant(self.block_factor)),
         node.test.right
     ])
Esempio n. 18
0
    def visit_FunctionCall(self, node):
        if isinstance(node.func, C.SymbolRef):
            if node.func.name == "rand":
                if "OPENCL" in latte.config.parallel_strategy:
                    import struct
                    platform_c_maxint = 2 ** (struct.Struct('i').size * 8 - 1) - 1
                    return StringTemplate("((({} + get_global_id(0)) * 0x5DEECE66DL + 0xBL) & ((1L << 48) - 1)) >> 16".format(int(random.random() * platform_c_maxint)))
                return C.Div(node, C.Cast(ctypes.c_float(), C.SymbolRef("RAND_MAX")))
        #ANAND: 10/11/2016 Adding following uutility to convert python max to c max
        if isinstance(node.func, C.SymbolRef):
            if node.func.name == "max":
                return C.FunctionCall(C.SymbolRef("MAX"),
                [node.args[0],node.args[1]])
 




        return node
Esempio n. 19
0
    def visit_For(self, node):
        node.body = util.flatten([self.visit(s) for s in node.body])
        if node.init.left.name == self.enclosing_loop_var:
            new_body = []
            added_code = False
            prefetch_count = self.prefetch_count
            for stmt in node.body:
                new_body.append(stmt)
                if prefetch_count > 0 and isinstance(stmt, C.BinaryOp) and isinstance(stmt.op, C.Op.Assign) and \
                   isinstance(stmt.right, C.FunctionCall) and "_mm" in stmt.right.func.name \
                   and ("_load_" in stmt.right.func.name or "_set1" in stmt.right.func.name or "_broadcast" in stmt.right.func.name):
                    ast.dump(stmt.right.args[0])
                    if check_name(stmt.right.args[0], self.prefetch_field):
                        array_ref = deepcopy(stmt.right.args[0])
                        new_array_ref = self.rewrite_arg(array_ref)
                        where_to_add = new_body
                        prefetch_count -= 1
                        if node.init.left.name != self.prefetch_dest_loop:
                            where_to_add = HoistPrefetch.escape_body
                        added_code = True
                        where_to_add.append(
                            C.FunctionCall(
                                C.SymbolRef(prefetch_symbol_table[
                                    self.cacheline_hint]),
                                [
                                    C.Add(new_array_ref,
                                          C.SymbolRef("prefetch_offset_var"))
                                ]))
                        where_to_add.append(
                            C.Assign(
                                C.SymbolRef("prefetch_offset_var"),
                                C.Add(C.SymbolRef("prefetch_offset_var"),
                                      C.Constant(self.prefetch_offset))))

            if added_code:
                InitPrefetcher.init_body.append(
                    C.Assign(
                        C.SymbolRef("prefetch_offset_var", ctypes.c_int()),
                        C.Constant(0)))
            node.body = new_body
        return node
Esempio n. 20
0
 def visit_For(self, node):
     node.body = util.flatten([self.visit(s) for s in node.body])
     if node.init.left.name == self.enclosing_loop_var:
         new_body = []
         prefetch_count = self.prefetch_count
         for stmt in node.body:
             new_body.append(stmt)
             if prefetch_count > 0 and isinstance(stmt, C.BinaryOp) and isinstance(stmt.op, C.Op.Assign) and \
                isinstance(stmt.right, C.FunctionCall) and "_mm" in stmt.right.func.name \
                and ("_load_" in stmt.right.func.name or "_set1" in stmt.right.func.name or "_broadcast" in stmt.right.func.name):
                 ast.dump(stmt.right.args[0])
                 if check_name(stmt.right.args[0], self.prefetch_field):
                     array_ref = deepcopy(stmt.right.args[0])
                     new_array_ref = self.rewrite_arg(array_ref)
                     prefetch_count -= 1
                     new_body.append(
                         C.FunctionCall(
                             C.SymbolRef(prefetch_symbol_table[
                                 self.cacheline_hint]), [new_array_ref]))
         node.body = new_body
     return node
Esempio n. 21
0
    def visit_FunctionCall(self, node):

        new_args = [deepcopy(self.visit(a)) for a in node.args]

        return C.FunctionCall(node.func, new_args)
Esempio n. 22
0
    def visit_FunctionDecl(self, node):
        new_body = []
        pre = []
        count = 0
        _id = 2
        for statement in node.defn:
            self.seen = set()
            #self.visit(statement)

            if isinstance(statement, ast.For) or isinstance(statement, C.For):

                temp = []

                if hasattr(statement,
                           'pre_trans') and statement.pre_trans is not None:
                    #new_body.extend(stmt.pre_trans)
                    pre.extend(statement.pre_trans)
                # self.visit(temp)

                #self.seen = set();
                self.visit(statement)

                args = []
                args2 = []
                args3 = []
                for var in self.seen:
                    args.append(C.SymbolRef(var))
                    args2.append(var)
                    args3.append(C.SymbolRef("_" + var))
                # create function call

                func_name = self.name + str(_id)
                _id = _id + 1

                arg_bufs = [self.buffers[var] for var in args2]
                #arg_bufs.sort()

                type_sig = [
                    np.ctypeslib.ndpointer(buf.dtype, buf.ndim, buf.shape)
                    for buf in arg_bufs
                ]
                params = [
                    C.SymbolRef("_" + arg, typ())
                    for arg, typ in zip(args2, type_sig)
                ]

                outlined_func_call = C.FunctionCall(C.SymbolRef(func_name),
                                                    args3)

                for arg in args2:
                    name = arg
                    buf = self.buffers[name]

                new_body2 = []
                for arg in args2:
                    name = arg
                    buf = self.buffers[name]
                    new_body2.insert(
                        0,
                        StringTemplate(
                            "__assume_aligned({}, 64);\n".format(name)))
                    util.insert_cast(new_body2, buf.shape[1:], name, buf.dtype)

                new_body2.append(statement)
                func_decl = C.FunctionDecl(None, C.SymbolRef(func_name),
                                           params, new_body2)

                if len(args2) > 0:
                    #shape_str = "{}* ".format(self.buffers[args2[0]].dtype) + args2[0].join(", {}* ".format(self.buffers[d].dtype) + "{}".format(d) for d in args2[1:])
                    shape_str = "{}* ".format(   ctree.types.codegen_type(ctree.types.get_c_type_from_numpy_dtype(self.buffers[args2[0]].dtype)())) + \
                            args2[0].join(", {}* ".format( ctree.types.codegen_type(ctree.types.get_c_type_from_numpy_dtype(self.buffers[d].dtype)())) + "{}".format(d) for d in args2[1:])

                else:
                    shape_str = ""

                self.func_headers.append(
                    StringTemplate(
                        "void  $func ($args);", {
                            "func": C.SymbolRef(func_name),
                            "args": C.SymbolRef(shape_str)
                        }))
                self.new_funcs.append(func_decl)

                new_body.append(outlined_func_call)

            else:
                new_body.append(statement)
        new_body = pre + new_body
        node.defn = new_body
        return node
Esempio n. 23
0
    def visit_For(self, node):
        if hasattr(node, 'parallel') and node.parallel:
            # Supports depth one nesting with collapse
            loopvar1 = node.init.left.name
            looplen1 = node.test.right
            to_return = []
            if all(isinstance(s, C.For) and hasattr(s, 'parallel') and s.parallel for s in node.body):
                for s in node.body:
                    body = s.body
                    kernel_args = set()
                    loopvar2 = s.init.left.name
                    looplen2 = s.test.right
                    kernel_name = self._gen_unique_kernel_name()
                    params = self.collect_args_and_insert_casts(kernel_args, body)
                    body.insert(0, C.Assign(
                        C.SymbolRef(loopvar1, ctypes.c_int()), 
                        C.FunctionCall(C.SymbolRef("get_global_id"), [C.Constant(0)])
                    ))
                    body.insert(0, C.Assign(
                        C.SymbolRef(loopvar2, ctypes.c_int()), 
                        C.FunctionCall(C.SymbolRef("get_global_id"), [C.Constant(1)])
                    ))
                    kernel_src = C.FunctionDecl(None, C.SymbolRef(kernel_name), params, body)
                    kernel_src.set_kernel()
                    self.build_kernel(kernel_src, kernel_name, kernel_args)
                    to_return.append(StringTemplate(
                        """
                        size_t global_size_{kernel_name}[2] = {{{looplen1}, {looplen2}}};
                        clEnqueueNDRangeKernel(queue, {kernel_name}, 2, NULL, global_size_{kernel_name}, NULL, 0, NULL, NULL);
                        clFinish(queue);
                        """.format(
                            kernel_name=kernel_name, 
                            looplen1=looplen1,
                            looplen2=looplen2)
                    ))
            else:
                kernel_args = set()
                body = node.body
                kernel_name = self._gen_unique_kernel_name()
                params = self.collect_args_and_insert_casts(kernel_args, body)
                body.insert(0, C.Assign(
                    C.SymbolRef(loopvar1, ctypes.c_int()), 
                    C.FunctionCall(C.SymbolRef("get_global_id"), [C.Constant(0)])
                ))
                kernel_src = C.FunctionDecl(None, C.SymbolRef(kernel_name), params, body)
                kernel_src.set_kernel()
                self.build_kernel(kernel_src, kernel_name, kernel_args)
                to_return.append(StringTemplate(
                    """
                    size_t global_size_{kernel_name}[1] = {{{looplen1}}};
                    clEnqueueNDRangeKernel(queue, {kernel_name}, 1, NULL, global_size_{kernel_name}, NULL, 0, NULL, NULL);
                    clFinish(queue);
                    """.format(
                        kernel_name=kernel_name, 
                        looplen1=looplen1)
                ))
            if hasattr(node, 'reduce_vars') and len(node.reduce_vars) > 0:
                for var in node.reduce_vars:
                    size = np.prod(self.buffers[var].shape[1:])
                    to_return.append(self._gen_reduce_for_loop(node, var, size))
            return to_return

        else:
            raise NotImplementedError(node)
        return node