def test1(self): typingctx = typing.Context() targetctx = cpu.CPUContext(typingctx) test_ir = compiler.run_frontend(test_will_propagate) with cpu_target.nested_context(typingctx, targetctx): typingctx.refresh() targetctx.refresh() args = (types.int64, types.int64, types.int64) typemap, return_type, calltypes = type_inference_stage( typingctx, test_ir, args, None) type_annotation = type_annotations.TypeAnnotation( func_ir=test_ir, typemap=typemap, calltypes=calltypes, lifted=(), lifted_from=None, args=args, return_type=return_type, html_output=config.HTML) remove_dels(test_ir.blocks) in_cps, out_cps = copy_propagate(test_ir.blocks, typemap) apply_copy_propagate(test_ir.blocks, in_cps, get_name_var_table(test_ir.blocks), typemap, calltypes) remove_dead(test_ir.blocks, test_ir.arg_names, test_ir) self.assertFalse(findLhsAssign(test_ir, "x"))
def run_pass(self, state): """ Back-end: Generate LLVM IR from Numba IR, compile to machine code """ lowered = state["cr"] signature = typing.signature(state.return_type, *state.args) from numba.core.compiler import compile_result state.cr = compile_result( typing_context=state.typingctx, target_context=state.targetctx, entry_point=lowered.cfunc, typing_error=state.status.fail_reason, type_annotation=state.type_annotation, library=state.library, call_helper=lowered.call_helper, signature=signature, objectmode=False, lifted=state.lifted, fndesc=lowered.fndesc, environment=lowered.env, metadata=state.metadata, reload_init=state.reload_init, ) remove_dels(state.func_ir.blocks) return True
def run_pass(self, state): parfor_pass = numba.parfors.parfor.ParforPass( state.func_ir, state.type_annotation.typemap, state.type_annotation.calltypes, state.return_type, state.typingctx, state.flags.auto_parallel, state.flags, state.parfor_diagnostics, ) remove_dels(state.func_ir.blocks) parfor_pass.array_analysis.run(state.func_ir.blocks) parfor_pass._convert_loop(state.func_ir.blocks) remove_dead( state.func_ir.blocks, state.func_ir.arg_names, state.func_ir, state.type_annotation.typemap, ) numba.parfors.parfor.get_parfor_params( state.func_ir.blocks, parfor_pass.options.fusion, parfor_pass.nested_fusion_info, ) return True
def run_pass(self, state): """ Convert data-parallel computations into Parfor nodes """ # Ensure we have an IR and type information. assert state.func_ir parfor_pass = _parfor_ParforPass( state.func_ir, state.type_annotation.typemap, state.type_annotation.calltypes, state.return_type, state.typingctx, state.flags.auto_parallel, state.flags, state.parfor_diagnostics, ) parfor_pass.run() remove_dels(state.func_ir.blocks) # check the parfor pass worked and warn if it didn't has_parfor = False for blk in state.func_ir.blocks.values(): for stmnt in blk.body: if isinstance(stmnt, Parfor): has_parfor = True break else: continue break if not has_parfor: # parfor calls the compiler chain again with a string if not ( config.DISABLE_PERFORMANCE_WARNINGS or state.func_ir.loc.filename == "<string>" ): url = ( "http://numba.pydata.org/numba-doc/latest/user/" "parallel.html#diagnostics" ) msg = ( "\nThe keyword argument 'parallel=True' was specified " "but no transformation for parallel execution was " "possible.\n\nTo find out why, try turning on parallel " "diagnostics, see %s for help." % url ) warnings.warn(errors.NumbaPerformanceWarning(msg, state.func_ir.loc)) # Add reload function to initialize the parallel backend. state.reload_init.append(_reload_parfors) return True
def test1(self): typingctx = typing.Context() targetctx = cpu.CPUContext(typingctx) test_ir = compiler.run_frontend(test_will_propagate) with cpu_target.nested_context(typingctx, targetctx): typingctx.refresh() targetctx.refresh() args = (types.int64, types.int64, types.int64) typemap, _, calltypes, _ = type_inference_stage( typingctx, targetctx, test_ir, args, None) remove_dels(test_ir.blocks) in_cps, out_cps = copy_propagate(test_ir.blocks, typemap) apply_copy_propagate(test_ir.blocks, in_cps, get_name_var_table(test_ir.blocks), typemap, calltypes) remove_dead(test_ir.blocks, test_ir.arg_names, test_ir) self.assertFalse(findLhsAssign(test_ir, "x"))
def _stencil_wrapper(self, result, sigret, return_type, typemap, calltypes, *args): # Overall approach: # 1) Construct a string containing a function definition for the stencil function # that will execute the stencil kernel. This function definition includes a # unique stencil function name, the parameters to the stencil kernel, loop # nests across the dimensions of the input array. Those loop nests use the # computed stencil kernel size so as not to try to compute elements where # elements outside the bounds of the input array would be needed. # 2) The but of the loop nest in this new function is a special sentinel # assignment. # 3) Get the IR of this new function. # 4) Split the block containing the sentinel assignment and remove the sentinel # assignment. Insert the stencil kernel IR into the stencil function IR # after label and variable renaming of the stencil kernel IR to prevent # conflicts with the stencil function IR. # 5) Compile the combined stencil function IR + stencil kernel IR into existence. # Copy the kernel so that our changes for this callsite # won't effect other callsites. (kernel_copy, copy_calltypes) = self.copy_ir_with_calltypes(self.kernel_ir, calltypes) # The stencil kernel body becomes the body of a loop, for which args aren't needed. ir_utils.remove_args(kernel_copy.blocks) first_arg = kernel_copy.arg_names[0] in_cps, out_cps = ir_utils.copy_propagate(kernel_copy.blocks, typemap) name_var_table = ir_utils.get_name_var_table(kernel_copy.blocks) ir_utils.apply_copy_propagate(kernel_copy.blocks, in_cps, name_var_table, typemap, copy_calltypes) if "out" in name_var_table: raise ValueError( "Cannot use the reserved word 'out' in stencil kernels.") sentinel_name = ir_utils.get_unused_var_name("__sentinel__", name_var_table) if config.DEBUG_ARRAY_OPT >= 1: print("name_var_table", name_var_table, sentinel_name) the_array = args[0] if config.DEBUG_ARRAY_OPT >= 1: print("_stencil_wrapper", return_type, return_type.dtype, type(return_type.dtype), args) ir_utils.dump_blocks(kernel_copy.blocks) # We generate a Numba function to execute this stencil and here # create the unique name of this function. stencil_func_name = "__numba_stencil_%s_%s" % (hex( id(the_array)).replace("-", "_"), self.id) # We will put a loop nest in the generated function for each # dimension in the input array. Here we create the name for # the index variable for each dimension. index0, index1, ... index_vars = [] for i in range(the_array.ndim): index_var_name = ir_utils.get_unused_var_name( "index" + str(i), name_var_table) index_vars += [index_var_name] # Create extra signature for out and neighborhood. out_name = ir_utils.get_unused_var_name("out", name_var_table) neighborhood_name = ir_utils.get_unused_var_name( "neighborhood", name_var_table) sig_extra = "" if result is not None: sig_extra += ", {}=None".format(out_name) if "neighborhood" in dict(self.kws): sig_extra += ", {}=None".format(neighborhood_name) # Get a list of the standard indexed array names. standard_indexed = self.options.get("standard_indexing", []) if first_arg in standard_indexed: raise ValueError("The first argument to a stencil kernel must " "use relative indexing, not standard indexing.") if len(set(standard_indexed) - set(kernel_copy.arg_names)) != 0: raise ValueError("Standard indexing requested for an array name " "not present in the stencil kernel definition.") # Add index variables to getitems in the IR to transition the accesses # in the kernel from relative to regular Python indexing. Returns the # computed size of the stencil kernel and a list of the relatively indexed # arrays. kernel_size, relatively_indexed = self.add_indices_to_kernel( kernel_copy, index_vars, the_array.ndim, self.neighborhood, standard_indexed, typemap, copy_calltypes) if self.neighborhood is None: self.neighborhood = kernel_size if config.DEBUG_ARRAY_OPT >= 1: print("After add_indices_to_kernel") ir_utils.dump_blocks(kernel_copy.blocks) # The return in the stencil kernel becomes a setitem for that # particular point in the iteration space. ret_blocks = self.replace_return_with_setitem(kernel_copy.blocks, index_vars, out_name) if config.DEBUG_ARRAY_OPT >= 1: print("After replace_return_with_setitem", ret_blocks) ir_utils.dump_blocks(kernel_copy.blocks) # Start to form the new function to execute the stencil kernel. func_text = "def {}({}{}):\n".format(stencil_func_name, ",".join(kernel_copy.arg_names), sig_extra) # Get loop ranges for each dimension, which could be either int # or variable. In the latter case we'll use the extra neighborhood # argument to the function. ranges = [] for i in range(the_array.ndim): if isinstance(kernel_size[i][0], int): lo = kernel_size[i][0] hi = kernel_size[i][1] else: lo = "{}[{}][0]".format(neighborhood_name, i) hi = "{}[{}][1]".format(neighborhood_name, i) ranges.append((lo, hi)) # If there are more than one relatively indexed arrays, add a call to # a function that will raise an error if any of the relatively indexed # arrays are of different size than the first input array. if len(relatively_indexed) > 1: func_text += " raise_if_incompatible_array_sizes(" + first_arg for other_array in relatively_indexed: if other_array != first_arg: func_text += "," + other_array func_text += ")\n" # Get the shape of the first input array. shape_name = ir_utils.get_unused_var_name("full_shape", name_var_table) func_text += " {} = {}.shape\n".format(shape_name, first_arg) # If we have to allocate the output array (the out argument was not used) # then us numpy.full if the user specified a cval stencil decorator option # or np.zeros if they didn't to allocate the array. if result is None: return_type_name = numpy_support.as_dtype( return_type.dtype).type.__name__ if "cval" in self.options: cval = self.options["cval"] if return_type.dtype != typing.typeof.typeof(cval): raise ValueError( "cval type does not match stencil return type.") out_init = "{} = np.full({}, {}, dtype=np.{})\n".format( out_name, shape_name, cval, return_type_name) else: out_init = "{} = np.zeros({}, dtype=np.{})\n".format( out_name, shape_name, return_type_name) func_text += " " + out_init else: # result is present, if cval is set then use it if "cval" in self.options: cval = self.options["cval"] cval_ty = typing.typeof.typeof(cval) if not self._typingctx.can_convert(cval_ty, return_type.dtype): msg = "cval type does not match stencil return type." raise ValueError(msg) out_init = "{}[:] = {}\n".format(out_name, cval) func_text += " " + out_init offset = 1 # Add the loop nests to the new function. for i in range(the_array.ndim): for j in range(offset): func_text += " " # ranges[i][0] is the minimum index used in the i'th dimension # but minimum's greater than 0 don't preclude any entry in the array. # So, take the minimum of 0 and the minimum index found in the kernel # and this will be a negative number (potentially -0). Then, we do # unary - on that to get the positive offset in this dimension whose # use is precluded. # ranges[i][1] is the maximum of 0 and the observed maximum index # in this dimension because negative maximums would not cause us to # preclude any entry in the array from being used. func_text += ("for {} in range(-min(0,{})," "{}[{}]-max(0,{})):\n").format( index_vars[i], ranges[i][0], shape_name, i, ranges[i][1]) offset += 1 for j in range(offset): func_text += " " # Put a sentinel in the code so we can locate it in the IR. We will # remove this sentinel assignment and replace it with the IR for the # stencil kernel body. func_text += "{} = 0\n".format(sentinel_name) func_text += " return {}\n".format(out_name) if config.DEBUG_ARRAY_OPT >= 1: print("new stencil func text") print(func_text) # Force the new stencil function into existence. exec(func_text) in globals(), locals() stencil_func = eval(stencil_func_name) if sigret is not None: pysig = utils.pysignature(stencil_func) sigret.pysig = pysig # Get the IR for the newly created stencil function. from numba.core import compiler stencil_ir = compiler.run_frontend(stencil_func) ir_utils.remove_dels(stencil_ir.blocks) # rename all variables in stencil_ir afresh var_table = ir_utils.get_name_var_table(stencil_ir.blocks) new_var_dict = {} reserved_names = ( [sentinel_name, out_name, neighborhood_name, shape_name] + kernel_copy.arg_names + index_vars) for name, var in var_table.items(): if not name in reserved_names: new_var_dict[name] = ir_utils.mk_unique_var(name) ir_utils.replace_var_names(stencil_ir.blocks, new_var_dict) stencil_stub_last_label = max(stencil_ir.blocks.keys()) + 1 # Shift labels in the kernel copy so they are guaranteed unique # and don't conflict with any labels in the stencil_ir. kernel_copy.blocks = ir_utils.add_offset_to_labels( kernel_copy.blocks, stencil_stub_last_label) new_label = max(kernel_copy.blocks.keys()) + 1 # Adjust ret_blocks to account for addition of the offset. ret_blocks = [x + stencil_stub_last_label for x in ret_blocks] if config.DEBUG_ARRAY_OPT >= 1: print("ret_blocks w/ offsets", ret_blocks, stencil_stub_last_label) print("before replace sentinel stencil_ir") ir_utils.dump_blocks(stencil_ir.blocks) print("before replace sentinel kernel_copy") ir_utils.dump_blocks(kernel_copy.blocks) # Search all the block in the stencil outline for the sentinel. for label, block in stencil_ir.blocks.items(): for i, inst in enumerate(block.body): if (isinstance(inst, ir.Assign) and inst.target.name == sentinel_name): # We found the sentinel assignment. loc = inst.loc scope = block.scope # split block across __sentinel__ # A new block is allocated for the statements prior to the # sentinel but the new block maintains the current block # label. prev_block = ir.Block(scope, loc) prev_block.body = block.body[:i] # The current block is used for statements after sentinel. block.body = block.body[i + 1:] # But the current block gets a new label. body_first_label = min(kernel_copy.blocks.keys()) # The previous block jumps to the minimum labelled block of # the parfor body. prev_block.append(ir.Jump(body_first_label, loc)) # Add all the parfor loop body blocks to the gufunc # function's IR. for (l, b) in kernel_copy.blocks.items(): stencil_ir.blocks[l] = b stencil_ir.blocks[new_label] = block stencil_ir.blocks[label] = prev_block # Add a jump from all the blocks that previously contained # a return in the stencil kernel to the block # containing statements after the sentinel. for ret_block in ret_blocks: stencil_ir.blocks[ret_block].append( ir.Jump(new_label, loc)) break else: continue break stencil_ir.blocks = ir_utils.rename_labels(stencil_ir.blocks) ir_utils.remove_dels(stencil_ir.blocks) assert (isinstance(the_array, types.Type)) array_types = args new_stencil_param_types = list(array_types) if config.DEBUG_ARRAY_OPT >= 1: print("new_stencil_param_types", new_stencil_param_types) ir_utils.dump_blocks(stencil_ir.blocks) # Compile the combined stencil function with the replaced loop # body in it. new_func = compiler.compile_ir(self._typingctx, self._targetctx, stencil_ir, new_stencil_param_types, None, compiler.DEFAULT_FLAGS, {}) return new_func
def remove_dels(self): """ Strips the IR of Del nodes """ ir_utils.remove_dels(self.func_ir.blocks)
def get_stencil_ir(sf, typingctx, args, scope, loc, input_dict, typemap, calltypes): """get typed IR from stencil bytecode """ from numba.core.cpu import CPUContext from numba.core.registry import cpu_target from numba.core.annotations import type_annotations from numba.core.typed_passes import type_inference_stage # get untyped IR stencil_func_ir = sf.kernel_ir.copy() # copy the IR nodes to avoid changing IR in the StencilFunc object stencil_blocks = copy.deepcopy(stencil_func_ir.blocks) stencil_func_ir.blocks = stencil_blocks name_var_table = ir_utils.get_name_var_table(stencil_func_ir.blocks) if "out" in name_var_table: raise ValueError( "Cannot use the reserved word 'out' in stencil kernels.") # get typed IR with a dummy pipeline (similar to test_parfors.py) targetctx = CPUContext(typingctx) with cpu_target.nested_context(typingctx, targetctx): tp = DummyPipeline(typingctx, targetctx, args, stencil_func_ir) rewrites.rewrite_registry.apply('before-inference', tp.state) tp.state.typemap, tp.state.return_type, tp.state.calltypes = type_inference_stage( tp.state.typingctx, tp.state.func_ir, tp.state.args, None) type_annotations.TypeAnnotation(func_ir=tp.state.func_ir, typemap=tp.state.typemap, calltypes=tp.state.calltypes, lifted=(), lifted_from=None, args=tp.state.args, return_type=tp.state.return_type, html_output=config.HTML) # make block labels unique stencil_blocks = ir_utils.add_offset_to_labels(stencil_blocks, ir_utils.next_label()) min_label = min(stencil_blocks.keys()) max_label = max(stencil_blocks.keys()) ir_utils._max_label = max_label if config.DEBUG_ARRAY_OPT >= 1: print("Initial stencil_blocks") ir_utils.dump_blocks(stencil_blocks) # rename variables, var_dict = {} for v, typ in tp.state.typemap.items(): new_var = ir.Var(scope, mk_unique_var(v), loc) var_dict[v] = new_var typemap[new_var.name] = typ # add new var type for overall function ir_utils.replace_vars(stencil_blocks, var_dict) if config.DEBUG_ARRAY_OPT >= 1: print("After replace_vars") ir_utils.dump_blocks(stencil_blocks) # add call types to overall function for call, call_typ in tp.state.calltypes.items(): calltypes[call] = call_typ arg_to_arr_dict = {} # replace arg with arr for block in stencil_blocks.values(): for stmt in block.body: if isinstance(stmt, ir.Assign) and isinstance(stmt.value, ir.Arg): if config.DEBUG_ARRAY_OPT >= 1: print("input_dict", input_dict, stmt.value.index, stmt.value.name, stmt.value.index in input_dict) arg_to_arr_dict[stmt.value.name] = input_dict[ stmt.value.index].name stmt.value = input_dict[stmt.value.index] if config.DEBUG_ARRAY_OPT >= 1: print("arg_to_arr_dict", arg_to_arr_dict) print("After replace arg with arr") ir_utils.dump_blocks(stencil_blocks) ir_utils.remove_dels(stencil_blocks) stencil_func_ir.blocks = stencil_blocks return stencil_func_ir, sf.get_return_type(args)[0], arg_to_arr_dict
def _create_gufunc_for_parfor_body( lowerer, parfor, typemap, typingctx, targetctx, flags, loop_ranges, locals, has_aliases, index_var_typ, races, ): """ Takes a parfor and creates a gufunc function for its body. There are two parts to this function: 1) Code to iterate across the iteration space as defined by the schedule. 2) The parfor body that does the work for a single point in the iteration space. Part 1 is created as Python text for simplicity with a sentinel assignment to mark the point in the IR where the parfor body should be added. This Python text is 'exec'ed into existence and its IR retrieved with run_frontend. The IR is scanned for the sentinel assignment where that basic block is split and the IR for the parfor body inserted. """ loc = parfor.init_block.loc # The parfor body and the main function body share ir.Var nodes. # We have to do some replacements of Var names in the parfor body # to make them legal parameter names. If we don't copy then the # Vars in the main function also would incorrectly change their name. loop_body = copy.copy(parfor.loop_body) remove_dels(loop_body) parfor_dim = len(parfor.loop_nests) loop_indices = [l.index_variable.name for l in parfor.loop_nests] # Get all the parfor params. parfor_params = parfor.params for start, stop, step in loop_ranges: if isinstance(start, ir.Var): parfor_params.add(start.name) if isinstance(stop, ir.Var): parfor_params.add(stop.name) # Get just the outputs of the parfor. parfor_outputs = numba.parfors.parfor.get_parfor_outputs( parfor, parfor_params) # Get all parfor reduction vars, and operators. typemap = lowerer.fndesc.typemap parfor_redvars, parfor_reddict = numba.parfors.parfor.get_parfor_reductions( lowerer.func_ir, parfor, parfor_params, lowerer.fndesc.calltypes) has_reduction = False if len(parfor_redvars) == 0 else True if has_reduction: _create_gufunc_for_reduction_parfor() # Compute just the parfor inputs as a set difference. parfor_inputs = sorted(list(set(parfor_params) - set(parfor_outputs))) for race in races: msg = ("Variable %s used in parallel loop may be written " "to simultaneously by multiple workers and may result " "in non-deterministic or unintended results." % race) warnings.warn(NumbaParallelSafetyWarning(msg, loc)) replace_var_with_array(races, loop_body, typemap, lowerer.fndesc.calltypes) if config.DEBUG_ARRAY_OPT >= 1: print("parfor_params = ", parfor_params, type(parfor_params)) print("parfor_outputs = ", parfor_outputs, type(parfor_outputs)) print("parfor_inputs = ", parfor_inputs, type(parfor_inputs)) # Reorder all the params so that inputs go first then outputs. parfor_params = parfor_inputs + parfor_outputs def addrspace_from(params, def_addr): addrspaces = [] for p in params: if isinstance(to_scalar_from_0d(typemap[p]), types.npytypes.Array): addrspaces.append(def_addr) else: addrspaces.append(None) return addrspaces addrspaces = addrspace_from(parfor_params, address_space.GLOBAL) if config.DEBUG_ARRAY_OPT >= 1: print("parfor_params = ", parfor_params, type(parfor_params)) print("loop_indices = ", loop_indices, type(loop_indices)) print("loop_body = ", loop_body, type(loop_body)) _print_body(loop_body) # Some Var are not legal parameter names so create a dict of # potentially illegal param name to guaranteed legal name. param_dict = legalize_names_with_typemap(parfor_params, typemap) if config.DEBUG_ARRAY_OPT >= 1: print("param_dict = ", sorted(param_dict.items()), type(param_dict)) # Some loop_indices are not legal parameter names so create a dict # of potentially illegal loop index to guaranteed legal name. ind_dict = legalize_names_with_typemap(loop_indices, typemap) # Compute a new list of legal loop index names. legal_loop_indices = [ind_dict[v] for v in loop_indices] if config.DEBUG_ARRAY_OPT >= 1: print("ind_dict = ", sorted(ind_dict.items()), type(ind_dict)) print( "legal_loop_indices = ", legal_loop_indices, type(legal_loop_indices), ) for pd in parfor_params: print("pd = ", pd) print("pd type = ", typemap[pd], type(typemap[pd])) # Get the types of each parameter. param_types = [to_scalar_from_0d(typemap[v]) for v in parfor_params] param_types_addrspaces = copy.copy(param_types) # Calculate types of args passed to gufunc. func_arg_types = [typemap[v] for v in (parfor_inputs + parfor_outputs)] assert len(param_types_addrspaces) == len(addrspaces) for i in range(len(param_types_addrspaces)): if addrspaces[i] is not None: # Convert Numba's npytype.Array to DPPYArray data type. DPPYArray # allows us to specify an address space for the data and other # pointer arguments for the array. param_types_addrspaces[i] = npytypes_array_to_dppy_array( param_types_addrspaces[i], addrspaces[i]) def print_arg_with_addrspaces(args): for a in args: print(a, type(a)) if isinstance(a, types.npytypes.Array): print("addrspace:", a.addrspace) if config.DEBUG_ARRAY_OPT >= 1: print_arg_with_addrspaces(param_types) print("func_arg_types = ", func_arg_types, type(func_arg_types)) # Replace illegal parameter names in the loop body with legal ones. replace_var_names(loop_body, param_dict) # remember the name before legalizing as the actual arguments parfor_args = parfor_params # Change parfor_params to be legal names. parfor_params = [param_dict[v] for v in parfor_params] parfor_params_orig = parfor_params parfor_params = [] ascontig = False for pindex in range(len(parfor_params_orig)): if (ascontig and pindex < len(parfor_inputs) and isinstance(param_types[pindex], types.npytypes.Array)): parfor_params.append(parfor_params_orig[pindex] + "param") else: parfor_params.append(parfor_params_orig[pindex]) # Change parfor body to replace illegal loop index vars with legal ones. replace_var_names(loop_body, ind_dict) loop_body_var_table = get_name_var_table(loop_body) sentinel_name = get_unused_var_name("__sentinel__", loop_body_var_table) if config.DEBUG_ARRAY_OPT >= 1: print("legal parfor_params = ", parfor_params, type(parfor_params)) # Determine the unique names of the scheduling and gufunc functions. gufunc_name = "__numba_parfor_gufunc_%s" % (parfor.id) if config.DEBUG_ARRAY_OPT: # print("sched_func_name ", type(sched_func_name), sched_func_name) print("gufunc_name ", type(gufunc_name), gufunc_name) gufunc_txt = "" # Create the gufunc function. gufunc_txt += "def " + gufunc_name gufunc_txt += "(" + (", ".join(parfor_params)) + "):\n" gufunc_txt += _schedule_loop(parfor_dim, legal_loop_indices, loop_ranges, param_dict) # Add the sentinel assignment so that we can find the loop body position # in the IR. gufunc_txt += " " gufunc_txt += sentinel_name + " = 0\n" # gufunc returns nothing gufunc_txt += " return None\n" if config.DEBUG_ARRAY_OPT: print("gufunc_txt = ", type(gufunc_txt), "\n", gufunc_txt) sys.stdout.flush() # Force gufunc outline into existence. globls = {"np": np, "numba": numba, "dppy": dppy} locls = {} exec(gufunc_txt, globls, locls) gufunc_func = locls[gufunc_name] if config.DEBUG_ARRAY_OPT: print("gufunc_func = ", type(gufunc_func), "\n", gufunc_func) # Get the IR for the gufunc outline. gufunc_ir = compiler.run_frontend(gufunc_func) if config.DEBUG_ARRAY_OPT: print("gufunc_ir dump ", type(gufunc_ir)) gufunc_ir.dump() print("loop_body dump ", type(loop_body)) _print_body(loop_body) # rename all variables in gufunc_ir afresh var_table = get_name_var_table(gufunc_ir.blocks) new_var_dict = {} reserved_names = ([sentinel_name] + list(param_dict.values()) + legal_loop_indices) for name, var in var_table.items(): if not (name in reserved_names): new_var_dict[name] = mk_unique_var(name) replace_var_names(gufunc_ir.blocks, new_var_dict) if config.DEBUG_ARRAY_OPT: print("gufunc_ir dump after renaming ") gufunc_ir.dump() prs_dict = {} pss_dict = {} pspmd_dict = {} gufunc_param_types = param_types if config.DEBUG_ARRAY_OPT: print( "gufunc_param_types = ", type(gufunc_param_types), "\n", gufunc_param_types, ) gufunc_stub_last_label = max(gufunc_ir.blocks.keys()) + 1 # Add gufunc stub last label to each parfor.loop_body label to prevent # label conflicts. loop_body = add_offset_to_labels(loop_body, gufunc_stub_last_label) # new label for splitting sentinel block new_label = max(loop_body.keys()) + 1 # If enabled, add a print statement after every assignment. if config.DEBUG_ARRAY_OPT_RUNTIME: _dbgprint_after_each_array_assignments(lowerer, loop_body, typemap) if config.DEBUG_ARRAY_OPT: print("parfor loop body") _print_body(loop_body) wrapped_blocks = wrap_loop_body(loop_body) # hoisted, not_hoisted = hoist(parfor_params, loop_body, # typemap, wrapped_blocks) setitems = set() find_setitems_body(setitems, loop_body, typemap) hoisted = [] not_hoisted = [] start_block = gufunc_ir.blocks[min(gufunc_ir.blocks.keys())] start_block.body = start_block.body[:-1] + hoisted + [start_block.body[-1]] unwrap_loop_body(loop_body) # store hoisted into diagnostics diagnostics = lowerer.metadata["parfor_diagnostics"] diagnostics.hoist_info[parfor.id] = { "hoisted": hoisted, "not_hoisted": not_hoisted, } lowerer.metadata["parfor_diagnostics"].extra_info[str(parfor.id)] = str( dpctl.get_current_queue().get_sycl_device().name) if config.DEBUG_ARRAY_OPT: print("After hoisting") _print_body(loop_body) # Search all the block in the gufunc outline for the sentinel assignment. for label, block in gufunc_ir.blocks.items(): for i, inst in enumerate(block.body): if (isinstance(inst, ir.Assign) and inst.target.name == sentinel_name): # We found the sentinel assignment. loc = inst.loc scope = block.scope # split block across __sentinel__ # A new block is allocated for the statements prior to the # sentinel but the new block maintains the current block label. prev_block = ir.Block(scope, loc) prev_block.body = block.body[:i] # The current block is used for statements after the sentinel. block.body = block.body[i + 1:] # But the current block gets a new label. body_first_label = min(loop_body.keys()) # The previous block jumps to the minimum labelled block of the # parfor body. prev_block.append(ir.Jump(body_first_label, loc)) # Add all the parfor loop body blocks to the gufunc function's # IR. for (l, b) in loop_body.items(): gufunc_ir.blocks[l] = b body_last_label = max(loop_body.keys()) gufunc_ir.blocks[new_label] = block gufunc_ir.blocks[label] = prev_block # Add a jump from the last parfor body block to the block # containing statements after the sentinel. gufunc_ir.blocks[body_last_label].append( ir.Jump(new_label, loc)) break else: continue break if config.DEBUG_ARRAY_OPT: print("gufunc_ir last dump before renaming") gufunc_ir.dump() gufunc_ir.blocks = rename_labels(gufunc_ir.blocks) remove_dels(gufunc_ir.blocks) if config.DEBUG_ARRAY_OPT: sys.stdout.flush() if config.DEBUG_ARRAY_OPT: print("gufunc_ir last dump") gufunc_ir.dump() print("flags", flags) print("typemap", typemap) old_alias = flags.noalias if not has_aliases: if config.DEBUG_ARRAY_OPT: print("No aliases found so adding noalias flag.") flags.noalias = True remove_dead(gufunc_ir.blocks, gufunc_ir.arg_names, gufunc_ir, typemap) if config.DEBUG_ARRAY_OPT: print("gufunc_ir after remove dead") gufunc_ir.dump() kernel_sig = signature(types.none, *gufunc_param_types) if config.DEBUG_ARRAY_OPT: sys.stdout.flush() if config.DEBUG_ARRAY_OPT: print("before DUFunc inlining".center(80, "-")) gufunc_ir.dump() # Inlining all DUFuncs dufunc_inliner( gufunc_ir, lowerer.fndesc.calltypes, typemap, lowerer.context.typing_context, lowerer.context, ) if config.DEBUG_ARRAY_OPT: print("after DUFunc inline".center(80, "-")) gufunc_ir.dump() kernel_func = dppy.compiler.compile_kernel_parfor( dpctl.get_current_queue(), gufunc_ir, gufunc_param_types, param_types_addrspaces, debug=flags.debuginfo, ) flags.noalias = old_alias if config.DEBUG_ARRAY_OPT: print("kernel_sig = ", kernel_sig) return kernel_func, parfor_args, kernel_sig, func_arg_types, setitems