def _analyze_call_set_REP(self, lhs, args, array_dists, fdef=None): for v in args: if (is_array(self.typemap, v.name) or is_array_container(self.typemap, v.name) or isinstance(self.typemap[v.name], DataFrameType)): dprint("dist setting call arg REP {} in {}".format( v.name, fdef)) array_dists[v.name] = Distribution.REP if (is_array(self.typemap, lhs) or is_array_container(self.typemap, lhs) or isinstance(self.typemap[lhs], DataFrameType)): dprint("dist setting call out REP {} in {}".format(lhs, fdef)) array_dists[lhs] = Distribution.REP
def _set_REP(self, var_list, array_dists): for var in var_list: varname = var.name # Handle SeriesType since it comes from Arg node and it could # have user-defined distribution if (is_array(self.typemap, varname) or is_array_container(self.typemap, varname) or isinstance(self.typemap[varname], (SeriesType, DataFrameType))): dprint("dist setting REP {}".format(varname)) array_dists[varname] = Distribution.REP # handle tuples of arrays var_def = guard(get_definition, self.func_ir, var) if (var_def is not None and isinstance(var_def, ir.Expr) and var_def.op == 'build_tuple'): tuple_vars = var_def.items self._set_REP(tuple_vars, array_dists)
def _analyze_call(self, lhs, rhs, func_var, args, array_dists): """analyze array distributions in function calls """ func_name = "" func_mod = "" fdef = guard(find_callname, self.func_ir, rhs, self.typemap) if fdef is None: # check ObjModeLiftedWith, we assume distribution doesn't change # blocks of data are passed in, TODO: document func_def = guard(get_definition, self.func_ir, rhs.func) if isinstance(func_def, ir.Const) and isinstance( func_def.value, numba.dispatcher.ObjModeLiftedWith): return warnings.warn( "function call couldn't be found for distributed analysis") self._analyze_call_set_REP(lhs, args, array_dists, fdef) return else: func_name, func_mod = fdef if is_alloc_callname(func_name, func_mod): if lhs not in array_dists: array_dists[lhs] = Distribution.OneD return # numpy direct functions if isinstance(func_mod, str) and func_mod == 'numpy': self._analyze_call_np(lhs, func_name, args, array_dists) return # handle array.func calls if isinstance(func_mod, ir.Var) and is_array(self.typemap, func_mod.name): self._analyze_call_array(lhs, func_mod, func_name, args, array_dists) return # handle df.func calls if isinstance(func_mod, ir.Var) and isinstance( self.typemap[func_mod.name], DataFrameType): self._analyze_call_df(lhs, func_mod, func_name, args, array_dists) return # sdc.distributed_api functions if isinstance(func_mod, str) and func_mod == 'sdc.distributed_api': self._analyze_call_hpat_dist(lhs, func_name, args, array_dists) return # len() if func_name == 'len' and func_mod in ('__builtin__', 'builtins'): return if fdef == ('quantile', 'sdc.hiframes.api'): # quantile doesn't affect input's distribution return if fdef == ('nunique', 'sdc.hiframes.api'): # nunique doesn't affect input's distribution return if fdef == ('unique', 'sdc.hiframes.api'): # doesn't affect distribution of input since input can stay 1D if lhs not in array_dists: array_dists[lhs] = Distribution.OneD_Var new_dist = Distribution( min(array_dists[lhs].value, array_dists[rhs.args[0].name].value)) array_dists[lhs] = new_dist return if fdef == ('rolling_fixed', 'sdc.hiframes.rolling'): self._meet_array_dists(lhs, rhs.args[0].name, array_dists) return if fdef == ('rolling_variable', 'sdc.hiframes.rolling'): # lhs, in_arr, on_arr should have the same distribution new_dist = self._meet_array_dists(lhs, rhs.args[0].name, array_dists) new_dist = self._meet_array_dists(lhs, rhs.args[1].name, array_dists, new_dist) array_dists[rhs.args[0].name] = new_dist return if fdef == ('shift', 'sdc.hiframes.rolling'): self._meet_array_dists(lhs, rhs.args[0].name, array_dists) return if fdef == ('pct_change', 'sdc.hiframes.rolling'): self._meet_array_dists(lhs, rhs.args[0].name, array_dists) return if fdef == ('nlargest', 'sdc.hiframes.api'): # output of nlargest is REP array_dists[lhs] = Distribution.REP return if fdef == ('median', 'sdc.hiframes.api'): return if fdef == ('concat', 'sdc.hiframes.api'): # hiframes concat is similar to np.concatenate self._analyze_call_np_concatenate(lhs, args, array_dists) return if fdef == ('isna', 'sdc.hiframes.api'): return if fdef == ('get_series_name', 'sdc.hiframes.api'): return # dummy hiframes functions if func_mod == 'sdc.hiframes.api' and func_name in ( 'get_series_data', 'get_series_index', 'to_arr_from_series', 'ts_series_to_arr_typ', 'to_date_series_type', 'dummy_unbox_series', 'parallel_fix_df_array'): # TODO: support Series type similar to Array self._meet_array_dists(lhs, rhs.args[0].name, array_dists) return if fdef == ('init_series', 'sdc.hiframes.api'): # lhs, in_arr, and index should have the same distribution new_dist = self._meet_array_dists(lhs, rhs.args[0].name, array_dists) if len(rhs.args) > 1 and self.typemap[ rhs.args[1].name] != types.none: new_dist = self._meet_array_dists(lhs, rhs.args[1].name, array_dists, new_dist) array_dists[rhs.args[0].name] = new_dist return if fdef == ('init_dataframe', 'sdc.hiframes.pd_dataframe_ext'): # lhs, data arrays, and index should have the same distribution df_typ = self.typemap[lhs] n_cols = len(df_typ.columns) for i in range(n_cols): new_dist = self._meet_array_dists(lhs, rhs.args[i].name, array_dists) # handle index if len(rhs.args) > n_cols and self.typemap[ rhs.args[n_cols].name] != types.none: new_dist = self._meet_array_dists(lhs, rhs.args[n_cols].name, array_dists, new_dist) for i in range(n_cols): array_dists[rhs.args[i].name] = new_dist return if fdef == ('get_dataframe_data', 'sdc.hiframes.pd_dataframe_ext'): self._meet_array_dists(lhs, rhs.args[0].name, array_dists) return if fdef == ('compute_split_view', 'sdc.hiframes.split_impl'): self._meet_array_dists(lhs, rhs.args[0].name, array_dists) return if fdef == ('get_split_view_index', 'sdc.hiframes.split_impl'): # just used in str.get() implementation for now so we know it is # parallel # TODO: handle index similar to getitem to support more cases return if fdef == ('get_split_view_data_ptr', 'sdc.hiframes.split_impl'): return if fdef == ('setitem_str_arr_ptr', 'sdc.str_arr_ext'): return if fdef == ('num_total_chars', 'sdc.str_arr_ext'): return if fdef == ('_series_dropna_str_alloc_impl_inner', 'sdc.hiframes.series_kernels'): if lhs not in array_dists: array_dists[lhs] = Distribution.OneD_Var in_dist = array_dists[rhs.args[0].name] out_dist = array_dists[lhs] out_dist = Distribution(min(out_dist.value, in_dist.value)) array_dists[lhs] = out_dist # output can cause input REP if out_dist != Distribution.OneD_Var: array_dists[rhs.args[0].name] = out_dist return if (fdef == ('copy_non_null_offsets', 'sdc.str_arr_ext') or fdef == ('copy_data', 'sdc.str_arr_ext')): out_arrname = rhs.args[0].name in_arrname = rhs.args[1].name self._meet_array_dists(out_arrname, in_arrname, array_dists) return if fdef == ('str_arr_item_to_numeric', 'sdc.str_arr_ext'): out_arrname = rhs.args[0].name in_arrname = rhs.args[2].name self._meet_array_dists(out_arrname, in_arrname, array_dists) return # np.fromfile() if fdef == ('file_read', 'sdc.io.np_io'): return if sdc.config._has_pyarrow and fdef == ('read_parquet', 'sdc.io.parquet_pio'): return if sdc.config._has_pyarrow and fdef == ('read_parquet_str', 'sdc.io.parquet_pio'): # string read creates array in output if lhs not in array_dists: array_dists[lhs] = Distribution.OneD return # TODO: make sure assert_equiv is not generated unnecessarily # TODO: fix assert_equiv for np.stack from df.value if fdef == ('assert_equiv', 'numba.parfors.parfor.array_analysis'): return # we perform call-analysis from external at the end if isinstance(func_mod, ir.Var): ky = (self.typemap[func_mod.name], func_name) if ky in DistributedAnalysis._extra_call: if DistributedAnalysis._extra_call[ky](lhs, func_mod, *ky, args, array_dists): return # set REP if not found self._analyze_call_set_REP(lhs, args, array_dists, fdef)
def _analyze_call_np(self, lhs, func_name, args, array_dists): """analyze distributions of numpy functions (np.func_name) """ if func_name == 'ascontiguousarray': self._meet_array_dists(lhs, args[0].name, array_dists) return if func_name == 'ravel': self._meet_array_dists(lhs, args[0].name, array_dists) return if func_name == 'concatenate': self._analyze_call_np_concatenate(lhs, args, array_dists) return if func_name == 'array' and is_array(self.typemap, args[0].name): self._meet_array_dists(lhs, args[0].name, array_dists) return # sum over the first axis is distributed, A.sum(0) if func_name == 'sum' and len(args) == 2: axis_def = guard(get_definition, self.func_ir, args[1]) if isinstance(axis_def, ir.Const) and axis_def.value == 0: array_dists[lhs] = Distribution.REP return if func_name == 'dot': self._analyze_call_np_dot(lhs, args, array_dists) return # used in df.values if func_name == 'stack': seq_info = guard(find_build_sequence, self.func_ir, args[0]) if seq_info is None: self._analyze_call_set_REP(lhs, args, array_dists, 'np.' + func_name) return in_arrs, _ = seq_info axis = 0 # TODO: support kws # if 'axis' in kws: # axis = find_const(self.func_ir, kws['axis']) if len(args) > 1: axis = find_const(self.func_ir, args[1]) # parallel if args are 1D and output is 2D and axis == 1 if axis is not None and axis == 1 and self.typemap[lhs].ndim == 2: for v in in_arrs: self._meet_array_dists(lhs, v.name, array_dists) return if (func_name in [ 'cumsum', 'cumprod', 'empty_like', 'zeros_like', 'ones_like', 'full_like', 'copy' ]): in_arr = args[0].name self._meet_array_dists(lhs, in_arr, array_dists) return # set REP if not found self._analyze_call_set_REP(lhs, args, array_dists, 'np.' + func_name)
def _analyze_assign(self, inst, array_dists, parfor_dists): lhs = inst.target.name rhs = inst.value # treat return casts like assignments if isinstance(rhs, ir.Expr) and rhs.op == 'cast': rhs = rhs.value if isinstance(rhs, ir.Var) and (is_array(self.typemap, lhs) or isinstance(self.typemap[lhs], (SeriesType, DataFrameType)) or is_array_container(self.typemap, lhs)): self._meet_array_dists(lhs, rhs.name, array_dists) return elif (is_array(self.typemap, lhs) and isinstance(rhs, ir.Expr) and rhs.op == 'inplace_binop'): # distributions of all 3 variables should meet (lhs, arg1, arg2) arg1 = rhs.lhs.name arg2 = rhs.rhs.name dist = self._meet_array_dists(arg1, arg2, array_dists) dist = self._meet_array_dists(arg1, lhs, array_dists, dist) self._meet_array_dists(arg1, arg2, array_dists, dist) return elif isinstance(rhs, ir.Expr) and rhs.op in ['getitem', 'static_getitem']: self._analyze_getitem(inst, lhs, rhs, array_dists) return elif isinstance(rhs, ir.Expr) and rhs.op == 'build_tuple': # parallel arrays can be packed and unpacked from tuples # e.g. boolean array index in test_getitem_multidim return elif (isinstance(rhs, ir.Expr) and rhs.op == 'getattr' and rhs.attr == 'T' and is_array(self.typemap, lhs)): # array and its transpose have same distributions arr = rhs.value.name self._meet_array_dists(lhs, arr, array_dists) # keep lhs in table for dot() handling self._T_arrs.add(lhs) return elif (isinstance(rhs, ir.Expr) and rhs.op == 'getattr' and isinstance(self.typemap[rhs.value.name], DataFrameType) and rhs.attr == 'to_csv'): return elif (isinstance(rhs, ir.Expr) and rhs.op == 'getattr' and rhs.attr in [ 'shape', 'ndim', 'size', 'strides', 'dtype', 'itemsize', 'astype', 'reshape', 'ctypes', 'transpose', 'tofile', 'copy' ]): pass # X.shape doesn't affect X distribution elif isinstance(rhs, ir.Expr) and rhs.op == 'call': self._analyze_call(lhs, rhs, rhs.func.name, rhs.args, array_dists) # handle for A in arr_container: ... # A = pair_first(iternext(getiter(arr_container))) # TODO: support getitem of container elif isinstance(rhs, ir.Expr) and rhs.op == 'pair_first' and is_array( self.typemap, lhs): arr_container = guard(_get_pair_first_container, self.func_ir, rhs) if arr_container is not None: self._meet_array_dists(lhs, arr_container.name, array_dists) return elif isinstance(rhs, ir.Expr) and rhs.op in ('getiter', 'iternext'): # analyze array container access in pair_first return elif isinstance(rhs, ir.Arg): distributed_key = 'distributed' threaded_key = 'threaded' if distributed_key not in self.metadata.keys(): self.metadata[distributed_key] = {} if threaded_key not in self.metadata.keys(): self.metadata[threaded_key] = {} if rhs.name in self.metadata[distributed_key]: if lhs not in array_dists: array_dists[lhs] = Distribution.OneD elif rhs.name in self.metadata[threaded_key]: if lhs not in array_dists: array_dists[lhs] = Distribution.Thread else: dprint("replicated input ", rhs.name, lhs) self._set_REP([inst.target], array_dists) else: self._set_REP(inst.list_vars(), array_dists) return