Beispiel #1
0
 def _analyze_call_set_REP(self, lhs, args, array_dists, fdef=None):
     for v in args:
         if (is_array(self.typemap, v.name)
                 or is_array_container(self.typemap, v.name)
                 or isinstance(self.typemap[v.name], DataFrameType)):
             dprint("dist setting call arg REP {} in {}".format(
                 v.name, fdef))
             array_dists[v.name] = Distribution.REP
     if (is_array(self.typemap, lhs)
             or is_array_container(self.typemap, lhs)
             or isinstance(self.typemap[lhs], DataFrameType)):
         dprint("dist setting call out REP {} in {}".format(lhs, fdef))
         array_dists[lhs] = Distribution.REP
Beispiel #2
0
 def _set_REP(self, var_list, array_dists):
     for var in var_list:
         varname = var.name
         # Handle SeriesType since it comes from Arg node and it could
         # have user-defined distribution
         if (is_array(self.typemap, varname)
                 or is_array_container(self.typemap, varname)
                 or isinstance(self.typemap[varname],
                               (SeriesType, DataFrameType))):
             dprint("dist setting REP {}".format(varname))
             array_dists[varname] = Distribution.REP
         # handle tuples of arrays
         var_def = guard(get_definition, self.func_ir, var)
         if (var_def is not None and isinstance(var_def, ir.Expr)
                 and var_def.op == 'build_tuple'):
             tuple_vars = var_def.items
             self._set_REP(tuple_vars, array_dists)
Beispiel #3
0
    def _analyze_call(self, lhs, rhs, func_var, args, array_dists):
        """analyze array distributions in function calls
        """
        func_name = ""
        func_mod = ""
        fdef = guard(find_callname, self.func_ir, rhs, self.typemap)
        if fdef is None:
            # check ObjModeLiftedWith, we assume distribution doesn't change
            # blocks of data are passed in, TODO: document
            func_def = guard(get_definition, self.func_ir, rhs.func)
            if isinstance(func_def, ir.Const) and isinstance(
                    func_def.value, numba.dispatcher.ObjModeLiftedWith):
                return
            warnings.warn(
                "function call couldn't be found for distributed analysis")
            self._analyze_call_set_REP(lhs, args, array_dists, fdef)
            return
        else:
            func_name, func_mod = fdef

        if is_alloc_callname(func_name, func_mod):
            if lhs not in array_dists:
                array_dists[lhs] = Distribution.OneD
            return

        # numpy direct functions
        if isinstance(func_mod, str) and func_mod == 'numpy':
            self._analyze_call_np(lhs, func_name, args, array_dists)
            return

        # handle array.func calls
        if isinstance(func_mod, ir.Var) and is_array(self.typemap,
                                                     func_mod.name):
            self._analyze_call_array(lhs, func_mod, func_name, args,
                                     array_dists)
            return

        # handle df.func calls
        if isinstance(func_mod, ir.Var) and isinstance(
                self.typemap[func_mod.name], DataFrameType):
            self._analyze_call_df(lhs, func_mod, func_name, args, array_dists)
            return

        # sdc.distributed_api functions
        if isinstance(func_mod, str) and func_mod == 'sdc.distributed_api':
            self._analyze_call_hpat_dist(lhs, func_name, args, array_dists)
            return

        # len()
        if func_name == 'len' and func_mod in ('__builtin__', 'builtins'):
            return

        if fdef == ('quantile', 'sdc.hiframes.api'):
            # quantile doesn't affect input's distribution
            return

        if fdef == ('nunique', 'sdc.hiframes.api'):
            # nunique doesn't affect input's distribution
            return

        if fdef == ('unique', 'sdc.hiframes.api'):
            # doesn't affect distribution of input since input can stay 1D
            if lhs not in array_dists:
                array_dists[lhs] = Distribution.OneD_Var

            new_dist = Distribution(
                min(array_dists[lhs].value,
                    array_dists[rhs.args[0].name].value))
            array_dists[lhs] = new_dist
            return

        if fdef == ('rolling_fixed', 'sdc.hiframes.rolling'):
            self._meet_array_dists(lhs, rhs.args[0].name, array_dists)
            return

        if fdef == ('rolling_variable', 'sdc.hiframes.rolling'):
            # lhs, in_arr, on_arr should have the same distribution
            new_dist = self._meet_array_dists(lhs, rhs.args[0].name,
                                              array_dists)
            new_dist = self._meet_array_dists(lhs, rhs.args[1].name,
                                              array_dists, new_dist)
            array_dists[rhs.args[0].name] = new_dist
            return

        if fdef == ('shift', 'sdc.hiframes.rolling'):
            self._meet_array_dists(lhs, rhs.args[0].name, array_dists)
            return

        if fdef == ('pct_change', 'sdc.hiframes.rolling'):
            self._meet_array_dists(lhs, rhs.args[0].name, array_dists)
            return

        if fdef == ('nlargest', 'sdc.hiframes.api'):
            # output of nlargest is REP
            array_dists[lhs] = Distribution.REP
            return

        if fdef == ('median', 'sdc.hiframes.api'):
            return

        if fdef == ('concat', 'sdc.hiframes.api'):
            # hiframes concat is similar to np.concatenate
            self._analyze_call_np_concatenate(lhs, args, array_dists)
            return

        if fdef == ('isna', 'sdc.hiframes.api'):
            return

        if fdef == ('get_series_name', 'sdc.hiframes.api'):
            return

        # dummy hiframes functions
        if func_mod == 'sdc.hiframes.api' and func_name in (
                'get_series_data', 'get_series_index', 'to_arr_from_series',
                'ts_series_to_arr_typ', 'to_date_series_type',
                'dummy_unbox_series', 'parallel_fix_df_array'):
            # TODO: support Series type similar to Array
            self._meet_array_dists(lhs, rhs.args[0].name, array_dists)
            return

        if fdef == ('init_series', 'sdc.hiframes.api'):
            # lhs, in_arr, and index should have the same distribution
            new_dist = self._meet_array_dists(lhs, rhs.args[0].name,
                                              array_dists)
            if len(rhs.args) > 1 and self.typemap[
                    rhs.args[1].name] != types.none:
                new_dist = self._meet_array_dists(lhs, rhs.args[1].name,
                                                  array_dists, new_dist)
                array_dists[rhs.args[0].name] = new_dist
            return

        if fdef == ('init_dataframe', 'sdc.hiframes.pd_dataframe_ext'):
            # lhs, data arrays, and index should have the same distribution
            df_typ = self.typemap[lhs]
            n_cols = len(df_typ.columns)
            for i in range(n_cols):
                new_dist = self._meet_array_dists(lhs, rhs.args[i].name,
                                                  array_dists)
            # handle index
            if len(rhs.args) > n_cols and self.typemap[
                    rhs.args[n_cols].name] != types.none:
                new_dist = self._meet_array_dists(lhs, rhs.args[n_cols].name,
                                                  array_dists, new_dist)
            for i in range(n_cols):
                array_dists[rhs.args[i].name] = new_dist
            return

        if fdef == ('get_dataframe_data', 'sdc.hiframes.pd_dataframe_ext'):
            self._meet_array_dists(lhs, rhs.args[0].name, array_dists)
            return

        if fdef == ('compute_split_view', 'sdc.hiframes.split_impl'):
            self._meet_array_dists(lhs, rhs.args[0].name, array_dists)
            return

        if fdef == ('get_split_view_index', 'sdc.hiframes.split_impl'):
            # just used in str.get() implementation for now so we know it is
            # parallel
            # TODO: handle index similar to getitem to support more cases
            return

        if fdef == ('get_split_view_data_ptr', 'sdc.hiframes.split_impl'):
            return

        if fdef == ('setitem_str_arr_ptr', 'sdc.str_arr_ext'):
            return

        if fdef == ('num_total_chars', 'sdc.str_arr_ext'):
            return

        if fdef == ('_series_dropna_str_alloc_impl_inner',
                    'sdc.hiframes.series_kernels'):
            if lhs not in array_dists:
                array_dists[lhs] = Distribution.OneD_Var
            in_dist = array_dists[rhs.args[0].name]
            out_dist = array_dists[lhs]
            out_dist = Distribution(min(out_dist.value, in_dist.value))
            array_dists[lhs] = out_dist
            # output can cause input REP
            if out_dist != Distribution.OneD_Var:
                array_dists[rhs.args[0].name] = out_dist
            return

        if (fdef == ('copy_non_null_offsets', 'sdc.str_arr_ext')
                or fdef == ('copy_data', 'sdc.str_arr_ext')):
            out_arrname = rhs.args[0].name
            in_arrname = rhs.args[1].name
            self._meet_array_dists(out_arrname, in_arrname, array_dists)
            return

        if fdef == ('str_arr_item_to_numeric', 'sdc.str_arr_ext'):
            out_arrname = rhs.args[0].name
            in_arrname = rhs.args[2].name
            self._meet_array_dists(out_arrname, in_arrname, array_dists)
            return

        # np.fromfile()
        if fdef == ('file_read', 'sdc.io.np_io'):
            return

        if sdc.config._has_pyarrow and fdef == ('read_parquet',
                                                'sdc.io.parquet_pio'):
            return

        if sdc.config._has_pyarrow and fdef == ('read_parquet_str',
                                                'sdc.io.parquet_pio'):
            # string read creates array in output
            if lhs not in array_dists:
                array_dists[lhs] = Distribution.OneD
            return

        # TODO: make sure assert_equiv is not generated unnecessarily
        # TODO: fix assert_equiv for np.stack from df.value
        if fdef == ('assert_equiv', 'numba.array_analysis'):
            return

        # we perform call-analysis from external at the end
        if isinstance(func_mod, ir.Var):
            ky = (self.typemap[func_mod.name], func_name)
            if ky in DistributedAnalysis._extra_call:
                if DistributedAnalysis._extra_call[ky](lhs, func_mod, *ky,
                                                       args, array_dists):
                    return

        # set REP if not found
        self._analyze_call_set_REP(lhs, args, array_dists, fdef)
Beispiel #4
0
    def _analyze_call_np(self, lhs, func_name, args, array_dists):
        """analyze distributions of numpy functions (np.func_name)
        """

        if func_name == 'ascontiguousarray':
            self._meet_array_dists(lhs, args[0].name, array_dists)
            return

        if func_name == 'ravel':
            self._meet_array_dists(lhs, args[0].name, array_dists)
            return

        if func_name == 'concatenate':
            self._analyze_call_np_concatenate(lhs, args, array_dists)
            return

        if func_name == 'array' and is_array(self.typemap, args[0].name):
            self._meet_array_dists(lhs, args[0].name, array_dists)
            return

        # sum over the first axis is distributed, A.sum(0)
        if func_name == 'sum' and len(args) == 2:
            axis_def = guard(get_definition, self.func_ir, args[1])
            if isinstance(axis_def, ir.Const) and axis_def.value == 0:
                array_dists[lhs] = Distribution.REP
                return

        if func_name == 'dot':
            self._analyze_call_np_dot(lhs, args, array_dists)
            return

        # used in df.values
        if func_name == 'stack':
            seq_info = guard(find_build_sequence, self.func_ir, args[0])
            if seq_info is None:
                self._analyze_call_set_REP(lhs, args, array_dists,
                                           'np.' + func_name)
                return
            in_arrs, _ = seq_info

            axis = 0
            # TODO: support kws
            # if 'axis' in kws:
            #     axis = find_const(self.func_ir, kws['axis'])
            if len(args) > 1:
                axis = find_const(self.func_ir, args[1])

            # parallel if args are 1D and output is 2D and axis == 1
            if axis is not None and axis == 1 and self.typemap[lhs].ndim == 2:
                for v in in_arrs:
                    self._meet_array_dists(lhs, v.name, array_dists)
                return

        if (func_name in [
                'cumsum', 'cumprod', 'empty_like', 'zeros_like', 'ones_like',
                'full_like', 'copy'
        ]):
            in_arr = args[0].name
            self._meet_array_dists(lhs, in_arr, array_dists)
            return

        # set REP if not found
        self._analyze_call_set_REP(lhs, args, array_dists, 'np.' + func_name)
Beispiel #5
0
    def _analyze_assign(self, inst, array_dists, parfor_dists):
        lhs = inst.target.name
        rhs = inst.value
        # treat return casts like assignments
        if isinstance(rhs, ir.Expr) and rhs.op == 'cast':
            rhs = rhs.value

        if isinstance(rhs,
                      ir.Var) and (is_array(self.typemap, lhs)
                                   or isinstance(self.typemap[lhs],
                                                 (SeriesType, DataFrameType))
                                   or is_array_container(self.typemap, lhs)):
            self._meet_array_dists(lhs, rhs.name, array_dists)
            return
        elif (is_array(self.typemap, lhs) and isinstance(rhs, ir.Expr)
              and rhs.op == 'inplace_binop'):
            # distributions of all 3 variables should meet (lhs, arg1, arg2)
            arg1 = rhs.lhs.name
            arg2 = rhs.rhs.name
            dist = self._meet_array_dists(arg1, arg2, array_dists)
            dist = self._meet_array_dists(arg1, lhs, array_dists, dist)
            self._meet_array_dists(arg1, arg2, array_dists, dist)
            return
        elif isinstance(rhs,
                        ir.Expr) and rhs.op in ['getitem', 'static_getitem']:
            self._analyze_getitem(inst, lhs, rhs, array_dists)
            return
        elif isinstance(rhs, ir.Expr) and rhs.op == 'build_tuple':
            # parallel arrays can be packed and unpacked from tuples
            # e.g. boolean array index in test_getitem_multidim
            return
        elif (isinstance(rhs, ir.Expr) and rhs.op == 'getattr'
              and rhs.attr == 'T' and is_array(self.typemap, lhs)):
            # array and its transpose have same distributions
            arr = rhs.value.name
            self._meet_array_dists(lhs, arr, array_dists)
            # keep lhs in table for dot() handling
            self._T_arrs.add(lhs)
            return
        elif (isinstance(rhs, ir.Expr) and rhs.op == 'getattr'
              and isinstance(self.typemap[rhs.value.name], DataFrameType)
              and rhs.attr == 'to_csv'):
            return
        elif (isinstance(rhs, ir.Expr) and rhs.op == 'getattr'
              and rhs.attr in [
                  'shape', 'ndim', 'size', 'strides', 'dtype', 'itemsize',
                  'astype', 'reshape', 'ctypes', 'transpose', 'tofile', 'copy'
              ]):
            pass  # X.shape doesn't affect X distribution
        elif isinstance(rhs, ir.Expr) and rhs.op == 'call':
            self._analyze_call(lhs, rhs, rhs.func.name, rhs.args, array_dists)
        # handle for A in arr_container: ...
        # A = pair_first(iternext(getiter(arr_container)))
        # TODO: support getitem of container
        elif isinstance(rhs, ir.Expr) and rhs.op == 'pair_first' and is_array(
                self.typemap, lhs):
            arr_container = guard(_get_pair_first_container, self.func_ir, rhs)
            if arr_container is not None:
                self._meet_array_dists(lhs, arr_container.name, array_dists)
                return
        elif isinstance(rhs, ir.Expr) and rhs.op in ('getiter', 'iternext'):
            # analyze array container access in pair_first
            return
        elif isinstance(rhs, ir.Arg):
            if rhs.name in self.metadata['distributed']:
                if lhs not in array_dists:
                    array_dists[lhs] = Distribution.OneD
            elif rhs.name in self.metadata['threaded']:
                if lhs not in array_dists:
                    array_dists[lhs] = Distribution.Thread
            else:
                dprint("replicated input ", rhs.name, lhs)
                self._set_REP([inst.target], array_dists)
        else:
            self._set_REP(inst.list_vars(), array_dists)
        return