Ejemplo n.º 1
0
    def _build_index_cache(self):
        assert isinstance(self._op, FrameNode)

        if self._partitions.size == 0:
            self._index_cache = Index.__new__(Index)
        else:
            assert self._partitions.size == 1
            obj = self._partitions[0][0].get()
            if isinstance(obj, (pd.DataFrame, pd.Series)):
                self._index_cache = obj.index
            else:
                assert isinstance(obj, pyarrow.Table)
                if self._index_cols is None:
                    self._index_cache = Index.__new__(RangeIndex,
                                                      data=range(obj.num_rows))
                else:
                    index_at = obj.drop([f"F_{col}" for col in self.columns])
                    index_df = index_at.to_pandas()
                    index_df.set_index(
                        [f"F_{col}" for col in self._index_cols], inplace=True)
                    index_df.index.rename(self._index_names(self._index_cols),
                                          inplace=True)
                    self._index_cache = index_df.index
Ejemplo n.º 2
0
    def join(self,
             other,
             how="inner",
             on=None,
             sort=False,
             suffixes=("_x", "_y")):
        assert (
            on is not None
        ), "Merge with unspecified 'on' parameter is not supported in the engine"

        for col in on:
            assert (
                col in self.columns and col in other.columns
            ), "Only cases when both frames contain key column are supported"

        new_columns = []
        new_dtypes = []

        conflicting_cols = set(self.columns) & set(other.columns) - set(on)
        for c in self.columns:
            suffix = suffixes[0] if c in conflicting_cols else ""
            new_columns.append(c + suffix)
            new_dtypes.append(self._dtypes[c])
        for c in other.columns:
            if c not in on:
                suffix = suffixes[1] if c in conflicting_cols else ""
                new_columns.append(c + suffix)
                new_dtypes.append(other._dtypes[c])

        op = JoinNode(
            self,
            other,
            how=how,
            on=on,
            sort=sort,
            suffixes=suffixes,
        )

        new_columns = Index.__new__(Index,
                                    data=new_columns,
                                    dtype=self.columns.dtype)
        return self.__constructor__(
            dtypes=new_dtypes,
            columns=new_columns,
            op=op,
            force_execution_mode=self._force_execution_mode,
        )
Ejemplo n.º 3
0
    def _concat(self,
                axis,
                other_modin_frames,
                join="outer",
                sort=False,
                ignore_index=False):
        if not other_modin_frames:
            return self

        if axis == 0:
            return self._union_all(axis, other_modin_frames, join, sort,
                                   ignore_index)

        base = self
        for frame in other_modin_frames:
            base = base._find_common_projections_base(frame)
            if base is None:
                raise NotImplementedError(
                    "concat requiring join is not supported yet")

        exprs = self._index_exprs()
        new_columns = self.columns.tolist()
        for col in self.columns:
            exprs[col] = self.ref(col)
        for frame in other_modin_frames:
            for col in frame.columns:
                if col == "" or col in exprs:
                    new_col = f"__col{len(exprs)}__"
                else:
                    new_col = col
                exprs[new_col] = frame.ref(col)
                new_columns.append(new_col)

        exprs = translate_exprs_to_base(exprs, base)
        new_columns = Index.__new__(Index,
                                    data=new_columns,
                                    dtype=self.columns.dtype)
        new_frame = self.__constructor__(
            columns=new_columns,
            dtypes=self._dtypes_for_exprs(exprs),
            op=TransformNode(base, exprs),
            index_cols=self._index_cols,
            force_execution_mode=self._force_execution_mode,
        )
        return new_frame
Ejemplo n.º 4
0
 def reset_index(self, drop):
     if drop:
         exprs = OrderedDict()
         for c in self.columns:
             exprs[c] = self.ref(c)
         return self.__constructor__(
             columns=self.columns,
             dtypes=self._dtypes_for_exprs(exprs),
             op=TransformNode(self, exprs),
             index_cols=None,
             force_execution_mode=self._force_execution_mode,
         )
     else:
         if self._index_cols is None:
             raise NotImplementedError(
                 "default index reset with no drop is not supported")
         # Need to demangle index names.
         exprs = OrderedDict()
         for i, c in enumerate(self._index_cols):
             name = self._index_name(c)
             if name is None:
                 name = f"level_{i}"
             if name in exprs:
                 raise ValueError(f"cannot insert {name}, already exists")
             exprs[name] = self.ref(c)
         for c in self.columns:
             if c in exprs:
                 raise ValueError(f"cannot insert {c}, already exists")
             exprs[c] = self.ref(c)
         new_columns = Index.__new__(Index, data=exprs.keys(), dtype="O")
         return self.__constructor__(
             columns=new_columns,
             dtypes=self._dtypes_for_exprs(exprs),
             op=TransformNode(self, exprs),
             index_cols=None,
             force_execution_mode=self._force_execution_mode,
         )
Ejemplo n.º 5
0
    def groupby_agg(self, by, axis, agg, groupby_args, **kwargs):
        # Currently we only expect 'by' to be a projection of the same frame.
        # If 'by' holds a list of columns/series, then we create such projection
        # to re-use code.
        if not isinstance(by, DFAlgQueryCompiler):
            if is_list_like(by):
                by_cols = []
                by_frames = []
                for obj in by:
                    if isinstance(obj, str):
                        by_cols.append(obj)
                    elif hasattr(obj, "_query_compiler"):
                        by_frames.append(obj._query_compiler._modin_frame)
                    else:
                        raise NotImplementedError("unsupported groupby args")
                by_cols = Index.__new__(Index,
                                        data=by_cols,
                                        dtype=self.columns.dtype)
                by_frame = self.mask(col_indices=by_cols)
                if by_frames:
                    by_frame = by_frame._concat(axis=1,
                                                other_modin_frames=by_frames,
                                                ignore_index=True)
            else:
                raise NotImplementedError("unsupported groupby args")
        else:
            by_frame = by._modin_frame

        if axis != 0:
            raise NotImplementedError("groupby is supported for axis = 0 only")

        base = by_frame._find_common_projections_base(self)
        if base is None:
            raise NotImplementedError("unsupported groupby args")

        if groupby_args["level"] is not None:
            raise NotImplementedError("levels are not supported for groupby")

        groupby_cols = by_frame.columns.tolist()
        agg_cols = [col for col in self.columns if col not in by_frame.columns]

        # Create new base where all required columns are computed. We don't allow
        # complex expressions to be a group key or an aggeregate operand.
        assert isinstance(by_frame._op, TransformNode), "unexpected by_frame"
        exprs = OrderedDict(((col, by_frame.ref(col)) for col in groupby_cols))
        exprs.update(((col, self.ref(col)) for col in agg_cols))
        exprs = translate_exprs_to_base(exprs, base)
        base_cols = Index.__new__(Index,
                                  data=list(exprs.keys()),
                                  dtype=self.columns.dtype)
        base = self.__constructor__(
            columns=base_cols,
            dtypes=self._dtypes_for_exprs(exprs),
            op=TransformNode(base, exprs, fold=True),
            index_cols=None,
            force_execution_mode=self._force_execution_mode,
        )

        new_columns = []
        index_cols = None

        if groupby_args["as_index"]:
            index_cols = groupby_cols.copy()
        else:
            new_columns = groupby_cols.copy()

        new_dtypes = by_frame._dtypes[groupby_cols].tolist()

        agg_exprs = OrderedDict()
        if isinstance(agg, str):
            for col in agg_cols:
                agg_exprs[col] = AggregateExpr(agg, base.ref(col))
        else:
            assert isinstance(agg, dict), "unsupported aggregate type"
            multiindex = any(isinstance(v, list) for v in agg.values())
            for k, v in agg.items():
                if isinstance(v, list):
                    for item in v:
                        agg_exprs[(k, item)] = AggregateExpr(item, base.ref(k))
                else:
                    col_name = (k, v) if multiindex else k
                    agg_exprs[col_name] = AggregateExpr(v, base.ref(k))
        new_columns.extend(agg_exprs.keys())
        new_dtypes.extend((x._dtype for x in agg_exprs.values()))
        new_columns = Index.__new__(Index,
                                    data=new_columns,
                                    dtype=self.columns.dtype)

        new_op = GroupbyAggNode(base, groupby_cols, agg_exprs, groupby_args)
        new_frame = self.__constructor__(
            columns=new_columns,
            dtypes=new_dtypes,
            op=new_op,
            index_cols=index_cols,
            force_execution_mode=self._force_execution_mode,
        )

        return new_frame