def _build_index_cache(self): assert isinstance(self._op, FrameNode) if self._partitions.size == 0: self._index_cache = Index.__new__(Index) else: assert self._partitions.size == 1 obj = self._partitions[0][0].get() if isinstance(obj, (pd.DataFrame, pd.Series)): self._index_cache = obj.index else: assert isinstance(obj, pyarrow.Table) if self._index_cols is None: self._index_cache = Index.__new__(RangeIndex, data=range(obj.num_rows)) else: index_at = obj.drop([f"F_{col}" for col in self.columns]) index_df = index_at.to_pandas() index_df.set_index( [f"F_{col}" for col in self._index_cols], inplace=True) index_df.index.rename(self._index_names(self._index_cols), inplace=True) self._index_cache = index_df.index
def join(self, other, how="inner", on=None, sort=False, suffixes=("_x", "_y")): assert ( on is not None ), "Merge with unspecified 'on' parameter is not supported in the engine" for col in on: assert ( col in self.columns and col in other.columns ), "Only cases when both frames contain key column are supported" new_columns = [] new_dtypes = [] conflicting_cols = set(self.columns) & set(other.columns) - set(on) for c in self.columns: suffix = suffixes[0] if c in conflicting_cols else "" new_columns.append(c + suffix) new_dtypes.append(self._dtypes[c]) for c in other.columns: if c not in on: suffix = suffixes[1] if c in conflicting_cols else "" new_columns.append(c + suffix) new_dtypes.append(other._dtypes[c]) op = JoinNode( self, other, how=how, on=on, sort=sort, suffixes=suffixes, ) new_columns = Index.__new__(Index, data=new_columns, dtype=self.columns.dtype) return self.__constructor__( dtypes=new_dtypes, columns=new_columns, op=op, force_execution_mode=self._force_execution_mode, )
def _concat(self, axis, other_modin_frames, join="outer", sort=False, ignore_index=False): if not other_modin_frames: return self if axis == 0: return self._union_all(axis, other_modin_frames, join, sort, ignore_index) base = self for frame in other_modin_frames: base = base._find_common_projections_base(frame) if base is None: raise NotImplementedError( "concat requiring join is not supported yet") exprs = self._index_exprs() new_columns = self.columns.tolist() for col in self.columns: exprs[col] = self.ref(col) for frame in other_modin_frames: for col in frame.columns: if col == "" or col in exprs: new_col = f"__col{len(exprs)}__" else: new_col = col exprs[new_col] = frame.ref(col) new_columns.append(new_col) exprs = translate_exprs_to_base(exprs, base) new_columns = Index.__new__(Index, data=new_columns, dtype=self.columns.dtype) new_frame = self.__constructor__( columns=new_columns, dtypes=self._dtypes_for_exprs(exprs), op=TransformNode(base, exprs), index_cols=self._index_cols, force_execution_mode=self._force_execution_mode, ) return new_frame
def reset_index(self, drop): if drop: exprs = OrderedDict() for c in self.columns: exprs[c] = self.ref(c) return self.__constructor__( columns=self.columns, dtypes=self._dtypes_for_exprs(exprs), op=TransformNode(self, exprs), index_cols=None, force_execution_mode=self._force_execution_mode, ) else: if self._index_cols is None: raise NotImplementedError( "default index reset with no drop is not supported") # Need to demangle index names. exprs = OrderedDict() for i, c in enumerate(self._index_cols): name = self._index_name(c) if name is None: name = f"level_{i}" if name in exprs: raise ValueError(f"cannot insert {name}, already exists") exprs[name] = self.ref(c) for c in self.columns: if c in exprs: raise ValueError(f"cannot insert {c}, already exists") exprs[c] = self.ref(c) new_columns = Index.__new__(Index, data=exprs.keys(), dtype="O") return self.__constructor__( columns=new_columns, dtypes=self._dtypes_for_exprs(exprs), op=TransformNode(self, exprs), index_cols=None, force_execution_mode=self._force_execution_mode, )
def groupby_agg(self, by, axis, agg, groupby_args, **kwargs): # Currently we only expect 'by' to be a projection of the same frame. # If 'by' holds a list of columns/series, then we create such projection # to re-use code. if not isinstance(by, DFAlgQueryCompiler): if is_list_like(by): by_cols = [] by_frames = [] for obj in by: if isinstance(obj, str): by_cols.append(obj) elif hasattr(obj, "_query_compiler"): by_frames.append(obj._query_compiler._modin_frame) else: raise NotImplementedError("unsupported groupby args") by_cols = Index.__new__(Index, data=by_cols, dtype=self.columns.dtype) by_frame = self.mask(col_indices=by_cols) if by_frames: by_frame = by_frame._concat(axis=1, other_modin_frames=by_frames, ignore_index=True) else: raise NotImplementedError("unsupported groupby args") else: by_frame = by._modin_frame if axis != 0: raise NotImplementedError("groupby is supported for axis = 0 only") base = by_frame._find_common_projections_base(self) if base is None: raise NotImplementedError("unsupported groupby args") if groupby_args["level"] is not None: raise NotImplementedError("levels are not supported for groupby") groupby_cols = by_frame.columns.tolist() agg_cols = [col for col in self.columns if col not in by_frame.columns] # Create new base where all required columns are computed. We don't allow # complex expressions to be a group key or an aggeregate operand. assert isinstance(by_frame._op, TransformNode), "unexpected by_frame" exprs = OrderedDict(((col, by_frame.ref(col)) for col in groupby_cols)) exprs.update(((col, self.ref(col)) for col in agg_cols)) exprs = translate_exprs_to_base(exprs, base) base_cols = Index.__new__(Index, data=list(exprs.keys()), dtype=self.columns.dtype) base = self.__constructor__( columns=base_cols, dtypes=self._dtypes_for_exprs(exprs), op=TransformNode(base, exprs, fold=True), index_cols=None, force_execution_mode=self._force_execution_mode, ) new_columns = [] index_cols = None if groupby_args["as_index"]: index_cols = groupby_cols.copy() else: new_columns = groupby_cols.copy() new_dtypes = by_frame._dtypes[groupby_cols].tolist() agg_exprs = OrderedDict() if isinstance(agg, str): for col in agg_cols: agg_exprs[col] = AggregateExpr(agg, base.ref(col)) else: assert isinstance(agg, dict), "unsupported aggregate type" multiindex = any(isinstance(v, list) for v in agg.values()) for k, v in agg.items(): if isinstance(v, list): for item in v: agg_exprs[(k, item)] = AggregateExpr(item, base.ref(k)) else: col_name = (k, v) if multiindex else k agg_exprs[col_name] = AggregateExpr(v, base.ref(k)) new_columns.extend(agg_exprs.keys()) new_dtypes.extend((x._dtype for x in agg_exprs.values())) new_columns = Index.__new__(Index, data=new_columns, dtype=self.columns.dtype) new_op = GroupbyAggNode(base, groupby_cols, agg_exprs, groupby_args) new_frame = self.__constructor__( columns=new_columns, dtypes=new_dtypes, op=new_op, index_cols=index_cols, force_execution_mode=self._force_execution_mode, ) return new_frame