def union(
     self,
     df1: DataFrame,
     df2: DataFrame,
     distinct: bool = True,
     metadata: Any = None,
 ) -> DataFrame:
     assert_or_throw(df1.schema == df2.schema,
                     ValueError(f"{df1.schema} != {df2.schema}"))
     d = self.pl_utils.union(df1.as_pandas(),
                             df2.as_pandas(),
                             unique=distinct)
     return PandasDataFrame(d.reset_index(drop=True), df1.schema, metadata)
Exemple #2
0
 def join(
     self,
     df1: DataFrame,
     df2: DataFrame,
     how: str,
     on: List[str] = _DEFAULT_JOIN_KEYS,
     metadata: Any = None,
 ) -> DataFrame:
     key_schema, output_schema = get_join_schemas(df1, df2, how=how, on=on)
     d = self.pl_utils.join(df1.as_pandas(),
                            df2.as_pandas(),
                            join_type=how,
                            on=key_schema.names)
     return PandasDataFrame(d.reset_index(drop=True), output_schema,
                            metadata)
Exemple #3
0
 def fillna(
     self,
     df: DataFrame,
     value: Any,
     subset: List[str] = None,
     metadata: Any = None,
 ) -> DataFrame:
     assert_or_throw(
         (not isinstance(value, list)) and (value is not None),
         ValueError("fillna value can not None or a list"),
     )
     if isinstance(value, dict):
         assert_or_throw(
             (None not in value.values()) and (any(value.values())),
             ValueError(
                 "fillna dict can not contain None and needs at least one value"
             ),
         )
         mapping = value
     else:
         # If subset is none, apply to all columns
         subset = subset or df.schema.names
         mapping = {col: value for col in subset}
     d = df.as_pandas().fillna(mapping, inplace=False)
     return PandasDataFrame(d.reset_index(drop=True), df.schema, metadata)
Exemple #4
0
 def distinct(
     self,
     df: DataFrame,
     metadata: Any = None,
 ) -> DataFrame:
     d = self.pl_utils.drop_duplicates(df.as_pandas())
     return PandasDataFrame(d.reset_index(drop=True), df.schema, metadata)
 def intersect(
     self,
     df1: DataFrame,
     df2: DataFrame,
     distinct: bool = True,
     metadata: Any = None,
 ) -> DataFrame:
     assert_or_throw(
         distinct,
         NotImplementedError("INTERSECT ALL for NativeExecutionEngine"))
     assert_or_throw(df1.schema == df2.schema,
                     ValueError(f"{df1.schema} != {df2.schema}"))
     d = self.pl_utils.intersect(df1.as_pandas(),
                                 df2.as_pandas(),
                                 unique=distinct)
     return PandasDataFrame(d.reset_index(drop=True), df1.schema, metadata)
Exemple #6
0
 def subtract(
     self,
     df1: DataFrame,
     df2: DataFrame,
     distinct: bool = True,
     metadata: Any = None,
 ) -> DataFrame:
     assert_or_throw(
         distinct,
         NotImplementedError("EXCEPT ALL for NativeExecutionEngine"))
     assert_or_throw(
         df1.schema == df2.schema,
         lambda: ValueError(f"{df1.schema} != {df2.schema}"),
     )
     d = self.pl_utils.except_df(df1.as_pandas(),
                                 df2.as_pandas(),
                                 unique=distinct)
     return PandasDataFrame(d.reset_index(drop=True), df1.schema, metadata)
Exemple #7
0
    def map(
        self,
        df: DataFrame,
        map_func: Callable[[PartitionCursor, LocalDataFrame], LocalDataFrame],
        output_schema: Any,
        partition_spec: PartitionSpec,
        metadata: Any = None,
        on_init: Optional[Callable[[int, DataFrame], Any]] = None,
    ) -> DataFrame:
        if partition_spec.num_partitions != "0":
            self.log.warning(
                "%s doesn't respect num_partitions %s",
                self,
                partition_spec.num_partitions,
            )
        cursor = partition_spec.get_cursor(df.schema, 0)
        if on_init is not None:
            on_init(0, df)
        if len(partition_spec.partition_by) == 0:  # no partition
            df = to_local_df(df)
            cursor.set(df.peek_array(), 0, 0)
            output_df = map_func(cursor, df)
            if (isinstance(output_df, PandasDataFrame)
                    and output_df.schema != output_schema):
                output_df = PandasDataFrame(output_df.native, output_schema)
            assert_or_throw(
                output_df.schema == output_schema,
                lambda: f"map output {output_df.schema} "
                f"mismatches given {output_schema}",
            )
            output_df._metadata = ParamDict(metadata, deep=True)
            output_df._metadata.set_readonly()
            return self.to_df(output_df)
        presort = partition_spec.presort
        presort_keys = list(presort.keys())
        presort_asc = list(presort.values())
        output_schema = Schema(output_schema)

        def _map(pdf: pd.DataFrame) -> pd.DataFrame:
            if len(presort_keys) > 0:
                pdf = pdf.sort_values(presort_keys, ascending=presort_asc)
            input_df = PandasDataFrame(pdf.reset_index(drop=True),
                                       df.schema,
                                       pandas_df_wrapper=True)
            cursor.set(input_df.peek_array(), cursor.partition_no + 1, 0)
            output_df = map_func(cursor, input_df)
            return output_df.as_pandas()

        result = self.pl_utils.safe_groupby_apply(df.as_pandas(),
                                                  partition_spec.partition_by,
                                                  _map)
        return PandasDataFrame(result, output_schema, metadata)
Exemple #8
0
 def dropna(
     self,
     df: DataFrame,
     how: str = "any",
     thresh: int = None,
     subset: List[str] = None,
     metadata: Any = None,
 ) -> DataFrame:
     d = df.as_pandas().dropna(axis=0,
                               how=how,
                               thresh=thresh,
                               subset=subset,
                               inplace=False)
     return PandasDataFrame(d.reset_index(drop=True), df.schema, metadata)
Exemple #9
0
 def sample(
     self,
     df: DataFrame,
     n: Optional[int] = None,
     frac: Optional[float] = None,
     replace: bool = False,
     seed: Optional[int] = None,
     metadata: Any = None,
 ) -> DataFrame:
     assert_or_throw(
         (n is None and frac is not None)
         or (n is not None and frac is None),
         ValueError("one and only one of n and frac should be set"),
     )
     d = df.as_pandas().sample(n=n,
                               frac=frac,
                               replace=replace,
                               random_state=seed)
     return PandasDataFrame(d.reset_index(drop=True), df.schema, metadata)
Exemple #10
0
    def take(
        self,
        df: DataFrame,
        n: int,
        presort: str,
        na_position: str = "last",
        partition_spec: PartitionSpec = EMPTY_PARTITION_SPEC,
        metadata: Any = None,
    ) -> DataFrame:
        assert_or_throw(
            isinstance(n, int),
            ValueError("n needs to be an integer"),
        )
        d = df.as_pandas()

        # Use presort over partition_spec.presort if possible
        if presort:
            presort = parse_presort_exp(presort)
        _presort: IndexedOrderedDict = presort or partition_spec.presort

        if len(_presort.keys()) > 0:
            d = d.sort_values(
                list(_presort.keys()),
                ascending=list(_presort.values()),
                na_position=na_position,
            )

        if len(partition_spec.partition_by) == 0:
            d = d.head(n)
        else:
            d = d.groupby(by=partition_spec.partition_by, dropna=False).head(n)

        return PandasDataFrame(d.reset_index(drop=True),
                               df.schema,
                               metadata,
                               pandas_df_wrapper=True)
 def to_input_data(self, df: DataFrame) -> Iterable[pd.DataFrame]:
     if not isinstance(df, LocalDataFrameIterableDataFrame):
         yield df.as_pandas()
     else:
         for sub in df.native:
             yield sub.as_pandas()
 def to_input_data(self, df: DataFrame) -> pd.DataFrame:
     return df.as_pandas()
 def join(
     self,
     df1: DataFrame,
     df2: DataFrame,
     how: str,
     on: List[str] = _DEFAULT_JOIN_KEYS,
     metadata: Any = None,
 ) -> DataFrame:
     key_schema, output_schema = get_join_schemas(df1, df2, how=how, on=on)
     how = how.lower().replace("_", "").replace(" ", "")
     if how == "cross":
         d1 = df1.as_pandas()
         d2 = df2.as_pandas()
         d1["__cross_join_index__"] = 1
         d2["__cross_join_index__"] = 1
         d = d1.merge(d2, on=("__cross_join_index__")).drop(
             "__cross_join_index__", axis=1)
         return PandasDataFrame(d.reset_index(drop=True), output_schema,
                                metadata)
     if how in ["semi", "leftsemi"]:
         d1 = df1.as_pandas()
         d2 = df2.as_pandas()[key_schema.names]
         d = d1.merge(d2, on=key_schema.names, how="inner")
         return PandasDataFrame(d.reset_index(drop=True), output_schema,
                                metadata)
     if how in ["anti", "leftanti"]:
         d1 = df1.as_pandas()
         d2 = df2.as_pandas()[key_schema.names]
         d2["__anti_join_dummy__"] = 1.0
         d = d1.merge(d2, on=key_schema.names, how="left")
         d = d[d.iloc[:, -1].isnull()]
         return PandasDataFrame(
             d.drop(["__anti_join_dummy__"], axis=1).reset_index(drop=True),
             output_schema,
             metadata,
         )
     fix_left, fix_right = False, False
     if how in ["leftouter"]:
         how = "left"
         self._validate_outer_joinable(df2.schema, key_schema)
         fix_right = True
     if how in ["rightouter"]:
         how = "right"
         self._validate_outer_joinable(df1.schema, key_schema)
         fix_left = True
     if how in ["fullouter"]:
         how = "outer"
         self._validate_outer_joinable(df1.schema, key_schema)
         self._validate_outer_joinable(df2.schema, key_schema)
         fix_left, fix_right = True, True
     d1 = df1.as_pandas()
     d2 = df2.as_pandas()
     d = d1.merge(d2, on=key_schema.names, how=how)
     if fix_left:
         d = self._fix_nan(
             d, output_schema,
             df1.schema.exclude(list(df2.schema.keys())).keys())
     if fix_right:
         d = self._fix_nan(
             d, output_schema,
             df2.schema.exclude(list(df1.schema.keys())).keys())
     return PandasDataFrame(d.reset_index(drop=True), output_schema,
                            metadata)