コード例 #1
0
    def map(
        self,
        df: DataFrame,
        map_func: Callable[[PartitionCursor, LocalDataFrame], LocalDataFrame],
        output_schema: Any,
        partition_spec: PartitionSpec,
        metadata: Any = None,
        on_init: Optional[Callable[[int, DataFrame], Any]] = None,
    ) -> DataFrame:
        if partition_spec.num_partitions != "0":
            self.log.warning(
                "%s doesn't respect num_partitions %s",
                self,
                partition_spec.num_partitions,
            )
        cursor = partition_spec.get_cursor(df.schema, 0)
        if on_init is not None:
            on_init(0, df)
        if len(partition_spec.partition_by) == 0:  # no partition
            df = to_local_df(df)
            cursor.set(df.peek_array(), 0, 0)
            output_df = map_func(cursor, df)
            if (isinstance(output_df, PandasDataFrame)
                    and output_df.schema != output_schema):
                output_df = PandasDataFrame(output_df.native, output_schema)
            assert_or_throw(
                output_df.schema == output_schema,
                lambda: f"map output {output_df.schema} "
                f"mismatches given {output_schema}",
            )
            output_df._metadata = ParamDict(metadata, deep=True)
            output_df._metadata.set_readonly()
            return self.to_df(output_df)
        presort = partition_spec.presort
        presort_keys = list(presort.keys())
        presort_asc = list(presort.values())
        output_schema = Schema(output_schema)

        def _map(pdf: pd.DataFrame) -> pd.DataFrame:
            if len(presort_keys) > 0:
                pdf = pdf.sort_values(presort_keys, ascending=presort_asc)
            input_df = PandasDataFrame(pdf.reset_index(drop=True),
                                       df.schema,
                                       pandas_df_wrapper=True)
            cursor.set(input_df.peek_array(), cursor.partition_no + 1, 0)
            output_df = map_func(cursor, input_df)
            return output_df.as_pandas()

        result = self.pl_utils.safe_groupby_apply(df.as_pandas(),
                                                  partition_spec.partition_by,
                                                  _map)
        return PandasDataFrame(result, output_schema, metadata)
コード例 #2
0
 def union(
     self,
     df1: DataFrame,
     df2: DataFrame,
     distinct: bool = True,
     metadata: Any = None,
 ) -> DataFrame:
     assert_or_throw(df1.schema == df2.schema,
                     ValueError(f"{df1.schema} != {df2.schema}"))
     d = self.pl_utils.union(df1.as_pandas(),
                             df2.as_pandas(),
                             unique=distinct)
     return PandasDataFrame(d.reset_index(drop=True), df1.schema, metadata)
コード例 #3
0
 def join(
     self,
     df1: DataFrame,
     df2: DataFrame,
     how: str,
     on: List[str] = _DEFAULT_JOIN_KEYS,
     metadata: Any = None,
 ) -> DataFrame:
     key_schema, output_schema = get_join_schemas(df1, df2, how=how, on=on)
     d = self.pl_utils.join(df1.as_pandas(),
                            df2.as_pandas(),
                            join_type=how,
                            on=key_schema.names)
     return PandasDataFrame(d.reset_index(drop=True), output_schema,
                            metadata)
コード例 #4
0
 def fillna(
     self,
     df: DataFrame,
     value: Any,
     subset: List[str] = None,
     metadata: Any = None,
 ) -> DataFrame:
     assert_or_throw(
         (not isinstance(value, list)) and (value is not None),
         ValueError("fillna value can not None or a list"),
     )
     if isinstance(value, dict):
         assert_or_throw(
             (None not in value.values()) and (any(value.values())),
             ValueError(
                 "fillna dict can not contain None and needs at least one value"
             ),
         )
         mapping = value
     else:
         # If subset is none, apply to all columns
         subset = subset or df.schema.names
         mapping = {col: value for col in subset}
     d = df.as_pandas().fillna(mapping, inplace=False)
     return PandasDataFrame(d.reset_index(drop=True), df.schema, metadata)
コード例 #5
0
 def intersect(
     self,
     df1: DataFrame,
     df2: DataFrame,
     distinct: bool = True,
     metadata: Any = None,
 ) -> DataFrame:
     assert_or_throw(
         distinct,
         NotImplementedError("INTERSECT ALL for NativeExecutionEngine"))
     assert_or_throw(df1.schema == df2.schema,
                     ValueError(f"{df1.schema} != {df2.schema}"))
     d = self.pl_utils.intersect(df1.as_pandas(),
                                 df2.as_pandas(),
                                 unique=distinct)
     return PandasDataFrame(d.reset_index(drop=True), df1.schema, metadata)
コード例 #6
0
ファイル: processors.py プロジェクト: gityow/fugue
 def on_init(self, partition_no: int, df: DataFrame) -> None:
     s = self.transformer.partition_spec
     self.transformer._cursor = s.get_cursor(  # type: ignore
         self.schema, partition_no
     )
     df._metadata = self.metadata
     self.transformer.on_init(df)
コード例 #7
0
 def distinct(
     self,
     df: DataFrame,
     metadata: Any = None,
 ) -> DataFrame:
     d = self.pl_utils.drop_duplicates(df.as_pandas())
     return PandasDataFrame(d.reset_index(drop=True), df.schema, metadata)
コード例 #8
0
 def subtract(
     self,
     df1: DataFrame,
     df2: DataFrame,
     distinct: bool = True,
     metadata: Any = None,
 ) -> DataFrame:
     assert_or_throw(
         distinct,
         NotImplementedError("EXCEPT ALL for NativeExecutionEngine"))
     assert_or_throw(
         df1.schema == df2.schema,
         lambda: ValueError(f"{df1.schema} != {df2.schema}"),
     )
     d = self.pl_utils.except_df(df1.as_pandas(),
                                 df2.as_pandas(),
                                 unique=distinct)
     return PandasDataFrame(d.reset_index(drop=True), df1.schema, metadata)
コード例 #9
0
 def dropna(
     self,
     df: DataFrame,
     how: str = "any",
     thresh: int = None,
     subset: List[str] = None,
     metadata: Any = None,
 ) -> DataFrame:
     d = df.as_pandas().dropna(axis=0,
                               how=how,
                               thresh=thresh,
                               subset=subset,
                               inplace=False)
     return PandasDataFrame(d.reset_index(drop=True), df.schema, metadata)
コード例 #10
0
 def sample(
     self,
     df: DataFrame,
     n: Optional[int] = None,
     frac: Optional[float] = None,
     replace: bool = False,
     seed: Optional[int] = None,
     metadata: Any = None,
 ) -> DataFrame:
     assert_or_throw(
         (n is None and frac is not None)
         or (n is not None and frac is None),
         ValueError("one and only one of n and frac should be set"),
     )
     d = df.as_pandas().sample(n=n,
                               frac=frac,
                               replace=replace,
                               random_state=seed)
     return PandasDataFrame(d.reset_index(drop=True), df.schema, metadata)
コード例 #11
0
ファイル: execution_engine.py プロジェクト: WangCHX/fugue
 def repartition(self, df: DataFrame,
                 partition_spec: PartitionSpec) -> DaskDataFrame:
     df = self.to_df(df)
     if partition_spec.empty:
         return df
     if len(partition_spec.partition_by) > 0:
         return df
     p = partition_spec.get_num_partitions(
         **{
             KEYWORD_ROWCOUNT: lambda: df.persist().count(),  # type: ignore
             KEYWORD_CORECOUNT: lambda: 2,  # TODO: remove this hard code
         })
     if p > 0:
         return DaskDataFrame(
             df.native.repartition(npartitions=p),
             schema=df.schema,
             metadata=df.metadata,
             type_safe=False,
         )
     return df
コード例 #12
0
    def take(
        self,
        df: DataFrame,
        n: int,
        presort: str,
        na_position: str = "last",
        partition_spec: PartitionSpec = EMPTY_PARTITION_SPEC,
        metadata: Any = None,
    ) -> DataFrame:
        assert_or_throw(
            isinstance(n, int),
            ValueError("n needs to be an integer"),
        )
        d = df.as_pandas()

        # Use presort over partition_spec.presort if possible
        if presort:
            presort = parse_presort_exp(presort)
        _presort: IndexedOrderedDict = presort or partition_spec.presort

        if len(_presort.keys()) > 0:
            d = d.sort_values(
                list(_presort.keys()),
                ascending=list(_presort.values()),
                na_position=na_position,
            )

        if len(partition_spec.partition_by) == 0:
            d = d.head(n)
        else:
            d = d.groupby(by=partition_spec.partition_by, dropna=False).head(n)

        return PandasDataFrame(d.reset_index(drop=True),
                               df.schema,
                               metadata,
                               pandas_df_wrapper=True)
コード例 #13
0
 def to_input_data(self, df: DataFrame) -> Iterable[Dict[str, Any]]:
     return df.as_dict_iterable()
コード例 #14
0
 def to_input_data(self, df: DataFrame) -> EmptyAwareIterable[List[Any]]:
     return make_empty_aware(df.as_array_iterable(type_safe=True))
コード例 #15
0
 def to_input_data(self, df: DataFrame) -> Iterable[List[Any]]:
     return df.as_array_iterable(type_safe=True)
コード例 #16
0
 def to_input_data(self, df: DataFrame) -> List[List[Any]]:
     return df.as_array(type_safe=True)
コード例 #17
0
 def count(self, df: DataFrame) -> int:
     if df.is_bounded:
         return df.count()
     else:
         return sum(1 for _ in df.as_array_iterable())
コード例 #18
0
def t3(df1: DataFrame, df2: DataFrame, a) -> DataFrame:
    value = df1.count() + df2.count() + a
    return ArrayDataFrame([[value]], "a:int")
コード例 #19
0
ファイル: test_dataframe.py プロジェクト: zywillc/fugue
 def __init__(self, df=None, schema=None, metadata=None):
     super().__init__(df=df, schema=schema, metadata=metadata)
     DataFrame.__init__(self, lambda: Schema(schema))
コード例 #20
0
 def to_input_data(self, df: DataFrame) -> Iterable[pd.DataFrame]:
     if not isinstance(df, LocalDataFrameIterableDataFrame):
         yield df.as_pandas()
     else:
         for sub in df.native:
             yield sub.as_pandas()
コード例 #21
0
 def join(
     self,
     df1: DataFrame,
     df2: DataFrame,
     how: str,
     on: List[str] = _DEFAULT_JOIN_KEYS,
     metadata: Any = None,
 ) -> DataFrame:
     key_schema, output_schema = get_join_schemas(df1, df2, how=how, on=on)
     how = how.lower().replace("_", "").replace(" ", "")
     if how == "cross":
         d1 = df1.as_pandas()
         d2 = df2.as_pandas()
         d1["__cross_join_index__"] = 1
         d2["__cross_join_index__"] = 1
         d = d1.merge(d2, on=("__cross_join_index__")).drop(
             "__cross_join_index__", axis=1)
         return PandasDataFrame(d.reset_index(drop=True), output_schema,
                                metadata)
     if how in ["semi", "leftsemi"]:
         d1 = df1.as_pandas()
         d2 = df2.as_pandas()[key_schema.names]
         d = d1.merge(d2, on=key_schema.names, how="inner")
         return PandasDataFrame(d.reset_index(drop=True), output_schema,
                                metadata)
     if how in ["anti", "leftanti"]:
         d1 = df1.as_pandas()
         d2 = df2.as_pandas()[key_schema.names]
         d2["__anti_join_dummy__"] = 1.0
         d = d1.merge(d2, on=key_schema.names, how="left")
         d = d[d.iloc[:, -1].isnull()]
         return PandasDataFrame(
             d.drop(["__anti_join_dummy__"], axis=1).reset_index(drop=True),
             output_schema,
             metadata,
         )
     fix_left, fix_right = False, False
     if how in ["leftouter"]:
         how = "left"
         self._validate_outer_joinable(df2.schema, key_schema)
         fix_right = True
     if how in ["rightouter"]:
         how = "right"
         self._validate_outer_joinable(df1.schema, key_schema)
         fix_left = True
     if how in ["fullouter"]:
         how = "outer"
         self._validate_outer_joinable(df1.schema, key_schema)
         self._validate_outer_joinable(df2.schema, key_schema)
         fix_left, fix_right = True, True
     d1 = df1.as_pandas()
     d2 = df2.as_pandas()
     d = d1.merge(d2, on=key_schema.names, how=how)
     if fix_left:
         d = self._fix_nan(
             d, output_schema,
             df1.schema.exclude(list(df2.schema.keys())).keys())
     if fix_right:
         d = self._fix_nan(
             d, output_schema,
             df2.schema.exclude(list(df1.schema.keys())).keys())
     return PandasDataFrame(d.reset_index(drop=True), output_schema,
                            metadata)
コード例 #22
0
def my_show(df: DataFrame) -> DataFrame:
    df.show()
    return df
コード例 #23
0
 def to_input_data(self,
                   df: DataFrame) -> EmptyAwareIterable[Dict[str, Any]]:
     return make_empty_aware(df.as_dict_iterable())
コード例 #24
0
 def to_input_data(self, df: DataFrame) -> pd.DataFrame:
     return df.as_pandas()
コード例 #25
0
 def _persist_and_count(df: DataFrame) -> int:
     df = self.persist(df)
     return df.count()
コード例 #26
0
ファイル: test_interfaceless.py プロジェクト: gityow/fugue
def f25(e: DataFrame, a: LocalDataFrame) -> List[Dict[str, Any]]:
    e = e.as_array()
    e += list(a.as_array())
    return list(ArrayDataFrame(e, "a:int").as_dict_iterable())
コード例 #27
0
def t3(df1: DataFrame, df2: DataFrame, a, b) -> None:
    b.value = df1.count() + df2.count() + a