Example #1
0
 def _map(pdf: pd.DataFrame) -> pd.DataFrame:
     if len(presort_keys) > 0:
         pdf = pdf.sort_values(presort_keys, ascending=presort_asc)
     input_df = PandasDataFrame(pdf.reset_index(drop=True),
                                df.schema,
                                pandas_df_wrapper=True)
     cursor.set(input_df.peek_array(), cursor.partition_no + 1, 0)
     output_df = map_func(cursor, input_df)
     return output_df.as_pandas()
Example #2
0
def test_nested():
    #data = [[dict(a=1, b=[3, 4], d=1.0)], [json.dumps(dict(b=[30, "40"]))]]
    #df = PandasDataFrame(data, "a:{a:str,b:[int]}")
    #a = df.as_array(type_safe=True)
    #assert [[dict(a="1", b=[3, 4])], [dict(a=None, b=[30, 40])]] == a

    data = [[[json.dumps(dict(b=[30, "40"]))]]]
    df = PandasDataFrame(data, "a:[{a:str,b:[int]}]")
    a = df.as_array(type_safe=True)
    assert [[[dict(a=None, b=[30, 40])]]] == a
Example #3
0
 def head(self, n: int, columns: Optional[List[str]] = None) -> List[Any]:
     """Get first n rows of the dataframe as 2-dimensional array
     :param n: number of rows
     :param columns: selected columns, defaults to None (all columns)
     :return: 2-dimensional array
     """
     tdf = PandasDataFrame(self.native.head(n, compute=True,
                                            npartitions=-1),
                           schema=self.schema)
     return tdf.head(n, columns=columns)
Example #4
0
    def map(
        self,
        df: DataFrame,
        map_func: Callable[[PartitionCursor, LocalDataFrame], LocalDataFrame],
        output_schema: Any,
        partition_spec: PartitionSpec,
        metadata: Any = None,
        on_init: Optional[Callable[[int, DataFrame], Any]] = None,
    ) -> DataFrame:
        if partition_spec.num_partitions != "0":
            self.log.warning(
                "%s doesn't respect num_partitions %s",
                self,
                partition_spec.num_partitions,
            )
        cursor = partition_spec.get_cursor(df.schema, 0)
        if on_init is not None:
            on_init(0, df)
        if len(partition_spec.partition_by) == 0:  # no partition
            df = to_local_df(df)
            cursor.set(df.peek_array(), 0, 0)
            output_df = map_func(cursor, df)
            if (isinstance(output_df, PandasDataFrame)
                    and output_df.schema != output_schema):
                output_df = PandasDataFrame(output_df.native, output_schema)
            assert_or_throw(
                output_df.schema == output_schema,
                lambda: f"map output {output_df.schema} "
                f"mismatches given {output_schema}",
            )
            output_df._metadata = ParamDict(metadata, deep=True)
            output_df._metadata.set_readonly()
            return self.to_df(output_df)
        presort = partition_spec.presort
        presort_keys = list(presort.keys())
        presort_asc = list(presort.values())
        output_schema = Schema(output_schema)

        def _map(pdf: pd.DataFrame) -> pd.DataFrame:
            if len(presort_keys) > 0:
                pdf = pdf.sort_values(presort_keys, ascending=presort_asc)
            input_df = PandasDataFrame(pdf.reset_index(drop=True),
                                       df.schema,
                                       pandas_df_wrapper=True)
            cursor.set(input_df.peek_array(), cursor.partition_no + 1, 0)
            output_df = map_func(cursor, input_df)
            return output_df.as_pandas()

        result = self.pl_utils.safe_groupby_apply(df.as_pandas(),
                                                  partition_spec.partition_by,
                                                  _map)
        return PandasDataFrame(result, output_schema, metadata)
Example #5
0
def test_simple_methods():
    df = PandasDataFrame([], "a:str,b:int")
    assert df.as_pandas() is df.native
    assert df.empty
    assert 0 == df.count()
    assert df.is_local

    df = PandasDataFrame([["a", 1], ["b", "2"]], "x:str,y:double")
    assert df.as_pandas() is df.native
    assert not df.empty
    assert 2 == df.count()
    assert ["a", 1.0] == df.peek_array()
    assert dict(x="a", y=1.0) == df.peek_dict()
Example #6
0
 def _map(pdf: Any) -> pd.DataFrame:
     if pdf.shape[0] == 0:
         return PandasDataFrame([], output_schema).as_pandas()
     if len(presort_keys) > 0:
         pdf = pdf.sort_values(presort_keys, ascending=presort_asc)
     input_df = PandasDataFrame(pdf.reset_index(drop=True),
                                input_schema,
                                pandas_df_wrapper=True)
     if on_init_once is not None:
         on_init_once(0, input_df)
     cursor = partition_spec.get_cursor(input_schema, 0)
     cursor.set(input_df.peek_array(), 0, 0)
     output_df = map_func(cursor, input_df)
     return output_df.as_pandas()
Example #7
0
 def fillna(
     self,
     df: DataFrame,
     value: Any,
     subset: List[str] = None,
     metadata: Any = None,
 ) -> DataFrame:
     assert_or_throw(
         (not isinstance(value, list)) and (value is not None),
         ValueError("fillna value can not None or a list"),
     )
     if isinstance(value, dict):
         assert_or_throw(
             (None not in value.values()) and (any(value.values())),
             ValueError(
                 "fillna dict can not contain None and needs at least one value"
             ),
         )
         mapping = value
     else:
         # If subset is none, apply to all columns
         subset = subset or df.schema.names
         mapping = {col: value for col in subset}
     d = df.as_pandas().fillna(mapping, inplace=False)
     return PandasDataFrame(d.reset_index(drop=True), df.schema, metadata)
Example #8
0
 def distinct(
     self,
     df: DataFrame,
     metadata: Any = None,
 ) -> DataFrame:
     d = self.pl_utils.drop_duplicates(df.as_pandas())
     return PandasDataFrame(d.reset_index(drop=True), df.schema, metadata)
Example #9
0
    def test_map_in_pandas(self):
        if not hasattr(ps.DataFrame, "mapInPandas"):
            return

        def add(cursor, data):
            assert isinstance(data, LocalDataFrameIterableDataFrame)

            def get_dfs():
                for df in data.native:
                    pdf = df.as_pandas()
                    pdf["zz"] = pdf["xx"] + pdf["yy"]
                    yield PandasDataFrame(pdf)

            return LocalDataFrameIterableDataFrame(get_dfs())

        e = self.engine
        np.random.seed(0)
        df = pd.DataFrame(np.random.randint(0, 5, (100000, 2)),
                          columns=["xx", "yy"])
        expected = PandasDataFrame(df.assign(zz=df.xx + df.yy),
                                   "xx:int,yy:int,zz:int")
        a = e.to_df(df)
        # no partition
        c = e.map(a, add, "xx:int,yy:int,zz:int", PartitionSpec(num=16))
        df_eq(c, expected, throw=True)
Example #10
0
        def _udf(
            dfs: Iterable[pd.DataFrame],
        ) -> Iterable[pd.DataFrame]:  # pragma: no cover
            def get_dfs() -> Iterable[LocalDataFrame]:
                for df in dfs:
                    if df.shape[0] > 0:
                        yield PandasDataFrame(
                            df.reset_index(drop=True),
                            input_schema,
                            pandas_df_wrapper=True,
                        )

            input_df = LocalDataFrameIterableDataFrame(get_dfs(), input_schema)
            if input_df.empty:
                return PandasDataFrame([], output_schema).as_pandas()
            if on_init_once is not None:
                on_init_once(0, input_df)
            cursor = partition_spec.get_cursor(input_schema, 0)
            cursor.set(input_df.peek_array(), 0, 0)
            output_df = map_func(cursor, input_df)
            if isinstance(output_df, LocalDataFrameIterableDataFrame):
                for res in output_df.native:
                    yield res.as_pandas()
            else:
                yield output_df.as_pandas()
Example #11
0
 def as_local(self) -> LocalDataFrame:
     # TODO: does it make sense to also include the metadata?
     if any(pa.types.is_nested(t) for t in self.schema.types):
         data = list(to_type_safe_input(self.native.collect(), self.schema))
         return ArrayDataFrame(data, self.schema, self.metadata)
     return PandasDataFrame(self.native.toPandas(), self.schema,
                            self.metadata)
Example #12
0
 def as_array(self,
              columns: Optional[List[str]] = None,
              type_safe: bool = False) -> List[Any]:
     df: DataFrame = self
     if columns is not None:
         df = df[columns]
     return PandasDataFrame(df.as_pandas(),
                            schema=df.schema).as_array(type_safe=type_safe)
Example #13
0
 def transform(self, df: LocalDataFrame) -> LocalDataFrame:
     assert 1 == self.on_init_called
     assert "test" in self.workflow_conf
     assert "x" in df.metadata
     pdf = df.as_pandas()
     pdf["p"] = self.params.get("p", 1)
     pdf["ct"] = pdf.shape[0]
     return PandasDataFrame(pdf, self.output_schema)
Example #14
0
 def get_dfs() -> Iterable[LocalDataFrame]:
     for df in dfs:
         if df.shape[0] > 0:
             yield PandasDataFrame(
                 df.reset_index(drop=True),
                 input_schema,
                 pandas_df_wrapper=True,
             )
Example #15
0
def test_nan_none():
    df = ArrayDataFrame([[None, None]], "b:str,c:double")
    assert df.as_pandas().iloc[0, 0] is None
    arr = PandasDataFrame(df.as_pandas(), df.schema).as_array()[0]
    assert arr[0] is None
    assert math.isnan(arr[1])

    df = ArrayDataFrame([[None, None]], "b:int,c:bool")
    arr = PandasDataFrame(df.as_pandas(),
                          df.schema).as_array(type_safe=True)[0]
    assert arr[0] is None
    assert arr[1] is None

    df = ArrayDataFrame([["a", 1.1], [None, None]], "b:str,c:double")
    arr = PandasDataFrame(df.as_pandas(),
                          df.schema).as_array(type_safe=True)[1]
    assert arr[0] is None
    assert arr[1] is None
Example #16
0
 def select(self, dfs: DataFrames, statement: str) -> DataFrame:
     sql_engine = create_engine("sqlite:///:memory:")
     for k, v in dfs.items():
         v.as_pandas().to_sql(k,
                              sql_engine,
                              if_exists="replace",
                              index=False)
     df = pd.read_sql_query(statement, sql_engine)
     return PandasDataFrame(df)
Example #17
0
def _test_as_array_perf():
    s = Schema()
    arr = []
    for i in range(100):
        s.append(f"a{i}:int")
        arr.append(i)
    for i in range(100):
        s.append(f"b{i}:int")
        arr.append(float(i))
    for i in range(100):
        s.append(f"c{i}:str")
        arr.append(str(i))
    data = []
    for i in range(5000):
        data.append(list(arr))
    df = PandasDataFrame(data, s)
    res = df.as_array()
    res = df.as_array(type_safe=True)
    nts, ts = 0.0, 0.0
    for i in range(10):
        t = datetime.now()
        res = df.as_array()
        nts += (datetime.now() - t).total_seconds()
        t = datetime.now()
        res = df.as_array(type_safe=True)
        ts += (datetime.now() - t).total_seconds()
    print(nts, ts)
 def union(
     self,
     df1: DataFrame,
     df2: DataFrame,
     distinct: bool = True,
     metadata: Any = None,
 ) -> DataFrame:
     assert_or_throw(df1.schema == df2.schema,
                     ValueError(f"{df1.schema} != {df2.schema}"))
     d = self.pl_utils.union(df1.as_pandas(),
                             df2.as_pandas(),
                             unique=distinct)
     return PandasDataFrame(d.reset_index(drop=True), df1.schema, metadata)
Example #19
0
 def dropna(
     self,
     df: DataFrame,
     how: str = "any",
     thresh: int = None,
     subset: List[str] = None,
     metadata: Any = None,
 ) -> DataFrame:
     d = df.as_pandas().dropna(axis=0,
                               how=how,
                               thresh=thresh,
                               subset=subset,
                               inplace=False)
     return PandasDataFrame(d.reset_index(drop=True), df.schema, metadata)
Example #20
0
 def join(
     self,
     df1: DataFrame,
     df2: DataFrame,
     how: str,
     on: List[str] = _DEFAULT_JOIN_KEYS,
     metadata: Any = None,
 ) -> DataFrame:
     key_schema, output_schema = get_join_schemas(df1, df2, how=how, on=on)
     d = self.pl_utils.join(df1.as_pandas(),
                            df2.as_pandas(),
                            join_type=how,
                            on=key_schema.names)
     return PandasDataFrame(d.reset_index(drop=True), output_schema,
                            metadata)
 def intersect(
     self,
     df1: DataFrame,
     df2: DataFrame,
     distinct: bool = True,
     metadata: Any = None,
 ) -> DataFrame:
     assert_or_throw(
         distinct,
         NotImplementedError("INTERSECT ALL for NativeExecutionEngine"))
     assert_or_throw(df1.schema == df2.schema,
                     ValueError(f"{df1.schema} != {df2.schema}"))
     d = self.pl_utils.intersect(df1.as_pandas(),
                                 df2.as_pandas(),
                                 unique=distinct)
     return PandasDataFrame(d.reset_index(drop=True), df1.schema, metadata)
Example #22
0
def load_df(
    uri: Union[str, List[str]],
    format_hint: Optional[str] = None,
    columns: Any = None,
    fs: Optional[FileSystem] = None,
    **kwargs: Any,
) -> LocalBoundedDataFrame:
    if isinstance(uri, str):
        fp = [FileParser(uri, format_hint)]
    else:
        fp = [FileParser(u, format_hint) for u in uri]
    dfs: List[pd.DataFrame] = []
    schema: Any = None
    for f in fp:
        df, schema = _FORMAT_LOAD[f.file_format](f, columns, **kwargs)
        dfs.append(df)
    return PandasDataFrame(pd.concat(dfs), schema)
Example #23
0
def test_init():
    df = IterableDataFrame(schema="a:str,b:int")
    assert df.empty
    assert df.schema == "a:str,b:int"
    assert not df.is_bounded

    data = [["a", 1], ["b", 2]]
    df = IterableDataFrame(data, "a:str,b:str")
    assert [["a", "1"], ["b", "2"]] == df.as_array(type_safe=True)
    assert df.empty  # after iterating all items
    df = IterableDataFrame(data, "a:str,b:int")
    assert [["a", 1], ["b", 2]] == df.as_array(type_safe=True)
    df = IterableDataFrame(data, "a:str,b:double")
    assert [["a", 1.0], ["b", 2.0]] == df.as_array(type_safe=True)

    df = IterableDataFrame(data, "a:str,b:double")
    ddf = IterableDataFrame(df)
    assert [["a", 1.0], ["b", 2.0]] == ddf.as_array(type_safe=True)
    df = IterableDataFrame(data, "a:str,b:double")
    ddf = IterableDataFrame(df, "a:str,b:float64")
    assert [["a", 1.0], ["b", 2.0]] == ddf.as_array(type_safe=True)
    df = IterableDataFrame(data, "a:str,b:double")
    ddf = IterableDataFrame(df, "b:str,a:str")
    assert [["1", "a"], ["2", "b"]] == ddf.as_array(type_safe=True)
    df = IterableDataFrame(data, "a:str,b:double")
    ddf = IterableDataFrame(df, ["b"])
    assert ddf.schema == "b:double"
    assert [[1.0], [2.0]] == ddf.as_array(type_safe=True)
    df = IterableDataFrame(data, "a:str,b:double")
    ddf = IterableDataFrame(df, ["a:str,b:str"])
    assert [["a", "1"], ["b", "2"]] == ddf.as_array(type_safe=True)
    df = IterableDataFrame(data, "a:str,b:double")
    ddf = IterableDataFrame(df, ["b:str"])
    assert [["1"], ["2"]] == ddf.as_array(type_safe=True)

    pdf = PandasDataFrame(data, "a:str,b:double")
    df = IterableDataFrame(pdf, "a:str,b:double")
    assert [["a", 1.0], ["b", 2.0]] == df.as_array(type_safe=True)
    df = IterableDataFrame(pdf, "b:str,a:str")
    assert [["1.0", "a"], ["2.0", "b"]] == df.as_array(type_safe=True)

    df = IterableDataFrame([], "x:str,y:double")
    assert df.empty
    assert df.is_local

    raises(FugueDataFrameInitError, lambda: IterableDataFrame(123))
Example #24
0
 def __init__(  # noqa: C901
     self,
     df: Any = None,
     schema: Any = None,
     metadata: Any = None,
     num_partitions: int = 0,
     type_safe=True,
 ):
     try:
         if num_partitions <= 0:
             num_partitions = FUGUE_DASK_DEFAULT_CONF[
                 FUGUE_DASK_CONF_DATAFRAME_DEFAULT_PARTITIONS]
         if df is None:
             schema = _input_schema(schema).assert_not_empty()
             df = []
         if isinstance(df, DaskDataFrame):
             super().__init__(df.schema,
                              df.metadata if metadata is None else metadata)
             self._native: pd.DataFrame = df._native
             return
         elif isinstance(df, (pd.DataFrame, pd.Series)):
             if isinstance(df, pd.Series):
                 df = df.to_frame()
             pdf = df
             schema = None if schema is None else _input_schema(schema)
         elif isinstance(df, (pandas.DataFrame, pandas.Series)):
             if isinstance(df, pandas.Series):
                 df = df.to_frame()
             pdf = pd.from_pandas(df,
                                  npartitions=num_partitions,
                                  sort=False)
             schema = None if schema is None else _input_schema(schema)
         elif isinstance(df, Iterable):
             schema = _input_schema(schema).assert_not_empty()
             t = PandasDataFrame(df, schema)
             pdf = pd.from_pandas(t.native,
                                  npartitions=num_partitions,
                                  sort=False)
             type_safe = False
         else:
             raise ValueError(f"{df} is incompatible with DaskDataFrame")
         pdf, schema = self._apply_schema(pdf, schema, type_safe)
         super().__init__(schema, metadata)
         self._native = pdf
     except Exception as e:
         raise FugueDataFrameInitError from e
Example #25
0
 def subtract(
     self,
     df1: DataFrame,
     df2: DataFrame,
     distinct: bool = True,
     metadata: Any = None,
 ) -> DataFrame:
     assert_or_throw(
         distinct,
         NotImplementedError("EXCEPT ALL for NativeExecutionEngine"))
     assert_or_throw(
         df1.schema == df2.schema,
         lambda: ValueError(f"{df1.schema} != {df2.schema}"),
     )
     d = self.pl_utils.except_df(df1.as_pandas(),
                                 df2.as_pandas(),
                                 unique=distinct)
     return PandasDataFrame(d.reset_index(drop=True), df1.schema, metadata)
Example #26
0
 def sample(
     self,
     df: DataFrame,
     n: Optional[int] = None,
     frac: Optional[float] = None,
     replace: bool = False,
     seed: Optional[int] = None,
     metadata: Any = None,
 ) -> DataFrame:
     assert_or_throw(
         (n is None and frac is not None)
         or (n is not None and frac is None),
         ValueError("one and only one of n and frac should be set"),
     )
     d = df.as_pandas().sample(n=n,
                               frac=frac,
                               replace=replace,
                               random_state=seed)
     return PandasDataFrame(d.reset_index(drop=True), df.schema, metadata)
Example #27
0
    def test_to_df(self):
        e = self.engine
        o = ArrayDataFrame(
            [[1, 2], [None, 3]],
            "a:double,b:int",
            dict(a=1),
        )
        a = e.to_df(o)
        assert a is not o
        res = a.native.collect()
        assert res[0][0] == 1.0 or res[0][0] is None
        assert res[1][0] == 1.0 or res[1][0] is None
        df_eq(a, o, throw=True)

        o = ArrowDataFrame(
            [[1, 2], [None, 3]],
            "a:double,b:int",
            dict(a=1),
        )
        a = e.to_df(o)
        assert a is not o
        res = a.native.collect()
        assert res[0][0] == 1.0 or res[0][0] is None
        assert res[1][0] == 1.0 or res[1][0] is None

        a = e.to_df([[1, None]], "a:int,b:int", dict(a=1))
        df_eq(a, [[1, None]], "a:int,b:int", dict(a=1), throw=True)

        o = PandasDataFrame(
            [[{
                "a": "b"
            }, 2]],
            "a:{a:str},b:int",
            dict(a=1),
        )
        a = e.to_df(o)
        assert a is not o
        res = a.as_array(type_safe=True)
        assert res[0][0] == {"a": "b"}
Example #28
0
    def take(
        self,
        df: DataFrame,
        n: int,
        presort: str,
        na_position: str = "last",
        partition_spec: PartitionSpec = EMPTY_PARTITION_SPEC,
        metadata: Any = None,
    ) -> DataFrame:
        assert_or_throw(
            isinstance(n, int),
            ValueError("n needs to be an integer"),
        )
        d = df.as_pandas()

        # Use presort over partition_spec.presort if possible
        if presort:
            presort = parse_presort_exp(presort)
        _presort: IndexedOrderedDict = presort or partition_spec.presort

        if len(_presort.keys()) > 0:
            d = d.sort_values(
                list(_presort.keys()),
                ascending=list(_presort.values()),
                na_position=na_position,
            )

        if len(partition_spec.partition_by) == 0:
            d = d.head(n)
        else:
            d = d.groupby(by=partition_spec.partition_by, dropna=False).head(n)

        return PandasDataFrame(d.reset_index(drop=True),
                               df.schema,
                               metadata,
                               pandas_df_wrapper=True)
Example #29
0
 def dfs():
     for df in output:
         yield PandasDataFrame(df, schema)
Example #30
0
 def to_output_df(self, output: pd.DataFrame, schema: Any) -> DataFrame:
     return PandasDataFrame(output, schema)