Ejemplo n.º 1
0
def _test_nested():
    # TODO: nested type doesn't work in dask
    # data = [[dict(a=1, b=[3, 4], d=1.0)], [json.dumps(dict(b=[30, "40"]))]]
    # df = DaskDataFrame(data, "a:{a:str,b:[int]}")
    # a = df.as_array(type_safe=True)
    # assert [[dict(a="1", b=[3, 4])], [dict(a=None, b=[30, 40])]] == a

    data = [[[json.dumps(dict(b=[30, "40"]))]]]
    df = DaskDataFrame(data, "a:[{a:str,b:[int]}]")
    a = df.as_array(type_safe=True)
    assert [[[dict(a=None, b=[30, 40])]]] == a
Ejemplo n.º 2
0
    def to_df(self,
              df: Any,
              schema: Any = None,
              metadata: Any = None) -> DaskDataFrame:
        """Convert a data structure to :class:`~fugue_dask.dataframe.DaskDataFrame`

        :param data: :class:`~fugue.dataframe.dataframe.DataFrame`,
          :class:`dask:dask.dataframe.DataFrame`,
          pandas DataFrame or list or iterable of arrays
        :param schema: |SchemaLikeObject|, defaults to None.
        :param metadata: |ParamsLikeObject|, defaults to None
        :return: engine compatible dataframe

        :Notice:

        * if the input is already :class:`~fugue_dask.dataframe.DaskDataFrame`,
          it should return itself
        * For list or iterable of arrays, ``schema`` must be specified
        * When ``schema`` is not None, a potential type cast may happen to ensure
          the dataframe's schema.
        * all other methods in the engine can take arbitrary dataframes and
          call this method to convert before doing anything
        """
        default_partitions = self.conf.get_or_throw(
            FUGUE_DASK_CONF_DATAFRAME_DEFAULT_PARTITIONS, int)
        if isinstance(df, DataFrame):
            assert_or_throw(
                schema is None and metadata is None,
                ValueError(
                    "schema and metadata must be None when df is a DataFrame"),
            )
            if isinstance(df, DaskDataFrame):
                return df
            if isinstance(df, PandasDataFrame):
                return DaskDataFrame(df.native,
                                     df.schema,
                                     df.metadata,
                                     num_partitions=default_partitions)
            return DaskDataFrame(
                df.as_array(type_safe=True),
                df.schema,
                df.metadata,
                num_partitions=default_partitions,
            )
        return DaskDataFrame(df,
                             schema,
                             metadata,
                             num_partitions=default_partitions)
Ejemplo n.º 3
0
def test_csv_io(tmpdir):
    fs = FileSystem()
    df1 = DaskDataFrame([["1", 2, 3]], "a:str,b:int,c:long")
    path = os.path.join(tmpdir, "a.csv")
    # without header
    save_df(df1, path)
    # assert fs.readtext(path).startswith("1,2,3")
    raises(InvalidOperationError, lambda: load_df(path, header=False))
    actual = load_df(path,
                     columns=["a", "b", "c"],
                     header=False,
                     infer_schema=True)
    assert [[1, 2, 3]] == actual.as_array()
    assert actual.schema == "a:long,b:long,c:long"
    actual = load_df(path, columns="a:double,b:str,c:str", header=False)
    assert [[1.0, "2", "3"]] == actual.as_array()
    assert actual.schema == "a:double,b:str,c:str"
    # with header
    save_df(df1, path, header=True)
    # assert fs.readtext(path).startswith("a,b,c")
    actual = load_df(path, header=True)
    assert [["1", "2", "3"]] == actual.as_array()
    actual = load_df(path, header=True, infer_schema=True)
    assert [[1, 2, 3]] == actual.as_array()
    actual = load_df(path, columns=["b", "a"], header=True, infer_schema=True)
    assert [[2, 1]] == actual.as_array()
    actual = load_df(path, columns="b:str,a:double", header=True)
    assert [["2", 1.0]] == actual.as_array()
    raises(KeyError,
           lambda: load_df(path, columns="b:str,x:double", header=True))

    raises(NotImplementedError,
           lambda: load_df(path, columns="b:str,x:double", header=2))
Ejemplo n.º 4
0
 def distinct(
     self,
     df: DataFrame,
     metadata: Any = None,
 ) -> DataFrame:
     d = self.pl_utils.drop_duplicates(self.to_df(df).native)
     return DaskDataFrame(d, df.schema, metadata)
Ejemplo n.º 5
0
 def select(self, dfs: DataFrames, statement: str) -> DataFrame:
     dask_dfs = {
         k: self.execution_engine.to_df(v).native  # type: ignore
         for k, v in dfs.items()
     }
     df = run_sql_on_dask(statement, dask_dfs)
     return DaskDataFrame(df)
Ejemplo n.º 6
0
 def fillna(
     self,
     df: DataFrame,
     value: Any,
     subset: List[str] = None,
     metadata: Any = None,
 ) -> DataFrame:
     assert_or_throw(
         (not isinstance(value, list)) and (value is not None),
         ValueError("fillna value can not be a list or None"),
     )
     if isinstance(value, dict):
         assert_or_throw(
             (None not in value.values()) and (any(value.values())),
             ValueError(
                 "fillna dict can not contain None and needs at least one value"
             ),
         )
         mapping = value
     else:
         # If subset is none, apply to all columns
         subset = subset or df.schema.names
         mapping = {col: value for col in subset}
     d = self.to_df(df).native.fillna(mapping)
     return DaskDataFrame(d, df.schema, metadata)
Ejemplo n.º 7
0
 def dropna(
     self,
     df: DataFrame,
     metadata: Any = None,
     how: str = "any",
     thresh: int = None,
     subset: List[str] = None,
 ) -> DataFrame:
     d = self.to_df(df).native.dropna(how=how, thresh=thresh, subset=subset)
     return DaskDataFrame(d, df.schema, metadata)
Ejemplo n.º 8
0
def test_json(tmpdir):
    df1 = DaskDataFrame([["1", 2, 3]], "a:str,b:int,c:long")
    path = os.path.join(tmpdir, "a.json")
    save_df(df1, path)
    actual = load_df(path)
    df_eq(actual, [[1, 2, 3]], "a:long,b:long,c:long")
    actual = load_df(path, columns=["b", "a"])
    df_eq(actual, [[2, "1"]], "b:int,a:str")
    actual = load_df(path, columns="b:str,a:int")
    df_eq(actual, [["2", 1]], "b:str,a:int")
    raises(KeyError, lambda: load_df(path, columns="bb:str,a:int"))
Ejemplo n.º 9
0
    def take(
        self,
        df: DataFrame,
        n: int,
        presort: str,
        na_position: str = "last",
        partition_spec: PartitionSpec = EMPTY_PARTITION_SPEC,
        metadata: Any = None,
    ) -> DataFrame:
        assert_or_throw(
            isinstance(n, int),
            ValueError("n needs to be an integer"),
        )
        d = self.to_df(df).native
        meta = [(d[x].name, d[x].dtype) for x in d.columns]

        if presort:
            presort = parse_presort_exp(presort)
        # Use presort over partition_spec.presort if possible
        _presort: IndexedOrderedDict = presort or partition_spec.presort

        def _partition_take(partition, n, presort):
            if len(presort.keys()) > 0:
                partition = partition.sort_values(
                    list(presort.keys()),
                    ascending=list(presort.values()),
                    na_position=na_position,
                )
            return partition.head(n)

        if len(partition_spec.partition_by) == 0:
            if len(_presort.keys()) == 0:
                d = d.head(n)
            else:
                # Use the default partition
                d = (d.map_partitions(
                    _partition_take, n, _presort,
                    meta=meta).reset_index(drop=True).compute())
                # compute() brings this to Pandas so we can use pandas
                d = d.sort_values(
                    list(_presort.keys()),
                    ascending=list(_presort.values()),
                    na_position=na_position,
                ).head(n)

        else:
            d = (d.groupby(partition_spec.partition_by,
                           dropna=False).apply(
                               _partition_take,
                               n=n,
                               presort=_presort,
                               meta=meta).reset_index(drop=True))

        return DaskDataFrame(d, df.schema, metadata)
Ejemplo n.º 10
0
def _test_as_array_perf():
    s = Schema()
    arr = []
    for i in range(100):
        s.append(f"a{i}:int")
        arr.append(i)
    for i in range(100):
        s.append(f"b{i}:int")
        arr.append(float(i))
    for i in range(100):
        s.append(f"c{i}:str")
        arr.append(str(i))
    data = []
    for i in range(5000):
        data.append(list(arr))
    df = DaskDataFrame(data, s)
    res = df.as_array()
    res = df.as_array(type_safe=True)
    nts, ts = 0.0, 0.0
    for i in range(10):
        t = datetime.now()
        res = df.as_array()
        nts += (datetime.now() - t).total_seconds()
        t = datetime.now()
        res = df.as_array(type_safe=True)
        ts += (datetime.now() - t).total_seconds()
    print(nts, ts)
Ejemplo n.º 11
0
def test_simple_methods():
    df = DaskDataFrame([], "a:str,b:int")
    assert df.empty
    assert 0 == df.count()
    assert not df.is_local

    df = DaskDataFrame([["a", 1], ["b", "2"]], "x:str,y:double")
    assert not df.empty
    assert 2 == df.count()
    assert ["a", 1.0] == df.peek_array()
    assert dict(x="a", y=1.0) == df.peek_dict()

    df_eq(
        PandasDataFrame(df.as_pandas()),
        [["a", 1.0], ["b", 2.0]],
        "x:str,y:double",
        throw=True,
    )
Ejemplo n.º 12
0
 def union(
     self,
     df1: DataFrame,
     df2: DataFrame,
     distinct: bool = True,
     metadata: Any = None,
 ) -> DataFrame:
     assert_or_throw(df1.schema == df2.schema,
                     ValueError(f"{df1.schema} != {df2.schema}"))
     d = self.pl_utils.union(self.to_df(df1).native,
                             self.to_df(df2).native,
                             unique=distinct)
     return DaskDataFrame(d, df1.schema, metadata)
Ejemplo n.º 13
0
    def map(
        self,
        df: DataFrame,
        map_func: Callable[[PartitionCursor, LocalDataFrame], LocalDataFrame],
        output_schema: Any,
        partition_spec: PartitionSpec,
        metadata: Any = None,
        on_init: Optional[Callable[[int, DataFrame], Any]] = None,
    ) -> DataFrame:
        presort = partition_spec.presort
        presort_keys = list(presort.keys())
        presort_asc = list(presort.values())
        output_schema = Schema(output_schema)
        input_schema = df.schema
        on_init_once: Any = (
            None
            if on_init is None
            else RunOnce(
                on_init, lambda *args, **kwargs: to_uuid(id(on_init), id(args[0]))
            )
        )

        def _map(pdf: Any) -> pd.DataFrame:
            if pdf.shape[0] == 0:
                return PandasDataFrame([], output_schema).as_pandas()
            if len(presort_keys) > 0:
                pdf = pdf.sort_values(presort_keys, ascending=presort_asc)
            input_df = PandasDataFrame(
                pdf.reset_index(drop=True), input_schema, pandas_df_wrapper=True
            )
            if on_init_once is not None:
                on_init_once(0, input_df)
            cursor = partition_spec.get_cursor(input_schema, 0)
            cursor.set(input_df.peek_array(), 0, 0)
            output_df = map_func(cursor, input_df)
            return output_df.as_pandas()

        df = self.to_df(df)
        if len(partition_spec.partition_by) == 0:
            pdf = self.repartition(df, partition_spec)
            result = pdf.native.map_partitions(_map, meta=output_schema.pandas_dtype)
        else:
            df = self.repartition(df, PartitionSpec(num=partition_spec.num_partitions))
            result = self.pl_utils.safe_groupby_apply(
                df.native,
                partition_spec.partition_by,
                _map,
                meta=output_schema.pandas_dtype,
            )
        return DaskDataFrame(result, output_schema, metadata)
Ejemplo n.º 14
0
def test_nan_none():
    # TODO: on dask, these tests can't pass
    # df = ArrayDataFrame([[None, None]], "b:str,c:double")
    # assert df.as_pandas().iloc[0, 0] is None
    # arr = DaskDataFrame(df.as_pandas(), df.schema).as_array()[0]
    # assert arr[0] is None
    # assert math.isnan(arr[1])

    # df = ArrayDataFrame([[None, None]], "b:int,c:bool")
    # arr = DaskDataFrame(df.as_pandas(), df.schema).as_array(type_safe=True)[0]
    # assert np.isnan(arr[0])  # TODO: this will cause inconsistent behavior cross engine
    # assert np.isnan(arr[1])  # TODO: this will cause inconsistent behavior cross engine

    df = ArrayDataFrame([["a", 1.1], [None, None]], "b:str,c:double")
    arr = DaskDataFrame(df.as_pandas(), df.schema).as_array()[1]
    assert arr[0] is None
    assert math.isnan(arr[1])

    arr = DaskDataFrame(df.as_array(), df.schema).as_array()[1]
    assert arr[0] is None
    assert math.isnan(arr[1])

    arr = DaskDataFrame(df.as_pandas()["b"], "b:str").as_array()[1]
    assert arr[0] is None
Ejemplo n.º 15
0
 def intersect(
     self,
     df1: DataFrame,
     df2: DataFrame,
     distinct: bool = True,
     metadata: Any = None,
 ) -> DataFrame:
     assert_or_throw(
         distinct,
         NotImplementedError("INTERSECT ALL for DaskExecutionEngine"))
     assert_or_throw(df1.schema == df2.schema,
                     ValueError(f"{df1.schema} != {df2.schema}"))
     d = self.pl_utils.intersect(self.to_df(df1).native,
                                 self.to_df(df2).native,
                                 unique=distinct)
     return DaskDataFrame(d, df1.schema, metadata)
Ejemplo n.º 16
0
 def join(
     self,
     df1: DataFrame,
     df2: DataFrame,
     how: str,
     on: List[str] = _DEFAULT_JOIN_KEYS,
     metadata: Any = None,
 ) -> DataFrame:
     key_schema, output_schema = get_join_schemas(df1, df2, how=how, on=on)
     d = self.pl_utils.join(
         self.to_df(df1).native,
         self.to_df(df2).native,
         join_type=how,
         on=key_schema.names,
     )
     return DaskDataFrame(d, output_schema, metadata)
Ejemplo n.º 17
0
def load_df(
    uri: Union[str, List[str]],
    format_hint: Optional[str] = None,
    columns: Any = None,
    fs: Optional[FileSystem] = None,
    **kwargs: Any,
) -> DaskDataFrame:
    if isinstance(uri, str):
        fp = [FileParser(uri, format_hint)]
    else:
        fp = [FileParser(u, format_hint) for u in uri]
    dfs: List[dd.DataFrame] = []
    schema: Any = None
    for f in _get_single_files(fp, fs):
        df, schema = _FORMAT_LOAD[f.file_format](f, columns, **kwargs)
        dfs.append(df)
    return DaskDataFrame(dd.concat(dfs), schema)
Ejemplo n.º 18
0
 def subtract(
     self,
     df1: DataFrame,
     df2: DataFrame,
     distinct: bool = True,
     metadata: Any = None,
 ) -> DataFrame:
     assert_or_throw(
         distinct,
         NotImplementedError("EXCEPT ALL for DaskExecutionEngine"))
     assert_or_throw(
         df1.schema == df2.schema,
         lambda: ValueError(f"{df1.schema} != {df2.schema}"),
     )
     d = self.pl_utils.except_df(self.to_df(df1).native,
                                 self.to_df(df2).native,
                                 unique=distinct)
     return DaskDataFrame(d, df1.schema, metadata)
Ejemplo n.º 19
0
 def sample(
     self,
     df: DataFrame,
     n: Optional[int] = None,
     frac: Optional[float] = None,
     replace: bool = False,
     seed: Optional[int] = None,
     metadata: Any = None,
 ) -> DataFrame:
     assert_or_throw(
         (n is None and frac is not None)
         or (n is not None and frac is None),
         ValueError("one and only one of n and frac should be set"),
     )
     # TODO: dask does not support sample by number of rows
     d = self.to_df(df).native.sample(n=n,
                                      frac=frac,
                                      replace=replace,
                                      random_state=seed)
     return DaskDataFrame(d, df.schema, metadata)
Ejemplo n.º 20
0
 def repartition(self, df: DataFrame,
                 partition_spec: PartitionSpec) -> DaskDataFrame:
     df = self.to_df(df)
     if partition_spec.empty:
         return df
     if len(partition_spec.partition_by) > 0:
         return df
     p = partition_spec.get_num_partitions(
         **{
             KEYWORD_ROWCOUNT: lambda: df.persist().count(),  # type: ignore
             KEYWORD_CORECOUNT: lambda: 2,  # TODO: remove this hard code
         })
     if p > 0:
         return DaskDataFrame(
             df.native.repartition(npartitions=p),
             schema=df.schema,
             metadata=df.metadata,
             type_safe=False,
         )
     return df
Ejemplo n.º 21
0
def test_as_dict_iterable():
    df = DaskDataFrame([["2020-01-01", 1.1]], "a:datetime,b:int")
    assert [dict(a=datetime(2020, 1, 1), b=1)] == list(df.as_dict_iterable())
Ejemplo n.º 22
0
 def join(
     self,
     df1: DataFrame,
     df2: DataFrame,
     how: str,
     on: List[str] = _DEFAULT_JOIN_KEYS,
     metadata: Any = None,
 ) -> DataFrame:
     key_schema, output_schema = get_join_schemas(df1, df2, how=how, on=on)
     how = how.lower().replace("_", "").replace(" ", "")
     if how == "cross":
         d1 = self.to_df(df1).native
         d2 = self.to_df(df2).native
         d1["__cross_join_index__"] = 1
         d2["__cross_join_index__"] = 1
         d = d1.merge(d2, on=("__cross_join_index__")).drop(
             "__cross_join_index__", axis=1)
         return DaskDataFrame(d.reset_index(drop=True), output_schema,
                              metadata)
     if how in ["semi", "leftsemi"]:
         d1 = self.to_df(df1).native
         d2 = self.to_df(df2).native[key_schema.names]
         d = d1.merge(d2, on=key_schema.names, how="inner")
         return DaskDataFrame(d.reset_index(drop=True), output_schema,
                              metadata)
     if how in ["anti", "leftanti"]:
         d1 = self.to_df(df1).native
         d2 = self.to_df(df2).native[key_schema.names]
         if DASK_UTILS.empty(d1) or DASK_UTILS.empty(d2):
             return df1
         d2["__anti_join_dummy__"] = 1.0
         d = d1.merge(d2, on=key_schema.names, how="left")
         d = d[d["__anti_join_dummy__"].isnull()]
         return DaskDataFrame(
             d.drop(["__anti_join_dummy__"], axis=1).reset_index(drop=True),
             output_schema,
             metadata,
         )
     fix_left, fix_right = False, False
     if how in ["leftouter"]:
         how = "left"
         self._validate_outer_joinable(df2.schema, key_schema)
         fix_right = True
     if how in ["rightouter"]:
         how = "right"
         self._validate_outer_joinable(df1.schema, key_schema)
         fix_left = True
     if how in ["fullouter"]:
         how = "outer"
         self._validate_outer_joinable(df1.schema, key_schema)
         self._validate_outer_joinable(df2.schema, key_schema)
         fix_left, fix_right = True, True
     d1 = self.to_df(df1).native
     d2 = self.to_df(df2).native
     d = d1.merge(d2, on=key_schema.names, how=how)
     if fix_left:
         d = self._fix_nan(
             d, output_schema,
             df1.schema.exclude(list(df2.schema.keys())).keys())
     if fix_right:
         d = self._fix_nan(
             d, output_schema,
             df2.schema.exclude(list(df1.schema.keys())).keys())
     return DaskDataFrame(d.reset_index(drop=True), output_schema, metadata)
Ejemplo n.º 23
0
 def df(self,
        data: Any = None,
        schema: Any = None,
        metadata: Any = None) -> DaskDataFrame:
     return DaskDataFrame(data, schema, metadata)
Ejemplo n.º 24
0
def test_avro_io(tmpdir):
    df1 = DaskDataFrame([["1", 2, 3]], "a:str,b:int,c:long")
    path = os.path.join(tmpdir, "a.avro")
    save_df(df1, path)
    actual = load_df(path)

    df_eq(actual, [["1", 2, 3]], "a:str,b:long,c:long")
    actual = load_df(path, columns=["a", "b"])
    df_eq(actual, [["1", 3]], "a:str,b:long")

    actual = load_df(path, columns="a:str,b:int,c:long")
    df_eq(actual, [["1", 2, 3]], "a:str,b:int,c:long")

    actual = load_df(path, columns=["b", "c"], infer_schema=True)
    df_eq(actual, [[2, 3]], "b:long,c:long")

    # provide schema and columns -> throw error
    raises(
        Exception,
        lambda: save_df(
            path,
            columns="a:str,b:int,c:long",
            schema={
                "type":
                "record",
                "name":
                "Root",
                "fields": [
                    {
                        "name": "station",
                        "type": "string"
                    },
                    {
                        "name": "time",
                        "type": "long"
                    },
                    {
                        "name": "temp",
                        "type": "int"
                    },
                ],
            },
        ),
    )

    # provide schema and infer_schema is True -> throw error
    raises(
        Exception,
        lambda: save_df(
            path,
            columns=None,
            schema={
                "type":
                "record",
                "name":
                "Root",
                "fields": [
                    {
                        "name": "station",
                        "type": "string"
                    },
                    {
                        "name": "time",
                        "type": "long"
                    },
                    {
                        "name": "temp",
                        "type": "int"
                    },
                ],
            },
            infer_schema=True,
        ),
    )
Ejemplo n.º 25
0
def test_parquet_io(tmpdir):
    df1 = DaskDataFrame([["1", 2, 3]], "a:str,b:int,c:long")
    df2 = DaskDataFrame([[[1, 2]]], "a:[int]")
    # {a:int} will become {a:long} because pyarrow lib has issue
    df3 = DaskDataFrame([[dict(a=1)]], "a:{a:long}")
    for df in [df1, df2, df3]:
        path = os.path.join(tmpdir, "a.parquet")
        save_df(df, path)
        actual = load_df(path)
        df_eq(df, actual, throw=True)

    save_df(df1, path)
    actual = load_df(path, columns=["b", "a"])
    df_eq(actual, [[2, "1"]], "b:int,a:str")
    actual = load_df(path, columns="b:str,a:int")
    df_eq(actual, [["2", 1]], "b:str,a:int")
    # can't specify wrong columns
    raises(Exception, lambda: load_df(path, columns="bb:str,a:int"))

    # load directory
    fs = FileSystem()
    for name in ["folder.parquet", "folder"]:
        folder = os.path.join(tmpdir, name)
        fs.makedirs(folder)
        f0 = os.path.join(folder, "_SUCCESS")
        f1 = os.path.join(folder, "1.parquet")
        f2 = os.path.join(folder, "3.parquet")
        fs.touch(f0)
        pd_save_df(df1, f1)
        pd_save_df(df1, f2)

    actual = load_df(folder, "parquet")
    df_eq(actual, [["1", 2, 3], ["1", 2, 3]], "a:str,b:int,c:long")

    # load multiple paths
    actual = load_df([f1, f2], "parquet")
    df_eq(actual, [["1", 2, 3], ["1", 2, 3]], "a:str,b:int,c:long")

    # load folder
    actual = load_df(folder, "parquet")
    df_eq(actual, [["1", 2, 3], ["1", 2, 3]], "a:str,b:int,c:long")

    actual = load_df(os.path.join(tmpdir, "folder.parquet"))
    df_eq(actual, [["1", 2, 3], ["1", 2, 3]], "a:str,b:int,c:long")

    # load pattern
    actual = load_df(os.path.join(tmpdir, "folder", "*.parquet"))
    df_eq(actual, [["1", 2, 3], ["1", 2, 3]], "a:str,b:int,c:long")

    # overwrite folder with single file
    save_df(actual, os.path.join(tmpdir, "folder.parquet"), mode="overwrite")
    actual = load_df(os.path.join(tmpdir, "folder.parquet"))
    df_eq(actual, [["1", 2, 3], ["1", 2, 3]], "a:str,b:int,c:long")

    # overwrite = False
    raises(FileExistsError, lambda: save_df(df1, f1, mode="error"))
    raises(
        FileExistsError,
        lambda: save_df(
            df1, os.path.join(tmpdir, "folder.parquet"), mode="error"),
    )

    # wrong mode
    raises(NotImplementedError, lambda: save_df(df1, f1, mode="dummy"))
Ejemplo n.º 26
0
def test_init():
    df = DaskDataFrame(schema="a:str,b:int")
    assert df.is_bounded
    assert df.count() == 0
    assert df.schema == "a:str,b:int"

    pdf = pandas.DataFrame([["a", 1], ["b", 2]])
    raises(FugueDataFrameInitError, lambda: DaskDataFrame(pdf))
    df = DaskDataFrame(pdf, "a:str,b:str")
    assert [["a", "1"], ["b", "2"]] == df.as_pandas().values.tolist()
    df = DaskDataFrame(pdf, "a:str,b:int")
    assert [["a", 1], ["b", 2]] == df.as_pandas().values.tolist()
    df = DaskDataFrame(pdf, "a:str,b:double")
    assert [["a", 1.0], ["b", 2.0]] == df.as_pandas().values.tolist()

    pdf = DaskDataFrame([["a", 1], ["b", 2]], "a:str,b:int").native["b"]
    assert isinstance(pdf, pd.Series)
    df = DaskDataFrame(pdf, "b:str")
    assert [["1"], ["2"]] == df.as_pandas().values.tolist()
    df = DaskDataFrame(pdf, "b:double")
    assert [[1.0], [2.0]] == df.as_pandas().values.tolist()

    pdf = DaskDataFrame([["a", 1], ["b", 2]], "x:str,y:long").native
    df = DaskDataFrame(pdf)
    assert df.schema == "x:str,y:long"
    df = DaskDataFrame(pdf, "y:str,x:str")
    assert [["1", "a"], ["2", "b"]] == df.as_pandas().values.tolist()
    ddf = DaskDataFrame(df)
    assert [["1", "a"], ["2", "b"]] == ddf.as_pandas().values.tolist()
    assert df.native is ddf.native  # no real copy happened

    df = DaskDataFrame([["a", 1], ["b", "2"]], "x:str,y:double")
    assert [["a", 1.0], ["b", 2.0]] == df.as_pandas().values.tolist()

    df = DaskDataFrame([], "x:str,y:double")
    assert [] == df.as_pandas().values.tolist()

    raises(FugueDataFrameInitError, lambda: DaskDataFrame(123))
Ejemplo n.º 27
0
def test_as_array():
    df = DaskDataFrame([], "a:str,b:int")
    assert [] == df.as_array()
    assert [] == df.as_array(type_safe=True)
    assert [] == list(df.as_array_iterable())
    assert [] == list(df.as_array_iterable(type_safe=True))

    df = DaskDataFrame([["a", 1]], "a:str,b:int")
    assert [["a", 1]] == df.as_array()
    assert [["a", 1]] == df.as_array(["a", "b"])
    assert [[1, "a"]] == df.as_array(["b", "a"])

    # prevent pandas auto type casting
    df = DaskDataFrame([[1.0, 1.1]], "a:double,b:int")
    assert [[1.0, 1]] == df.as_array()
    assert isinstance(df.as_array()[0][0], float)
    assert isinstance(df.as_array()[0][1], int)
    assert [[1.0, 1]] == df.as_array(["a", "b"])
    assert [[1, 1.0]] == df.as_array(["b", "a"])

    df = DaskDataFrame([[np.float64(1.0), 1.1]], "a:double,b:int")
    assert [[1.0, 1]] == df.as_array()
    assert isinstance(df.as_array()[0][0], float)
    assert isinstance(df.as_array()[0][1], int)

    df = DaskDataFrame([[pandas.Timestamp("2020-01-01"), 1.1]],
                       "a:datetime,b:int")
    df.native["a"] = pd.to_datetime(df.native["a"])
    assert [[datetime(2020, 1, 1), 1]] == df.as_array()
    assert isinstance(df.as_array()[0][0], datetime)
    assert isinstance(df.as_array()[0][1], int)

    df = DaskDataFrame([[pandas.NaT, 1.1]], "a:datetime,b:int")
    df.native["a"] = pd.to_datetime(df.native["a"])
    assert isinstance(df.as_array()[0][0], datetime)
    assert isinstance(df.as_array()[0][1], int)

    df = DaskDataFrame([[1.0, 1.1]], "a:double,b:int")
    assert [[1.0, 1]] == df.as_array(type_safe=True)
    assert isinstance(df.as_array()[0][0], float)
    assert isinstance(df.as_array()[0][1], int)