Example #1
0
 def _udf(pdf: Any) -> pd.DataFrame:  # pragma: no cover
     if pdf.shape[0] == 0:
         return PandasDataFrame([], output_schema).as_pandas()
     if len(presort_keys) > 0:
         pdf = pdf.sort_values(presort_keys, ascending=presort_asc)
     input_df = PandasDataFrame(pdf.reset_index(drop=True),
                                input_schema,
                                pandas_df_wrapper=True)
     if on_init_once is not None:
         on_init_once(0, input_df)
     cursor = partition_spec.get_cursor(input_schema, 0)
     cursor.set(input_df.peek_array(), 0, 0)
     output_df = map_func(cursor, input_df)
     return output_df.as_pandas()
Example #2
0
        def _udf(
            dfs: Iterable[pd.DataFrame],
        ) -> Iterable[pd.DataFrame]:  # pragma: no cover
            def get_dfs() -> Iterable[LocalDataFrame]:
                for df in dfs:
                    if df.shape[0] > 0:
                        yield PandasDataFrame(
                            df.reset_index(drop=True),
                            input_schema,
                            pandas_df_wrapper=True,
                        )

            input_df = LocalDataFrameIterableDataFrame(get_dfs(), input_schema)
            if input_df.empty:
                return PandasDataFrame([], output_schema).as_pandas()
            if on_init_once is not None:
                on_init_once(0, input_df)
            cursor = partition_spec.get_cursor(input_schema, 0)
            cursor.set(input_df.peek_array(), 0, 0)
            output_df = map_func(cursor, input_df)
            if isinstance(output_df, LocalDataFrameIterableDataFrame):
                for res in output_df.native:
                    yield res.as_pandas()
            else:
                yield output_df.as_pandas()
Example #3
0
def to_local_df(df: Any, schema: Any = None, metadata: Any = None) -> LocalDataFrame:
    """Convert a data structure to :class:`~fugue.dataframe.dataframe.LocalDataFrame`

    :param df: :class:`~fugue.dataframe.dataframe.DataFrame`, pandas DataFramme and
      list or iterable of arrays
    :param schema: |SchemaLikeObject|, defaults to None, it should not be set for
      :class:`~fugue.dataframe.dataframe.DataFrame` type
    :param metadata: dict-like object with string keys, defaults to  None
    :raises ValueError: if ``df`` is :class:`~fugue.dataframe.dataframe.DataFrame`
      but you set ``schema`` or ``metadata``
    :raises TypeError: if ``df`` is not compatible
    :return: the dataframe itself if it's
      :class:`~fugue.dataframe.dataframe.LocalDataFrame` else a converted one

    :Examples:

    >>> a = to_local_df([[0,'a'],[1,'b']],"a:int,b:str")
    >>> assert to_local_df(a) is a
    >>> to_local_df(SparkDataFrame([[0,'a'],[1,'b']],"a:int,b:str"))
    """
    assert_arg_not_none(df, "df")
    if isinstance(df, DataFrame):
        aot(
            schema is None and metadata is None,
            ValueError("schema and metadata must be None when df is a DataFrame"),
        )
        return df.as_local()
    if isinstance(df, pd.DataFrame):
        return PandasDataFrame(df, schema, metadata)
    if isinstance(df, List):
        return ArrayDataFrame(df, schema, metadata)
    if isinstance(df, Iterable):
        return IterableDataFrame(df, schema, metadata)
    raise TypeError(f"{df} cannot convert to a LocalDataFrame")
Example #4
0
def test_iterable_pandas_dataframe():
    p = _IterablePandasParam(None)
    pdf = pd.DataFrame([[0]], columns=["a"])
    df = PandasDataFrame(pdf)
    data = list(p.to_input_data(df, ctx=None))
    assert 1 == len(data)
    assert data[0] is pdf  # this is to guarantee no copy in any wrapping logic
    assert data[0].values.tolist() == [[0]]

    dfs = LocalDataFrameIterableDataFrame([df, df])
    data = list(p.to_input_data(dfs, ctx=None))
    assert 2 == len(data)
    assert data[0] is pdf
    assert data[1] is pdf

    def get_pdfs():
        yield pdf
        yield pdf

    # without schema change, there is no copy
    odf = p.to_output_df(get_pdfs(), df.schema, ctx=None)
    data = list(odf.native)
    assert 2 == len(data)
    assert data[0].native is pdf
    assert data[1].native is pdf

    # with schema change, there is copy
    odf = p.to_output_df(get_pdfs(), "a:double", ctx=None)
    data = list(odf.native)
    assert 2 == len(data)
    assert data[0].native is not pdf
    assert data[1].native is not pdf
Example #5
0
def _df(data, schema=None, metadata=None):
    session = SparkSession.builder.getOrCreate()
    if schema is not None:
        pdf = PandasDataFrame(data, to_schema(schema), metadata)
        return session.createDataFrame(pdf.native, to_spark_schema(schema))
    else:
        return session.createDataFrame(data)
Example #6
0
def test_csv_io(tmpdir):
    fs = FileSystem()
    df1 = PandasDataFrame([["1", 2, 3]], "a:str,b:int,c:long")
    path = os.path.join(tmpdir, "a.csv")
    # without header
    save_df(df1, path)
    assert fs.readtext(path).startswith("1,2,3")
    raises(InvalidOperationError, lambda: load_df(path, header=False))
    actual = load_df(path,
                     columns=["a", "b", "c"],
                     header=False,
                     infer_schema=True)
    assert [[1, 2, 3]] == actual.as_array()
    assert actual.schema == "a:long,b:long,c:long"
    actual = load_df(path, columns="a:double,b:str,c:str", header=False)
    assert [[1.0, "2", "3"]] == actual.as_array()
    assert actual.schema == "a:double,b:str,c:str"
    # with header
    save_df(df1, path, header=True)
    assert fs.readtext(path).startswith("a,b,c")
    actual = load_df(path, header=True)
    assert [["1", "2", "3"]] == actual.as_array()
    actual = load_df(path, header=True, infer_schema=True)
    assert [[1, 2, 3]] == actual.as_array()
    actual = load_df(path, columns=["b", "a"], header=True, infer_schema=True)
    assert [[2, 1]] == actual.as_array()
    actual = load_df(path, columns="b:str,a:double", header=True)
    assert [["2", 1.0]] == actual.as_array()
    raises(KeyError,
           lambda: load_df(path, columns="b:str,x:double", header=True))

    raises(NotImplementedError,
           lambda: load_df(path, columns="b:str,x:double", header=2))
Example #7
0
 def get_dfs() -> Iterable[LocalDataFrame]:
     for df in dfs:
         if df.shape[0] > 0:
             yield PandasDataFrame(
                 df.reset_index(drop=True),
                 input_schema,
                 pandas_df_wrapper=True,
             )
Example #8
0
 def test_to_df(self):
     e = self.engine
     a = e.to_df([[1, 2], [3, 4]], "a:int,b:int", dict(a=1))
     df_eq(a, [[1, 2], [3, 4]], "a:int,b:int", dict(a=1), throw=True)
     a = e.to_df(PandasDataFrame([[1, 2], [3, 4]], "a:int,b:int",
                                 dict(a=1)))
     df_eq(a, [[1, 2], [3, 4]], "a:int,b:int", dict(a=1), throw=True)
     assert a is e.to_df(a)
Example #9
0
def test_serialize_df(tmpdir):
    def assert_eq(df, df_expected=None, raw=False):
        if df_expected is None:
            df_expected = df
        df_actual = deserialize_df(serialize_df(df))
        if raw:
            assert df_expected.native == df_actual.native
        else:
            df_eq(df_expected, df_actual, throw=True)

    fs = FileSystem()
    assert deserialize_df(serialize_df(None)) is None
    assert_eq(ArrayDataFrame([], "a:int,b:int"))
    assert_eq(ArrayDataFrame([[None, None]], "a:int,b:int"))
    assert_eq(ArrayDataFrame([[None, "abc"]], "a:int,b:str"))
    assert_eq(ArrayDataFrame([[None, [1, 2], dict(x=1)]],
                             "a:int,b:[int],c:{x:int}"),
              raw=True)
    assert_eq(
        IterableDataFrame([[None, [1, 2], dict(x=1)]],
                          "a:int,b:[int],c:{x:int}"),
        ArrayDataFrame([[None, [1, 2], dict(x=1)]], "a:int,b:[int],c:{x:int}"),
        raw=True,
    )
    assert_eq(PandasDataFrame([[None, None]], "a:int,b:int"))
    assert_eq(PandasDataFrame([[None, "abc"]], "a:int,b:str"))

    raises(
        InvalidOperationError,
        lambda: serialize_df(ArrayDataFrame([], "a:int,b:int"), 0),
    )

    path = os.path.join(tmpdir, "1.pkl")

    df = ArrayDataFrame([[None, None]], "a:int,b:int")
    s = serialize_df(df, 0, path, fs)
    df_eq(df, deserialize_df(s, fs), throw=True)
    df_eq(df, deserialize_df(s), throw=True)

    s = serialize_df(df, 0, path)
    df_eq(df, deserialize_df(s), throw=True)

    raises(ValueError, lambda: deserialize_df('{"x":1}'))
Example #10
0
def test_json(tmpdir):
    fs = FileSystem()
    df1 = PandasDataFrame([["1", 2, 3]], "a:str,b:int,c:long")
    path = os.path.join(tmpdir, "a.json")
    save_df(df1, path)
    actual = load_df(path)
    df_eq(actual, [[1, 2, 3]], "a:long,b:long,c:long")
    actual = load_df(path, columns=["b", "a"])
    df_eq(actual, [[2, "1"]], "b:int,a:str")
    actual = load_df(path, columns="b:str,a:int")
    df_eq(actual, [["2", 1]], "b:str,a:int")
    raises(KeyError, lambda: load_df(path, columns="bb:str,a:int"))
Example #11
0
def test_pickle_df():
    def assert_eq(df, df_expected=None, raw=False):
        if df_expected is None:
            df_expected = df
        df_actual = unpickle_df(pickle_df(df))
        if raw:
            assert df_expected.native == df_actual.native
        else:
            df_eq(df_expected, df_actual, throw=True)

    assert_eq(ArrayDataFrame([], "a:int,b:int"))
    assert_eq(ArrayDataFrame([[None, None]], "a:int,b:int"))
    assert_eq(ArrayDataFrame([[None, "abc"]], "a:int,b:str"))
    assert_eq(ArrayDataFrame([[None, [1, 2], dict(x=1)]],
                             "a:int,b:[int],c:{x:int}"),
              raw=True)
    assert_eq(IterableDataFrame([[None, [1, 2], dict(x=1)]],
                                "a:int,b:[int],c:{x:int}"),
              ArrayDataFrame([[None, [1, 2], dict(x=1)]],
                             "a:int,b:[int],c:{x:int}"),
              raw=True)
    assert_eq(PandasDataFrame([[None, None]], "a:int,b:int"))
    assert_eq(PandasDataFrame([[None, "abc"]], "a:int,b:str"))
Example #12
0
def test_parquet_io(tmpdir):
    df1 = PandasDataFrame([["1", 2, 3]], "a:str,b:int,c:long")
    df2 = ArrayDataFrame([[[1, 2]]], "a:[int]")
    # {a:int} will become {a:long} because pyarrow lib has issue
    df3 = ArrayDataFrame([[dict(a=1)]], "a:{a:long}")
    for df in [df1, df2, df3]:
        path = os.path.join(tmpdir, "a.parquet")
        save_df(df, path)
        actual = load_df(path)
        df_eq(df, actual, throw=True)

    save_df(df1, path)
    actual = load_df(path, columns=["b", "a"])
    df_eq(actual, [[2, "1"]], "b:int,a:str")
    actual = load_df(path, columns="b:str,a:int")
    df_eq(actual, [["2", 1]], "b:str,a:int")
    # can't specify wrong columns
    raises(Exception, lambda: load_df(path, columns="bb:str,a:int"))

    # load directory
    fs = FileSystem()
    folder = os.path.join(tmpdir, "folder")
    fs.makedirs(folder)
    f0 = os.path.join(folder, "_SUCCESS")
    f1 = os.path.join(folder, "1.parquet")
    f2 = os.path.join(folder, "3.parquet")
    fs.touch(f0)
    save_df(df1, f1)
    save_df(df1, f2)
    actual = load_df(folder, "parquet")
    df_eq(actual, [["1", 2, 3], ["1", 2, 3]], "a:str,b:int,c:long")

    # load multiple paths
    actual = load_df([f1, f2], "parquet")
    df_eq(actual, [["1", 2, 3], ["1", 2, 3]], "a:str,b:int,c:long")

    # overwrite = False
    raises(FileExistsError, lambda: save_df(df1, f1, mode="error"))
    # can't overwrite directory
    raises(
        IsADirectoryError,
        lambda: save_df(df1, folder, format_hint="parquet", mode="overwrite"),
    )
    # wrong mode
    raises(NotImplementedError, lambda: save_df(df1, f1, mode="dummy"))
Example #13
0
def unpickle_df(stream: bytes) -> LocalBoundedDataFrame:
    """Unpickles a dataframe from bytes array.

    :param stream: binary data
    :return: unpickled dataframe

    .. note::

        The data must be serialized by :func:`.pickle_df` to deserialize.
    """
    o = pickle.loads(stream)
    schema = o[0]
    if o[1] == "p":
        return PandasDataFrame(o[2], schema)
    if o[1] == "a":
        return ArrayDataFrame(o[2], schema)
    raise NotImplementedError(  # pragma: no cover
        f"{o[1]} is not supported for unpickle")
Example #14
0
 def df(self,
        data: Any = None,
        schema: Any = None,
        metadata: Any = None) -> SparkDataFrame:
     session = SparkSession.builder.getOrCreate()
     if data is None:
         df = None
     else:
         if schema is not None:
             pdf = PandasDataFrame(data, to_schema(schema), metadata)
             df = session.createDataFrame(pdf.native,
                                          to_spark_schema(schema))
         else:
             try:
                 df = session.createDataFrame(data)
             except Exception:
                 raise FugueDataFrameInitError("schema error")
     return SparkDataFrame(df, schema, metadata)
Example #15
0
def test_simple_methods():
    df = DaskDataFrame([], "a:str,b:int")
    assert df.empty
    assert 0 == df.count()
    assert not df.is_local

    df = DaskDataFrame([["a", 1], ["b", "2"]], "x:str,y:double")
    assert not df.empty
    assert 2 == df.count()
    assert ["a", 1.0] == df.peek_array()
    assert dict(x="a", y=1.0) == df.peek_dict()

    df_eq(
        PandasDataFrame(df.as_pandas()),
        [["a", 1.0], ["b", 2.0]],
        "x:str,y:double",
        throw=True,
    )
Example #16
0
def test_to_local_df():
    df = ArrayDataFrame([[0, 1]], "a:int,b:int")
    pdf = PandasDataFrame(df.as_pandas(), "a:int,b:int")
    idf = IterableDataFrame([[0, 1]], "a:int,b:int")
    assert to_local_df(df) is df
    assert to_local_df(pdf) is pdf
    assert to_local_df(idf) is idf
    assert isinstance(to_local_df(df.native, "a:int,b:int"), ArrayDataFrame)
    assert isinstance(to_local_df(pdf.native, "a:int,b:int"), PandasDataFrame)
    assert isinstance(to_local_df(idf.native, "a:int,b:int"),
                      IterableDataFrame)
    raises(TypeError, lambda: to_local_df(123))

    metadata = dict(a=1)
    assert to_local_df(df.native, df.schema, metadata).metadata == metadata

    raises(NoneArgumentError, lambda: to_local_df(None))
    raises(ValueError, lambda: to_local_df(df, "a:int,b:int", None))
Example #17
0
def f35(e: pd.DataFrame, a: LocalDataFrame) -> Iterable[pd.DataFrame]:
    e = PandasDataFrame(e, "a:int").as_pandas()
    a = ArrayDataFrame(a, "a:int").as_pandas()
    return iter([e, a])
Example #18
0
def f26(e: pd.DataFrame, a: LocalDataFrame) -> Iterable[Dict[str, Any]]:
    e = list(PandasDataFrame(e).as_array())
    e += list(a.as_array())
    return ArrayDataFrame(e, "a:int").as_dict_iterable()
Example #19
0
 def with_nat(cursor, data):
     df = data.as_pandas()
     df["nat"] = pd.NaT
     schema = data.schema + "nat:datetime"
     return PandasDataFrame(df, schema)
Example #20
0
def test_avro_io(tmpdir):
    df1 = PandasDataFrame([["1", 2, 3]], "a:str,b:int,c:long")
    path = os.path.join(tmpdir, "a.avro")
    save_df(df1, path)
    actual = load_df(path)

    df_eq(actual, [["1", 2, 3]], "a:str,b:long,c:long")
    actual = load_df(path, columns=["a", "b"])
    df_eq(actual, [["1", 3]], "a:str,b:long")

    actual = load_df(path, columns="a:str,b:int,c:long")
    df_eq(actual, [["1", 2, 3]], "a:str,b:int,c:long")

    actual = load_df(path, columns=["b", "c"], infer_schema=True)
    df_eq(actual, [[2, 3]], "b:long,c:long")

    # provide schema and columns -> throw error
    raises(
        Exception,
        lambda: save_df(
            path,
            columns="a:str,b:int,c:long",
            schema={
                "type":
                "record",
                "name":
                "Root",
                "fields": [
                    {
                        "name": "station",
                        "type": "string"
                    },
                    {
                        "name": "time",
                        "type": "long"
                    },
                    {
                        "name": "temp",
                        "type": "int"
                    },
                ],
            },
        ),
    )

    # provide schema and infer_schema is True -> throw error
    raises(
        Exception,
        lambda: save_df(
            path,
            columns=None,
            schema={
                "type":
                "record",
                "name":
                "Root",
                "fields": [
                    {
                        "name": "station",
                        "type": "string"
                    },
                    {
                        "name": "time",
                        "type": "long"
                    },
                    {
                        "name": "temp",
                        "type": "int"
                    },
                ],
            },
            infer_schema=True,
        ),
    )
Example #21
0
def test_avro_io(tmpdir):
    df1 = PandasDataFrame([["1", 2, 3]], "a:str,b:int,c:long")
    df2 = PandasDataFrame([["hello", 2, 3]], "a:str,b:int,c:long")
    path1 = os.path.join(tmpdir, "df1.avro")
    path2 = os.path.join(tmpdir, "df2.avro")
    save_df(df1, path1)
    actual = load_df(path1)

    df_eq(actual, [["1", 2, 3]], "a:str,b:long,c:long")
    actual = load_df(path1, columns=["a", "b"])
    df_eq(actual, [["1", 3]], "a:str,b:long")

    actual = load_df(path1, columns="a:str,b:int,c:long")
    df_eq(actual, [["1", 2, 3]], "a:str,b:int,c:long")

    actual = load_df(path1, columns="a:str,b:int,c:long",
                     infer_schema=True)  # TODO raise error when both provided?
    df_eq(actual, [["1", 2, 3]], "a:str,b:int,c:long")

    actual = load_df(path1, columns=["b", "c"], infer_schema=True)
    df_eq(actual, [[2, 3]], "b:long,c:long")

    # save in append mode
    path3 = os.path.join(tmpdir, "append.avro")
    save_df(df1, path3)
    save_df(df2, path3, append=True)
    actual = load_df(path1, columns="a:str,b:int,c:long")
    df_eq(actual, [['1', 2, 3], ['hello', 2, 3]], "a:str,b:int,c:long")

    # save times_as_micros =False (i.e milliseconds instead)
    df4 = PandasDataFrame([["2021-05-04", 2, 3]], "a:datetime,b:int,c:long")
    path4 = os.path.join(tmpdir, "df4.avro")
    save_df(df4, path4)
    actual = load_df(path4, columns="a:datetime,b:int,c:long")
    df_eq(actual, [["2021-05-04", 2, 3]], "a:datetime,b:int,c:long")
    save_df(df4, path4, times_as_micros=False)
    actual = load_df(path4, columns="a:datetime,b:int,c:long")
    df_eq(actual, [["2021-05-04", 2, 3]], "a:datetime,b:int,c:long")

    # provide avro schema
    schema = {
        'type':
        'record',
        'name':
        'Root',
        'fields': [
            {
                'name': 'a',
                'type': 'string'
            },
            {
                'name': 'b',
                'type': 'int'
            },
            {
                'name': 'c',
                'type': 'long'
            },
        ],
    }
    save_df(df1, path1, schema=schema)
    actual = load_df(path1, columns="a:str,b:int,c:long")
    df_eq(actual, [['1', 2, 3]], "a:str,b:int,c:long")

    # provide wrong types in columns arg
    save_df(df2, path2, schema=schema)
    raises(
        FugueDataFrameOperationError,
        lambda: load_df(df2, path2, columns="a:int,b:int,c:long"),
    )

    # load with process_record function
    actual = load_df(path2,
                     columns="a:str,b:int,c:long",
                     process_record=lambda s: {
                         'a': str.upper(s['a']),
                         'b': s['b'],
                         'c': s['c']
                     })
    df_eq(actual, [['HELLO', 2, 3]], "a:str,b:int,c:long")

    # provide wrong type in avro schema
    schema = {
        'type':
        'record',
        'name':
        'Root',
        'fields': [
            {
                'name': 'a',
                'type': 'int'
            },
            {
                'name': 'b',
                'type': 'int'
            },
            {
                'name': 'c',
                'type': 'long'
            },
        ],
    }
    raises(TypeError, lambda: save_df(df2, path2, schema=schema))
Example #22
0
def test_dataframes():
    df1 = ArrayDataFrame([[0]], "a:int")
    df2 = ArrayDataFrame([[1]], "a:int")
    dfs = DataFrames(a=df1, b=df2)
    assert dfs[0] is df1
    assert dfs[1] is df2

    dfs = DataFrames([df1, df2], df1)
    assert not dfs.has_key
    assert dfs[0] is df1
    assert dfs[1] is df2
    assert dfs[2] is df1

    dfs2 = DataFrames(dfs, dfs, df2)
    assert not dfs2.has_key
    assert dfs2[0] is df1
    assert dfs2[1] is df2
    assert dfs2[2] is df1
    assert dfs2[3] is df1
    assert dfs2[4] is df2
    assert dfs2[5] is df1
    assert dfs2[6] is df2

    dfs = DataFrames([("a", df1), ("b", df2)])
    assert dfs.has_key
    assert dfs[0] is df1
    assert dfs[1] is df2
    assert dfs["a"] is df1
    assert dfs["b"] is df2

    with raises(ValueError):
        dfs["c"] = 1

    with raises(ValueError):
        dfs2 = DataFrames(1)

    with raises(ValueError):
        dfs2 = DataFrames(a=df1, b=2)

    with raises(InvalidOperationError):
        dfs2 = DataFrames(dict(a=df1), df2)

    with raises(InvalidOperationError):
        dfs2 = DataFrames(df2, dict(a=df1))

    with raises(InvalidOperationError):
        dfs2 = DataFrames(df1, a=df2)

    with raises(InvalidOperationError):
        dfs2 = DataFrames(DataFrames(df1, df2), x=df2)

    dfs2 = DataFrames(dfs)
    assert dfs2.has_key
    assert dfs2[0] is df1
    assert dfs2[1] is df2

    dfs1 = DataFrames(a=df1, b=df2)
    dfs2 = dfs1.convert(lambda x: PandasDataFrame(x.as_array(), x.schema))
    assert len(dfs1) == len(dfs2)
    assert dfs2.has_key
    assert isinstance(dfs2["a"], PandasDataFrame)
    assert isinstance(dfs2["b"], PandasDataFrame)

    dfs1 = DataFrames(df1, df2)
    dfs2 = dfs1.convert(lambda x: PandasDataFrame(x.as_array(), x.schema))
    assert len(dfs1) == len(dfs2)
    assert not dfs2.has_key
    assert isinstance(dfs2[0], PandasDataFrame)
    assert isinstance(dfs2[1], PandasDataFrame)