def _udf(pdf: Any) -> pd.DataFrame: # pragma: no cover if pdf.shape[0] == 0: return PandasDataFrame([], output_schema).as_pandas() if len(presort_keys) > 0: pdf = pdf.sort_values(presort_keys, ascending=presort_asc) input_df = PandasDataFrame(pdf.reset_index(drop=True), input_schema, pandas_df_wrapper=True) if on_init_once is not None: on_init_once(0, input_df) cursor = partition_spec.get_cursor(input_schema, 0) cursor.set(input_df.peek_array(), 0, 0) output_df = map_func(cursor, input_df) return output_df.as_pandas()
def _udf( dfs: Iterable[pd.DataFrame], ) -> Iterable[pd.DataFrame]: # pragma: no cover def get_dfs() -> Iterable[LocalDataFrame]: for df in dfs: if df.shape[0] > 0: yield PandasDataFrame( df.reset_index(drop=True), input_schema, pandas_df_wrapper=True, ) input_df = LocalDataFrameIterableDataFrame(get_dfs(), input_schema) if input_df.empty: return PandasDataFrame([], output_schema).as_pandas() if on_init_once is not None: on_init_once(0, input_df) cursor = partition_spec.get_cursor(input_schema, 0) cursor.set(input_df.peek_array(), 0, 0) output_df = map_func(cursor, input_df) if isinstance(output_df, LocalDataFrameIterableDataFrame): for res in output_df.native: yield res.as_pandas() else: yield output_df.as_pandas()
def to_local_df(df: Any, schema: Any = None, metadata: Any = None) -> LocalDataFrame: """Convert a data structure to :class:`~fugue.dataframe.dataframe.LocalDataFrame` :param df: :class:`~fugue.dataframe.dataframe.DataFrame`, pandas DataFramme and list or iterable of arrays :param schema: |SchemaLikeObject|, defaults to None, it should not be set for :class:`~fugue.dataframe.dataframe.DataFrame` type :param metadata: dict-like object with string keys, defaults to None :raises ValueError: if ``df`` is :class:`~fugue.dataframe.dataframe.DataFrame` but you set ``schema`` or ``metadata`` :raises TypeError: if ``df`` is not compatible :return: the dataframe itself if it's :class:`~fugue.dataframe.dataframe.LocalDataFrame` else a converted one :Examples: >>> a = to_local_df([[0,'a'],[1,'b']],"a:int,b:str") >>> assert to_local_df(a) is a >>> to_local_df(SparkDataFrame([[0,'a'],[1,'b']],"a:int,b:str")) """ assert_arg_not_none(df, "df") if isinstance(df, DataFrame): aot( schema is None and metadata is None, ValueError("schema and metadata must be None when df is a DataFrame"), ) return df.as_local() if isinstance(df, pd.DataFrame): return PandasDataFrame(df, schema, metadata) if isinstance(df, List): return ArrayDataFrame(df, schema, metadata) if isinstance(df, Iterable): return IterableDataFrame(df, schema, metadata) raise TypeError(f"{df} cannot convert to a LocalDataFrame")
def test_iterable_pandas_dataframe(): p = _IterablePandasParam(None) pdf = pd.DataFrame([[0]], columns=["a"]) df = PandasDataFrame(pdf) data = list(p.to_input_data(df, ctx=None)) assert 1 == len(data) assert data[0] is pdf # this is to guarantee no copy in any wrapping logic assert data[0].values.tolist() == [[0]] dfs = LocalDataFrameIterableDataFrame([df, df]) data = list(p.to_input_data(dfs, ctx=None)) assert 2 == len(data) assert data[0] is pdf assert data[1] is pdf def get_pdfs(): yield pdf yield pdf # without schema change, there is no copy odf = p.to_output_df(get_pdfs(), df.schema, ctx=None) data = list(odf.native) assert 2 == len(data) assert data[0].native is pdf assert data[1].native is pdf # with schema change, there is copy odf = p.to_output_df(get_pdfs(), "a:double", ctx=None) data = list(odf.native) assert 2 == len(data) assert data[0].native is not pdf assert data[1].native is not pdf
def _df(data, schema=None, metadata=None): session = SparkSession.builder.getOrCreate() if schema is not None: pdf = PandasDataFrame(data, to_schema(schema), metadata) return session.createDataFrame(pdf.native, to_spark_schema(schema)) else: return session.createDataFrame(data)
def test_csv_io(tmpdir): fs = FileSystem() df1 = PandasDataFrame([["1", 2, 3]], "a:str,b:int,c:long") path = os.path.join(tmpdir, "a.csv") # without header save_df(df1, path) assert fs.readtext(path).startswith("1,2,3") raises(InvalidOperationError, lambda: load_df(path, header=False)) actual = load_df(path, columns=["a", "b", "c"], header=False, infer_schema=True) assert [[1, 2, 3]] == actual.as_array() assert actual.schema == "a:long,b:long,c:long" actual = load_df(path, columns="a:double,b:str,c:str", header=False) assert [[1.0, "2", "3"]] == actual.as_array() assert actual.schema == "a:double,b:str,c:str" # with header save_df(df1, path, header=True) assert fs.readtext(path).startswith("a,b,c") actual = load_df(path, header=True) assert [["1", "2", "3"]] == actual.as_array() actual = load_df(path, header=True, infer_schema=True) assert [[1, 2, 3]] == actual.as_array() actual = load_df(path, columns=["b", "a"], header=True, infer_schema=True) assert [[2, 1]] == actual.as_array() actual = load_df(path, columns="b:str,a:double", header=True) assert [["2", 1.0]] == actual.as_array() raises(KeyError, lambda: load_df(path, columns="b:str,x:double", header=True)) raises(NotImplementedError, lambda: load_df(path, columns="b:str,x:double", header=2))
def get_dfs() -> Iterable[LocalDataFrame]: for df in dfs: if df.shape[0] > 0: yield PandasDataFrame( df.reset_index(drop=True), input_schema, pandas_df_wrapper=True, )
def test_to_df(self): e = self.engine a = e.to_df([[1, 2], [3, 4]], "a:int,b:int", dict(a=1)) df_eq(a, [[1, 2], [3, 4]], "a:int,b:int", dict(a=1), throw=True) a = e.to_df(PandasDataFrame([[1, 2], [3, 4]], "a:int,b:int", dict(a=1))) df_eq(a, [[1, 2], [3, 4]], "a:int,b:int", dict(a=1), throw=True) assert a is e.to_df(a)
def test_serialize_df(tmpdir): def assert_eq(df, df_expected=None, raw=False): if df_expected is None: df_expected = df df_actual = deserialize_df(serialize_df(df)) if raw: assert df_expected.native == df_actual.native else: df_eq(df_expected, df_actual, throw=True) fs = FileSystem() assert deserialize_df(serialize_df(None)) is None assert_eq(ArrayDataFrame([], "a:int,b:int")) assert_eq(ArrayDataFrame([[None, None]], "a:int,b:int")) assert_eq(ArrayDataFrame([[None, "abc"]], "a:int,b:str")) assert_eq(ArrayDataFrame([[None, [1, 2], dict(x=1)]], "a:int,b:[int],c:{x:int}"), raw=True) assert_eq( IterableDataFrame([[None, [1, 2], dict(x=1)]], "a:int,b:[int],c:{x:int}"), ArrayDataFrame([[None, [1, 2], dict(x=1)]], "a:int,b:[int],c:{x:int}"), raw=True, ) assert_eq(PandasDataFrame([[None, None]], "a:int,b:int")) assert_eq(PandasDataFrame([[None, "abc"]], "a:int,b:str")) raises( InvalidOperationError, lambda: serialize_df(ArrayDataFrame([], "a:int,b:int"), 0), ) path = os.path.join(tmpdir, "1.pkl") df = ArrayDataFrame([[None, None]], "a:int,b:int") s = serialize_df(df, 0, path, fs) df_eq(df, deserialize_df(s, fs), throw=True) df_eq(df, deserialize_df(s), throw=True) s = serialize_df(df, 0, path) df_eq(df, deserialize_df(s), throw=True) raises(ValueError, lambda: deserialize_df('{"x":1}'))
def test_json(tmpdir): fs = FileSystem() df1 = PandasDataFrame([["1", 2, 3]], "a:str,b:int,c:long") path = os.path.join(tmpdir, "a.json") save_df(df1, path) actual = load_df(path) df_eq(actual, [[1, 2, 3]], "a:long,b:long,c:long") actual = load_df(path, columns=["b", "a"]) df_eq(actual, [[2, "1"]], "b:int,a:str") actual = load_df(path, columns="b:str,a:int") df_eq(actual, [["2", 1]], "b:str,a:int") raises(KeyError, lambda: load_df(path, columns="bb:str,a:int"))
def test_pickle_df(): def assert_eq(df, df_expected=None, raw=False): if df_expected is None: df_expected = df df_actual = unpickle_df(pickle_df(df)) if raw: assert df_expected.native == df_actual.native else: df_eq(df_expected, df_actual, throw=True) assert_eq(ArrayDataFrame([], "a:int,b:int")) assert_eq(ArrayDataFrame([[None, None]], "a:int,b:int")) assert_eq(ArrayDataFrame([[None, "abc"]], "a:int,b:str")) assert_eq(ArrayDataFrame([[None, [1, 2], dict(x=1)]], "a:int,b:[int],c:{x:int}"), raw=True) assert_eq(IterableDataFrame([[None, [1, 2], dict(x=1)]], "a:int,b:[int],c:{x:int}"), ArrayDataFrame([[None, [1, 2], dict(x=1)]], "a:int,b:[int],c:{x:int}"), raw=True) assert_eq(PandasDataFrame([[None, None]], "a:int,b:int")) assert_eq(PandasDataFrame([[None, "abc"]], "a:int,b:str"))
def test_parquet_io(tmpdir): df1 = PandasDataFrame([["1", 2, 3]], "a:str,b:int,c:long") df2 = ArrayDataFrame([[[1, 2]]], "a:[int]") # {a:int} will become {a:long} because pyarrow lib has issue df3 = ArrayDataFrame([[dict(a=1)]], "a:{a:long}") for df in [df1, df2, df3]: path = os.path.join(tmpdir, "a.parquet") save_df(df, path) actual = load_df(path) df_eq(df, actual, throw=True) save_df(df1, path) actual = load_df(path, columns=["b", "a"]) df_eq(actual, [[2, "1"]], "b:int,a:str") actual = load_df(path, columns="b:str,a:int") df_eq(actual, [["2", 1]], "b:str,a:int") # can't specify wrong columns raises(Exception, lambda: load_df(path, columns="bb:str,a:int")) # load directory fs = FileSystem() folder = os.path.join(tmpdir, "folder") fs.makedirs(folder) f0 = os.path.join(folder, "_SUCCESS") f1 = os.path.join(folder, "1.parquet") f2 = os.path.join(folder, "3.parquet") fs.touch(f0) save_df(df1, f1) save_df(df1, f2) actual = load_df(folder, "parquet") df_eq(actual, [["1", 2, 3], ["1", 2, 3]], "a:str,b:int,c:long") # load multiple paths actual = load_df([f1, f2], "parquet") df_eq(actual, [["1", 2, 3], ["1", 2, 3]], "a:str,b:int,c:long") # overwrite = False raises(FileExistsError, lambda: save_df(df1, f1, mode="error")) # can't overwrite directory raises( IsADirectoryError, lambda: save_df(df1, folder, format_hint="parquet", mode="overwrite"), ) # wrong mode raises(NotImplementedError, lambda: save_df(df1, f1, mode="dummy"))
def unpickle_df(stream: bytes) -> LocalBoundedDataFrame: """Unpickles a dataframe from bytes array. :param stream: binary data :return: unpickled dataframe .. note:: The data must be serialized by :func:`.pickle_df` to deserialize. """ o = pickle.loads(stream) schema = o[0] if o[1] == "p": return PandasDataFrame(o[2], schema) if o[1] == "a": return ArrayDataFrame(o[2], schema) raise NotImplementedError( # pragma: no cover f"{o[1]} is not supported for unpickle")
def df(self, data: Any = None, schema: Any = None, metadata: Any = None) -> SparkDataFrame: session = SparkSession.builder.getOrCreate() if data is None: df = None else: if schema is not None: pdf = PandasDataFrame(data, to_schema(schema), metadata) df = session.createDataFrame(pdf.native, to_spark_schema(schema)) else: try: df = session.createDataFrame(data) except Exception: raise FugueDataFrameInitError("schema error") return SparkDataFrame(df, schema, metadata)
def test_simple_methods(): df = DaskDataFrame([], "a:str,b:int") assert df.empty assert 0 == df.count() assert not df.is_local df = DaskDataFrame([["a", 1], ["b", "2"]], "x:str,y:double") assert not df.empty assert 2 == df.count() assert ["a", 1.0] == df.peek_array() assert dict(x="a", y=1.0) == df.peek_dict() df_eq( PandasDataFrame(df.as_pandas()), [["a", 1.0], ["b", 2.0]], "x:str,y:double", throw=True, )
def test_to_local_df(): df = ArrayDataFrame([[0, 1]], "a:int,b:int") pdf = PandasDataFrame(df.as_pandas(), "a:int,b:int") idf = IterableDataFrame([[0, 1]], "a:int,b:int") assert to_local_df(df) is df assert to_local_df(pdf) is pdf assert to_local_df(idf) is idf assert isinstance(to_local_df(df.native, "a:int,b:int"), ArrayDataFrame) assert isinstance(to_local_df(pdf.native, "a:int,b:int"), PandasDataFrame) assert isinstance(to_local_df(idf.native, "a:int,b:int"), IterableDataFrame) raises(TypeError, lambda: to_local_df(123)) metadata = dict(a=1) assert to_local_df(df.native, df.schema, metadata).metadata == metadata raises(NoneArgumentError, lambda: to_local_df(None)) raises(ValueError, lambda: to_local_df(df, "a:int,b:int", None))
def f35(e: pd.DataFrame, a: LocalDataFrame) -> Iterable[pd.DataFrame]: e = PandasDataFrame(e, "a:int").as_pandas() a = ArrayDataFrame(a, "a:int").as_pandas() return iter([e, a])
def f26(e: pd.DataFrame, a: LocalDataFrame) -> Iterable[Dict[str, Any]]: e = list(PandasDataFrame(e).as_array()) e += list(a.as_array()) return ArrayDataFrame(e, "a:int").as_dict_iterable()
def with_nat(cursor, data): df = data.as_pandas() df["nat"] = pd.NaT schema = data.schema + "nat:datetime" return PandasDataFrame(df, schema)
def test_avro_io(tmpdir): df1 = PandasDataFrame([["1", 2, 3]], "a:str,b:int,c:long") path = os.path.join(tmpdir, "a.avro") save_df(df1, path) actual = load_df(path) df_eq(actual, [["1", 2, 3]], "a:str,b:long,c:long") actual = load_df(path, columns=["a", "b"]) df_eq(actual, [["1", 3]], "a:str,b:long") actual = load_df(path, columns="a:str,b:int,c:long") df_eq(actual, [["1", 2, 3]], "a:str,b:int,c:long") actual = load_df(path, columns=["b", "c"], infer_schema=True) df_eq(actual, [[2, 3]], "b:long,c:long") # provide schema and columns -> throw error raises( Exception, lambda: save_df( path, columns="a:str,b:int,c:long", schema={ "type": "record", "name": "Root", "fields": [ { "name": "station", "type": "string" }, { "name": "time", "type": "long" }, { "name": "temp", "type": "int" }, ], }, ), ) # provide schema and infer_schema is True -> throw error raises( Exception, lambda: save_df( path, columns=None, schema={ "type": "record", "name": "Root", "fields": [ { "name": "station", "type": "string" }, { "name": "time", "type": "long" }, { "name": "temp", "type": "int" }, ], }, infer_schema=True, ), )
def test_avro_io(tmpdir): df1 = PandasDataFrame([["1", 2, 3]], "a:str,b:int,c:long") df2 = PandasDataFrame([["hello", 2, 3]], "a:str,b:int,c:long") path1 = os.path.join(tmpdir, "df1.avro") path2 = os.path.join(tmpdir, "df2.avro") save_df(df1, path1) actual = load_df(path1) df_eq(actual, [["1", 2, 3]], "a:str,b:long,c:long") actual = load_df(path1, columns=["a", "b"]) df_eq(actual, [["1", 3]], "a:str,b:long") actual = load_df(path1, columns="a:str,b:int,c:long") df_eq(actual, [["1", 2, 3]], "a:str,b:int,c:long") actual = load_df(path1, columns="a:str,b:int,c:long", infer_schema=True) # TODO raise error when both provided? df_eq(actual, [["1", 2, 3]], "a:str,b:int,c:long") actual = load_df(path1, columns=["b", "c"], infer_schema=True) df_eq(actual, [[2, 3]], "b:long,c:long") # save in append mode path3 = os.path.join(tmpdir, "append.avro") save_df(df1, path3) save_df(df2, path3, append=True) actual = load_df(path1, columns="a:str,b:int,c:long") df_eq(actual, [['1', 2, 3], ['hello', 2, 3]], "a:str,b:int,c:long") # save times_as_micros =False (i.e milliseconds instead) df4 = PandasDataFrame([["2021-05-04", 2, 3]], "a:datetime,b:int,c:long") path4 = os.path.join(tmpdir, "df4.avro") save_df(df4, path4) actual = load_df(path4, columns="a:datetime,b:int,c:long") df_eq(actual, [["2021-05-04", 2, 3]], "a:datetime,b:int,c:long") save_df(df4, path4, times_as_micros=False) actual = load_df(path4, columns="a:datetime,b:int,c:long") df_eq(actual, [["2021-05-04", 2, 3]], "a:datetime,b:int,c:long") # provide avro schema schema = { 'type': 'record', 'name': 'Root', 'fields': [ { 'name': 'a', 'type': 'string' }, { 'name': 'b', 'type': 'int' }, { 'name': 'c', 'type': 'long' }, ], } save_df(df1, path1, schema=schema) actual = load_df(path1, columns="a:str,b:int,c:long") df_eq(actual, [['1', 2, 3]], "a:str,b:int,c:long") # provide wrong types in columns arg save_df(df2, path2, schema=schema) raises( FugueDataFrameOperationError, lambda: load_df(df2, path2, columns="a:int,b:int,c:long"), ) # load with process_record function actual = load_df(path2, columns="a:str,b:int,c:long", process_record=lambda s: { 'a': str.upper(s['a']), 'b': s['b'], 'c': s['c'] }) df_eq(actual, [['HELLO', 2, 3]], "a:str,b:int,c:long") # provide wrong type in avro schema schema = { 'type': 'record', 'name': 'Root', 'fields': [ { 'name': 'a', 'type': 'int' }, { 'name': 'b', 'type': 'int' }, { 'name': 'c', 'type': 'long' }, ], } raises(TypeError, lambda: save_df(df2, path2, schema=schema))
def test_dataframes(): df1 = ArrayDataFrame([[0]], "a:int") df2 = ArrayDataFrame([[1]], "a:int") dfs = DataFrames(a=df1, b=df2) assert dfs[0] is df1 assert dfs[1] is df2 dfs = DataFrames([df1, df2], df1) assert not dfs.has_key assert dfs[0] is df1 assert dfs[1] is df2 assert dfs[2] is df1 dfs2 = DataFrames(dfs, dfs, df2) assert not dfs2.has_key assert dfs2[0] is df1 assert dfs2[1] is df2 assert dfs2[2] is df1 assert dfs2[3] is df1 assert dfs2[4] is df2 assert dfs2[5] is df1 assert dfs2[6] is df2 dfs = DataFrames([("a", df1), ("b", df2)]) assert dfs.has_key assert dfs[0] is df1 assert dfs[1] is df2 assert dfs["a"] is df1 assert dfs["b"] is df2 with raises(ValueError): dfs["c"] = 1 with raises(ValueError): dfs2 = DataFrames(1) with raises(ValueError): dfs2 = DataFrames(a=df1, b=2) with raises(InvalidOperationError): dfs2 = DataFrames(dict(a=df1), df2) with raises(InvalidOperationError): dfs2 = DataFrames(df2, dict(a=df1)) with raises(InvalidOperationError): dfs2 = DataFrames(df1, a=df2) with raises(InvalidOperationError): dfs2 = DataFrames(DataFrames(df1, df2), x=df2) dfs2 = DataFrames(dfs) assert dfs2.has_key assert dfs2[0] is df1 assert dfs2[1] is df2 dfs1 = DataFrames(a=df1, b=df2) dfs2 = dfs1.convert(lambda x: PandasDataFrame(x.as_array(), x.schema)) assert len(dfs1) == len(dfs2) assert dfs2.has_key assert isinstance(dfs2["a"], PandasDataFrame) assert isinstance(dfs2["b"], PandasDataFrame) dfs1 = DataFrames(df1, df2) dfs2 = dfs1.convert(lambda x: PandasDataFrame(x.as_array(), x.schema)) assert len(dfs1) == len(dfs2) assert not dfs2.has_key assert isinstance(dfs2[0], PandasDataFrame) assert isinstance(dfs2[1], PandasDataFrame)