def test_csv_io(tmpdir): fs = FileSystem() df1 = PandasDataFrame([["1", 2, 3]], "a:str,b:int,c:long") path = os.path.join(tmpdir, "a.csv") # without header save_df(df1, path) assert fs.readtext(path).startswith("1,2,3") raises(InvalidOperationError, lambda: load_df(path, header=False)) actual = load_df(path, columns=["a", "b", "c"], header=False, infer_schema=True) assert [[1, 2, 3]] == actual.as_array() assert actual.schema == "a:long,b:long,c:long" actual = load_df(path, columns="a:double,b:str,c:str", header=False) assert [[1.0, "2", "3"]] == actual.as_array() assert actual.schema == "a:double,b:str,c:str" # with header save_df(df1, path, header=True) assert fs.readtext(path).startswith("a,b,c") actual = load_df(path, header=True) assert [["1", "2", "3"]] == actual.as_array() actual = load_df(path, header=True, infer_schema=True) assert [[1, 2, 3]] == actual.as_array() actual = load_df(path, columns=["b", "a"], header=True, infer_schema=True) assert [[2, 1]] == actual.as_array() actual = load_df(path, columns="b:str,a:double", header=True) assert [["2", 1.0]] == actual.as_array() raises(KeyError, lambda: load_df(path, columns="b:str,x:double", header=True)) raises(NotImplementedError, lambda: load_df(path, columns="b:str,x:double", header=2))
def test_json(tmpdir): fs = FileSystem() df1 = PandasDataFrame([["1", 2, 3]], "a:str,b:int,c:long") path = os.path.join(tmpdir, "a.json") save_df(df1, path) actual = load_df(path) df_eq(actual, [[1, 2, 3]], "a:long,b:long,c:long") actual = load_df(path, columns=["b", "a"]) df_eq(actual, [[2, "1"]], "b:int,a:str") actual = load_df(path, columns="b:str,a:int") df_eq(actual, [["2", 1]], "b:str,a:int") raises(KeyError, lambda: load_df(path, columns="bb:str,a:int"))
def save_df( self, df: DataFrame, path: str, format_hint: Any = None, mode: str = "overwrite", partition_spec: PartitionSpec = EMPTY_PARTITION_SPEC, force_single: bool = False, **kwargs: Any, ) -> None: if not partition_spec.empty: self.log.warning( # pragma: no cover f"partition_spec is not respected in {self}.save_df" ) df = self.to_df(df).as_local() save_df(df, path, format_hint=format_hint, mode=mode, fs=self.fs, **kwargs)
def save_df( self, df: SparkDataFrame, uri: str, format_hint: Optional[str] = None, partition_spec: PartitionSpec = EMPTY_PARTITION_SPEC, mode: str = "overwrite", force_single: bool = False, **kwargs: Any, ) -> None: if not force_single: p = FileParser(uri, format_hint) writer = self._get_writer(df.native, partition_spec) writer.format(p.file_format).options(**kwargs).mode(mode) writer.save(uri) else: ldf = df.as_local() save_df(ldf, uri, format_hint=format_hint, mode=mode, fs=self._fs, **kwargs)
def save_df( self, df: DataFrame, path: str, format_hint: Any = None, mode: str = "overwrite", partition_spec: PartitionSpec = EMPTY_PARTITION_SPEC, force_single: bool = False, **kwargs: Any, ) -> None: if not partition_spec.empty: self.log.warning( # pragma: no cover "partition_spec is not respected in %s.save_df", self) self.fs.makedirs(os.path.dirname(path), recreate=True) df = self.to_df(df) save_df(df, path, format_hint=format_hint, mode=mode, fs=self.fs, **kwargs)
def test_parquet_io(tmpdir): df1 = PandasDataFrame([["1", 2, 3]], "a:str,b:int,c:long") df2 = ArrayDataFrame([[[1, 2]]], "a:[int]") # {a:int} will become {a:long} because pyarrow lib has issue df3 = ArrayDataFrame([[dict(a=1)]], "a:{a:long}") for df in [df1, df2, df3]: path = os.path.join(tmpdir, "a.parquet") save_df(df, path) actual = load_df(path) df_eq(df, actual, throw=True) save_df(df1, path) actual = load_df(path, columns=["b", "a"]) df_eq(actual, [[2, "1"]], "b:int,a:str") actual = load_df(path, columns="b:str,a:int") df_eq(actual, [["2", 1]], "b:str,a:int") # can't specify wrong columns raises(Exception, lambda: load_df(path, columns="bb:str,a:int")) # load directory fs = FileSystem() for name in ["folder.parquet", "folder"]: folder = os.path.join(tmpdir, name) fs.makedirs(folder) f0 = os.path.join(folder, "_SUCCESS") f1 = os.path.join(folder, "1.parquet") f2 = os.path.join(folder, "3.parquet") fs.touch(f0) save_df(df1, f1) save_df(df1, f2) actual = load_df(folder, "parquet") df_eq(actual, [["1", 2, 3], ["1", 2, 3]], "a:str,b:int,c:long") # load multiple paths actual = load_df([f1, f2], "parquet") df_eq(actual, [["1", 2, 3], ["1", 2, 3]], "a:str,b:int,c:long") # load folder actual = load_df(folder, "parquet") df_eq(actual, [["1", 2, 3], ["1", 2, 3]], "a:str,b:int,c:long") actual = load_df(os.path.join(tmpdir, "folder.parquet")) df_eq(actual, [["1", 2, 3], ["1", 2, 3]], "a:str,b:int,c:long") # load pattern actual = load_df(os.path.join(tmpdir, "folder", "*.parquet")) df_eq(actual, [["1", 2, 3], ["1", 2, 3]], "a:str,b:int,c:long") # overwrite folder with single file save_df(actual, os.path.join(tmpdir, "folder.parquet"), mode="overwrite") actual = load_df(os.path.join(tmpdir, "folder.parquet")) df_eq(actual, [["1", 2, 3], ["1", 2, 3]], "a:str,b:int,c:long") # overwrite = False raises(FileExistsError, lambda: save_df(df1, f1, mode="error")) raises( FileExistsError, lambda: save_df( df1, os.path.join(tmpdir, "folder.parquet"), mode="error"), ) # wrong mode raises(NotImplementedError, lambda: save_df(df1, f1, mode="dummy"))
def test_avro_io(tmpdir): df1 = PandasDataFrame([["1", 2, 3]], "a:str,b:int,c:long") path = os.path.join(tmpdir, "a.avro") save_df(df1, path) actual = load_df(path) df_eq(actual, [["1", 2, 3]], "a:str,b:long,c:long") actual = load_df(path, columns=["a", "b"]) df_eq(actual, [["1", 3]], "a:str,b:long") actual = load_df(path, columns="a:str,b:int,c:long") df_eq(actual, [["1", 2, 3]], "a:str,b:int,c:long") actual = load_df(path, columns=["b", "c"], infer_schema=True) df_eq(actual, [[2, 3]], "b:long,c:long") # provide schema and columns -> throw error raises( Exception, lambda: save_df( path, columns="a:str,b:int,c:long", schema={ "type": "record", "name": "Root", "fields": [ { "name": "station", "type": "string" }, { "name": "time", "type": "long" }, { "name": "temp", "type": "int" }, ], }, ), ) # provide schema and infer_schema is True -> throw error raises( Exception, lambda: save_df( path, columns=None, schema={ "type": "record", "name": "Root", "fields": [ { "name": "station", "type": "string" }, { "name": "time", "type": "long" }, { "name": "temp", "type": "int" }, ], }, infer_schema=True, ), )
def test_avro_io(tmpdir): df1 = PandasDataFrame([["1", 2, 3]], "a:str,b:int,c:long") df2 = PandasDataFrame([["hello", 2, 3]], "a:str,b:int,c:long") path1 = os.path.join(tmpdir, "df1.avro") path2 = os.path.join(tmpdir, "df2.avro") save_df(df1, path1) actual = load_df(path1) df_eq(actual, [["1", 2, 3]], "a:str,b:long,c:long") actual = load_df(path1, columns=["a", "b"]) df_eq(actual, [["1", 3]], "a:str,b:long") actual = load_df(path1, columns="a:str,b:int,c:long") df_eq(actual, [["1", 2, 3]], "a:str,b:int,c:long") actual = load_df(path1, columns="a:str,b:int,c:long", infer_schema=True) # TODO raise error when both provided? df_eq(actual, [["1", 2, 3]], "a:str,b:int,c:long") actual = load_df(path1, columns=["b", "c"], infer_schema=True) df_eq(actual, [[2, 3]], "b:long,c:long") # save in append mode path3 = os.path.join(tmpdir, "append.avro") save_df(df1, path3) save_df(df2, path3, append=True) actual = load_df(path1, columns="a:str,b:int,c:long") df_eq(actual, [['1', 2, 3], ['hello', 2, 3]], "a:str,b:int,c:long") # save times_as_micros =False (i.e milliseconds instead) df4 = PandasDataFrame([["2021-05-04", 2, 3]], "a:datetime,b:int,c:long") path4 = os.path.join(tmpdir, "df4.avro") save_df(df4, path4) actual = load_df(path4, columns="a:datetime,b:int,c:long") df_eq(actual, [["2021-05-04", 2, 3]], "a:datetime,b:int,c:long") save_df(df4, path4, times_as_micros=False) actual = load_df(path4, columns="a:datetime,b:int,c:long") df_eq(actual, [["2021-05-04", 2, 3]], "a:datetime,b:int,c:long") # provide avro schema schema = { 'type': 'record', 'name': 'Root', 'fields': [ { 'name': 'a', 'type': 'string' }, { 'name': 'b', 'type': 'int' }, { 'name': 'c', 'type': 'long' }, ], } save_df(df1, path1, schema=schema) actual = load_df(path1, columns="a:str,b:int,c:long") df_eq(actual, [['1', 2, 3]], "a:str,b:int,c:long") # provide wrong types in columns arg save_df(df2, path2, schema=schema) raises( FugueDataFrameOperationError, lambda: load_df(df2, path2, columns="a:int,b:int,c:long"), ) # load with process_record function actual = load_df(path2, columns="a:str,b:int,c:long", process_record=lambda s: { 'a': str.upper(s['a']), 'b': s['b'], 'c': s['c'] }) df_eq(actual, [['HELLO', 2, 3]], "a:str,b:int,c:long") # provide wrong type in avro schema schema = { 'type': 'record', 'name': 'Root', 'fields': [ { 'name': 'a', 'type': 'int' }, { 'name': 'b', 'type': 'int' }, { 'name': 'c', 'type': 'long' }, ], } raises(TypeError, lambda: save_df(df2, path2, schema=schema))