def test_parquet_io(tmpdir): df1 = PandasDataFrame([["1", 2, 3]], "a:str,b:int,c:long") df2 = ArrayDataFrame([[[1, 2]]], "a:[int]") # {a:int} will become {a:long} because pyarrow lib has issue df3 = ArrayDataFrame([[dict(a=1)]], "a:{a:long}") for df in [df1, df2, df3]: path = os.path.join(tmpdir, "a.parquet") save_df(df, path) actual = load_df(path) df_eq(df, actual, throw=True) save_df(df1, path) actual = load_df(path, columns=["b", "a"]) df_eq(actual, [[2, "1"]], "b:int,a:str") actual = load_df(path, columns="b:str,a:int") df_eq(actual, [["2", 1]], "b:str,a:int") # can't specify wrong columns raises(Exception, lambda: load_df(path, columns="bb:str,a:int")) # load directory fs = FileSystem() folder = os.path.join(tmpdir, "folder") fs.makedirs(folder) f0 = os.path.join(folder, "_SUCCESS") f1 = os.path.join(folder, "1.parquet") f2 = os.path.join(folder, "3.parquet") fs.touch(f0) save_df(df1, f1) save_df(df1, f2) actual = load_df(folder, "parquet") df_eq(actual, [["1", 2, 3], ["1", 2, 3]], "a:str,b:int,c:long") # load multiple paths actual = load_df([f1, f2], "parquet") df_eq(actual, [["1", 2, 3], ["1", 2, 3]], "a:str,b:int,c:long") # overwrite = False raises(FileExistsError, lambda: save_df(df1, f1, mode="error")) # can't overwrite directory raises( IsADirectoryError, lambda: save_df(df1, folder, format_hint="parquet", mode="overwrite"), ) # wrong mode raises(NotImplementedError, lambda: save_df(df1, f1, mode="dummy"))
def test_parquet_io(tmpdir, spark_session): si = SparkIO(spark_session, FileSystem()) df1 = _df([["1", 2, 3]], "a:str,b:int,c:long") df2 = _df([[[1, 2]]], "a:[int]") # {a:int} will become {a:long} because pyarrow lib has issue df3 = _df([[dict(a=1)]], "a:{a:long}") for df in [df1, df2, df3]: path = os.path.join(tmpdir, "a.parquet") si.save_df(df, path) actual = si.load_df(path) df_eq(df, actual, throw=True) si.save_df(df1, path) actual = si.load_df(path, columns=["b", "a"]) df_eq(actual, [[2, "1"]], "b:int,a:str") actual = si.load_df(path, columns="b:str,a:int") df_eq(actual, [["2", 1]], "b:str,a:int") raises(Exception, lambda: si.load_df(path, columns="bb:str,a:int")) # load directory fs = FileSystem() folder = os.path.join(tmpdir, "folder") fs.makedirs(folder) f0 = os.path.join(folder, "_SUCCESS") f1 = os.path.join(folder, "1.parquet") f2 = os.path.join(folder, "3.parquet") fs.touch(f0) si.save_df(df1, f1, force_single=True) si.save_df(df1, f2, force_single=True) assert fs.isfile(f1) actual = si.load_df(folder, "parquet") df_eq(actual, [["1", 2, 3], ["1", 2, 3]], "a:str,b:int,c:long") # load multiple paths actual = si.load_df([f1, f2], "parquet") df_eq(actual, [["1", 2, 3], ["1", 2, 3]], "a:str,b:int,c:long") actual = si.load_df([f1, f2], "parquet", columns="b:str,a:str") df_eq(actual, [["2", "1"], ["2", "1"]], "a:str,b:int,c:long") # overwrite = False raises((FileExistsError, AnalysisException), lambda: si.save_df(df1, f1, mode="error")) # wrong mode raises(Exception, lambda: si.save_df(df1, f1, mode="dummy"))