def test_orc_write(self): with self.temp_dir() as tmp: pdf = self.test_pdf expected = ps.DataFrame(pdf) # Write out partitioned by one column expected.to_orc(tmp, mode="overwrite", partition_cols="i32") # Reset column order, as once the data is written out, Spark rearranges partition # columns to appear first. actual = ps.read_orc(tmp) self.assertFalse((actual.columns == self.test_column_order).all()) actual = actual[self.test_column_order] self.assert_eq( actual.sort_values(by="f").to_spark().toPandas(), expected.sort_values(by="f").to_spark().toPandas(), ) # Write out partitioned by two columns expected.to_orc(tmp, mode="overwrite", partition_cols=["i32", "bhello"]) # Reset column order, as once the data is written out, Spark rearranges partition # columns to appear first. actual = ps.read_orc(tmp) self.assertFalse((actual.columns == self.test_column_order).all()) actual = actual[self.test_column_order] self.assert_eq( actual.sort_values(by="f").to_spark().toPandas(), expected.sort_values(by="f").to_spark().toPandas(), )
def test_read_orc(self): with self.temp_dir() as tmp: path = "{}/file1.orc".format(tmp) data = self.test_pdf self.spark.createDataFrame( data, "i32 int, i64 long, f double, bhello string").coalesce( 1).write.orc(path, mode="overwrite") # `spark.write.orc` create a directory contains distributed orc files. # But pandas only can read from file, not directory. Therefore, we need orc file path. orc_file_path = glob.glob(os.path.join(path, "*.orc"))[0] expected = data.reset_index()[data.columns] actual = ps.read_orc(path) self.assertPandasEqual(expected, actual.to_pandas()) # columns columns = ["i32", "i64"] expected = data.reset_index()[columns] actual = ps.read_orc(path, columns=columns) self.assertPandasEqual(expected, actual.to_pandas()) # index_col expected = data.set_index("i32") actual = ps.read_orc(path, index_col="i32") self.assert_eq(actual, expected) expected = data.set_index(["i32", "f"]) actual = ps.read_orc(path, index_col=["i32", "f"]) self.assert_eq(actual, expected) # index_col with columns expected = data.set_index("i32")[["i64", "bhello"]] actual = ps.read_orc(path, index_col=["i32"], columns=["i64", "bhello"]) self.assert_eq(actual, expected) expected = data.set_index(["i32", "f"])[["bhello", "i64"]] actual = ps.read_orc(path, index_col=["i32", "f"], columns=["bhello", "i64"]) self.assert_eq(actual, expected) msg = "Unknown column name 'i'" with self.assertRaises(ValueError, msg=msg): ps.read_orc(path, columns="i32") msg = "Unknown column name 'i34'" with self.assertRaises(ValueError, msg=msg): ps.read_orc(path, columns=["i34", "i64"])
def test_read_orc(self): with self.temp_dir() as tmp: path = "{}/file1.orc".format(tmp) data = self.test_pdf self.spark.createDataFrame( data, "i32 int, i64 long, f double, bhello string").coalesce( 1).write.orc(path, mode="overwrite") expected = data.reset_index()[data.columns] actual = ps.read_orc(path) self.assertPandasEqual(expected, actual.to_pandas()) # columns columns = ["i32", "i64"] expected = data.reset_index()[columns] actual = ps.read_orc(path, columns=columns) self.assertPandasEqual(expected, actual.to_pandas()) # index_col expected = data.set_index("i32") actual = ps.read_orc(path, index_col="i32") self.assert_eq(actual, expected) expected = data.set_index(["i32", "f"]) actual = ps.read_orc(path, index_col=["i32", "f"]) self.assert_eq(actual, expected) # index_col with columns expected = data.set_index("i32")[["i64", "bhello"]] actual = ps.read_orc(path, index_col=["i32"], columns=["i64", "bhello"]) self.assert_eq(actual, expected) expected = data.set_index(["i32", "f"])[["bhello", "i64"]] actual = ps.read_orc(path, index_col=["i32", "f"], columns=["bhello", "i64"]) self.assert_eq(actual, expected) msg = "Unknown column name 'i'" with self.assertRaises(ValueError, msg=msg): ps.read_orc(path, columns="i32") msg = "Unknown column name 'i34'" with self.assertRaises(ValueError, msg=msg): ps.read_orc(path, columns=["i34", "i64"])