def test_parquet_write(self): with self.temp_dir() as tmp: pdf = self.test_pdf expected = ks.DataFrame(pdf) # Write out partitioned by one column expected.to_parquet(tmp, mode="overwrite", partition_cols="i32") # Reset column order, as once the data is written out, Spark rearranges partition # columns to appear first. actual = ks.read_parquet(tmp) self.assertFalse((actual.columns == self.test_column_order).all()) actual = actual[self.test_column_order] self.assert_eq( actual.sort_values(by="f").to_spark().toPandas(), expected.sort_values(by="f").to_spark().toPandas(), ) # Write out partitioned by two columns expected.to_parquet(tmp, mode="overwrite", partition_cols=["i32", "bhello"]) # Reset column order, as once the data is written out, Spark rearranges partition # columns to appear first. actual = ks.read_parquet(tmp) self.assertFalse((actual.columns == self.test_column_order).all()) actual = actual[self.test_column_order] self.assert_eq( actual.sort_values(by="f").to_spark().toPandas(), expected.sort_values(by="f").to_spark().toPandas(), )
def test_parquet_read_with_pandas_metadata(self): with self.temp_dir() as tmp: expected1 = self.test_pdf path1 = "{}/file1.parquet".format(tmp) expected1.to_parquet(path1) self.assert_eq(ks.read_parquet(path1, pandas_metadata=True), expected1) expected2 = expected1.reset_index() path2 = "{}/file2.parquet".format(tmp) expected2.to_parquet(path2) self.assert_eq(ks.read_parquet(path2, pandas_metadata=True), expected2) expected3 = expected2.set_index("index", append=True) path3 = "{}/file3.parquet".format(tmp) expected3.to_parquet(path3) self.assert_eq(ks.read_parquet(path3, pandas_metadata=True), expected3)
def test_parquet_read(self): with self.temp_dir() as tmp: data = self.test_pdf self.spark.createDataFrame(data, 'i32 int, i64 long, f double, bhello string') \ .coalesce(1).write.parquet(tmp, mode='overwrite') def check(columns, expected): if LooseVersion("0.21.1") <= LooseVersion(pd.__version__): expected = pd.read_parquet(tmp, columns=columns) actual = ks.read_parquet(tmp, columns=columns) self.assertPandasEqual(expected, actual.toPandas()) check(None, data) check(['i32', 'i64'], data[['i32', 'i64']]) check(['i64', 'i32'], data[['i64', 'i32']]) check(('i32', 'i64'), data[['i32', 'i64']]) check(['a', 'b', 'i32', 'i64'], data[['i32', 'i64']]) check([], pd.DataFrame([])) check(['a'], pd.DataFrame([])) check('i32', pd.DataFrame([])) check('float', data[['f']]) # check with pyspark patch. if LooseVersion("0.21.1") <= LooseVersion(pd.__version__): expected = pd.read_parquet(tmp) else: expected = data actual = ks.read_parquet(tmp) self.assertPandasEqual(expected, actual.toPandas())
def test_local(self): with self.temp_dir() as tmp: data = pd.DataFrame({ 'i32': np.arange(1000, dtype=np.int32), 'i64': np.arange(1000, dtype=np.int64), 'f': np.arange(1000, dtype=np.float64), 'bhello': np.random.choice(['hello', 'yo', 'people'], size=1000).astype("O")}) data = data[['i32', 'i64', 'f', 'bhello']] self.spark.createDataFrame(data, 'i32 int, i64 long, f double, bhello string') \ .coalesce(1).write.parquet(tmp, mode='overwrite') def check(columns, expected): if LooseVersion("0.21.1") <= LooseVersion(pd.__version__): expected = pd.read_parquet(tmp, columns=columns) actual = koalas.read_parquet(tmp, columns=columns) self.assertPandasEqual(expected, actual.toPandas()) check(None, data) check(['i32', 'i64'], data[['i32', 'i64']]) check(['i64', 'i32'], data[['i64', 'i32']]) check(('i32', 'i64'), data[['i32', 'i64']]) check(['a', 'b', 'i32', 'i64'], data[['i32', 'i64']]) check([], pd.DataFrame([])) check(['a'], pd.DataFrame([])) check('i32', pd.DataFrame([])) check('float', data[['f']]) # check with pyspark patch. if LooseVersion("0.21.1") <= LooseVersion(pd.__version__): expected = pd.read_parquet(tmp) else: expected = data actual = koalas.read_parquet(tmp) self.assertPandasEqual(expected, actual.toPandas())
def test_parquet_read(self): with self.temp_dir() as tmp: data = self.test_pdf self.spark.createDataFrame( data, "i32 int, i64 long, f double, bhello string").coalesce( 1).write.parquet(tmp, mode="overwrite") def check(columns, expected): if LooseVersion("0.21.1") <= LooseVersion(pd.__version__): expected = pd.read_parquet(tmp, columns=columns) actual = ks.read_parquet(tmp, columns=columns) self.assertPandasEqual(expected, actual.to_pandas()) check(None, data) check(["i32", "i64"], data[["i32", "i64"]]) check(["i64", "i32"], data[["i64", "i32"]]) if LooseVersion(pa.__version__) < LooseVersion("1.0.0"): # TODO: `pd.read_parquet()` changed the behavior due to PyArrow 1.0.0. # We might want to adjust the behavior. Let's see how pandas handles it. check(("i32", "i64"), data[["i32", "i64"]]) check(["a", "b", "i32", "i64"], data[["i32", "i64"]]) check([], pd.DataFrame([])) check(["a"], pd.DataFrame([])) check("i32", pd.DataFrame([])) check("float", data[["f"]]) # check with pyspark patch. if LooseVersion("0.21.1") <= LooseVersion(pd.__version__): expected = pd.read_parquet(tmp) else: expected = data actual = ks.read_parquet(tmp) self.assertPandasEqual(expected, actual.to_pandas()) # When index columns are known pdf = self.test_pdf expected = ks.DataFrame(pdf) expected_idx = expected.set_index("bhello")[["f", "i32", "i64"]] actual_idx = ks.read_parquet( tmp, index_col="bhello")[["f", "i32", "i64"]] self.assert_eq( actual_idx.sort_values(by="f").to_spark().toPandas(), expected_idx.sort_values(by="f").to_spark().toPandas(), )
def test_parquet_write(self): with self.temp_dir() as tmp: pdf = self.test_pdf expected = ks.DataFrame(pdf) # Write out partitioned by one column expected.to_parquet(tmp, mode='overwrite', partition_cols='i32') # Reset column order, as once the data is written out, Spark rearranges partition # columns to appear first. actual = ks.read_parquet(tmp)[self.test_column_order] self.assert_eq(actual.sort_values(by='f'), expected.sort_values(by='f')) # Write out partitioned by two columns expected.to_parquet(tmp, mode='overwrite', partition_cols=['i32', 'bhello']) # Reset column order, as once the data is written out, Spark rearranges partition # columns to appear first. actual = ks.read_parquet(tmp)[self.test_column_order] self.assert_eq(actual.sort_values(by='f'), expected.sort_values(by='f'))
def test_local(self): with self.temp_dir() as tmp: data = pd.DataFrame({ 'i32': np.arange(1000, dtype=np.int32), 'i64': np.arange(1000, dtype=np.int64), 'f': np.arange(1000, dtype=np.float64), 'bhello': np.random.choice(['hello', 'yo', 'people'], size=1000).astype("O") }) data = data[['i32', 'i64', 'f', 'bhello']] self.spark.createDataFrame(data, 'i32 int, i64 long, f double, bhello string') \ .coalesce(1).write.parquet(tmp, mode='overwrite') def check(columns, expected): if LooseVersion("0.21.1") <= LooseVersion(pd.__version__): expected = pd.read_parquet(tmp, columns=columns) actual = koalas.read_parquet(tmp, columns=columns) self.assertPandasEqual(expected, actual.toPandas()) check(None, data) check(['i32', 'i64'], data[['i32', 'i64']]) check(['i64', 'i32'], data[['i64', 'i32']]) check(('i32', 'i64'), data[['i32', 'i64']]) check(['a', 'b', 'i32', 'i64'], data[['i32', 'i64']]) check([], pd.DataFrame([])) check(['a'], pd.DataFrame([])) check('i32', pd.DataFrame([])) check('float', data[['f']]) # check with pyspark patch. if LooseVersion("0.21.1") <= LooseVersion(pd.__version__): expected = pd.read_parquet(tmp) else: expected = data actual = koalas.read_parquet(tmp) self.assertPandasEqual(expected, actual.toPandas())
def check(columns, expected): if LooseVersion("0.21.1") <= LooseVersion(pd.__version__): expected = pd.read_parquet(tmp, columns=columns) actual = ks.read_parquet(tmp, columns=columns) self.assertPandasEqual(expected, actual.toPandas())
def check(columns, expected): if LooseVersion("0.21.1") <= LooseVersion(pd.__version__): expected = pd.read_parquet(tmp, columns=columns) actual = koalas.read_parquet(tmp, columns=columns) self.assertPandasEqual(expected, actual.toPandas())
# COMMAND ---------- # Pandas import pandas as pd pdDF = pd.read_parquet( "/dbfs/databricks-datasets/learning-spark-v2/sf-airbnb/sf-airbnb-clean.parquet" ) pdDF.head() # COMMAND ---------- # Koalas import databricks.koalas as ks kdf = ks.read_parquet( "/databricks-datasets/learning-spark-v2/sf-airbnb/sf-airbnb-clean.parquet") kdf.head() # COMMAND ---------- # MAGIC %md # MAGIC ### Converting to Koalas DataFrame to/from Spark DataFrame # COMMAND ---------- # Creating a Koalas DataFrame from PySpark DataFrame kdf = ks.DataFrame(df) # COMMAND ---------- # Alternative way of creating a Koalas DataFrame from PySpark DataFrame