def test_read_csv_with_comment(self): with self.csv_file(self.csv_text_with_comments) as fn: expected = pd.read_csv(fn, comment="#") actual = ps.read_csv(fn, comment="#") self.assert_eq(expected, actual, almost=True) self.assertRaisesRegex( ValueError, "Only length-1 comment characters supported", lambda: ps.read_csv(fn, comment="").show(), ) self.assertRaisesRegex( ValueError, "Only length-1 comment characters supported", lambda: ps.read_csv(fn, comment="##").show(), ) self.assertRaisesRegex( ValueError, "Only length-1 comment characters supported", lambda: ps.read_csv(fn, comment=1), ) self.assertRaisesRegex( ValueError, "Only length-1 comment characters supported", lambda: ps.read_csv(fn, comment=[1]), )
def test_read_csv_with_dtype(self): with self.csv_file(self.csv_text) as fn: self.assert_eq(ps.read_csv(fn), pd.read_csv(fn), almost=True) self.assert_eq(ps.read_csv(fn, dtype=str), pd.read_csv(fn, dtype=str)) self.assert_eq( ps.read_csv(fn, dtype={"amount": "int64"}), pd.read_csv(fn, dtype={"amount": "int64"}), )
def test_read_csv_with_escapechar(self): with self.csv_file(self.e_escapeted_csv_text) as fn: self.assert_eq( ps.read_csv(fn, escapechar="E"), pd.read_csv(fn, escapechar="E"), almost=True ) self.assert_eq( ps.read_csv(fn, escapechar="ABC", escape="E"), pd.read_csv(fn, escapechar="E"), almost=True, )
def test_read_csv_with_squeeze(self): with self.csv_file(self.csv_text) as fn: expected = pd.read_csv(fn, squeeze=True, usecols=["name"]) actual = ps.read_csv(fn, squeeze=True, usecols=["name"]) self.assert_eq(expected, actual, almost=True) expected = pd.read_csv(fn, squeeze=True, usecols=["name", "amount"]) actual = ps.read_csv(fn, squeeze=True, usecols=["name", "amount"]) self.assert_eq(expected, actual, almost=True) expected = pd.read_csv(fn, squeeze=True, usecols=["name", "amount"], index_col=["name"]) actual = ps.read_csv(fn, squeeze=True, usecols=["name", "amount"], index_col=["name"]) self.assert_eq(expected, actual, almost=True)
def check(header="infer", names=None, usecols=None, index_col=None): expected = pd.read_csv( fn, header=header, names=names, usecols=usecols, index_col=index_col ) actual = ps.read_csv( fn, header=header, names=names, usecols=usecols, index_col=index_col ) self.assert_eq(expected, actual, almost=True)
def test_read_csv_with_quotechar(self): with self.csv_file(self.q_quoted_csv_text) as fn: self.assert_eq( ps.read_csv(fn, quotechar="Q"), pd.read_csv(fn, quotechar="Q"), almost=True )
def test_read_csv_with_parse_dates(self): self.assertRaisesRegex( ValueError, "parse_dates", lambda: ps.read_csv("path", parse_dates=True) )
def test_read_csv_with_mangle_dupe_cols(self): self.assertRaisesRegex( ValueError, "mangle_dupe_cols", lambda: ps.read_csv("path", mangle_dupe_cols=False) )
def test_read_csv_with_sep(self): with self.csv_file(self.tab_delimited_csv_text) as fn: expected = pd.read_csv(fn, sep="\t") actual = ps.read_csv(fn, sep="\t") self.assert_eq(expected, actual, almost=True)
def test_read_csv_with_limit(self): with self.csv_file(self.csv_text_with_comments) as fn: expected = pd.read_csv(fn, comment="#", nrows=2) actual = ps.read_csv(fn, comment="#", nrows=2) self.assert_eq(expected, actual, almost=True)
def test_read_with_spark_schema(self): with self.csv_file(self.csv_text_2) as fn: actual = ps.read_csv(fn, names="A string, B string, C long, D long, E long") expected = pd.read_csv(fn, names=["A", "B", "C", "D", "E"]) self.assert_eq(expected, actual)
def test_read_csv(self): with self.csv_file(self.csv_text) as fn: def check(header="infer", names=None, usecols=None, index_col=None): expected = pd.read_csv( fn, header=header, names=names, usecols=usecols, index_col=index_col ) actual = ps.read_csv( fn, header=header, names=names, usecols=usecols, index_col=index_col ) self.assert_eq(expected, actual, almost=True) check() check(header=0) check(header=None) check(names=["n", "a"]) check(names=[("x", "n"), ("y", "a")]) check(names=[10, 20]) check(header=0, names=["n", "a"]) check(usecols=[1]) check(usecols=[1, 0]) check(usecols=["amount"]) check(usecols=["amount", "name"]) check(usecols=[]) check(usecols=[1, 1]) check(usecols=["amount", "amount"]) check(header=None, usecols=[1]) check(names=["n", "a"], usecols=["a"]) check(header=None, names=["n", "a"], usecols=["a"]) check(index_col=["amount"]) check(header=None, index_col=[1]) check(names=["n", "a"], index_col=["a"]) # check with pyspark patch. expected = pd.read_csv(fn) actual = ps.read_csv(fn) self.assert_eq(expected, actual, almost=True) self.assertRaisesRegex( ValueError, "non-unique", lambda: ps.read_csv(fn, names=["n", "n"]) ) self.assertRaisesRegex( ValueError, "does not match the number.*3", lambda: ps.read_csv(fn, names=["n", "a", "b"]), ) self.assertRaisesRegex( ValueError, "does not match the number.*3", lambda: ps.read_csv(fn, header=0, names=["n", "a", "b"]), ) self.assertRaisesRegex( ValueError, "Usecols do not match.*3", lambda: ps.read_csv(fn, usecols=[1, 3]) ) self.assertRaisesRegex( ValueError, "Usecols do not match.*col", lambda: ps.read_csv(fn, usecols=["amount", "col"]), ) self.assertRaisesRegex( ValueError, "Unknown header argument 1", lambda: ps.read_csv(fn, header="1") ) expected_error_message = ( "'usecols' must either be list-like of all strings, " "all unicode, all integers or a callable." ) self.assertRaisesRegex( ValueError, expected_error_message, lambda: ps.read_csv(fn, usecols=[1, "amount"]) ) # check with index_col expected = pd.read_csv(fn).set_index("name") actual = ps.read_csv(fn, index_col="name") self.assert_eq(expected, actual, almost=True)
# Databricks notebook source # MAGIC %md # MAGIC ## Import Data # MAGIC [More info around pyspark.pandas](https://databricks.com/blog/2021/10/04/pandas-api-on-upcoming-apache-spark-3-2.html) # COMMAND ---------- import pyspark.pandas as pd #import databricks.koalas as pd # for spark less than 3.2 data_file = "/mnt/training/airbnb-sf-listings.csv" airbnb_sf_listings = pd.read_csv(data_file, quotechar='"', escapechar='"') display(airbnb_sf_listings) # COMMAND ---------- # MAGIC %md # MAGIC ## Explore the data # COMMAND ---------- airbnb_sf_listings.describe() # COMMAND ---------- airbnb_sf_listings['price'].plot.hist(100) # COMMAND ---------- airbnb_spark_df = airbnb_sf_listings.to_spark( ) #conversion from dataframe to spark
def test_read_csv_with_encoding(self): # SPARK-37181: Read csv supporting latin-1 encoding. with self.csv_file(self.csv_text) as fn: expected = pd.read_csv(fn, encoding="latin-1") actual = ps.read_csv(fn, encoding="latin-1") self.assert_eq(expected, actual, almost=True)