Example #1
0
    def test_read_csv_with_comment(self):
        with self.csv_file(self.csv_text_with_comments) as fn:
            expected = pd.read_csv(fn, comment="#")
            actual = ps.read_csv(fn, comment="#")
            self.assert_eq(expected, actual, almost=True)

            self.assertRaisesRegex(
                ValueError,
                "Only length-1 comment characters supported",
                lambda: ps.read_csv(fn, comment="").show(),
            )
            self.assertRaisesRegex(
                ValueError,
                "Only length-1 comment characters supported",
                lambda: ps.read_csv(fn, comment="##").show(),
            )
            self.assertRaisesRegex(
                ValueError,
                "Only length-1 comment characters supported",
                lambda: ps.read_csv(fn, comment=1),
            )
            self.assertRaisesRegex(
                ValueError,
                "Only length-1 comment characters supported",
                lambda: ps.read_csv(fn, comment=[1]),
            )
Example #2
0
 def test_read_csv_with_dtype(self):
     with self.csv_file(self.csv_text) as fn:
         self.assert_eq(ps.read_csv(fn), pd.read_csv(fn), almost=True)
         self.assert_eq(ps.read_csv(fn, dtype=str), pd.read_csv(fn, dtype=str))
         self.assert_eq(
             ps.read_csv(fn, dtype={"amount": "int64"}),
             pd.read_csv(fn, dtype={"amount": "int64"}),
         )
Example #3
0
    def test_read_csv_with_escapechar(self):
        with self.csv_file(self.e_escapeted_csv_text) as fn:
            self.assert_eq(
                ps.read_csv(fn, escapechar="E"), pd.read_csv(fn, escapechar="E"), almost=True
            )

            self.assert_eq(
                ps.read_csv(fn, escapechar="ABC", escape="E"),
                pd.read_csv(fn, escapechar="E"),
                almost=True,
            )
Example #4
0
    def test_read_csv_with_squeeze(self):
        with self.csv_file(self.csv_text) as fn:
            expected = pd.read_csv(fn, squeeze=True, usecols=["name"])
            actual = ps.read_csv(fn, squeeze=True, usecols=["name"])
            self.assert_eq(expected, actual, almost=True)

            expected = pd.read_csv(fn, squeeze=True, usecols=["name", "amount"])
            actual = ps.read_csv(fn, squeeze=True, usecols=["name", "amount"])
            self.assert_eq(expected, actual, almost=True)

            expected = pd.read_csv(fn, squeeze=True, usecols=["name", "amount"], index_col=["name"])
            actual = ps.read_csv(fn, squeeze=True, usecols=["name", "amount"], index_col=["name"])
            self.assert_eq(expected, actual, almost=True)
Example #5
0
 def check(header="infer", names=None, usecols=None, index_col=None):
     expected = pd.read_csv(
         fn, header=header, names=names, usecols=usecols, index_col=index_col
     )
     actual = ps.read_csv(
         fn, header=header, names=names, usecols=usecols, index_col=index_col
     )
     self.assert_eq(expected, actual, almost=True)
Example #6
0
 def test_read_csv_with_quotechar(self):
     with self.csv_file(self.q_quoted_csv_text) as fn:
         self.assert_eq(
             ps.read_csv(fn, quotechar="Q"), pd.read_csv(fn, quotechar="Q"), almost=True
         )
Example #7
0
 def test_read_csv_with_parse_dates(self):
     self.assertRaisesRegex(
         ValueError, "parse_dates", lambda: ps.read_csv("path", parse_dates=True)
     )
Example #8
0
 def test_read_csv_with_mangle_dupe_cols(self):
     self.assertRaisesRegex(
         ValueError, "mangle_dupe_cols", lambda: ps.read_csv("path", mangle_dupe_cols=False)
     )
Example #9
0
 def test_read_csv_with_sep(self):
     with self.csv_file(self.tab_delimited_csv_text) as fn:
         expected = pd.read_csv(fn, sep="\t")
         actual = ps.read_csv(fn, sep="\t")
         self.assert_eq(expected, actual, almost=True)
Example #10
0
 def test_read_csv_with_limit(self):
     with self.csv_file(self.csv_text_with_comments) as fn:
         expected = pd.read_csv(fn, comment="#", nrows=2)
         actual = ps.read_csv(fn, comment="#", nrows=2)
         self.assert_eq(expected, actual, almost=True)
Example #11
0
 def test_read_with_spark_schema(self):
     with self.csv_file(self.csv_text_2) as fn:
         actual = ps.read_csv(fn, names="A string, B string, C long, D long, E long")
         expected = pd.read_csv(fn, names=["A", "B", "C", "D", "E"])
         self.assert_eq(expected, actual)
Example #12
0
    def test_read_csv(self):
        with self.csv_file(self.csv_text) as fn:

            def check(header="infer", names=None, usecols=None, index_col=None):
                expected = pd.read_csv(
                    fn, header=header, names=names, usecols=usecols, index_col=index_col
                )
                actual = ps.read_csv(
                    fn, header=header, names=names, usecols=usecols, index_col=index_col
                )
                self.assert_eq(expected, actual, almost=True)

            check()
            check(header=0)
            check(header=None)
            check(names=["n", "a"])
            check(names=[("x", "n"), ("y", "a")])
            check(names=[10, 20])
            check(header=0, names=["n", "a"])
            check(usecols=[1])
            check(usecols=[1, 0])
            check(usecols=["amount"])
            check(usecols=["amount", "name"])
            check(usecols=[])
            check(usecols=[1, 1])
            check(usecols=["amount", "amount"])
            check(header=None, usecols=[1])
            check(names=["n", "a"], usecols=["a"])
            check(header=None, names=["n", "a"], usecols=["a"])
            check(index_col=["amount"])
            check(header=None, index_col=[1])
            check(names=["n", "a"], index_col=["a"])

            # check with pyspark patch.
            expected = pd.read_csv(fn)
            actual = ps.read_csv(fn)
            self.assert_eq(expected, actual, almost=True)

            self.assertRaisesRegex(
                ValueError, "non-unique", lambda: ps.read_csv(fn, names=["n", "n"])
            )
            self.assertRaisesRegex(
                ValueError,
                "does not match the number.*3",
                lambda: ps.read_csv(fn, names=["n", "a", "b"]),
            )
            self.assertRaisesRegex(
                ValueError,
                "does not match the number.*3",
                lambda: ps.read_csv(fn, header=0, names=["n", "a", "b"]),
            )
            self.assertRaisesRegex(
                ValueError, "Usecols do not match.*3", lambda: ps.read_csv(fn, usecols=[1, 3])
            )
            self.assertRaisesRegex(
                ValueError,
                "Usecols do not match.*col",
                lambda: ps.read_csv(fn, usecols=["amount", "col"]),
            )
            self.assertRaisesRegex(
                ValueError, "Unknown header argument 1", lambda: ps.read_csv(fn, header="1")
            )
            expected_error_message = (
                "'usecols' must either be list-like of all strings, "
                "all unicode, all integers or a callable."
            )
            self.assertRaisesRegex(
                ValueError, expected_error_message, lambda: ps.read_csv(fn, usecols=[1, "amount"])
            )

            # check with index_col
            expected = pd.read_csv(fn).set_index("name")
            actual = ps.read_csv(fn, index_col="name")
            self.assert_eq(expected, actual, almost=True)
Example #13
0
# Databricks notebook source
# MAGIC %md
# MAGIC ## Import Data
# MAGIC [More info around pyspark.pandas](https://databricks.com/blog/2021/10/04/pandas-api-on-upcoming-apache-spark-3-2.html)

# COMMAND ----------

import pyspark.pandas as pd
#import databricks.koalas as pd # for spark less than 3.2

data_file = "/mnt/training/airbnb-sf-listings.csv"
airbnb_sf_listings = pd.read_csv(data_file, quotechar='"', escapechar='"')
display(airbnb_sf_listings)

# COMMAND ----------

# MAGIC %md
# MAGIC ## Explore the data

# COMMAND ----------

airbnb_sf_listings.describe()

# COMMAND ----------

airbnb_sf_listings['price'].plot.hist(100)

# COMMAND ----------

airbnb_spark_df = airbnb_sf_listings.to_spark(
)  #conversion from dataframe to spark
Example #14
0
 def test_read_csv_with_encoding(self):
     # SPARK-37181: Read csv supporting latin-1 encoding.
     with self.csv_file(self.csv_text) as fn:
         expected = pd.read_csv(fn, encoding="latin-1")
         actual = ps.read_csv(fn, encoding="latin-1")
         self.assert_eq(expected, actual, almost=True)