Esempio n. 1
0
    def test_to_csv_with_path(self):
        pdf = pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]})
        kdf = ps.DataFrame(pdf)

        tmp_dir = "{}/tmp1".format(self.tmp_dir)

        kdf.to_csv(tmp_dir, num_files=1)
        self._check_output(tmp_dir, pdf.to_csv(index=False))

        tmp_dir = "{}/tmp2".format(self.tmp_dir)

        self.assertRaises(KeyError, lambda: kdf.to_csv(tmp_dir, columns=["c"], num_files=1))

        # non-string names
        pdf = pd.DataFrame({10: [1, 2, 3], 20: ["a", "b", "c"]})
        kdf = ps.DataFrame(pdf)

        tmp_dir = "{}/tmp3".format(self.tmp_dir)

        kdf.to_csv(tmp_dir, num_files=1)
        self._check_output(tmp_dir, pdf.to_csv(index=False))

        tmp_dir = "{}/tmp4".format(self.tmp_dir)

        kdf.to_csv(tmp_dir, columns=[10], num_files=1)
        self._check_output(tmp_dir, pdf.to_csv(columns=[10], index=False))

        tmp_dir = "{}/tmp5".format(self.tmp_dir)

        self.assertRaises(TypeError, lambda: kdf.to_csv(tmp_dir, columns=10, num_files=1))
Esempio n. 2
0
    def test_coalesce(self):
        num_partitions = 10
        psdf = ps.DataFrame({"age": [5, 5, 2, 2], "name": ["Bob", "Bob", "Alice", "Alice"]})
        psdf = psdf.spark.repartition(num_partitions)

        num_partitions -= 1
        new_psdf = psdf.spark.coalesce(num_partitions)
        self.assertEqual(new_psdf.to_spark().rdd.getNumPartitions(), num_partitions)
        self.assert_eq(psdf.sort_index(), new_psdf.sort_index())

        # Reserves Index
        psdf = psdf.set_index("age")
        num_partitions -= 1
        new_psdf = psdf.spark.coalesce(num_partitions)
        self.assertEqual(new_psdf.to_spark().rdd.getNumPartitions(), num_partitions)
        self.assert_eq(psdf.sort_index(), new_psdf.sort_index())

        # Reflects internal changes
        psdf = psdf.reset_index()
        psdf = psdf.set_index("name")
        psdf2 = psdf + 1
        num_partitions -= 1
        self.assert_eq(psdf2.sort_index(), (psdf + 1).spark.coalesce(num_partitions).sort_index())

        # Reserves MultiIndex
        psdf = ps.DataFrame({"a": ["a", "b", "c"]}, index=[[1, 2, 3], [4, 5, 6]])
        num_partitions -= 1
        psdf = psdf.spark.repartition(num_partitions)

        num_partitions -= 1
        new_psdf = psdf.spark.coalesce(num_partitions)
        self.assertEqual(new_psdf.to_spark().rdd.getNumPartitions(), num_partitions)
        self.assert_eq(psdf.sort_index(), new_psdf.sort_index())
Esempio n. 3
0
    def test_expanding_count(self):
        # The behaviour of Expanding.count are different between pandas>=1.0.0 and lower,
        # and we're following the behaviour of latest version of pandas.
        if LooseVersion(pd.__version__) >= LooseVersion("1.0.0"):
            self._test_expanding_func("count")
        else:
            # Series
            idx = np.random.rand(3)
            psser = ps.Series([1, 2, 3], index=idx, name="a")
            expected_result = pd.Series([None, 2.0, 3.0], index=idx, name="a")
            self.assert_eq(psser.expanding(2).count().sort_index(), expected_result.sort_index())
            self.assert_eq(psser.expanding(2).count().sum(), expected_result.sum())

            # MultiIndex
            midx = pd.MultiIndex.from_tuples([("a", "x"), ("a", "y"), ("b", "z")])
            psser = ps.Series([1, 2, 3], index=midx, name="a")
            expected_result = pd.Series([None, 2.0, 3.0], index=midx, name="a")
            self.assert_eq(psser.expanding(2).count().sort_index(), expected_result.sort_index())

            # DataFrame
            psdf = ps.DataFrame({"a": [1, 2, 3, 2], "b": [4.0, 2.0, 3.0, 1.0]})
            expected_result = pd.DataFrame({"a": [None, 2.0, 3.0, 4.0], "b": [None, 2.0, 3.0, 4.0]})
            self.assert_eq(psdf.expanding(2).count().sort_index(), expected_result.sort_index())
            self.assert_eq(psdf.expanding(2).count().sum(), expected_result.sum())

            # MultiIndex columns
            idx = np.random.rand(4)
            psdf = ps.DataFrame({"a": [1, 2, 3, 2], "b": [4.0, 2.0, 3.0, 1.0]}, index=idx)
            psdf.columns = pd.MultiIndex.from_tuples([("a", "x"), ("a", "y")])
            expected_result = pd.DataFrame(
                {("a", "x"): [None, 2.0, 3.0, 4.0], ("a", "y"): [None, 2.0, 3.0, 4.0]},
                index=idx,
            )
            self.assert_eq(psdf.expanding(2).count().sort_index(), expected_result.sort_index())
Esempio n. 4
0
    def test_to_csv(self):
        pdf = pd.DataFrame({"aa": [1, 2, 3], "bb": [4, 5, 6]}, index=[0, 1, 3])
        kdf = ps.DataFrame(pdf)

        self.assert_eq(kdf.to_csv(), pdf.to_csv(index=False))
        self.assert_eq(kdf.to_csv(columns=["aa"]), pdf.to_csv(columns=["aa"], index=False))
        self.assert_eq(kdf.aa.to_csv(), pdf.aa.to_csv(index=False, header=True))

        pdf = pd.DataFrame({"a": [1, np.nan, 3], "b": ["one", "two", None]}, index=[0, 1, 3])
        kdf = ps.from_pandas(pdf)

        self.assert_eq(kdf.to_csv(na_rep="null"), pdf.to_csv(na_rep="null", index=False))
        self.assert_eq(
            kdf.a.to_csv(na_rep="null"), pdf.a.to_csv(na_rep="null", index=False, header=True)
        )

        self.assertRaises(KeyError, lambda: kdf.to_csv(columns=["ab"]))

        pdf = pd.DataFrame({"a": [1.0, 2.0, 3.0], "b": [4.0, 5.0, 6.0]}, index=[0, 1, 3])
        kdf = ps.from_pandas(pdf)

        self.assert_eq(kdf.to_csv(), pdf.to_csv(index=False))
        self.assert_eq(kdf.to_csv(header=False), pdf.to_csv(header=False, index=False))
        self.assert_eq(kdf.to_csv(), pdf.to_csv(index=False))

        # non-string names
        pdf = pd.DataFrame({10: [1, 2, 3], 20: [4, 5, 6]}, index=[0, 1, 3])
        kdf = ps.DataFrame(pdf)

        self.assert_eq(kdf.to_csv(), pdf.to_csv(index=False))
        self.assert_eq(kdf.to_csv(columns=[10]), pdf.to_csv(columns=[10], index=False))

        self.assertRaises(TypeError, lambda: kdf.to_csv(columns=10))
Esempio n. 5
0
    def test_sql_with_index_col(self):
        import pandas as pd

        # Index
        psdf = ps.DataFrame({
            "A": [1, 2, 3],
            "B": [4, 5, 6]
        },
                            index=pd.Index(["a", "b", "c"], name="index"))
        psdf_reset_index = psdf.reset_index()
        actual = ps.sql("select * from {psdf_reset_index} where A > 1",
                        index_col="index")
        expected = psdf.iloc[[1, 2]]
        self.assert_eq(actual, expected)

        # MultiIndex
        psdf = ps.DataFrame(
            {
                "A": [1, 2, 3],
                "B": [4, 5, 6]
            },
            index=pd.MultiIndex.from_tuples([("a", "b"), ("c", "d"),
                                             ("e", "f")],
                                            names=["index1", "index2"]),
        )
        psdf_reset_index = psdf.reset_index()
        actual = ps.sql("select * from {psdf_reset_index} where A > 1",
                        index_col=["index1", "index2"])
        expected = psdf.iloc[[1, 2]]
        self.assert_eq(actual, expected)
Esempio n. 6
0
 def test_sql_with_python_objects(self):
     self.assert_eq(ps.sql("SELECT {col} as a FROM range(1)", col="lit"),
                    ps.DataFrame({"a": ["lit"]}))
     self.assert_eq(
         ps.sql("SELECT id FROM range(10) WHERE id IN {pred}",
                col="lit",
                pred=(1, 2, 3)),
         ps.DataFrame({"id": [1, 2, 3]}),
     )
Esempio n. 7
0
    def test_np_unsupported_frame(self):
        psdf = self.psdf
        with self.assertRaisesRegex(NotImplementedError,
                                    "on-Spark.*not.*support.*sqrt.*"):
            np.sqrt(psdf, psdf)

        psdf1 = ps.DataFrame({"A": [1, 2, 3]})
        psdf2 = ps.DataFrame({("A", "B"): [4, 5, 6]})
        with self.assertRaisesRegex(
                ValueError, "cannot join with no overlapping index names"):
            np.left_shift(psdf1, psdf2)
Esempio n. 8
0
    def test_sql_with_pandas_on_spark_objects(self):
        psdf = ps.DataFrame({"a": [1, 2, 3, 4]})

        self.assert_eq(ps.sql("SELECT {col} FROM {tbl}", col=psdf.a, tbl=psdf),
                       psdf)
        self.assert_eq(ps.sql("SELECT {tbl.a} FROM {tbl}", tbl=psdf), psdf)

        psdf = ps.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
        self.assert_eq(
            ps.sql("SELECT {col}, {col2} FROM {tbl}",
                   col=psdf.A,
                   col2=psdf.B,
                   tbl=psdf), psdf)
        self.assert_eq(ps.sql("SELECT {tbl.A}, {tbl.B} FROM {tbl}", tbl=psdf),
                       psdf)
Esempio n. 9
0
    def test_spark_io(self):
        with self.temp_dir() as tmp:
            pdf = self.test_pdf
            expected = ps.DataFrame(pdf)

            # Write out partitioned by one column
            expected.to_spark_io(tmp,
                                 format="json",
                                 mode="overwrite",
                                 partition_cols="i32")
            # Reset column order, as once the data is written out, Spark rearranges partition
            # columns to appear first.
            actual = ps.read_spark_io(tmp, format="json")
            self.assertFalse((actual.columns == self.test_column_order).all())
            actual = actual[self.test_column_order]
            self.assert_eq(
                actual.sort_values(by="f").to_spark().toPandas(),
                expected.sort_values(by="f").to_spark().toPandas(),
            )

            # Write out partitioned by two columns
            expected.to_spark_io(tmp,
                                 format="json",
                                 mode="overwrite",
                                 partition_cols=["i32", "bhello"])
            # Reset column order, as once the data is written out, Spark rearranges partition
            # columns to appear first.
            actual = ps.read_spark_io(path=tmp, format="json")
            self.assertFalse((actual.columns == self.test_column_order).all())
            actual = actual[self.test_column_order]
            self.assert_eq(
                actual.sort_values(by="f").to_spark().toPandas(),
                expected.sort_values(by="f").to_spark().toPandas(),
            )

            # When index columns are known
            pdf = self.test_pdf
            expected = ps.DataFrame(pdf)
            col_order = ["f", "i32", "i64"]

            expected_idx = expected.set_index("bhello")[col_order]
            actual_idx = ps.read_spark_io(tmp,
                                          format="json",
                                          index_col="bhello")[col_order]
            self.assert_eq(
                actual_idx.sort_values(by="f").to_spark().toPandas(),
                expected_idx.sort_values(by="f").to_spark().toPandas(),
            )
Esempio n. 10
0
    def test_compute_hist_multi_columns(self):
        expected_bins = np.linspace(1, 50, 11)
        kdf = ps.DataFrame(
            {
                "a": [1, 2, 3, 4, 5, 6, 7, 8, 9, 15, 50],
                "b": [50, 50, 30, 30, 30, 24, 10, 5, 4, 3, 1],
            }
        )

        bins = HistogramPlotBase.get_bins(kdf.to_spark(), 10)
        self.assert_eq(pd.Series(expected_bins), pd.Series(bins))

        expected_histograms = [
            np.array([5, 4, 1, 0, 0, 0, 0, 0, 0, 1]),
            np.array([4, 1, 0, 0, 1, 3, 0, 0, 0, 2]),
        ]
        histograms = HistogramPlotBase.compute_hist(kdf, bins)
        expected_names = ["a", "b"]

        for histogram, expected_histogram, expected_name in zip(
            histograms, expected_histograms, expected_names
        ):
            self.assert_eq(
                pd.Series(expected_histogram, name=expected_name), histogram, almost=True
            )
Esempio n. 11
0
    def test_timestamp_subtraction(self):
        pdf = self.pdf1
        kdf = ps.from_pandas(pdf)

        # Those fail in certain OSs presumably due to different
        # timezone behaviours inherited from C library.

        actual = (kdf["end_date"] - kdf["start_date"] - 1).to_pandas()
        expected = (pdf["end_date"] - pdf["start_date"]) // np.timedelta64(1, "s") - 1
        # self.assert_eq(actual, expected)

        actual = (kdf["end_date"] - pd.Timestamp("2012-1-1 12:45:31") - 1).to_pandas()
        expected = (pdf["end_date"] - pd.Timestamp("2012-1-1 12:45:31")) // np.timedelta64(
            1, "s"
        ) - 1
        # self.assert_eq(actual, expected)

        actual = (pd.Timestamp("2013-3-11 21:45:00") - kdf["start_date"] - 1).to_pandas()
        expected = (pd.Timestamp("2013-3-11 21:45:00") - pdf["start_date"]) // np.timedelta64(
            1, "s"
        ) - 1
        # self.assert_eq(actual, expected)

        kdf = ps.DataFrame(
            {"a": pd.date_range("2016-12-31", "2017-01-08", freq="D"), "b": pd.Series(range(9))}
        )
        expected_error_message = "datetime subtraction can only be applied to datetime series."
        with self.assertRaisesRegex(TypeError, expected_error_message):
            kdf["a"] - kdf["b"]
        with self.assertRaisesRegex(TypeError, expected_error_message):
            kdf["a"] - 1
        with self.assertRaisesRegex(TypeError, expected_error_message):
            1 - kdf["a"]
Esempio n. 12
0
    def test_to_csv_with_partition_cols(self):
        pdf = pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]})
        kdf = ps.DataFrame(pdf)

        kdf.to_csv(self.tmp_dir, partition_cols="b", num_files=1)

        partition_paths = [
            path for path in os.listdir(self.tmp_dir) if path.startswith("b=")
        ]
        assert len(partition_paths) > 0
        for partition_path in partition_paths:
            column, value = partition_path.split("=")
            expected = pdf[pdf[column] == value].drop(
                "b", axis=1).to_csv(index=False)

            output_paths = [
                path for path in os.listdir("%s/%s" %
                                            (self.tmp_dir, partition_path))
                if path.startswith("part-")
            ]
            assert len(output_paths) > 0
            output_path = "%s/%s/%s" % (self.tmp_dir, partition_path,
                                        output_paths[0])
            with open(output_path) as f:
                self.assertEqual(f.read(), expected)
Esempio n. 13
0
    def test_parquet_read(self):
        with self.temp_dir() as tmp:
            data = self.test_pdf
            self.spark.createDataFrame(
                data, "i32 int, i64 long, f double, bhello string").coalesce(
                    1).write.parquet(tmp, mode="overwrite")

            def check(columns):
                expected = pd.read_parquet(tmp, columns=columns)
                actual = ps.read_parquet(tmp, columns=columns)
                self.assertPandasEqual(expected, actual.to_pandas())

            check(None)
            check(["i32", "i64"])
            check(["i64", "i32"])

            # check with pyspark patch.
            expected = pd.read_parquet(tmp)
            actual = ps.read_parquet(tmp)
            self.assertPandasEqual(expected, actual.to_pandas())

            # When index columns are known
            pdf = self.test_pdf
            expected = ps.DataFrame(pdf)

            expected_idx = expected.set_index("bhello")[["f", "i32", "i64"]]
            actual_idx = ps.read_parquet(
                tmp, index_col="bhello")[["f", "i32", "i64"]]
            self.assert_eq(
                actual_idx.sort_values(by="f").to_spark().toPandas(),
                expected_idx.sort_values(by="f").to_spark().toPandas(),
            )
Esempio n. 14
0
 def _convert_var(self, var: Any) -> Any:
     """
     Converts a python object into a string that is legal SQL.
     """
     if isinstance(var, (int, float)):
         return str(var)
     if isinstance(var, Series):
         return self._convert_var(var.to_dataframe())
     if isinstance(var, pd.DataFrame):
         return self._convert_var(ps.DataFrame(var))
     if isinstance(var, DataFrame):
         df_id = "pandas_on_spark_" + str(id(var))
         if df_id not in self._temp_views:
             sdf = var._to_spark()
             sdf.createOrReplaceTempView(df_id)
             self._temp_views[df_id] = sdf
         return df_id
     if isinstance(var, str):
         return '"' + escape_sql_string(var) + '"'
     if isinstance(var, list):
         return "(" + ", ".join([self._convert_var(v) for v in var]) + ")"
     if isinstance(var, (tuple, range)):
         return self._convert_var(list(var))
     raise ValueError("Unsupported variable type {}: {}".format(
         type(var).__name__, str(var)))
Esempio n. 15
0
 def test_checkpoint(self):
     with self.temp_dir() as tmp:
         self.spark.sparkContext.setCheckpointDir(tmp)
         psdf = ps.DataFrame({"a": ["a", "b", "c"]})
         new_psdf = psdf.spark.checkpoint()
         self.assertIsNotNone(os.listdir(tmp))
         self.assert_eq(psdf, new_psdf)
Esempio n. 16
0
    def test_orc_write(self):
        with self.temp_dir() as tmp:
            pdf = self.test_pdf
            expected = ps.DataFrame(pdf)

            # Write out partitioned by one column
            expected.to_orc(tmp, mode="overwrite", partition_cols="i32")
            # Reset column order, as once the data is written out, Spark rearranges partition
            # columns to appear first.
            actual = ps.read_orc(tmp)
            self.assertFalse((actual.columns == self.test_column_order).all())
            actual = actual[self.test_column_order]
            self.assert_eq(
                actual.sort_values(by="f").to_spark().toPandas(),
                expected.sort_values(by="f").to_spark().toPandas(),
            )

            # Write out partitioned by two columns
            expected.to_orc(tmp,
                            mode="overwrite",
                            partition_cols=["i32", "bhello"])
            # Reset column order, as once the data is written out, Spark rearranges partition
            # columns to appear first.
            actual = ps.read_orc(tmp)
            self.assertFalse((actual.columns == self.test_column_order).all())
            actual = actual[self.test_column_order]
            self.assert_eq(
                actual.sort_values(by="f").to_spark().toPandas(),
                expected.sort_values(by="f").to_spark().toPandas(),
            )
Esempio n. 17
0
    def test_get_index_map(self):
        psdf = ps.DataFrame({
            "year": [2015, 2016],
            "month": [2, 3],
            "day": [4, 5]
        })
        sdf = psdf.to_spark()
        self.assertEqual(_get_index_map(sdf), (None, None))

        def check(actual, expected):
            actual_scols, actual_labels = actual
            expected_column_names, expected_labels = expected
            self.assertEqual(len(actual_scols), len(expected_column_names))
            for actual_scol, expected_column_name in zip(
                    actual_scols, expected_column_names):
                expected_scol = sdf[expected_column_name]
                self.assertTrue(spark_column_equals(actual_scol,
                                                    expected_scol))
            self.assertEqual(actual_labels, expected_labels)

        check(_get_index_map(sdf, "year"), (["year"], [("year", )]))
        check(_get_index_map(sdf, ["year", "month"]),
              (["year", "month"], [("year", ), ("month", )]))

        self.assertRaises(KeyError,
                          lambda: _get_index_map(sdf, ["year", "hour"]))
Esempio n. 18
0
    def test_date_subtraction(self):
        pdf = self.pdf1
        kdf = ps.from_pandas(pdf)

        self.assert_eq(
            kdf["end_date"].dt.date - kdf["start_date"].dt.date,
            (pdf["end_date"].dt.date - pdf["start_date"].dt.date).dt.days,
        )

        self.assert_eq(
            kdf["end_date"].dt.date - datetime.date(2012, 1, 1),
            (pdf["end_date"].dt.date - datetime.date(2012, 1, 1)).dt.days,
        )

        self.assert_eq(
            datetime.date(2013, 3, 11) - kdf["start_date"].dt.date,
            (datetime.date(2013, 3, 11) - pdf["start_date"].dt.date).dt.days,
        )

        kdf = ps.DataFrame({
            "a":
            pd.date_range("2016-12-31", "2017-01-08", freq="D"),
            "b":
            pd.Series(range(9))
        })
        expected_error_message = "date subtraction can only be applied to date series."
        with self.assertRaisesRegex(TypeError, expected_error_message):
            kdf["a"].dt.date - kdf["b"]
        with self.assertRaisesRegex(TypeError, expected_error_message):
            kdf["a"].dt.date - 1
        with self.assertRaisesRegex(TypeError, expected_error_message):
            1 - kdf["a"].dt.date
Esempio n. 19
0
    def test_kde_plot(self):
        psdf = ps.DataFrame({
            "a": [1, 2, 3, 4, 5],
            "b": [1, 3, 5, 7, 9],
            "c": [2, 4, 6, 8, 10]
        })

        pdf = pd.DataFrame({
            "Density": [
                0.03515491,
                0.06834979,
                0.00663503,
                0.02372059,
                0.06834979,
                0.01806934,
                0.01806934,
                0.06834979,
                0.02372059,
            ],
            "names": ["a", "a", "a", "b", "b", "b", "c", "c", "c"],
            "index": [-3.5, 5.5, 14.5, -3.5, 5.5, 14.5, -3.5, 5.5, 14.5],
        })

        actual = psdf.plot.kde(bw_method=5, ind=3)

        expected = express.line(pdf, x="index", y="Density", color="names")
        expected["layout"]["xaxis"]["title"] = None

        self.assertEqual(pprint.pformat(actual.to_dict()),
                         pprint.pformat(expected.to_dict()))
Esempio n. 20
0
    def test_table(self):
        with self.table("test_table"):
            pdf = self.test_pdf
            expected = ps.DataFrame(pdf)

            # Write out partitioned by one column
            expected.spark.to_table("test_table",
                                    mode="overwrite",
                                    partition_cols="i32")
            # Reset column order, as once the data is written out, Spark rearranges partition
            # columns to appear first.
            actual = ps.read_table("test_table")
            self.assertFalse((actual.columns == self.test_column_order).all())
            actual = actual[self.test_column_order]
            self.assert_eq(
                actual.sort_values(by="f").to_spark().toPandas(),
                expected.sort_values(by="f").to_spark().toPandas(),
            )

            # Write out partitioned by two columns
            expected.to_table("test_table",
                              mode="overwrite",
                              partition_cols=["i32", "bhello"])
            # Reset column order, as once the data is written out, Spark rearranges partition
            # columns to appear first.
            actual = ps.read_table("test_table")
            self.assertFalse((actual.columns == self.test_column_order).all())
            actual = actual[self.test_column_order]
            self.assert_eq(
                actual.sort_values(by="f").to_spark().toPandas(),
                expected.sort_values(by="f").to_spark().toPandas(),
            )

            # When index columns are known
            expected_idx = expected.set_index("bhello")[["f", "i32", "i64"]]
            actual_idx = ps.read_table("test_table",
                                       index_col="bhello")[["f", "i32", "i64"]]
            self.assert_eq(
                actual_idx.sort_values(by="f").to_spark().toPandas(),
                expected_idx.sort_values(by="f").to_spark().toPandas(),
            )

            expected_idx = expected.set_index(["bhello"])[["f", "i32", "i64"]]
            actual_idx = ps.read_table("test_table",
                                       index_col=["bhello"
                                                  ])[["f", "i32", "i64"]]
            self.assert_eq(
                actual_idx.sort_values(by="f").to_spark().toPandas(),
                expected_idx.sort_values(by="f").to_spark().toPandas(),
            )

            expected_idx = expected.set_index(["i32", "bhello"])[["f", "i64"]]
            actual_idx = ps.read_table("test_table",
                                       index_col=["i32",
                                                  "bhello"])[["f", "i64"]]
            self.assert_eq(
                actual_idx.sort_values(by="f").to_spark().toPandas(),
                expected_idx.sort_values(by="f").to_spark().toPandas(),
            )
Esempio n. 21
0
    def test_to_csv_with_path_and_basic_options(self):
        pdf = pd.DataFrame({"aa": [1, 2, 3], "bb": ["a", "b", "c"]})
        kdf = ps.DataFrame(pdf)

        kdf.to_csv(self.tmp_dir, num_files=1, sep="|", header=False, columns=["aa"])
        expected = pdf.to_csv(index=False, sep="|", header=False, columns=["aa"])

        self._check_output(self.tmp_dir, expected)
Esempio n. 22
0
    def test_to_csv_with_path_and_pyspark_options(self):
        pdf = pd.DataFrame({"a": [1, 2, 3, None], "b": ["a", "b", "c", None]})
        kdf = ps.DataFrame(pdf)

        kdf.to_csv(self.tmp_dir, nullValue="null", num_files=1)
        expected = pdf.to_csv(index=False, na_rep="null")

        self._check_output(self.tmp_dir, expected)
Esempio n. 23
0
    def test_missing(self):
        kdf = ps.DataFrame(np.random.rand(2500, 4), columns=["a", "b", "c", "d"])

        unsupported_functions = ["box", "hexbin"]

        for name in unsupported_functions:
            with self.assertRaisesRegex(
                PandasNotImplementedError, "method.*DataFrame.*{}.*not implemented".format(name)
            ):
                getattr(kdf.plot, name)()
Esempio n. 24
0
    def test_to_json_with_path(self):
        pdf = pd.DataFrame({"a": [1], "b": ["a"]})
        psdf = ps.DataFrame(pdf)

        psdf.to_json(self.tmp_dir, num_files=1)
        expected = pdf.to_json(orient="records")

        output_paths = [path for path in os.listdir(self.tmp_dir) if path.startswith("part-")]
        assert len(output_paths) > 0
        output_path = "%s/%s" % (self.tmp_dir, output_paths[0])
        self.assertEqual("[%s]" % open(output_path).read().strip(), expected)
Esempio n. 25
0
    def test_compute_hist_single_column(self):
        kdf = ps.DataFrame(
            {"a": [1, 2, 3, 4, 5, 6, 7, 8, 9, 15, 50]}, index=[0, 1, 3, 5, 6, 8, 9, 9, 9, 10, 10]
        )

        expected_bins = np.linspace(1, 50, 11)
        bins = HistogramPlotBase.get_bins(kdf[["a"]].to_spark(), 10)

        expected_histogram = np.array([5, 4, 1, 0, 0, 0, 0, 0, 0, 1])
        histogram = HistogramPlotBase.compute_hist(kdf[["a"]], bins)[0]
        self.assert_eq(pd.Series(expected_bins), pd.Series(bins))
        self.assert_eq(pd.Series(expected_histogram, name="a"), histogram, almost=True)
Esempio n. 26
0
    def test_to_csv_with_path_and_basic_options_multiindex_columns(self):
        pdf = pd.DataFrame({("x", "a"): [1, 2, 3], ("y", "b"): ["a", "b", "c"]})
        kdf = ps.DataFrame(pdf)

        with self.assertRaises(ValueError):
            kdf.to_csv(self.tmp_dir, num_files=1, sep="|", columns=[("x", "a")])

        kdf.to_csv(self.tmp_dir, num_files=1, sep="|", header=["a"], columns=[("x", "a")])
        pdf.columns = ["a", "b"]
        expected = pdf.to_csv(index=False, sep="|", columns=["a"])

        self._check_output(self.tmp_dir, expected)
Esempio n. 27
0
    def test_repartition(self):
        kdf = ps.DataFrame({
            "age": [5, 5, 2, 2],
            "name": ["Bob", "Bob", "Alice", "Alice"]
        })
        num_partitions = kdf.to_spark().rdd.getNumPartitions() + 1

        num_partitions += 1
        new_kdf = kdf.spark.repartition(num_partitions)
        self.assertEqual(new_kdf.to_spark().rdd.getNumPartitions(),
                         num_partitions)
        self.assert_eq(kdf.sort_index(), new_kdf.sort_index())

        # Reserves Index
        kdf = kdf.set_index("age")
        num_partitions += 1
        new_kdf = kdf.spark.repartition(num_partitions)
        self.assertEqual(new_kdf.to_spark().rdd.getNumPartitions(),
                         num_partitions)
        self.assert_eq(kdf.sort_index(), new_kdf.sort_index())

        # Reflects internal changes
        kdf = kdf.reset_index()
        kdf = kdf.set_index("name")
        kdf2 = kdf + 1
        num_partitions += 1
        self.assert_eq(kdf2.sort_index(),
                       (kdf +
                        1).spark.repartition(num_partitions).sort_index())

        # Reserves MultiIndex
        kdf = ps.DataFrame({"a": ["a", "b", "c"]},
                           index=[[1, 2, 3], [4, 5, 6]])
        num_partitions = kdf.to_spark().rdd.getNumPartitions() + 1
        new_kdf = kdf.spark.repartition(num_partitions)
        self.assertEqual(new_kdf.to_spark().rdd.getNumPartitions(),
                         num_partitions)
        self.assert_eq(kdf.sort_index(), new_kdf.sort_index())
Esempio n. 28
0
    def test_repr_float_index(self):
        kdf = ps.DataFrame(
            {"a": np.random.rand(ReprTest.max_display_count)},
            index=np.random.rand(ReprTest.max_display_count),
        )
        self.assertTrue("Showing only the first" not in repr(kdf))
        self.assert_eq(repr(kdf), repr(kdf.to_pandas()))
        self.assertTrue("Showing only the first" not in repr(kdf.a))
        self.assert_eq(repr(kdf.a), repr(kdf.a.to_pandas()))
        self.assertTrue("Showing only the first" not in repr(kdf.index))
        self.assert_eq(repr(kdf.index), repr(kdf.index.to_pandas()))

        self.assertTrue("Showing only the first" not in kdf._repr_html_())
        self.assertEqual(kdf._repr_html_(), kdf.to_pandas()._repr_html_())

        kdf = ps.DataFrame(
            {"a": np.random.rand(ReprTest.max_display_count + 1)},
            index=np.random.rand(ReprTest.max_display_count + 1),
        )
        self.assertTrue("Showing only the first" in repr(kdf))
        self.assertTrue("Showing only the first" in repr(kdf.a))
        self.assertTrue("Showing only the first" in repr(kdf.index))
        self.assertTrue("Showing only the first" in kdf._repr_html_())
    def test_parquet_write(self):
        with self.temp_dir() as tmp:
            pdf = self.test_pdf
            expected = ps.DataFrame(pdf)

            # Write out partitioned by one column
            expected.to_parquet(tmp, mode="overwrite", partition_cols="i32")
            # Reset column order, as once the data is written out, Spark rearranges partition
            # columns to appear first.
            actual = ps.read_parquet(tmp)
            self.assertFalse((actual.columns == self.test_column_order).all())
            actual = actual[self.test_column_order]
            self.assert_eq(
                actual.sort_values(by="f").to_spark().toPandas(),
                expected.sort_values(by="f").to_spark().toPandas(),
            )

            # Write out partitioned by two columns
            expected.to_parquet(tmp, mode="overwrite", partition_cols=["i32", "bhello"])
            # Reset column order, as once the data is written out, Spark rearranges partition
            # columns to appear first.
            actual = ps.read_parquet(tmp)
            self.assertFalse((actual.columns == self.test_column_order).all())
            actual = actual[self.test_column_order]
            self.assert_eq(
                actual.sort_values(by="f").to_spark().toPandas(),
                expected.sort_values(by="f").to_spark().toPandas(),
            )

            # Set `compression` with string
            expected.to_parquet(tmp, mode="overwrite", partition_cols="i32", compression="none")
            actual = ps.read_parquet(tmp)
            self.assertFalse((actual.columns == self.test_column_order).all())
            actual = actual[self.test_column_order]
            self.assert_eq(
                actual.sort_values(by="f").to_spark().toPandas(),
                expected.sort_values(by="f").to_spark().toPandas(),
            )

            # Test `options` parameter
            expected.to_parquet(
                tmp, mode="overwrite", partition_cols="i32", options={"compression": "none"}
            )
            actual = ps.read_parquet(tmp)
            self.assertFalse((actual.columns == self.test_column_order).all())
            actual = actual[self.test_column_order]
            self.assert_eq(
                actual.sort_values(by="f").to_spark().toPandas(),
                expected.sort_values(by="f").to_spark().toPandas(),
            )
Esempio n. 30
0
 def test_timestamp_subtraction_errors(self):
     psdf = ps.DataFrame({
         "a":
         pd.date_range("2016-12-31", "2017-01-08", freq="D"),
         "b":
         pd.Series(range(9))
     })
     expected_error_message = "Datetime subtraction can only be applied to datetime series."
     with self.assertRaisesRegex(TypeError, expected_error_message):
         psdf["a"] - psdf["b"]
     with self.assertRaisesRegex(TypeError, expected_error_message):
         psdf["a"] - 1
     with self.assertRaisesRegex(TypeError, expected_error_message):
         1 - psdf["a"]