Beispiel #1
0
    def test_get_dummies_date_datetime(self):
        df = pd.DataFrame({
            'd': [
                datetime.date(2019, 1, 1),
                datetime.date(2019, 1, 2),
                datetime.date(2019, 1, 1)
            ],
            'dt': [
                datetime.datetime(2019, 1, 1, 0, 0, 0),
                datetime.datetime(2019, 1, 1, 0, 0, 1),
                datetime.datetime(2019, 1, 1, 0, 0, 0)
            ]
        })
        ddf = self.spark.from_pandas(df)

        exp = pd.get_dummies(df)
        res = pyspark.get_dummies(ddf)
        self.assertPandasAlmostEqual(res.toPandas(), exp)

        exp = pd.get_dummies(df.d)
        res = pyspark.get_dummies(ddf.d)
        self.assertPandasAlmostEqual(res.toPandas(), exp)

        exp = pd.get_dummies(df.dt)
        res = pyspark.get_dummies(ddf.dt)
        self.assertPandasAlmostEqual(res.toPandas(), exp)
Beispiel #2
0
    def test_get_dummies_kwargs(self):
        # s = pd.Series([1, 1, 1, 2, 2, 1, 3, 4], dtype='category')
        s = pd.Series([1, 1, 1, 2, 2, 1, 3, 4])
        exp = pd.get_dummies(s, prefix='X', prefix_sep='-')

        ds = self.spark.from_pandas(s)
        res = pyspark.get_dummies(ds, prefix='X', prefix_sep='-')
        self.assertPandasAlmostEqual(res.toPandas(), exp)

        exp = pd.get_dummies(s, drop_first=True)

        ds = self.spark.from_pandas(s)
        res = pyspark.get_dummies(ds, drop_first=True)
        self.assertPandasAlmostEqual(res.toPandas(), exp)

        # nan
        # s = pd.Series([1, 1, 1, 2, np.nan, 3, np.nan, 5], dtype='category')
        s = pd.Series([1, 1, 1, 2, np.nan, 3, np.nan, 5])
        exp = pd.get_dummies(s)

        ds = self.spark.from_pandas(s)
        res = pyspark.get_dummies(ds)
        self.assertPandasAlmostEqual(res.toPandas(), exp)

        # dummy_na
        exp = pd.get_dummies(s, dummy_na=True)

        ds = self.spark.from_pandas(s)
        res = pyspark.get_dummies(ds, dummy_na=True)
        self.assertPandasAlmostEqual(res.toPandas(), exp)
Beispiel #3
0
    def test_get_dummies_object(self):
        df = pd.DataFrame({
            'a': [1, 2, 3, 4, 4, 3, 2, 1],
            # 'a': pd.Categorical([1, 2, 3, 4, 4, 3, 2, 1]),
            'b': list('abcdabcd'),
            # 'c': pd.Categorical(list('abcdabcd')),
            'c': list('abcdabcd')
        })
        ddf = self.spark.from_pandas(df)

        # Explicitly exclude object columns
        exp = pd.get_dummies(df, columns=['a', 'c'])
        res = pyspark.get_dummies(ddf, columns=['a', 'c'])
        self.assertPandasAlmostEqual(res.toPandas(), exp)

        exp = pd.get_dummies(df)
        res = pyspark.get_dummies(ddf)
        self.assertPandasAlmostEqual(res.toPandas(), exp)

        exp = pd.get_dummies(df.b)
        res = pyspark.get_dummies(ddf.b)
        self.assertPandasAlmostEqual(res.toPandas(), exp)

        exp = pd.get_dummies(df, columns=['b'])
        res = pyspark.get_dummies(ddf, columns=['b'])
        self.assertPandasAlmostEqual(res.toPandas(), exp)
Beispiel #4
0
    def test_get_dummies_decimal(self):
        df = pd.DataFrame({'d': [Decimal(1.0), Decimal(2.0), Decimal(1)]})
        ddf = self.spark.from_pandas(df)

        exp = pd.get_dummies(df)
        res = pyspark.get_dummies(ddf)
        self.assertPandasAlmostEqual(res.toPandas(), exp)

        exp = pd.get_dummies(df.d)
        res = pyspark.get_dummies(ddf.d)
        self.assertPandasAlmostEqual(res.toPandas(), exp)
Beispiel #5
0
    def test_get_dummies_boolean(self):
        df = pd.DataFrame({'b': [True, False, True]})
        ddf = self.spark.from_pandas(df)

        exp = pd.get_dummies(df)
        res = pyspark.get_dummies(ddf)
        self.assertPandasAlmostEqual(res.toPandas(), exp)

        exp = pd.get_dummies(df.b)
        res = pyspark.get_dummies(ddf.b)
        self.assertPandasAlmostEqual(res.toPandas(), exp)
Beispiel #6
0
    def test_get_dummies_dtype(self):
        df = pd.DataFrame({
            # "A": pd.Categorical(['a', 'b', 'a'], categories=['a', 'b', 'c']),
            "A": ['a', 'b', 'a'],
            "B": [0, 0, 1],
        })
        ddf = self.spark.from_pandas(df)

        if LooseVersion("0.23.0") <= LooseVersion(pd.__version__):
            exp = pd.get_dummies(df, dtype='float64')
        else:
            exp = pd.get_dummies(df)
            exp = exp.astype({'A_a': 'float64', 'A_b': 'float64'})
        res = pyspark.get_dummies(ddf, dtype='float64')
        self.assertPandasAlmostEqual(exp, res.toPandas())
Beispiel #7
0
    def test_get_dummies_prefix(self):
        df = pd.DataFrame({
            "A": ['a', 'b', 'a'],
            "B": ['b', 'a', 'c'],
            "D": [0, 0, 1],
        })
        ddf = self.spark.from_pandas(df)

        exp = pd.get_dummies(df, prefix=['foo', 'bar'])
        res = pyspark.get_dummies(ddf, prefix=['foo', 'bar'])
        self.assertPandasAlmostEqual(res.toPandas(), exp)

        exp = pd.get_dummies(df, prefix=['foo'], columns=['B'])
        res = pyspark.get_dummies(ddf, prefix=['foo'], columns=['B'])
        self.assertPandasAlmostEqual(res.toPandas(), exp)

        with self.assertRaisesRegex(ValueError, "string types"):
            pyspark.get_dummies(ddf, prefix='foo')
        with self.assertRaisesRegex(ValueError,
                                    "Length of 'prefix' \\(1\\) .* \\(2\\)"):
            pyspark.get_dummies(ddf, prefix=['foo'])
        with self.assertRaisesRegex(ValueError,
                                    "Length of 'prefix' \\(2\\) .* \\(1\\)"):
            pyspark.get_dummies(ddf, prefix=['foo', 'bar'], columns=['B'])

        s = pd.Series([1, 1, 1, 2, 2, 1, 3, 4], name='A')
        ds = self.spark.from_pandas(s)

        exp = pd.get_dummies(s, prefix='foo')
        res = pyspark.get_dummies(ds, prefix='foo')
        self.assertPandasAlmostEqual(res.toPandas(), exp)

        # columns are ignored.
        exp = pd.get_dummies(s, prefix=['foo'], columns=['B'])
        res = pyspark.get_dummies(ds, prefix=['foo'], columns=['B'])
        self.assertPandasAlmostEqual(res.toPandas(), exp)
Beispiel #8
0
    def test_get_dummies(self):
        for data in [
                pd.Series([1, 1, 1, 2, 2, 1, 3, 4]),
                # pd.Series([1, 1, 1, 2, 2, 1, 3, 4], dtype='category'),
                # pd.Series(pd.Categorical([1, 1, 1, 2, 2, 1, 3, 4], categories=[4, 3, 2, 1])),
                pd.DataFrame({
                    'a': [1, 2, 3, 4, 4, 3, 2, 1],
                    # 'b': pd.Categorical(list('abcdabcd')),
                    'b': list('abcdabcd')
                })
        ]:
            exp = pd.get_dummies(data)

            ddata = self.spark.from_pandas(data)
            res = pyspark.get_dummies(ddata)
            self.assertPandasAlmostEqual(res.toPandas(), exp)