Example #1
0
 def test_astype(self):
     for pser, psser in self.intergral_extension_pser_psser_pairs:
         for dtype in self.extension_dtypes:
             if dtype in self.string_extension_dtype:
                 if LooseVersion(pd.__version__) >= LooseVersion("1.1.0"):
                     # Limit pandas version due to
                     # https://github.com/pandas-dev/pandas/issues/31204
                     self.check_extension(pser.astype(dtype),
                                          psser.astype(dtype))
             else:
                 self.check_extension(pser.astype(dtype),
                                      psser.astype(dtype))
     for pser, psser in self.intergral_extension_pser_psser_pairs:
         self.assert_eq(pser.astype(float), psser.astype(float))
         self.assert_eq(pser.astype(np.float32), psser.astype(np.float32))
         with ps.option_context("compute.eager_check", True):
             self.assertRaisesRegex(
                 ValueError,
                 "Cannot convert integrals with missing values to bool",
                 lambda: psser.astype(bool),
             )
             self.assertRaisesRegex(
                 ValueError,
                 "Cannot convert integrals with missing values to integer",
                 lambda: psser.astype(int),
             )
             self.assertRaisesRegex(
                 ValueError,
                 "Cannot convert integrals with missing values to integer",
                 lambda: psser.astype(np.int32),
             )
         with ps.option_context("compute.eager_check", False):
             psser.astype(bool)
             psser.astype(int)
             psser.astype(np.int32)
Example #2
0
 def test_astype(self):
     for pser, psser in self.fractional_extension_pser_psser_pairs:
         for dtype in self.extension_dtypes:
             self.check_extension(pser.astype(dtype), psser.astype(dtype))
     for pser, psser in self.fractional_extension_pser_psser_pairs:
         self.assert_eq(pser.astype(float), psser.astype(float))
         self.assert_eq(pser.astype(np.float32), psser.astype(np.float32))
         with ps.option_context("compute.eager_check", True):
             self.assertRaisesRegex(
                 ValueError,
                 "Cannot convert fractions with missing values to bool",
                 lambda: psser.astype(bool),
             )
             self.assertRaisesRegex(
                 ValueError,
                 "Cannot convert fractions with missing values to integer",
                 lambda: psser.astype(int),
             )
             self.assertRaisesRegex(
                 ValueError,
                 "Cannot convert fractions with missing values to integer",
                 lambda: psser.astype(np.int32),
             )
         with ps.option_context("compute.eager_check", False):
             psser.astype(bool)
             psser.astype(int)
             psser.astype(np.int32)
Example #3
0
    def test_series_transform_batch_without_shortcut(self):
        with ps.option_context("compute.shortcut_limit", 0):
            self.test_series_transform_batch()

        pdf, kdf = self.df_pair

        def to_str(pser) -> ps.Series[str]:
            return pser.astype(str)

        self.assert_eq(
            kdf.a.koalas.transform_batch(to_str).sort_index(),
            to_str(pdf.a).sort_index())

        pdf = pd.DataFrame({
            "a": ["a", "b", "c", "a", "b", "c"],
            "b": ["b", "a", "c", "c", "b", "a"]
        })
        kdf = ps.from_pandas(pdf)

        dtype = CategoricalDtype(categories=["a", "b", "c", "d"])

        def to_category(pser) -> ps.Series[dtype]:
            return pser.astype(dtype)

        self.assert_eq(
            kdf.a.koalas.transform_batch(to_category).sort_index(),
            to_category(pdf.a).sort_index())
Example #4
0
    def test_frame_apply_batch_without_shortcut(self):
        with ps.option_context("compute.shortcut_limit", 0):
            self.test_frame_apply_batch()

        pdf, kdf = self.df_pair

        def to_str(pdf) -> 'ps.DataFrame["a":str, "b":str]':  # noqa: F405
            return pdf.astype(str)

        self.assert_eq(
            kdf.koalas.apply_batch(to_str).sort_values(
                ["a", "b"]).reset_index(drop=True),
            to_str(pdf).sort_values(["a", "b"]).reset_index(drop=True),
        )

        pdf = pd.DataFrame({
            "a": ["a", "b", "c", "a", "b", "c"],
            "b": ["b", "a", "c", "c", "b", "a"]
        })
        kdf = ps.from_pandas(pdf)

        dtype = CategoricalDtype(categories=["a", "b", "c", "d"])
        ret = ps.DataFrame["a":dtype, "b":dtype]

        def to_category(pdf) -> ret:
            return pdf.astype(dtype)

        self.assert_eq(
            kdf.koalas.apply_batch(to_category).sort_values(
                ["a", "b"]).reset_index(drop=True),
            to_category(pdf).sort_values(["a", "b"]).reset_index(drop=True),
        )
Example #5
0
    def test_groupby_transform_without_shortcut(self):
        with ps.option_context("compute.shortcut_limit", 0):
            self.test_groupby_transform()

        pdf, psdf = self.df_pair

        def identity(x) -> ps.Series[psdf.b.dtype]:  # type: ignore
            return x

        self.assert_eq(
            psdf.groupby("a").transform(identity).sort_values("b").reset_index(drop=True),
            pdf.groupby("a").transform(identity).sort_values("b").reset_index(drop=True),
        )

        dtype = CategoricalDtype(categories=["a", "b", "c", "d"])

        def astype(x) -> ps.Series[dtype]:
            return x.astype(dtype)

        if LooseVersion(pd.__version__) >= LooseVersion("1.2"):
            self.assert_eq(
                psdf.groupby("a").transform(astype).sort_values("b").reset_index(drop=True),
                pdf.groupby("a").transform(astype).sort_values("b").reset_index(drop=True),
            )
        else:
            expected = pdf.groupby("a").transform(astype)
            expected["b"] = dtype.categories.take(expected["b"].cat.codes).astype(dtype)
            self.assert_eq(
                psdf.groupby("a").transform(astype).sort_values("b").reset_index(drop=True),
                expected.sort_values("b").reset_index(drop=True),
            )
Example #6
0
    def test_frame_transform_without_shortcut(self):
        with ps.option_context("compute.shortcut_limit", 0):
            self.test_frame_transform()

        pdf, kdf = self.df_pair

        def codes(pser) -> ps.Series[np.int8]:
            return pser.cat.codes

        self.assert_eq(kdf.transform(codes), pdf.transform(codes))

        pdf = pd.DataFrame({
            "a": ["a", "b", "c", "a", "b", "c"],
            "b": ["b", "a", "c", "c", "b", "a"]
        })
        kdf = ps.from_pandas(pdf)

        dtype = CategoricalDtype(categories=["a", "b", "c", "d"])

        def to_category(pser) -> ps.Series[dtype]:
            return pser.astype(dtype)

        self.assert_eq(
            kdf.transform(to_category).sort_index(),
            pdf.transform(to_category).sort_index())
Example #7
0
    def test_plot_backends(self):
        plot_backend = "plotly"

        with ps.option_context("plotting.backend", plot_backend):
            self.assertEqual(ps.options.plotting.backend, plot_backend)

            module = PandasOnSparkPlotAccessor._get_plot_backend(plot_backend)
            self.assertEqual(module.__name__, "pyspark.pandas.plot.plotly")
Example #8
0
    def test_plot_backends_incorrect(self):
        fake_plot_backend = "none_plotting_module"

        with ps.option_context("plotting.backend", fake_plot_backend):
            self.assertEqual(ps.options.plotting.backend, fake_plot_backend)

            with self.assertRaises(ValueError):
                PandasOnSparkPlotAccessor._get_plot_backend(fake_plot_backend)
Example #9
0
    def test_astype_eager_check(self):
        psser = self.psdf["float_nan"]
        with ps.option_context("compute.eager_check",
                               True), self.assertRaisesRegex(
                                   ValueError, "Cannot convert"):
            psser.astype(int)
        with ps.option_context("compute.eager_check", False):
            psser.astype(int)

        # Skip decimal_nan test before v1.3.0, it not supported by pandas on spark yet.
        if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
            psser = self.psdf["decimal_nan"]
            with ps.option_context("compute.eager_check",
                                   True), self.assertRaisesRegex(
                                       ValueError, "Cannot convert"):
                psser.astype(int)
            with ps.option_context("compute.eager_check", False):
                psser.astype(int)
Example #10
0
    def indexer_between_time(
        self,
        start_time: Union[datetime.time, str],
        end_time: Union[datetime.time, str],
        include_start: bool = True,
        include_end: bool = True,
    ) -> Index:
        """
        Return index locations of values between particular times of day
        (example: 9:00-9:30AM).

        Parameters
        ----------
        start_time, end_time : datetime.time, str
            Time passed either as object (datetime.time) or as string in
            appropriate format ("%H:%M", "%H%M", "%I:%M%p", "%I%M%p",
            "%H:%M:%S", "%H%M%S", "%I:%M:%S%p","%I%M%S%p").
        include_start : bool, default True
        include_end : bool, default True

        Returns
        -------
        values_between_time : Index of integers

        Examples
        --------
        >>> psidx = ps.date_range("2000-01-01", periods=3, freq="T")
        >>> psidx  # doctest: +NORMALIZE_WHITESPACE
        DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 00:01:00',
                       '2000-01-01 00:02:00'],
                      dtype='datetime64[ns]', freq=None)

        >>> psidx.indexer_between_time("00:01", "00:02").sort_values()
        Int64Index([1, 2], dtype='int64')

        >>> psidx.indexer_between_time("00:01", "00:02", include_end=False)
        Int64Index([1], dtype='int64')

        >>> psidx.indexer_between_time("00:01", "00:02", include_start=False)
        Int64Index([2], dtype='int64')
        """
        @no_type_check
        def pandas_between_time(pdf) -> ps.DataFrame[int]:
            return pdf.between_time(start_time, end_time, include_start,
                                    include_end)

        psdf = self.to_frame()[[]]
        id_column_name = verify_temp_column_name(psdf, "__id_column__")
        psdf = psdf.pandas_on_spark.attach_id_column("distributed-sequence",
                                                     id_column_name)
        with ps.option_context("compute.default_index_type", "distributed"):
            # The attached index in the statement below will be dropped soon,
            # so we enforce “distributed” default index type
            psdf = psdf.pandas_on_spark.apply_batch(pandas_between_time)
        return ps.Index(first_series(psdf).rename(self.name))
Example #11
0
    def test_series_apply_without_shortcut(self):
        with ps.option_context("compute.shortcut_limit", 0):
            self.test_series_apply()

        pdf, psdf = self.df_pair
        ret = psdf.a.dtype

        def identity(pser) -> ret:
            return pser

        self.assert_eq(psdf.a.apply(identity).sort_index(), pdf.a.apply(identity).sort_index())
Example #12
0
    def test_groupby_apply_without_shortcut(self):
        with ps.option_context("compute.shortcut_limit", 0):
            self.test_groupby_apply()

        pdf, psdf = self.df_pair

        def identity(df) -> ps.DataFrame[zip(psdf.columns, psdf.dtypes)]:
            return df

        self.assert_eq(
            psdf.groupby("a").apply(identity).sort_values(["a", "b"]).reset_index(drop=True),
            pdf.groupby("a").apply(identity).sort_values(["a", "b"]).reset_index(drop=True),
        )
Example #13
0
    def indexer_at_time(self,
                        time: Union[datetime.time, str],
                        asof: bool = False) -> Index:
        """
        Return index locations of values at particular time of day
        (example: 9:30AM).

        Parameters
        ----------
        time : datetime.time or str
            Time passed in either as object (datetime.time) or as string in
            appropriate format ("%H:%M", "%H%M", "%I:%M%p", "%I%M%p",
            "%H:%M:%S", "%H%M%S", "%I:%M:%S%p", "%I%M%S%p").

        Returns
        -------
        values_at_time : Index of integers

        Examples
        --------
        >>> psidx = ps.date_range("2000-01-01", periods=3, freq="T")
        >>> psidx  # doctest: +NORMALIZE_WHITESPACE
        DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 00:01:00',
                       '2000-01-01 00:02:00'],
                      dtype='datetime64[ns]', freq=None)

        >>> psidx.indexer_at_time("00:00")
        Int64Index([0], dtype='int64')

        >>> psidx.indexer_at_time("00:01")
        Int64Index([1], dtype='int64')
        """
        if asof:
            raise NotImplementedError("'asof' argument is not supported")

        @no_type_check
        def pandas_at_time(pdf) -> ps.DataFrame[int]:
            return pdf.at_time(time, asof)

        psdf = self.to_frame()[[]]
        id_column_name = verify_temp_column_name(psdf, "__id_column__")
        psdf = psdf.pandas_on_spark.attach_id_column("distributed-sequence",
                                                     id_column_name)
        with ps.option_context("compute.default_index_type", "distributed"):
            # The attached index in the statement below will be dropped soon,
            # so we enforce “distributed” default index type
            psdf = psdf.pandas_on_spark.apply_batch(pandas_at_time)
        return ps.Index(first_series(psdf).rename(self.name))
Example #14
0
    def test_frame_transform_batch_without_shortcut(self):
        with ps.option_context("compute.shortcut_limit", 0):
            self.test_frame_transform_batch()

        pdf, kdf = self.df_pair

        def to_str(pdf) -> 'ps.DataFrame["a":str, "b":str]':  # noqa: F405
            return pdf.astype(str)

        self.assert_eq(
            kdf.koalas.transform_batch(to_str).sort_index(),
            to_str(pdf).sort_index(),
        )

        def to_codes(pdf) -> ps.Series[np.int8]:
            return pdf.b.cat.codes

        self.assert_eq(
            kdf.koalas.transform_batch(to_codes).sort_index(),
            to_codes(pdf).sort_index(),
        )

        pdf = pd.DataFrame({
            "a": ["a", "b", "c", "a", "b", "c"],
            "b": ["b", "a", "c", "c", "b", "a"]
        })
        kdf = ps.from_pandas(pdf)

        dtype = CategoricalDtype(categories=["a", "b", "c", "d"])
        ret = ps.DataFrame["a":dtype, "b":dtype]

        def to_category(pdf) -> ret:
            return pdf.astype(dtype)

        self.assert_eq(
            kdf.koalas.transform_batch(to_category).sort_index(),
            to_category(pdf).sort_index(),
        )

        def to_category(pdf) -> ps.Series[dtype]:
            return pdf.b.astype(dtype)

        self.assert_eq(
            kdf.koalas.transform_batch(to_category).sort_index(),
            to_category(pdf).rename().sort_index(),
        )
Example #15
0
    def test_frame_apply_without_shortcut(self):
        with ps.option_context("compute.shortcut_limit", 0):
            self.test_frame_apply()

        pdf = pd.DataFrame(
            {"a": ["a", "b", "c", "a", "b", "c"], "b": ["b", "a", "c", "c", "b", "a"]}
        )
        psdf = ps.from_pandas(pdf)

        dtype = CategoricalDtype(categories=["a", "b", "c"])

        def categorize(ser) -> ps.Series[dtype]:
            return ser.astype(dtype)

        self.assert_eq(
            psdf.apply(categorize).sort_values(["a", "b"]).reset_index(drop=True),
            pdf.apply(categorize).sort_values(["a", "b"]).reset_index(drop=True),
        )
Example #16
0
    def test_xor(self):
        pdf, psdf = self.bool_pdf, self.bool_psdf
        pser, other_pser = pdf["this"], pdf["that"]
        psser, other_psser = psdf["this"], psdf["that"]

        self.assert_eq(pser ^ other_pser, psser ^ other_psser)
        self.assert_eq(pser ^ True, psser ^ True)
        self.assert_eq(pser ^ False, psser ^ False)
        self.assert_eq(pser ^ 2, psser ^ 2)
        self.assert_eq(pser ^ 99, psser ^ 99)

        with self.assertRaisesRegex(TypeError, "XOR can not be applied to given types."):
            psser ^ "a"

        with option_context("compute.ops_on_diff_frames", True):
            pser, other_pser = self.pdf["bool"], self.integral_pdf["this"]
            psser, other_psser = self.psdf["bool"], self.integral_psdf["this"]

            self.assert_eq(pser ^ other_pser, psser ^ other_psser)
Example #17
0
 def test_default_index_distributed_sequence(self):
     with ps.option_context("compute.default_index_type",
                            "distributed-sequence"):
         sdf = self.spark.range(1000)
         self.assert_eq(ps.DataFrame(sdf),
                        pd.DataFrame({"id": list(range(1000))}))
Example #18
0
 def test_default_index_distributed(self):
     with ps.option_context("compute.default_index_type", "distributed"):
         sdf = self.spark.range(1000)
         pdf = ps.DataFrame(sdf).to_pandas()
         self.assertEqual(len(set(pdf.index)), len(pdf))
Example #19
0
    def drop_duplicates(self,
                        keep: Union[bool, str] = "first") -> "MultiIndex":
        """
        Return MultiIndex with duplicate values removed.

        Parameters
        ----------
        keep : {'first', 'last', ``False``}, default 'first'
            Method to handle dropping duplicates:
            - 'first' : Drop duplicates except for the first occurrence.
            - 'last' : Drop duplicates except for the last occurrence.
            - ``False`` : Drop all duplicates.

        Returns
        -------
        deduplicated : MultiIndex

        See Also
        --------
        Series.drop_duplicates : Equivalent method on Series.
        DataFrame.drop_duplicates : Equivalent method on DataFrame.

        Examples
        --------
        Generate a MultiIndex with duplicate values.

        >>> arrays = [[1, 2, 3, 1, 2], ["red", "blue", "black", "red", "blue"]]
        >>> midx = ps.MultiIndex.from_arrays(arrays, names=("number", "color"))
        >>> midx
        MultiIndex([(1,   'red'),
                    (2,  'blue'),
                    (3, 'black'),
                    (1,   'red'),
                    (2,  'blue')],
                   names=['number', 'color'])

        >>> midx.drop_duplicates()
        MultiIndex([(1,   'red'),
                    (2,  'blue'),
                    (3, 'black')],
                   names=['number', 'color'])

        >>> midx.drop_duplicates(keep='first')
        MultiIndex([(1,   'red'),
                    (2,  'blue'),
                    (3, 'black')],
                   names=['number', 'color'])

        >>> midx.drop_duplicates(keep='last')
        MultiIndex([(3, 'black'),
                    (1,   'red'),
                    (2,  'blue')],
                   names=['number', 'color'])

        >>> midx.drop_duplicates(keep=False)
        MultiIndex([(3, 'black')],
                   names=['number', 'color'])
        """
        with ps.option_context("compute.default_index_type", "distributed"):
            # The attached index caused by `reset_index` below is used for sorting only,
            # and it will be dropped soon,
            # so we enforce “distributed” default index type
            psdf = self.to_frame().reset_index(drop=True)
        return ps.MultiIndex.from_frame(
            psdf.drop_duplicates(keep=keep).sort_index())