def test_astype(self): for pser, psser in self.intergral_extension_pser_psser_pairs: for dtype in self.extension_dtypes: if dtype in self.string_extension_dtype: if LooseVersion(pd.__version__) >= LooseVersion("1.1.0"): # Limit pandas version due to # https://github.com/pandas-dev/pandas/issues/31204 self.check_extension(pser.astype(dtype), psser.astype(dtype)) else: self.check_extension(pser.astype(dtype), psser.astype(dtype)) for pser, psser in self.intergral_extension_pser_psser_pairs: self.assert_eq(pser.astype(float), psser.astype(float)) self.assert_eq(pser.astype(np.float32), psser.astype(np.float32)) with ps.option_context("compute.eager_check", True): self.assertRaisesRegex( ValueError, "Cannot convert integrals with missing values to bool", lambda: psser.astype(bool), ) self.assertRaisesRegex( ValueError, "Cannot convert integrals with missing values to integer", lambda: psser.astype(int), ) self.assertRaisesRegex( ValueError, "Cannot convert integrals with missing values to integer", lambda: psser.astype(np.int32), ) with ps.option_context("compute.eager_check", False): psser.astype(bool) psser.astype(int) psser.astype(np.int32)
def test_astype(self): for pser, psser in self.fractional_extension_pser_psser_pairs: for dtype in self.extension_dtypes: self.check_extension(pser.astype(dtype), psser.astype(dtype)) for pser, psser in self.fractional_extension_pser_psser_pairs: self.assert_eq(pser.astype(float), psser.astype(float)) self.assert_eq(pser.astype(np.float32), psser.astype(np.float32)) with ps.option_context("compute.eager_check", True): self.assertRaisesRegex( ValueError, "Cannot convert fractions with missing values to bool", lambda: psser.astype(bool), ) self.assertRaisesRegex( ValueError, "Cannot convert fractions with missing values to integer", lambda: psser.astype(int), ) self.assertRaisesRegex( ValueError, "Cannot convert fractions with missing values to integer", lambda: psser.astype(np.int32), ) with ps.option_context("compute.eager_check", False): psser.astype(bool) psser.astype(int) psser.astype(np.int32)
def test_series_transform_batch_without_shortcut(self): with ps.option_context("compute.shortcut_limit", 0): self.test_series_transform_batch() pdf, kdf = self.df_pair def to_str(pser) -> ps.Series[str]: return pser.astype(str) self.assert_eq( kdf.a.koalas.transform_batch(to_str).sort_index(), to_str(pdf.a).sort_index()) pdf = pd.DataFrame({ "a": ["a", "b", "c", "a", "b", "c"], "b": ["b", "a", "c", "c", "b", "a"] }) kdf = ps.from_pandas(pdf) dtype = CategoricalDtype(categories=["a", "b", "c", "d"]) def to_category(pser) -> ps.Series[dtype]: return pser.astype(dtype) self.assert_eq( kdf.a.koalas.transform_batch(to_category).sort_index(), to_category(pdf.a).sort_index())
def test_frame_apply_batch_without_shortcut(self): with ps.option_context("compute.shortcut_limit", 0): self.test_frame_apply_batch() pdf, kdf = self.df_pair def to_str(pdf) -> 'ps.DataFrame["a":str, "b":str]': # noqa: F405 return pdf.astype(str) self.assert_eq( kdf.koalas.apply_batch(to_str).sort_values( ["a", "b"]).reset_index(drop=True), to_str(pdf).sort_values(["a", "b"]).reset_index(drop=True), ) pdf = pd.DataFrame({ "a": ["a", "b", "c", "a", "b", "c"], "b": ["b", "a", "c", "c", "b", "a"] }) kdf = ps.from_pandas(pdf) dtype = CategoricalDtype(categories=["a", "b", "c", "d"]) ret = ps.DataFrame["a":dtype, "b":dtype] def to_category(pdf) -> ret: return pdf.astype(dtype) self.assert_eq( kdf.koalas.apply_batch(to_category).sort_values( ["a", "b"]).reset_index(drop=True), to_category(pdf).sort_values(["a", "b"]).reset_index(drop=True), )
def test_groupby_transform_without_shortcut(self): with ps.option_context("compute.shortcut_limit", 0): self.test_groupby_transform() pdf, psdf = self.df_pair def identity(x) -> ps.Series[psdf.b.dtype]: # type: ignore return x self.assert_eq( psdf.groupby("a").transform(identity).sort_values("b").reset_index(drop=True), pdf.groupby("a").transform(identity).sort_values("b").reset_index(drop=True), ) dtype = CategoricalDtype(categories=["a", "b", "c", "d"]) def astype(x) -> ps.Series[dtype]: return x.astype(dtype) if LooseVersion(pd.__version__) >= LooseVersion("1.2"): self.assert_eq( psdf.groupby("a").transform(astype).sort_values("b").reset_index(drop=True), pdf.groupby("a").transform(astype).sort_values("b").reset_index(drop=True), ) else: expected = pdf.groupby("a").transform(astype) expected["b"] = dtype.categories.take(expected["b"].cat.codes).astype(dtype) self.assert_eq( psdf.groupby("a").transform(astype).sort_values("b").reset_index(drop=True), expected.sort_values("b").reset_index(drop=True), )
def test_frame_transform_without_shortcut(self): with ps.option_context("compute.shortcut_limit", 0): self.test_frame_transform() pdf, kdf = self.df_pair def codes(pser) -> ps.Series[np.int8]: return pser.cat.codes self.assert_eq(kdf.transform(codes), pdf.transform(codes)) pdf = pd.DataFrame({ "a": ["a", "b", "c", "a", "b", "c"], "b": ["b", "a", "c", "c", "b", "a"] }) kdf = ps.from_pandas(pdf) dtype = CategoricalDtype(categories=["a", "b", "c", "d"]) def to_category(pser) -> ps.Series[dtype]: return pser.astype(dtype) self.assert_eq( kdf.transform(to_category).sort_index(), pdf.transform(to_category).sort_index())
def test_plot_backends(self): plot_backend = "plotly" with ps.option_context("plotting.backend", plot_backend): self.assertEqual(ps.options.plotting.backend, plot_backend) module = PandasOnSparkPlotAccessor._get_plot_backend(plot_backend) self.assertEqual(module.__name__, "pyspark.pandas.plot.plotly")
def test_plot_backends_incorrect(self): fake_plot_backend = "none_plotting_module" with ps.option_context("plotting.backend", fake_plot_backend): self.assertEqual(ps.options.plotting.backend, fake_plot_backend) with self.assertRaises(ValueError): PandasOnSparkPlotAccessor._get_plot_backend(fake_plot_backend)
def test_astype_eager_check(self): psser = self.psdf["float_nan"] with ps.option_context("compute.eager_check", True), self.assertRaisesRegex( ValueError, "Cannot convert"): psser.astype(int) with ps.option_context("compute.eager_check", False): psser.astype(int) # Skip decimal_nan test before v1.3.0, it not supported by pandas on spark yet. if LooseVersion(pd.__version__) >= LooseVersion("1.3"): psser = self.psdf["decimal_nan"] with ps.option_context("compute.eager_check", True), self.assertRaisesRegex( ValueError, "Cannot convert"): psser.astype(int) with ps.option_context("compute.eager_check", False): psser.astype(int)
def indexer_between_time( self, start_time: Union[datetime.time, str], end_time: Union[datetime.time, str], include_start: bool = True, include_end: bool = True, ) -> Index: """ Return index locations of values between particular times of day (example: 9:00-9:30AM). Parameters ---------- start_time, end_time : datetime.time, str Time passed either as object (datetime.time) or as string in appropriate format ("%H:%M", "%H%M", "%I:%M%p", "%I%M%p", "%H:%M:%S", "%H%M%S", "%I:%M:%S%p","%I%M%S%p"). include_start : bool, default True include_end : bool, default True Returns ------- values_between_time : Index of integers Examples -------- >>> psidx = ps.date_range("2000-01-01", periods=3, freq="T") >>> psidx # doctest: +NORMALIZE_WHITESPACE DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 00:01:00', '2000-01-01 00:02:00'], dtype='datetime64[ns]', freq=None) >>> psidx.indexer_between_time("00:01", "00:02").sort_values() Int64Index([1, 2], dtype='int64') >>> psidx.indexer_between_time("00:01", "00:02", include_end=False) Int64Index([1], dtype='int64') >>> psidx.indexer_between_time("00:01", "00:02", include_start=False) Int64Index([2], dtype='int64') """ @no_type_check def pandas_between_time(pdf) -> ps.DataFrame[int]: return pdf.between_time(start_time, end_time, include_start, include_end) psdf = self.to_frame()[[]] id_column_name = verify_temp_column_name(psdf, "__id_column__") psdf = psdf.pandas_on_spark.attach_id_column("distributed-sequence", id_column_name) with ps.option_context("compute.default_index_type", "distributed"): # The attached index in the statement below will be dropped soon, # so we enforce “distributed” default index type psdf = psdf.pandas_on_spark.apply_batch(pandas_between_time) return ps.Index(first_series(psdf).rename(self.name))
def test_series_apply_without_shortcut(self): with ps.option_context("compute.shortcut_limit", 0): self.test_series_apply() pdf, psdf = self.df_pair ret = psdf.a.dtype def identity(pser) -> ret: return pser self.assert_eq(psdf.a.apply(identity).sort_index(), pdf.a.apply(identity).sort_index())
def test_groupby_apply_without_shortcut(self): with ps.option_context("compute.shortcut_limit", 0): self.test_groupby_apply() pdf, psdf = self.df_pair def identity(df) -> ps.DataFrame[zip(psdf.columns, psdf.dtypes)]: return df self.assert_eq( psdf.groupby("a").apply(identity).sort_values(["a", "b"]).reset_index(drop=True), pdf.groupby("a").apply(identity).sort_values(["a", "b"]).reset_index(drop=True), )
def indexer_at_time(self, time: Union[datetime.time, str], asof: bool = False) -> Index: """ Return index locations of values at particular time of day (example: 9:30AM). Parameters ---------- time : datetime.time or str Time passed in either as object (datetime.time) or as string in appropriate format ("%H:%M", "%H%M", "%I:%M%p", "%I%M%p", "%H:%M:%S", "%H%M%S", "%I:%M:%S%p", "%I%M%S%p"). Returns ------- values_at_time : Index of integers Examples -------- >>> psidx = ps.date_range("2000-01-01", periods=3, freq="T") >>> psidx # doctest: +NORMALIZE_WHITESPACE DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 00:01:00', '2000-01-01 00:02:00'], dtype='datetime64[ns]', freq=None) >>> psidx.indexer_at_time("00:00") Int64Index([0], dtype='int64') >>> psidx.indexer_at_time("00:01") Int64Index([1], dtype='int64') """ if asof: raise NotImplementedError("'asof' argument is not supported") @no_type_check def pandas_at_time(pdf) -> ps.DataFrame[int]: return pdf.at_time(time, asof) psdf = self.to_frame()[[]] id_column_name = verify_temp_column_name(psdf, "__id_column__") psdf = psdf.pandas_on_spark.attach_id_column("distributed-sequence", id_column_name) with ps.option_context("compute.default_index_type", "distributed"): # The attached index in the statement below will be dropped soon, # so we enforce “distributed” default index type psdf = psdf.pandas_on_spark.apply_batch(pandas_at_time) return ps.Index(first_series(psdf).rename(self.name))
def test_frame_transform_batch_without_shortcut(self): with ps.option_context("compute.shortcut_limit", 0): self.test_frame_transform_batch() pdf, kdf = self.df_pair def to_str(pdf) -> 'ps.DataFrame["a":str, "b":str]': # noqa: F405 return pdf.astype(str) self.assert_eq( kdf.koalas.transform_batch(to_str).sort_index(), to_str(pdf).sort_index(), ) def to_codes(pdf) -> ps.Series[np.int8]: return pdf.b.cat.codes self.assert_eq( kdf.koalas.transform_batch(to_codes).sort_index(), to_codes(pdf).sort_index(), ) pdf = pd.DataFrame({ "a": ["a", "b", "c", "a", "b", "c"], "b": ["b", "a", "c", "c", "b", "a"] }) kdf = ps.from_pandas(pdf) dtype = CategoricalDtype(categories=["a", "b", "c", "d"]) ret = ps.DataFrame["a":dtype, "b":dtype] def to_category(pdf) -> ret: return pdf.astype(dtype) self.assert_eq( kdf.koalas.transform_batch(to_category).sort_index(), to_category(pdf).sort_index(), ) def to_category(pdf) -> ps.Series[dtype]: return pdf.b.astype(dtype) self.assert_eq( kdf.koalas.transform_batch(to_category).sort_index(), to_category(pdf).rename().sort_index(), )
def test_frame_apply_without_shortcut(self): with ps.option_context("compute.shortcut_limit", 0): self.test_frame_apply() pdf = pd.DataFrame( {"a": ["a", "b", "c", "a", "b", "c"], "b": ["b", "a", "c", "c", "b", "a"]} ) psdf = ps.from_pandas(pdf) dtype = CategoricalDtype(categories=["a", "b", "c"]) def categorize(ser) -> ps.Series[dtype]: return ser.astype(dtype) self.assert_eq( psdf.apply(categorize).sort_values(["a", "b"]).reset_index(drop=True), pdf.apply(categorize).sort_values(["a", "b"]).reset_index(drop=True), )
def test_xor(self): pdf, psdf = self.bool_pdf, self.bool_psdf pser, other_pser = pdf["this"], pdf["that"] psser, other_psser = psdf["this"], psdf["that"] self.assert_eq(pser ^ other_pser, psser ^ other_psser) self.assert_eq(pser ^ True, psser ^ True) self.assert_eq(pser ^ False, psser ^ False) self.assert_eq(pser ^ 2, psser ^ 2) self.assert_eq(pser ^ 99, psser ^ 99) with self.assertRaisesRegex(TypeError, "XOR can not be applied to given types."): psser ^ "a" with option_context("compute.ops_on_diff_frames", True): pser, other_pser = self.pdf["bool"], self.integral_pdf["this"] psser, other_psser = self.psdf["bool"], self.integral_psdf["this"] self.assert_eq(pser ^ other_pser, psser ^ other_psser)
def test_default_index_distributed_sequence(self): with ps.option_context("compute.default_index_type", "distributed-sequence"): sdf = self.spark.range(1000) self.assert_eq(ps.DataFrame(sdf), pd.DataFrame({"id": list(range(1000))}))
def test_default_index_distributed(self): with ps.option_context("compute.default_index_type", "distributed"): sdf = self.spark.range(1000) pdf = ps.DataFrame(sdf).to_pandas() self.assertEqual(len(set(pdf.index)), len(pdf))
def drop_duplicates(self, keep: Union[bool, str] = "first") -> "MultiIndex": """ Return MultiIndex with duplicate values removed. Parameters ---------- keep : {'first', 'last', ``False``}, default 'first' Method to handle dropping duplicates: - 'first' : Drop duplicates except for the first occurrence. - 'last' : Drop duplicates except for the last occurrence. - ``False`` : Drop all duplicates. Returns ------- deduplicated : MultiIndex See Also -------- Series.drop_duplicates : Equivalent method on Series. DataFrame.drop_duplicates : Equivalent method on DataFrame. Examples -------- Generate a MultiIndex with duplicate values. >>> arrays = [[1, 2, 3, 1, 2], ["red", "blue", "black", "red", "blue"]] >>> midx = ps.MultiIndex.from_arrays(arrays, names=("number", "color")) >>> midx MultiIndex([(1, 'red'), (2, 'blue'), (3, 'black'), (1, 'red'), (2, 'blue')], names=['number', 'color']) >>> midx.drop_duplicates() MultiIndex([(1, 'red'), (2, 'blue'), (3, 'black')], names=['number', 'color']) >>> midx.drop_duplicates(keep='first') MultiIndex([(1, 'red'), (2, 'blue'), (3, 'black')], names=['number', 'color']) >>> midx.drop_duplicates(keep='last') MultiIndex([(3, 'black'), (1, 'red'), (2, 'blue')], names=['number', 'color']) >>> midx.drop_duplicates(keep=False) MultiIndex([(3, 'black')], names=['number', 'color']) """ with ps.option_context("compute.default_index_type", "distributed"): # The attached index caused by `reset_index` below is used for sorting only, # and it will be dropped soon, # so we enforce “distributed” default index type psdf = self.to_frame().reset_index(drop=True) return ps.MultiIndex.from_frame( psdf.drop_duplicates(keep=keep).sort_index())