def complex_psdf(self): pssers = { "this_array": self.psser, "that_array": ps.Series([[2, 3, 4]]), "this_struct": ps.Index([("x", 1)]).to_series().reset_index(drop=True), "that_struct": ps.Index([("a", 2)]).to_series().reset_index(drop=True), } return ps.concat(pssers, axis=1)
def test_arithmetic_op_exceptions(self): kser = self.ks_start_date py_datetime = self.pd_start_date.dt.to_pydatetime() datetime_index = ps.Index(self.pd_start_date) for other in [1, 0.1, kser, datetime_index, py_datetime]: expected_err_msg = "Addition can not be applied to datetimes." self.assertRaisesRegex(TypeError, expected_err_msg, lambda: kser + other) self.assertRaisesRegex(TypeError, expected_err_msg, lambda: other + kser) expected_err_msg = "Multiplication can not be applied to datetimes." self.assertRaisesRegex(TypeError, expected_err_msg, lambda: kser * other) self.assertRaisesRegex(TypeError, expected_err_msg, lambda: other * kser) expected_err_msg = "True division can not be applied to datetimes." self.assertRaisesRegex(TypeError, expected_err_msg, lambda: kser / other) self.assertRaisesRegex(TypeError, expected_err_msg, lambda: other / kser) expected_err_msg = "Floor division can not be applied to datetimes." self.assertRaisesRegex(TypeError, expected_err_msg, lambda: kser // other) self.assertRaisesRegex(TypeError, expected_err_msg, lambda: other // kser) expected_err_msg = "Modulo can not be applied to datetimes." self.assertRaisesRegex(TypeError, expected_err_msg, lambda: kser % other) self.assertRaisesRegex(TypeError, expected_err_msg, lambda: other % kser) expected_err_msg = "datetime subtraction can only be applied to datetime series." for other in [1, 0.1]: self.assertRaisesRegex(TypeError, expected_err_msg, lambda: kser - other) self.assertRaisesRegex(TypeError, expected_err_msg, lambda: other - kser) self.assertRaisesRegex(TypeError, expected_err_msg, lambda: kser - other) self.assertRaises(NotImplementedError, lambda: py_datetime - kser)
def indexer_between_time( self, start_time: Union[datetime.time, str], end_time: Union[datetime.time, str], include_start: bool = True, include_end: bool = True, ) -> Index: """ Return index locations of values between particular times of day (example: 9:00-9:30AM). Parameters ---------- start_time, end_time : datetime.time, str Time passed either as object (datetime.time) or as string in appropriate format ("%H:%M", "%H%M", "%I:%M%p", "%I%M%p", "%H:%M:%S", "%H%M%S", "%I:%M:%S%p","%I%M%S%p"). include_start : bool, default True include_end : bool, default True Returns ------- values_between_time : Index of integers Examples -------- >>> psidx = ps.date_range("2000-01-01", periods=3, freq="T") >>> psidx # doctest: +NORMALIZE_WHITESPACE DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 00:01:00', '2000-01-01 00:02:00'], dtype='datetime64[ns]', freq=None) >>> psidx.indexer_between_time("00:01", "00:02").sort_values() Int64Index([1, 2], dtype='int64') >>> psidx.indexer_between_time("00:01", "00:02", include_end=False) Int64Index([1], dtype='int64') >>> psidx.indexer_between_time("00:01", "00:02", include_start=False) Int64Index([2], dtype='int64') """ @no_type_check def pandas_between_time(pdf) -> ps.DataFrame[int]: return pdf.between_time(start_time, end_time, include_start, include_end) psdf = self.to_frame()[[]] id_column_name = verify_temp_column_name(psdf, "__id_column__") psdf = psdf.pandas_on_spark.attach_id_column("distributed-sequence", id_column_name) with ps.option_context("compute.default_index_type", "distributed"): # The attached index in the statement below will be dropped soon, # so we enforce “distributed” default index type psdf = psdf.pandas_on_spark.apply_batch(pandas_between_time) return ps.Index(first_series(psdf).rename(self.name))
def indexer_at_time(self, time: Union[datetime.time, str], asof: bool = False) -> Index: """ Return index locations of values at particular time of day (example: 9:30AM). Parameters ---------- time : datetime.time or str Time passed in either as object (datetime.time) or as string in appropriate format ("%H:%M", "%H%M", "%I:%M%p", "%I%M%p", "%H:%M:%S", "%H%M%S", "%I:%M:%S%p", "%I%M%S%p"). Returns ------- values_at_time : Index of integers Examples -------- >>> psidx = ps.date_range("2000-01-01", periods=3, freq="T") >>> psidx # doctest: +NORMALIZE_WHITESPACE DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 00:01:00', '2000-01-01 00:02:00'], dtype='datetime64[ns]', freq=None) >>> psidx.indexer_at_time("00:00") Int64Index([0], dtype='int64') >>> psidx.indexer_at_time("00:01") Int64Index([1], dtype='int64') """ if asof: raise NotImplementedError("'asof' argument is not supported") @no_type_check def pandas_at_time(pdf) -> ps.DataFrame[int]: return pdf.at_time(time, asof) psdf = self.to_frame()[[]] id_column_name = verify_temp_column_name(psdf, "__id_column__") psdf = psdf.pandas_on_spark.attach_id_column("distributed-sequence", id_column_name) with ps.option_context("compute.default_index_type", "distributed"): # The attached index in the statement below will be dropped soon, # so we enforce “distributed” default index type psdf = psdf.pandas_on_spark.apply_batch(pandas_at_time) return ps.Index(first_series(psdf).rename(self.name))
def test_categorical_index(self): pidx = pd.CategoricalIndex([1, 2, 3]) psidx = ps.CategoricalIndex([1, 2, 3]) self.assert_eq(psidx, pidx) self.assert_eq(psidx.categories, pidx.categories) self.assert_eq(psidx.codes, pd.Index(pidx.codes)) self.assert_eq(psidx.ordered, pidx.ordered) pidx = pd.Index([1, 2, 3], dtype="category") psidx = ps.Index([1, 2, 3], dtype="category") self.assert_eq(psidx, pidx) self.assert_eq(psidx.categories, pidx.categories) self.assert_eq(psidx.codes, pd.Index(pidx.codes)) self.assert_eq(psidx.ordered, pidx.ordered) pdf = pd.DataFrame( { "a": pd.Categorical([1, 2, 3, 1, 2, 3]), "b": pd.Categorical(["a", "b", "c", "a", "b", "c"], categories=["c", "b", "a"]), }, index=pd.Categorical([10, 20, 30, 20, 30, 10], categories=[30, 10, 20], ordered=True), ) psdf = ps.from_pandas(pdf) pidx = pdf.set_index("b").index psidx = psdf.set_index("b").index self.assert_eq(psidx, pidx) self.assert_eq(psidx.categories, pidx.categories) self.assert_eq(psidx.codes, pd.Index(pidx.codes)) self.assert_eq(psidx.ordered, pidx.ordered) pidx = pdf.set_index(["a", "b"]).index.get_level_values(0) psidx = psdf.set_index(["a", "b"]).index.get_level_values(0) self.assert_eq(psidx, pidx) self.assert_eq(psidx.categories, pidx.categories) self.assert_eq(psidx.codes, pd.Index(pidx.codes)) self.assert_eq(psidx.ordered, pidx.ordered)
def struct_psser(self): return ps.Index([("x", 1)]).to_series().reset_index(drop=True)