pd.Series(np.add(ser, Dummy(1)))) @pytest.mark.parametrize( "values", [ pd.array([1, 3, 2], dtype=np.int64), pd.array([1, 3, 2], dtype="Int64"), pd.array([1, 3, 2], dtype="Float32"), pd.array([1, 10, 2], dtype="Sparse[int]"), pd.to_datetime(["2000", "2010", "2001"]), pd.to_datetime(["2000", "2010", "2001"]).tz_localize("CET"), pd.to_datetime(["2000", "2010", "2001"]).to_period(freq="D"), pd.to_timedelta(["1 Day", "3 Days", "2 Days"]), pd.IntervalIndex( [pd.Interval(0, 1), pd.Interval(2, 3), pd.Interval(1, 2)]), ], ids=lambda x: str(x.dtype), ) @pytest.mark.parametrize("box", [pd.array, pd.Index, pd.Series, pd.DataFrame]) def test_reduce(values, box, request): # TODO: cases with NAs same_type = True if box is pd.Index: if values.dtype.kind in ["i", "f"]: # ATM Index casts to object, so we get python ints/floats same_type = False
class TestDataFrameDataTypes: def test_empty_frame_dtypes(self): empty_df = DataFrame() tm.assert_series_equal(empty_df.dtypes, Series(dtype=object)) nocols_df = DataFrame(index=[1, 2, 3]) tm.assert_series_equal(nocols_df.dtypes, Series(dtype=object)) norows_df = DataFrame(columns=list("abc")) tm.assert_series_equal(norows_df.dtypes, Series(object, index=list("abc"))) norows_int_df = DataFrame(columns=list("abc")).astype(np.int32) tm.assert_series_equal(norows_int_df.dtypes, Series(np.dtype("int32"), index=list("abc"))) df = DataFrame(dict([("a", 1), ("b", True), ("c", 1.0)]), index=[1, 2, 3]) ex_dtypes = Series( dict([("a", np.int64), ("b", np.bool_), ("c", np.float64)])) tm.assert_series_equal(df.dtypes, ex_dtypes) # same but for empty slice of df tm.assert_series_equal(df[:0].dtypes, ex_dtypes) def test_datetime_with_tz_dtypes(self): tzframe = DataFrame({ "A": date_range("20130101", periods=3), "B": date_range("20130101", periods=3, tz="US/Eastern"), "C": date_range("20130101", periods=3, tz="CET"), }) tzframe.iloc[1, 1] = pd.NaT tzframe.iloc[1, 2] = pd.NaT result = tzframe.dtypes.sort_index() expected = Series( [ np.dtype("datetime64[ns]"), DatetimeTZDtype("ns", "US/Eastern"), DatetimeTZDtype("ns", "CET"), ], ["A", "B", "C"], ) tm.assert_series_equal(result, expected) def test_dtypes_are_correct_after_column_slice(self): # GH6525 df = DataFrame(index=range(5), columns=list("abc"), dtype=np.float_) tm.assert_series_equal( df.dtypes, Series(dict([("a", np.float_), ("b", np.float_), ("c", np.float_)])), ) tm.assert_series_equal(df.iloc[:, 2:].dtypes, Series(dict([("c", np.float_)]))) tm.assert_series_equal( df.dtypes, Series(dict([("a", np.float_), ("b", np.float_), ("c", np.float_)])), ) def test_dtypes_gh8722(self, float_string_frame): float_string_frame["bool"] = float_string_frame["A"] > 0 result = float_string_frame.dtypes expected = Series({k: v.dtype for k, v in float_string_frame.items()}, index=result.index) tm.assert_series_equal(result, expected) # compat, GH 8722 with option_context("use_inf_as_na", True): df = DataFrame([[1]]) result = df.dtypes tm.assert_series_equal(result, Series({0: np.dtype("int64")})) def test_singlerow_slice_categoricaldtype_gives_series(self): # GH29521 df = DataFrame({"x": pd.Categorical("a b c d e".split())}) result = df.iloc[0] raw_cat = pd.Categorical(["a"], categories=["a", "b", "c", "d", "e"]) expected = Series(raw_cat, index=["x"], name=0, dtype="category") tm.assert_series_equal(result, expected) def test_timedeltas(self): df = DataFrame( dict( A=Series(date_range("2012-1-1", periods=3, freq="D")), B=Series([timedelta(days=i) for i in range(3)]), )) result = df.dtypes expected = Series( [np.dtype("datetime64[ns]"), np.dtype("timedelta64[ns]")], index=list("AB")) tm.assert_series_equal(result, expected) df["C"] = df["A"] + df["B"] result = df.dtypes expected = Series( [ np.dtype("datetime64[ns]"), np.dtype("timedelta64[ns]"), np.dtype("datetime64[ns]"), ], index=list("ABC"), ) tm.assert_series_equal(result, expected) # mixed int types df["D"] = 1 result = df.dtypes expected = Series( [ np.dtype("datetime64[ns]"), np.dtype("timedelta64[ns]"), np.dtype("datetime64[ns]"), np.dtype("int64"), ], index=list("ABCD"), ) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( "input_vals", [ ([1, 2]), (["1", "2"]), (list(pd.date_range("1/1/2011", periods=2, freq="H"))), (list( pd.date_range("1/1/2011", periods=2, freq="H", tz="US/Eastern"))), ([pd.Interval(left=0, right=5)]), ], ) def test_constructor_list_str(self, input_vals, string_dtype): # GH 16605 # Ensure that data elements are converted to strings when # dtype is str, 'str', or 'U' result = DataFrame({"A": input_vals}, dtype=string_dtype) expected = DataFrame({"A": input_vals}).astype({"A": string_dtype}) tm.assert_frame_equal(result, expected) def test_constructor_list_str_na(self, string_dtype): result = DataFrame({"A": [1.0, 2.0, None]}, dtype=string_dtype) expected = DataFrame({"A": ["1.0", "2.0", None]}, dtype=object) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "data, expected", [ # empty (DataFrame(), True), # multi-same (DataFrame({ "A": [1, 2], "B": [1, 2] }), True), # multi-object ( DataFrame({ "A": np.array([1, 2], dtype=object), "B": np.array(["a", "b"], dtype=object), }), True, ), # multi-extension ( DataFrame({ "A": pd.Categorical(["a", "b"]), "B": pd.Categorical(["a", "b"]) }), True, ), # differ types (DataFrame({ "A": [1, 2], "B": [1.0, 2.0] }), False), # differ sizes ( DataFrame({ "A": np.array([1, 2], dtype=np.int32), "B": np.array([1, 2], dtype=np.int64), }), False, ), # multi-extension differ ( DataFrame({ "A": pd.Categorical(["a", "b"]), "B": pd.Categorical(["b", "c"]) }), False, ), ], ) def test_is_homogeneous_type(self, data, expected): assert data._is_homogeneous_type is expected def test_asarray_homogenous(self): df = DataFrame({ "A": pd.Categorical([1, 2]), "B": pd.Categorical([1, 2]) }) result = np.asarray(df) # may change from object in the future expected = np.array([[1, 1], [2, 2]], dtype="object") tm.assert_numpy_array_equal(result, expected) def test_str_to_small_float_conversion_type(self): # GH 20388 np.random.seed(13) col_data = [str(np.random.random() * 1e-12) for _ in range(5)] result = DataFrame(col_data, columns=["A"]) expected = DataFrame(col_data, columns=["A"], dtype=object) tm.assert_frame_equal(result, expected) # change the dtype of the elements from object to float one by one result.loc[result.index, "A"] = [float(x) for x in col_data] expected = DataFrame(col_data, columns=["A"], dtype=float) tm.assert_frame_equal(result, expected)
def test_survival_table_from_events_will_collapse_to_desired_bins(): T, C = np.array([1, 3, 4, 5]), np.array([True, True, True, True]) table = utils.survival_table_from_events(T, C, collapse=True, intervals=[0, 4, 8]) assert table.index.tolist() == [pd.Interval(0, 4, closed='right'), pd.Interval(4, 8, closed='right')]
def catagorize_donation_amounts(donation_df): # pop the first 11 rows which are not per 10s SKIP = 11 # donation catagories bins = [0, 100, 5000, 50000, 9999999999] data_dates = donation_df['date'].iloc[SKIP:] donation_data_delta = donation_df.diff(periods=1, axis=0) donation_data_delta = donation_data_delta.iloc[SKIP:]['amount'] merged = pd.concat([data_dates, donation_data_delta], axis=1, keys=['date', 'donated_amount']) """ Full """ plt.hist(np.abs(merged['donated_amount']), bins=500, log=True) plt.yscale('log') plt.xlim(left=0) plt.xlabel("Amount donated in USD", fontsize=30) plt.ylabel("Frequency", fontsize=30) plt.yticks(fontsize=20) plt.xticks(fontsize=20) plt.axvline(50000, color='k', linestyle='dashed', label="$50000", alpha=0.5) plt.legend(prop={'size': 20}) plt.show() """ MEDIUM - HIGH - Histogram of donation amounts """ plt.hist(np.abs(merged['donated_amount']), bins=10000, log=True) plt.yscale('log') plt.xlim(left=0, right=50000) plt.xlabel("Amount donated in USD", fontsize=30) plt.ylabel("Frequency", fontsize=30) plt.yticks(fontsize=20) plt.xticks(fontsize=20) plt.axvline(5000, color='k', linestyle='dashed', label="$5000", alpha=0.5) plt.axvline(100, color='r', linestyle='dashed', label="$100", alpha=0.5) plt.legend(prop={'size': 20}) plt.show() """ LOW BOUND HIST """ merged = merged[merged['donated_amount'] < 300] plt.hist(merged['donated_amount'], bins=2000) plt.yscale('log') plt.xlim(left=0, right=300) plt.xlabel("Amount donated in USD", fontsize=30) plt.ylabel("Frequency", fontsize=30) plt.yticks(fontsize=20) plt.xticks(fontsize=20) plt.axvline(100, color='r', linestyle='dashed', label="$100", alpha=0.5) plt.legend(prop={'size': 20}) plt.show() """ BINNED DONATIONS """ merged = pd.concat([data_dates, donation_data_delta], axis=1, keys=['date', 'donated_amount']) # Bin the donations merged['bin'] = pd.cut(x=merged['donated_amount'], bins=bins) # cumsum the bins merged['cumsum'] = merged.groupby('bin')['donated_amount'].cumsum() # Group the tweets per catagory for v-lines binned = merged.groupby(['bin']) # Get the donors in highest interval TOP_DONORS_INTERVAL = pd.Interval(left=50000, right=9999999999) top_donor_data = binned.get_group(TOP_DONORS_INTERVAL) ax = sns.lineplot(x="date", y="cumsum", hue="bin", data=merged, drawstyle="steps-pre") ax.set(xlabel='Date', ylabel='Binned Cumulative Sum') ax.xaxis.get_label().set_fontsize(30) ax.yaxis.get_label().set_fontsize(30) ax.tick_params(labelsize=17) plt.legend(loc='upper left', labels=[ 'Biggest donors', 'Medium donors', 'Large donors', 'Smallest donors' ], prop={'size': 20}) # plt.yscale('log') plt.ylabel("USD donated", fontsize=30) plt.show() correlate_binned_data(top_donor_data, binned, bins)
def test_is_all_dates(self): # GH 23576 year_2017 = pd.Interval(Timestamp("2017-01-01 00:00:00"), Timestamp("2018-01-01 00:00:00")) year_2017_index = pd.IntervalIndex([year_2017]) assert not year_2017_index._is_all_dates
class TestSeriesConvertDtypes: # The answerdict has keys that have 4 tuples, corresponding to the arguments # infer_objects, convert_string, convert_integer, convert_boolean # This allows all 16 possible combinations to be tested. Since common # combinations expect the same answer, this provides an easy way to list # all the possibilities @pytest.mark.parametrize( "data, maindtype, answerdict", [ ( [1, 2, 3], np.dtype("int32"), { ((True, False), (True, False), (True, ), (True, False)): "Int32", ((True, False), (True, False), (False, ), (True, False)): np.dtype("int32"), }, ), ( [1, 2, 3], np.dtype("int64"), { ((True, False), (True, False), (True, ), (True, False)): "Int64", ((True, False), (True, False), (False, ), (True, False)): np.dtype("int64"), }, ), ( ["x", "y", "z"], np.dtype("O"), { ( (True, False), (True, ), (True, False), (True, False), ): pd.StringDtype(), ((True, False), (False, ), (True, False), (True, False)): np.dtype("O"), }, ), ( [True, False, np.nan], np.dtype("O"), { ( (True, False), (True, False), (True, False), (True, ), ): pd.BooleanDtype(), ((True, False), (True, False), (True, False), (False, )): np.dtype("O"), }, ), ( ["h", "i", np.nan], np.dtype("O"), { ( (True, False), (True, ), (True, False), (True, False), ): pd.StringDtype(), ((True, False), (False, ), (True, False), (True, False)): np.dtype("O"), }, ), ( [10, np.nan, 20], np.dtype("float"), { ((True, False), (True, False), (True, ), (True, False)): "Int64", ((True, False), (True, False), (False, ), (True, False)): np.dtype("float"), }, ), ( [np.nan, 100.5, 200], np.dtype("float"), { ( (True, False), (True, False), (True, False), (True, False), ): np.dtype("float"), }, ), ( [3, 4, 5], "Int8", { ((True, False), (True, False), (True, False), (True, False)): "Int8" }, ), ( [[1, 2], [3, 4], [5]], None, { ( (True, False), (True, False), (True, False), (True, False), ): np.dtype("O"), }, ), ( [4, 5, 6], np.dtype("uint32"), { ((True, False), (True, False), (True, ), (True, False)): "UInt32", ((True, False), (True, False), (False, ), (True, False)): np.dtype("uint32"), }, ), ( [-10, 12, 13], np.dtype("i1"), { ((True, False), (True, False), (True, ), (True, False)): "Int8", ((True, False), (True, False), (False, ), (True, False)): np.dtype("i1"), }, ), ( [1, 2.0], object, { ((True, False), (True, False), (True, ), (True, False)): "Int64", ((True, ), (True, False), (False, ), (True, False)): np.dtype("float"), ((False, ), (True, False), (False, ), (True, False)): np.dtype("object"), }, ), ( ["a", "b"], pd.CategoricalDtype(), { ( (True, False), (True, False), (True, False), (True, False), ): pd.CategoricalDtype(), }, ), ( pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]), pd.DatetimeTZDtype(tz="UTC"), { ( (True, False), (True, False), (True, False), (True, False), ): pd.DatetimeTZDtype(tz="UTC"), }, ), ( pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]), "datetime64[ns]", { ( (True, False), (True, False), (True, False), (True, False), ): np.dtype("datetime64[ns]"), }, ), ( pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]), object, { ( (True, ), (True, False), (True, False), (True, False), ): np.dtype("datetime64[ns]"), ( (False, ), (True, False), (True, False), (True, False), ): np.dtype("O"), }, ), ( pd.period_range("1/1/2011", freq="M", periods=3), None, { ( (True, False), (True, False), (True, False), (True, False), ): pd.PeriodDtype("M"), }, ), ( pd.arrays.IntervalArray([pd.Interval(0, 1), pd.Interval(1, 5)]), None, { ( (True, False), (True, False), (True, False), (True, False), ): pd.IntervalDtype("int64"), }, ), ], ) @pytest.mark.parametrize("params", product(*[(True, False)] * 4)) def test_convert_dtypes(self, data, maindtype, params, answerdict): if maindtype is not None: series = pd.Series(data, dtype=maindtype) else: series = pd.Series(data) answers = { k: a for (kk, a) in answerdict.items() for k in product(*kk) } ns = series.convert_dtypes(*params) expected_dtype = answers[tuple(params)] expected = pd.Series(series.values, dtype=expected_dtype) tm.assert_series_equal(ns, expected) # Test that it is a copy copy = series.copy(deep=True) ns[ns.notna()] = np.nan # Make sure original not changed tm.assert_series_equal(series, copy)
("datetime64", [np.datetime64("2013-01-01"), np.nan, np.datetime64("2018-01-01")]), ("datetime", [pd.Timestamp("20130101"), np.nan, pd.Timestamp("20180101")]), ("date", [date(2013, 1, 1), np.nan, date(2018, 1, 1)]), # The following two dtypes are commented out due to GH 23554 # ('complex', [1 + 1j, np.nan, 2 + 2j]), # ('timedelta64', [np.timedelta64(1, 'D'), # np.nan, np.timedelta64(2, 'D')]), ("timedelta", [timedelta(1), np.nan, timedelta(2)]), ("time", [time(1), np.nan, time(2)]), ("period", [pd.Period(2013), pd.NaT, pd.Period(2018)]), ("interval", [pd.Interval(0, 1), np.nan, pd.Interval(0, 2)]), ] ids, _ = zip(*_any_skipna_inferred_dtype) # use inferred type as fixture-id @pytest.fixture(params=_any_skipna_inferred_dtype, ids=ids) def any_skipna_inferred_dtype(request): """ Fixture for all inferred dtypes from _libs.lib.infer_dtype The covered (inferred) types are: * 'string' * 'empty' * 'bytes' * 'mixed'
class TestContains: def test_contains(self): ci = CategoricalIndex(list("aabbca"), categories=list("cabdef"), ordered=False) assert "a" in ci assert "z" not in ci assert "e" not in ci assert np.nan not in ci # assert codes NOT in index assert 0 not in ci assert 1 not in ci def test_contains_nan(self): ci = CategoricalIndex(list("aabbca") + [np.nan], categories=list("cabdef")) assert np.nan in ci @pytest.mark.parametrize("unwrap", [True, False]) def test_contains_na_dtype(self, unwrap): dti = pd.date_range("2016-01-01", periods=100).insert(0, pd.NaT) pi = dti.to_period("D") tdi = dti - dti[-1] ci = CategoricalIndex(dti) obj = ci if unwrap: obj = ci._data assert np.nan in obj assert None in obj assert pd.NaT in obj assert np.datetime64("NaT") in obj assert np.timedelta64("NaT") not in obj obj2 = CategoricalIndex(tdi) if unwrap: obj2 = obj2._data assert np.nan in obj2 assert None in obj2 assert pd.NaT in obj2 assert np.datetime64("NaT") not in obj2 assert np.timedelta64("NaT") in obj2 obj3 = CategoricalIndex(pi) if unwrap: obj3 = obj3._data assert np.nan in obj3 assert None in obj3 assert pd.NaT in obj3 assert np.datetime64("NaT") not in obj3 assert np.timedelta64("NaT") not in obj3 @pytest.mark.parametrize( "item, expected", [ (pd.Interval(0, 1), True), (1.5, True), (pd.Interval(0.5, 1.5), False), ("a", False), (Timestamp(1), False), (pd.Timedelta(1), False), ], ids=str, ) def test_contains_interval(self, item, expected): # GH 23705 ci = CategoricalIndex(IntervalIndex.from_breaks(range(3))) result = item in ci assert result is expected def test_contains_list(self): # GH#21729 idx = CategoricalIndex([1, 2, 3]) assert "a" not in idx with pytest.raises(TypeError, match="unhashable type"): ["a"] in idx with pytest.raises(TypeError, match="unhashable type"): ["a", "b"] in idx
def _get(self, ep, symbol, start_date, end_date, retry, retry_wait, freq='6H'): dates = [None] if start_date: if not end_date: end_date = pd.Timestamp.utcnow() dates = pd.interval_range(API._timestamp(start_date), API._timestamp(end_date), freq=freq).tolist() if len(dates) == 0: dates.append( pd.Interval(left=API._timestamp(start_date), right=API._timestamp(end_date))) elif dates[-1].right < API._timestamp(end_date): dates.append( pd.Interval(dates[-1].right, API._timestamp(end_date))) @request_retry(self.ID, retry, retry_wait) def helper(start, start_date, end_date): if start_date and end_date: endpoint = f'/api/v1/{ep}?symbol={symbol}&count={API_MAX}&reverse=false&start={start}&startTime={start_date}&endTime={end_date}' else: endpoint = f'/api/v1/{ep}?symbol={symbol}&reverse=true' header = {} if self.key_id and self.key_secret: header = self._generate_signature("GET", endpoint) header['Accept'] = 'application/json' return requests.get('{}{}'.format(self.api, endpoint), headers=header) for interval in dates: start = 0 if interval is not None: end = interval.right end -= pd.Timedelta(nanoseconds=1) start_date = str(interval.left).replace(" ", "T") + "Z" end_date = str(end).replace(" ", "T") + "Z" while True: r = helper(start, start_date, end_date) if r.status_code in {502, 504}: LOG.warning("%s: %d for URL %s - %s", self.ID, r.status_code, r.url, r.text) sleep(retry_wait) continue elif r.status_code == 429: sleep(API_REFRESH) continue elif r.status_code != 200: self._handle_error(r, LOG) limit = int(r.headers['X-RateLimit-Remaining']) data = r.json() yield data if len(data) != API_MAX: break if limit < 1: sleep(API_REFRESH) start += len(data)
class TestSeriesReplace: def test_replace_explicit_none(self): # GH#36984 if the user explicitly passes value=None, give it to them ser = pd.Series([0, 0, ""], dtype=object) result = ser.replace("", None) expected = pd.Series([0, 0, None], dtype=object) tm.assert_series_equal(result, expected) df = pd.DataFrame(np.zeros((3, 3))) df.iloc[2, 2] = "" result = df.replace("", None) expected = pd.DataFrame({ 0: np.zeros(3), 1: np.zeros(3), 2: np.array([0.0, 0.0, None], dtype=object), }) assert expected.iloc[2, 2] is None tm.assert_frame_equal(result, expected) # GH#19998 same thing with object dtype ser = pd.Series([10, 20, 30, "a", "a", "b", "a"]) result = ser.replace("a", None) expected = pd.Series([10, 20, 30, None, None, "b", None]) assert expected.iloc[-1] is None tm.assert_series_equal(result, expected) def test_replace_noop_doesnt_downcast(self): # GH#44498 ser = pd.Series( [None, None, pd.Timestamp("2021-12-16 17:31")], dtype=object) res = ser.replace({np.nan: None}) # should be a no-op tm.assert_series_equal(res, ser) assert res.dtype == object # same thing but different calling convention res = ser.replace(np.nan, None) tm.assert_series_equal(res, ser) assert res.dtype == object def test_replace(self): N = 100 ser = pd.Series(np.random.randn(N)) ser[0:4] = np.nan ser[6:10] = 0 # replace list with a single value return_value = ser.replace([np.nan], -1, inplace=True) assert return_value is None exp = ser.fillna(-1) tm.assert_series_equal(ser, exp) rs = ser.replace(0.0, np.nan) ser[ser == 0.0] = np.nan tm.assert_series_equal(rs, ser) ser = pd.Series(np.fabs(np.random.randn(N)), tm.makeDateIndex(N), dtype=object) ser[:5] = np.nan ser[6:10] = "foo" ser[20:30] = "bar" # replace list with a single value rs = ser.replace([np.nan, "foo", "bar"], -1) assert (rs[:5] == -1).all() assert (rs[6:10] == -1).all() assert (rs[20:30] == -1).all() assert (pd.isna(ser[:5])).all() # replace with different values rs = ser.replace({np.nan: -1, "foo": -2, "bar": -3}) assert (rs[:5] == -1).all() assert (rs[6:10] == -2).all() assert (rs[20:30] == -3).all() assert (pd.isna(ser[:5])).all() # replace with different values with 2 lists rs2 = ser.replace([np.nan, "foo", "bar"], [-1, -2, -3]) tm.assert_series_equal(rs, rs2) # replace inplace return_value = ser.replace([np.nan, "foo", "bar"], -1, inplace=True) assert return_value is None assert (ser[:5] == -1).all() assert (ser[6:10] == -1).all() assert (ser[20:30] == -1).all() def test_replace_nan_with_inf(self): ser = pd.Series([np.nan, 0, np.inf]) tm.assert_series_equal(ser.replace(np.nan, 0), ser.fillna(0)) ser = pd.Series([np.nan, 0, "foo", "bar", np.inf, None, pd.NaT]) tm.assert_series_equal(ser.replace(np.nan, 0), ser.fillna(0)) filled = ser.copy() filled[4] = 0 tm.assert_series_equal(ser.replace(np.inf, 0), filled) def test_replace_listlike_value_listlike_target(self, datetime_series): ser = pd.Series(datetime_series.index) tm.assert_series_equal(ser.replace(np.nan, 0), ser.fillna(0)) # malformed msg = r"Replacement lists must match in length\. Expecting 3 got 2" with pytest.raises(ValueError, match=msg): ser.replace([1, 2, 3], [np.nan, 0]) # ser is dt64 so can't hold 1 or 2, so this replace is a no-op result = ser.replace([1, 2], [np.nan, 0]) tm.assert_series_equal(result, ser) ser = pd.Series([0, 1, 2, 3, 4]) result = ser.replace([0, 1, 2, 3, 4], [4, 3, 2, 1, 0]) tm.assert_series_equal(result, pd.Series([4, 3, 2, 1, 0])) def test_replace_gh5319(self): # API change from 0.12? # GH 5319 ser = pd.Series([0, np.nan, 2, 3, 4]) expected = ser.ffill() result = ser.replace([np.nan]) tm.assert_series_equal(result, expected) ser = pd.Series([0, np.nan, 2, 3, 4]) expected = ser.ffill() result = ser.replace(np.nan) tm.assert_series_equal(result, expected) def test_replace_datetime64(self): # GH 5797 ser = pd.Series(pd.date_range("20130101", periods=5)) expected = ser.copy() expected.loc[2] = pd.Timestamp("20120101") result = ser.replace( {pd.Timestamp("20130103"): pd.Timestamp("20120101")}) tm.assert_series_equal(result, expected) result = ser.replace(pd.Timestamp("20130103"), pd.Timestamp("20120101")) tm.assert_series_equal(result, expected) def test_replace_nat_with_tz(self): # GH 11792: Test with replacing NaT in a list with tz data ts = pd.Timestamp("2015/01/01", tz="UTC") s = pd.Series([pd.NaT, pd.Timestamp("2015/01/01", tz="UTC")]) result = s.replace([np.nan, pd.NaT], pd.Timestamp.min) expected = pd.Series([pd.Timestamp.min, ts], dtype=object) tm.assert_series_equal(expected, result) def test_replace_timedelta_td64(self): tdi = pd.timedelta_range(0, periods=5) ser = pd.Series(tdi) # Using a single dict argument means we go through replace_list result = ser.replace({ser[1]: ser[3]}) expected = pd.Series([ser[0], ser[3], ser[2], ser[3], ser[4]]) tm.assert_series_equal(result, expected) def test_replace_with_single_list(self): ser = pd.Series([0, 1, 2, 3, 4]) result = ser.replace([1, 2, 3]) tm.assert_series_equal(result, pd.Series([0, 0, 0, 0, 4])) s = ser.copy() return_value = s.replace([1, 2, 3], inplace=True) assert return_value is None tm.assert_series_equal(s, pd.Series([0, 0, 0, 0, 4])) # make sure things don't get corrupted when fillna call fails s = ser.copy() msg = (r"Invalid fill method\. Expecting pad \(ffill\) or backfill " r"\(bfill\)\. Got crash_cymbal") with pytest.raises(ValueError, match=msg): return_value = s.replace([1, 2, 3], inplace=True, method="crash_cymbal") assert return_value is None tm.assert_series_equal(s, ser) def test_replace_mixed_types(self): ser = pd.Series(np.arange(5), dtype="int64") def check_replace(to_rep, val, expected): sc = ser.copy() result = ser.replace(to_rep, val) return_value = sc.replace(to_rep, val, inplace=True) assert return_value is None tm.assert_series_equal(expected, result) tm.assert_series_equal(expected, sc) # 3.0 can still be held in our int64 series, so we do not upcast GH#44940 tr, v = [3], [3.0] check_replace(tr, v, ser) # Note this matches what we get with the scalars 3 and 3.0 check_replace(tr[0], v[0], ser) # MUST upcast to float e = pd.Series([0, 1, 2, 3.5, 4]) tr, v = [3], [3.5] check_replace(tr, v, e) # casts to object e = pd.Series([0, 1, 2, 3.5, "a"]) tr, v = [3, 4], [3.5, "a"] check_replace(tr, v, e) # again casts to object e = pd.Series([0, 1, 2, 3.5, pd.Timestamp("20130101")]) tr, v = [3, 4], [3.5, pd.Timestamp("20130101")] check_replace(tr, v, e) # casts to object e = pd.Series([0, 1, 2, 3.5, True], dtype="object") tr, v = [3, 4], [3.5, True] check_replace(tr, v, e) # test an object with dates + floats + integers + strings dr = pd.Series(pd.date_range("1/1/2001", "1/10/2001", freq="D")) result = dr.astype(object).replace([dr[0], dr[1], dr[2]], [1.0, 2, "a"]) expected = pd.Series([1.0, 2, "a"] + dr[3:].tolist(), dtype=object) tm.assert_series_equal(result, expected) def test_replace_bool_with_string_no_op(self): s = pd.Series([True, False, True]) result = s.replace("fun", "in-the-sun") tm.assert_series_equal(s, result) def test_replace_bool_with_string(self): # nonexistent elements s = pd.Series([True, False, True]) result = s.replace(True, "2u") expected = pd.Series(["2u", False, "2u"]) tm.assert_series_equal(expected, result) def test_replace_bool_with_bool(self): s = pd.Series([True, False, True]) result = s.replace(True, False) expected = pd.Series([False] * len(s)) tm.assert_series_equal(expected, result) def test_replace_with_dict_with_bool_keys(self): s = pd.Series([True, False, True]) result = s.replace({"asdf": "asdb", True: "yes"}) expected = pd.Series(["yes", False, "yes"]) tm.assert_series_equal(result, expected) def test_replace_Int_with_na(self, any_int_ea_dtype): # GH 38267 result = pd.Series([0, None], dtype=any_int_ea_dtype).replace(0, pd.NA) expected = pd.Series([pd.NA, pd.NA], dtype=any_int_ea_dtype) tm.assert_series_equal(result, expected) result = pd.Series([0, 1], dtype=any_int_ea_dtype).replace(0, pd.NA) result.replace(1, pd.NA, inplace=True) tm.assert_series_equal(result, expected) def test_replace2(self): N = 100 ser = pd.Series(np.fabs(np.random.randn(N)), tm.makeDateIndex(N), dtype=object) ser[:5] = np.nan ser[6:10] = "foo" ser[20:30] = "bar" # replace list with a single value rs = ser.replace([np.nan, "foo", "bar"], -1) assert (rs[:5] == -1).all() assert (rs[6:10] == -1).all() assert (rs[20:30] == -1).all() assert (pd.isna(ser[:5])).all() # replace with different values rs = ser.replace({np.nan: -1, "foo": -2, "bar": -3}) assert (rs[:5] == -1).all() assert (rs[6:10] == -2).all() assert (rs[20:30] == -3).all() assert (pd.isna(ser[:5])).all() # replace with different values with 2 lists rs2 = ser.replace([np.nan, "foo", "bar"], [-1, -2, -3]) tm.assert_series_equal(rs, rs2) # replace inplace return_value = ser.replace([np.nan, "foo", "bar"], -1, inplace=True) assert return_value is None assert (ser[:5] == -1).all() assert (ser[6:10] == -1).all() assert (ser[20:30] == -1).all() def test_replace_with_dictlike_and_string_dtype(self, nullable_string_dtype): # GH 32621, GH#44940 ser = pd.Series(["one", "two", np.nan], dtype=nullable_string_dtype) expected = pd.Series(["1", "2", np.nan], dtype=nullable_string_dtype) result = ser.replace({"one": "1", "two": "2"}) tm.assert_series_equal(expected, result) def test_replace_with_empty_dictlike(self): # GH 15289 s = pd.Series(list("abcd")) tm.assert_series_equal(s, s.replace({})) with tm.assert_produces_warning(FutureWarning): empty_series = pd.Series([]) tm.assert_series_equal(s, s.replace(empty_series)) def test_replace_string_with_number(self): # GH 15743 s = pd.Series([1, 2, 3]) result = s.replace("2", np.nan) expected = pd.Series([1, 2, 3]) tm.assert_series_equal(expected, result) def test_replace_replacer_equals_replacement(self): # GH 20656 # make sure all replacers are matching against original values s = pd.Series(["a", "b"]) expected = pd.Series(["b", "a"]) result = s.replace({"a": "b", "b": "a"}) tm.assert_series_equal(expected, result) def test_replace_unicode_with_number(self): # GH 15743 s = pd.Series([1, 2, 3]) result = s.replace("2", np.nan) expected = pd.Series([1, 2, 3]) tm.assert_series_equal(expected, result) def test_replace_mixed_types_with_string(self): # Testing mixed s = pd.Series([1, 2, 3, "4", 4, 5]) result = s.replace([2, "4"], np.nan) expected = pd.Series([1, np.nan, 3, np.nan, 4, 5]) tm.assert_series_equal(expected, result) @pytest.mark.parametrize( "categorical, numeric", [ (pd.Categorical(["A"], categories=["A", "B"]), [1]), (pd.Categorical(["A", "B"], categories=["A", "B"]), [1, 2]), ], ) def test_replace_categorical(self, categorical, numeric): # GH 24971, GH#23305 ser = pd.Series(categorical) result = ser.replace({"A": 1, "B": 2}) expected = pd.Series(numeric).astype("category") if 2 not in expected.cat.categories: # i.e. categories should be [1, 2] even if there are no "B"s present # GH#44940 expected = expected.cat.add_categories(2) tm.assert_series_equal(expected, result) def test_replace_categorical_single(self): # GH 26988 dti = pd.date_range("2016-01-01", periods=3, tz="US/Pacific") s = pd.Series(dti) c = s.astype("category") expected = c.copy() expected = expected.cat.add_categories("foo") expected[2] = "foo" expected = expected.cat.remove_unused_categories() assert c[2] != "foo" result = c.replace(c[2], "foo") tm.assert_series_equal(expected, result) assert c[2] != "foo" # ensure non-inplace call does not alter original return_value = c.replace(c[2], "foo", inplace=True) assert return_value is None tm.assert_series_equal(expected, c) first_value = c[0] return_value = c.replace(c[1], c[0], inplace=True) assert return_value is None assert c[0] == c[1] == first_value # test replacing with existing value def test_replace_with_no_overflowerror(self): # GH 25616 # casts to object without Exception from OverflowError s = pd.Series([0, 1, 2, 3, 4]) result = s.replace([3], ["100000000000000000000"]) expected = pd.Series([0, 1, 2, "100000000000000000000", 4]) tm.assert_series_equal(result, expected) s = pd.Series([0, "100000000000000000000", "100000000000000000001"]) result = s.replace(["100000000000000000000"], [1]) expected = pd.Series([0, 1, "100000000000000000001"]) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( "ser, to_replace, exp", [ ([1, 2, 3], { 1: 2, 2: 3, 3: 4 }, [2, 3, 4]), (["1", "2", "3"], { "1": "2", "2": "3", "3": "4" }, ["2", "3", "4"]), ], ) def test_replace_commutative(self, ser, to_replace, exp): # GH 16051 # DataFrame.replace() overwrites when values are non-numeric series = pd.Series(ser) expected = pd.Series(exp) result = series.replace(to_replace) tm.assert_series_equal(result, expected) @pytest.mark.parametrize("ser, exp", [([1, 2, 3], [1, True, 3]), (["x", 2, 3], ["x", True, 3])]) def test_replace_no_cast(self, ser, exp): # GH 9113 # BUG: replace int64 dtype with bool coerces to int64 series = pd.Series(ser) result = series.replace(2, True) expected = pd.Series(exp) tm.assert_series_equal(result, expected) def test_replace_invalid_to_replace(self): # GH 18634 # API: replace() should raise an exception if invalid argument is given series = pd.Series(["a", "b", "c "]) msg = (r"Expecting 'to_replace' to be either a scalar, array-like, " r"dict or None, got invalid type.*") with pytest.raises(TypeError, match=msg): series.replace(lambda x: x.strip()) @pytest.mark.parametrize("frame", [False, True]) def test_replace_nonbool_regex(self, frame): obj = pd.Series(["a", "b", "c "]) if frame: obj = obj.to_frame() msg = "'to_replace' must be 'None' if 'regex' is not a bool" with pytest.raises(ValueError, match=msg): obj.replace(to_replace=["a"], regex="foo") @pytest.mark.parametrize("frame", [False, True]) def test_replace_empty_copy(self, frame): obj = pd.Series([], dtype=np.float64) if frame: obj = obj.to_frame() res = obj.replace(4, 5, inplace=True) assert res is None res = obj.replace(4, 5, inplace=False) tm.assert_equal(res, obj) assert res is not obj def test_replace_only_one_dictlike_arg(self, fixed_now_ts): # GH#33340 ser = pd.Series([1, 2, "A", fixed_now_ts, True]) to_replace = {0: 1, 2: "A"} value = "foo" msg = "Series.replace cannot use dict-like to_replace and non-None value" with pytest.raises(ValueError, match=msg): ser.replace(to_replace, value) to_replace = 1 value = {0: "foo", 2: "bar"} msg = "Series.replace cannot use dict-value and non-None to_replace" with pytest.raises(ValueError, match=msg): ser.replace(to_replace, value) def test_replace_extension_other(self, frame_or_series): # https://github.com/pandas-dev/pandas/issues/34530 obj = frame_or_series(pd.array([1, 2, 3], dtype="Int64")) result = obj.replace("", "") # no exception # should not have changed dtype tm.assert_equal(obj, result) def _check_replace_with_method(self, ser: pd.Series): df = ser.to_frame() res = ser.replace(ser[1], method="pad") expected = pd.Series([ser[0], ser[0]] + list(ser[2:]), dtype=ser.dtype) tm.assert_series_equal(res, expected) res_df = df.replace(ser[1], method="pad") tm.assert_frame_equal(res_df, expected.to_frame()) ser2 = ser.copy() res2 = ser2.replace(ser[1], method="pad", inplace=True) assert res2 is None tm.assert_series_equal(ser2, expected) res_df2 = df.replace(ser[1], method="pad", inplace=True) assert res_df2 is None tm.assert_frame_equal(df, expected.to_frame()) def test_replace_ea_dtype_with_method(self, any_numeric_ea_dtype): arr = pd.array([1, 2, pd.NA, 4], dtype=any_numeric_ea_dtype) ser = pd.Series(arr) self._check_replace_with_method(ser) @pytest.mark.parametrize("as_categorical", [True, False]) def test_replace_interval_with_method(self, as_categorical): # in particular interval that can't hold NA idx = pd.IntervalIndex.from_breaks(range(4)) ser = pd.Series(idx) if as_categorical: ser = ser.astype("category") self._check_replace_with_method(ser) @pytest.mark.parametrize("as_period", [True, False]) @pytest.mark.parametrize("as_categorical", [True, False]) def test_replace_datetimelike_with_method(self, as_period, as_categorical): idx = pd.date_range("2016-01-01", periods=5, tz="US/Pacific") if as_period: idx = idx.tz_localize(None).to_period("D") ser = pd.Series(idx) ser.iloc[-2] = pd.NaT if as_categorical: ser = ser.astype("category") self._check_replace_with_method(ser) def test_replace_with_compiled_regex(self): # https://github.com/pandas-dev/pandas/issues/35680 s = pd.Series(["a", "b", "c"]) regex = re.compile("^a$") result = s.replace({regex: "z"}, regex=True) expected = pd.Series(["z", "b", "c"]) tm.assert_series_equal(result, expected) def test_pandas_replace_na(self): # GH#43344 ser = pd.Series(["AA", "BB", "CC", "DD", "EE", "", pd.NA], dtype="string") regex_mapping = { "AA": "CC", "BB": "CC", "EE": "CC", "CC": "CC-REPL", } result = ser.replace(regex_mapping, regex=True) exp = pd.Series(["CC", "CC", "CC-REPL", "DD", "CC", "", pd.NA], dtype="string") tm.assert_series_equal(result, exp) @pytest.mark.parametrize( "dtype, input_data, to_replace, expected_data", [ ("bool", [True, False], { True: False }, [False, False]), ("int64", [1, 2], { 1: 10, 2: 20 }, [10, 20]), ("Int64", [1, 2], { 1: 10, 2: 20 }, [10, 20]), ("float64", [1.1, 2.2], { 1.1: 10.1, 2.2: 20.5 }, [10.1, 20.5]), ("Float64", [1.1, 2.2], { 1.1: 10.1, 2.2: 20.5 }, [10.1, 20.5]), ("string", ["one", "two"], { "one": "1", "two": "2" }, ["1", "2"]), ( pd.IntervalDtype("int64"), IntervalArray([pd.Interval(1, 2), pd.Interval(2, 3)]), { pd.Interval(1, 2): pd.Interval(10, 20) }, IntervalArray([pd.Interval(10, 20), pd.Interval(2, 3)]), ), ( pd.IntervalDtype("float64"), IntervalArray([pd.Interval(1.0, 2.7), pd.Interval(2.8, 3.1)]), { pd.Interval(1.0, 2.7): pd.Interval(10.6, 20.8) }, IntervalArray([pd.Interval(10.6, 20.8), pd.Interval(2.8, 3.1)]), ), ( pd.PeriodDtype("M"), [pd.Period("2020-05", freq="M")], { pd.Period("2020-05", freq="M"): pd.Period("2020-06", freq="M") }, [pd.Period("2020-06", freq="M")], ), ], ) def test_replace_dtype(self, dtype, input_data, to_replace, expected_data): # GH#33484 ser = pd.Series(input_data, dtype=dtype) result = ser.replace(to_replace) expected = pd.Series(expected_data, dtype=dtype) tm.assert_series_equal(result, expected) def test_replace_string_dtype(self): # GH#40732, GH#44940 ser = pd.Series(["one", "two", np.nan], dtype="string") res = ser.replace({"one": "1", "two": "2"}) expected = pd.Series(["1", "2", np.nan], dtype="string") tm.assert_series_equal(res, expected) # GH#31644 ser2 = pd.Series(["A", np.nan], dtype="string") res2 = ser2.replace("A", "B") expected2 = pd.Series(["B", np.nan], dtype="string") tm.assert_series_equal(res2, expected2) ser3 = pd.Series(["A", "B"], dtype="string") res3 = ser3.replace("A", pd.NA) expected3 = pd.Series([pd.NA, "B"], dtype="string") tm.assert_series_equal(res3, expected3) def test_replace_string_dtype_list_to_replace(self): # GH#41215, GH#44940 ser = pd.Series(["abc", "def"], dtype="string") res = ser.replace(["abc", "any other string"], "xyz") expected = pd.Series(["xyz", "def"], dtype="string") tm.assert_series_equal(res, expected) def test_replace_string_dtype_regex(self): # GH#31644 ser = pd.Series(["A", "B"], dtype="string") res = ser.replace(r".", "C", regex=True) expected = pd.Series(["C", "C"], dtype="string") tm.assert_series_equal(res, expected) def test_replace_nullable_numeric(self): # GH#40732, GH#44940 floats = pd.Series([1.0, 2.0, 3.999, 4.4], dtype=pd.Float64Dtype()) assert floats.replace({1.0: 9}).dtype == floats.dtype assert floats.replace(1.0, 9).dtype == floats.dtype assert floats.replace({1.0: 9.0}).dtype == floats.dtype assert floats.replace(1.0, 9.0).dtype == floats.dtype res = floats.replace(to_replace=[1.0, 2.0], value=[9.0, 10.0]) assert res.dtype == floats.dtype ints = pd.Series([1, 2, 3, 4], dtype=pd.Int64Dtype()) assert ints.replace({1: 9}).dtype == ints.dtype assert ints.replace(1, 9).dtype == ints.dtype assert ints.replace({1: 9.0}).dtype == ints.dtype assert ints.replace(1, 9.0).dtype == ints.dtype # nullable (for now) raises instead of casting with pytest.raises(TypeError, match="Invalid value"): ints.replace({1: 9.5}) with pytest.raises(TypeError, match="Invalid value"): ints.replace(1, 9.5) @pytest.mark.parametrize("regex", [False, True]) def test_replace_regex_dtype_series(self, regex): # GH-48644 series = pd.Series(["0"]) expected = pd.Series([1]) result = series.replace(to_replace="0", value=1, regex=regex) tm.assert_series_equal(result, expected) def test_replace_different_int_types(self, any_int_numpy_dtype): # GH#45311 labs = pd.Series([1, 1, 1, 0, 0, 2, 2, 2], dtype=any_int_numpy_dtype) maps = pd.Series([0, 2, 1], dtype=any_int_numpy_dtype) map_dict = {old: new for (old, new) in zip(maps.values, maps.index)} result = labs.replace(map_dict) expected = labs.replace({0: 0, 2: 1, 1: 2}) tm.assert_series_equal(result, expected)
def test_slicing_agg_min_max(s1_fix): result = s1_fix.slice(range(-4, 11, 2)).agg(["min", "max"]) pd.testing.assert_series_equal( result["min"], pd.Series({ pd.Interval(-4, -2, closed="left"): -1.75, pd.Interval(-2, 0, closed="left"): -1.75, pd.Interval(0, 2, closed="left"): -1.75, pd.Interval(2, 4, closed="left"): 0.25, pd.Interval(4, 6, closed="left"): 2.0, pd.Interval(6, 8, closed="left"): -0.5, pd.Interval(8, 10, closed="left"): -0.5, }), check_names=False, check_index_type=False, ) pd.testing.assert_series_equal( result["max"], pd.Series({ pd.Interval(-4, -2, closed="left"): -1.75, pd.Interval(-2, 0, closed="left"): -1.75, pd.Interval(0, 2, closed="left"): 0.25, pd.Interval(2, 4, closed="left"): 2.75, pd.Interval(4, 6, closed="left"): 2.75, pd.Interval(6, 8, closed="left"): -0.5, pd.Interval(8, 10, closed="left"): -0.5, }), check_names=False, check_index_type=False, )
class TestHashing(object): @pytest.fixture(params=[ Series([1, 2, 3] * 3, dtype='int32'), Series([None, 2.5, 3.5] * 3, dtype='float32'), Series(['a', 'b', 'c'] * 3, dtype='category'), Series(['d', 'e', 'f'] * 3), Series([True, False, True] * 3), Series(pd.date_range('20130101', periods=9)), Series(pd.date_range('20130101', periods=9, tz='US/Eastern')), Series(pd.timedelta_range('2000', periods=9)) ]) def series(self, request): return request.param def test_consistency(self): # check that our hash doesn't change because of a mistake # in the actual code; this is the ground truth result = hash_pandas_object(Index(['foo', 'bar', 'baz'])) expected = Series(np.array( [3600424527151052760, 1374399572096150070, 477881037637427054], dtype='uint64'), index=['foo', 'bar', 'baz']) tm.assert_series_equal(result, expected) def test_hash_array(self, series): a = series.values tm.assert_numpy_array_equal(hash_array(a), hash_array(a)) def test_hash_array_mixed(self): result1 = hash_array(np.array([3, 4, 'All'])) result2 = hash_array(np.array(['3', '4', 'All'])) result3 = hash_array(np.array([3, 4, 'All'], dtype=object)) tm.assert_numpy_array_equal(result1, result2) tm.assert_numpy_array_equal(result1, result3) @pytest.mark.parametrize('val', [5, 'foo', pd.Timestamp('20130101')]) def test_hash_array_errors(self, val): msg = 'must pass a ndarray-like' with tm.assert_raises_regex(TypeError, msg): hash_array(val) def check_equal(self, obj, **kwargs): a = hash_pandas_object(obj, **kwargs) b = hash_pandas_object(obj, **kwargs) tm.assert_series_equal(a, b) kwargs.pop('index', None) a = hash_pandas_object(obj, **kwargs) b = hash_pandas_object(obj, **kwargs) tm.assert_series_equal(a, b) def check_not_equal_with_index(self, obj): # check that we are not hashing the same if # we include the index if not isinstance(obj, Index): a = hash_pandas_object(obj, index=True) b = hash_pandas_object(obj, index=False) if len(obj): assert not (a == b).all() def test_hash_tuples(self): tups = [(1, 'one'), (1, 'two'), (2, 'one')] result = hash_tuples(tups) expected = hash_pandas_object(MultiIndex.from_tuples(tups)).values tm.assert_numpy_array_equal(result, expected) result = hash_tuples(tups[0]) assert result == expected[0] @pytest.mark.parametrize('tup', [(1, 'one'), (1, np.nan), (1.0, pd.NaT, 'A'), ('A', pd.Timestamp("2012-01-01"))]) def test_hash_tuple(self, tup): # test equivalence between hash_tuples and hash_tuple result = hash_tuple(tup) expected = hash_tuples([tup])[0] assert result == expected @pytest.mark.parametrize('val', [ 1, 1.4, 'A', b'A', u'A', pd.Timestamp("2012-01-01"), pd.Timestamp("2012-01-01", tz='Europe/Brussels'), datetime.datetime(2012, 1, 1), pd.Timestamp("2012-01-01", tz='EST').to_pydatetime(), pd.Timedelta('1 days'), datetime.timedelta(1), pd.Period('2012-01-01', freq='D'), pd.Interval(0, 1), np.nan, pd.NaT, None ]) def test_hash_scalar(self, val): result = _hash_scalar(val) expected = hash_array(np.array([val], dtype=object), categorize=True) assert result[0] == expected[0] @pytest.mark.parametrize('val', [5, 'foo', pd.Timestamp('20130101')]) def test_hash_tuples_err(self, val): msg = 'must be convertible to a list-of-tuples' with tm.assert_raises_regex(TypeError, msg): hash_tuples(val) def test_multiindex_unique(self): mi = MultiIndex.from_tuples([(118, 472), (236, 118), (51, 204), (102, 51)]) assert mi.is_unique result = hash_pandas_object(mi) assert result.is_unique def test_multiindex_objects(self): mi = MultiIndex(levels=[['b', 'd', 'a'], [1, 2, 3]], labels=[[0, 1, 0, 2], [2, 0, 0, 1]], names=['col1', 'col2']) recons = mi._sort_levels_monotonic() # these are equal assert mi.equals(recons) assert Index(mi.values).equals(Index(recons.values)) # _hashed_values and hash_pandas_object(..., index=False) # equivalency expected = hash_pandas_object(mi, index=False).values result = mi._hashed_values tm.assert_numpy_array_equal(result, expected) expected = hash_pandas_object(recons, index=False).values result = recons._hashed_values tm.assert_numpy_array_equal(result, expected) expected = mi._hashed_values result = recons._hashed_values # values should match, but in different order tm.assert_numpy_array_equal(np.sort(result), np.sort(expected)) @pytest.mark.parametrize('obj', [ Series([1, 2, 3]), Series([1.0, 1.5, 3.2]), Series([1.0, 1.5, np.nan]), Series([1.0, 1.5, 3.2], index=[1.5, 1.1, 3.3]), Series(['a', 'b', 'c']), Series(['a', np.nan, 'c']), Series(['a', None, 'c']), Series([True, False, True]), Series(), Index([1, 2, 3]), Index([True, False, True]), DataFrame({ 'x': ['a', 'b', 'c'], 'y': [1, 2, 3] }), DataFrame(), tm.makeMissingDataframe(), tm.makeMixedDataFrame(), tm.makeTimeDataFrame(), tm.makeTimeSeries(), tm.makeTimedeltaIndex(), tm.makePeriodIndex(), Series(tm.makePeriodIndex()), Series(pd.date_range('20130101', periods=3, tz='US/Eastern')), MultiIndex.from_product([ range(5), ['foo', 'bar', 'baz'], pd.date_range('20130101', periods=2) ]), MultiIndex.from_product([pd.CategoricalIndex(list('aabc')), range(3)]) ]) def test_hash_pandas_object(self, obj): self.check_equal(obj) self.check_not_equal_with_index(obj) def test_hash_pandas_object2(self, series): self.check_equal(series) self.check_not_equal_with_index(series) @pytest.mark.parametrize( 'obj', [Series([], dtype='float64'), Series([], dtype='object'), Index([])]) def test_hash_pandas_empty_object(self, obj): # these are by-definition the same with # or w/o the index as the data is empty self.check_equal(obj) @pytest.mark.parametrize('s1', [ Series(['a', 'b', 'c', 'd']), Series([1000, 2000, 3000, 4000]), Series(pd.date_range(0, periods=4)) ]) @pytest.mark.parametrize('categorize', [True, False]) def test_categorical_consistency(self, s1, categorize): # GH15143 # Check that categoricals hash consistent with their values, not codes # This should work for categoricals of any dtype s2 = s1.astype('category').cat.set_categories(s1) s3 = s2.cat.set_categories(list(reversed(s1))) # These should all hash identically h1 = hash_pandas_object(s1, categorize=categorize) h2 = hash_pandas_object(s2, categorize=categorize) h3 = hash_pandas_object(s3, categorize=categorize) tm.assert_series_equal(h1, h2) tm.assert_series_equal(h1, h3) def test_categorical_with_nan_consistency(self): c = pd.Categorical.from_codes([-1, 0, 1, 2, 3, 4], categories=pd.date_range('2012-01-01', periods=5, name='B')) expected = hash_array(c, categorize=False) c = pd.Categorical.from_codes([-1, 0], categories=[pd.Timestamp('2012-01-01')]) result = hash_array(c, categorize=False) assert result[0] in expected assert result[1] in expected @pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning") def test_pandas_errors(self): with pytest.raises(TypeError): hash_pandas_object(pd.Timestamp('20130101')) obj = tm.makePanel() with pytest.raises(TypeError): hash_pandas_object(obj) def test_hash_keys(self): # using different hash keys, should have different hashes # for the same data # this only matters for object dtypes obj = Series(list('abc')) a = hash_pandas_object(obj, hash_key='9876543210123456') b = hash_pandas_object(obj, hash_key='9876543210123465') assert (a != b).all() def test_invalid_key(self): # this only matters for object dtypes msg = 'key should be a 16-byte string encoded' with tm.assert_raises_regex(ValueError, msg): hash_pandas_object(Series(list('abc')), hash_key='foo') def test_alread_encoded(self): # if already encoded then ok obj = Series(list('abc')).str.encode('utf8') self.check_equal(obj) def test_alternate_encoding(self): obj = Series(list('abc')) self.check_equal(obj, encoding='ascii') @pytest.mark.parametrize('l_exp', range(8)) @pytest.mark.parametrize('l_add', [0, 1]) def test_same_len_hash_collisions(self, l_exp, l_add): length = 2**(l_exp + 8) + l_add s = tm.rands_array(length, 2) result = hash_array(s, 'utf8') assert not result[0] == result[1] def test_hash_collisions(self): # hash collisions are bad # https://github.com/pandas-dev/pandas/issues/14711#issuecomment-264885726 L = [ 'Ingrid-9Z9fKIZmkO7i7Cn51Li34pJm44fgX6DYGBNj3VPlOH50m7HnBlPxfIwFMrcNJNMP6PSgLmwWnInciMWrCSAlLEvt7JkJl4IxiMrVbXSa8ZQoVaq5xoQPjltuJEfwdNlO6jo8qRRHvD8sBEBMQASrRa6TsdaPTPCBo3nwIBpE7YzzmyH0vMBhjQZLx1aCT7faSEx7PgFxQhHdKFWROcysamgy9iVj8DO2Fmwg1NNl93rIAqC3mdqfrCxrzfvIY8aJdzin2cHVzy3QUJxZgHvtUtOLxoqnUHsYbNTeq0xcLXpTZEZCxD4PGubIuCNf32c33M7HFsnjWSEjE2yVdWKhmSVodyF8hFYVmhYnMCztQnJrt3O8ZvVRXd5IKwlLexiSp4h888w7SzAIcKgc3g5XQJf6MlSMftDXm9lIsE1mJNiJEv6uY6pgvC3fUPhatlR5JPpVAHNSbSEE73MBzJrhCAbOLXQumyOXigZuPoME7QgJcBalliQol7YZ9', # noqa 'Tim-b9MddTxOWW2AT1Py6vtVbZwGAmYCjbp89p8mxsiFoVX4FyDOF3wFiAkyQTUgwg9sVqVYOZo09Dh1AzhFHbgij52ylF0SEwgzjzHH8TGY8Lypart4p4onnDoDvVMBa0kdthVGKl6K0BDVGzyOXPXKpmnMF1H6rJzqHJ0HywfwS4XYpVwlAkoeNsiicHkJUFdUAhG229INzvIAiJuAHeJDUoyO4DCBqtoZ5TDend6TK7Y914yHlfH3g1WZu5LksKv68VQHJriWFYusW5e6ZZ6dKaMjTwEGuRgdT66iU5nqWTHRH8WSzpXoCFwGcTOwyuqPSe0fTe21DVtJn1FKj9F9nEnR9xOvJUO7E0piCIF4Ad9yAIDY4DBimpsTfKXCu1vdHpKYerzbndfuFe5AhfMduLYZJi5iAw8qKSwR5h86ttXV0Mc0QmXz8dsRvDgxjXSmupPxBggdlqUlC828hXiTPD7am0yETBV0F3bEtvPiNJfremszcV8NcqAoARMe' ] # noqa # these should be different! result1 = hash_array(np.asarray(L[0:1], dtype=object), 'utf8') expected1 = np.array([14963968704024874985], dtype=np.uint64) tm.assert_numpy_array_equal(result1, expected1) result2 = hash_array(np.asarray(L[1:2], dtype=object), 'utf8') expected2 = np.array([16428432627716348016], dtype=np.uint64) tm.assert_numpy_array_equal(result2, expected2) result = hash_array(np.asarray(L, dtype=object), 'utf8') tm.assert_numpy_array_equal( result, np.concatenate([expected1, expected2], axis=0))
class TestDataFrameDataTypes(TestData): def test_concat_empty_dataframe_dtypes(self): df = DataFrame(columns=list("abc")) df['a'] = df['a'].astype(np.bool_) df['b'] = df['b'].astype(np.int32) df['c'] = df['c'].astype(np.float64) result = pd.concat([df, df]) assert result['a'].dtype == np.bool_ assert result['b'].dtype == np.int32 assert result['c'].dtype == np.float64 result = pd.concat([df, df.astype(np.float64)]) assert result['a'].dtype == np.object_ assert result['b'].dtype == np.float64 assert result['c'].dtype == np.float64 def test_empty_frame_dtypes_ftypes(self): empty_df = pd.DataFrame() assert_series_equal(empty_df.dtypes, pd.Series(dtype=np.object)) assert_series_equal(empty_df.ftypes, pd.Series(dtype=np.object)) nocols_df = pd.DataFrame(index=[1, 2, 3]) assert_series_equal(nocols_df.dtypes, pd.Series(dtype=np.object)) assert_series_equal(nocols_df.ftypes, pd.Series(dtype=np.object)) norows_df = pd.DataFrame(columns=list("abc")) assert_series_equal(norows_df.dtypes, pd.Series( np.object, index=list("abc"))) assert_series_equal(norows_df.ftypes, pd.Series( 'object:dense', index=list("abc"))) norows_int_df = pd.DataFrame(columns=list("abc")).astype(np.int32) assert_series_equal(norows_int_df.dtypes, pd.Series( np.dtype('int32'), index=list("abc"))) assert_series_equal(norows_int_df.ftypes, pd.Series( 'int32:dense', index=list("abc"))) odict = compat.OrderedDict df = pd.DataFrame(odict([('a', 1), ('b', True), ('c', 1.0)]), index=[1, 2, 3]) ex_dtypes = pd.Series(odict([('a', np.int64), ('b', np.bool), ('c', np.float64)])) ex_ftypes = pd.Series(odict([('a', 'int64:dense'), ('b', 'bool:dense'), ('c', 'float64:dense')])) assert_series_equal(df.dtypes, ex_dtypes) assert_series_equal(df.ftypes, ex_ftypes) # same but for empty slice of df assert_series_equal(df[:0].dtypes, ex_dtypes) assert_series_equal(df[:0].ftypes, ex_ftypes) def test_datetime_with_tz_dtypes(self): tzframe = DataFrame({'A': date_range('20130101', periods=3), 'B': date_range('20130101', periods=3, tz='US/Eastern'), 'C': date_range('20130101', periods=3, tz='CET')}) tzframe.iloc[1, 1] = pd.NaT tzframe.iloc[1, 2] = pd.NaT result = tzframe.dtypes.sort_index() expected = Series([np.dtype('datetime64[ns]'), DatetimeTZDtype('datetime64[ns, US/Eastern]'), DatetimeTZDtype('datetime64[ns, CET]')], ['A', 'B', 'C']) assert_series_equal(result, expected) def test_dtypes_are_correct_after_column_slice(self): # GH6525 df = pd.DataFrame(index=range(5), columns=list("abc"), dtype=np.float_) odict = compat.OrderedDict assert_series_equal(df.dtypes, pd.Series(odict([('a', np.float_), ('b', np.float_), ('c', np.float_)]))) assert_series_equal(df.iloc[:, 2:].dtypes, pd.Series(odict([('c', np.float_)]))) assert_series_equal(df.dtypes, pd.Series(odict([('a', np.float_), ('b', np.float_), ('c', np.float_)]))) def test_select_dtypes_include_using_list_like(self): df = DataFrame({'a': list('abc'), 'b': list(range(1, 4)), 'c': np.arange(3, 6).astype('u1'), 'd': np.arange(4.0, 7.0, dtype='float64'), 'e': [True, False, True], 'f': pd.Categorical(list('abc')), 'g': pd.date_range('20130101', periods=3), 'h': pd.date_range('20130101', periods=3, tz='US/Eastern'), 'i': pd.date_range('20130101', periods=3, tz='CET'), 'j': pd.period_range('2013-01', periods=3, freq='M'), 'k': pd.timedelta_range('1 day', periods=3)}) ri = df.select_dtypes(include=[np.number]) ei = df[['b', 'c', 'd', 'k']] assert_frame_equal(ri, ei) ri = df.select_dtypes(include=[np.number], exclude=['timedelta']) ei = df[['b', 'c', 'd']] assert_frame_equal(ri, ei) ri = df.select_dtypes(include=[np.number, 'category'], exclude=['timedelta']) ei = df[['b', 'c', 'd', 'f']] assert_frame_equal(ri, ei) ri = df.select_dtypes(include=['datetime']) ei = df[['g']] assert_frame_equal(ri, ei) ri = df.select_dtypes(include=['datetime64']) ei = df[['g']] assert_frame_equal(ri, ei) ri = df.select_dtypes(include=['datetimetz']) ei = df[['h', 'i']] assert_frame_equal(ri, ei) pytest.raises(NotImplementedError, lambda: df.select_dtypes(include=['period'])) def test_select_dtypes_exclude_using_list_like(self): df = DataFrame({'a': list('abc'), 'b': list(range(1, 4)), 'c': np.arange(3, 6).astype('u1'), 'd': np.arange(4.0, 7.0, dtype='float64'), 'e': [True, False, True]}) re = df.select_dtypes(exclude=[np.number]) ee = df[['a', 'e']] assert_frame_equal(re, ee) def test_select_dtypes_exclude_include_using_list_like(self): df = DataFrame({'a': list('abc'), 'b': list(range(1, 4)), 'c': np.arange(3, 6).astype('u1'), 'd': np.arange(4.0, 7.0, dtype='float64'), 'e': [True, False, True], 'f': pd.date_range('now', periods=3).values}) exclude = np.datetime64, include = np.bool_, 'integer' r = df.select_dtypes(include=include, exclude=exclude) e = df[['b', 'c', 'e']] assert_frame_equal(r, e) exclude = 'datetime', include = 'bool', 'int64', 'int32' r = df.select_dtypes(include=include, exclude=exclude) e = df[['b', 'e']] assert_frame_equal(r, e) def test_select_dtypes_include_using_scalars(self): df = DataFrame({'a': list('abc'), 'b': list(range(1, 4)), 'c': np.arange(3, 6).astype('u1'), 'd': np.arange(4.0, 7.0, dtype='float64'), 'e': [True, False, True], 'f': pd.Categorical(list('abc')), 'g': pd.date_range('20130101', periods=3), 'h': pd.date_range('20130101', periods=3, tz='US/Eastern'), 'i': pd.date_range('20130101', periods=3, tz='CET'), 'j': pd.period_range('2013-01', periods=3, freq='M'), 'k': pd.timedelta_range('1 day', periods=3)}) ri = df.select_dtypes(include=np.number) ei = df[['b', 'c', 'd', 'k']] assert_frame_equal(ri, ei) ri = df.select_dtypes(include='datetime') ei = df[['g']] assert_frame_equal(ri, ei) ri = df.select_dtypes(include='datetime64') ei = df[['g']] assert_frame_equal(ri, ei) ri = df.select_dtypes(include='category') ei = df[['f']] assert_frame_equal(ri, ei) pytest.raises(NotImplementedError, lambda: df.select_dtypes(include='period')) def test_select_dtypes_exclude_using_scalars(self): df = DataFrame({'a': list('abc'), 'b': list(range(1, 4)), 'c': np.arange(3, 6).astype('u1'), 'd': np.arange(4.0, 7.0, dtype='float64'), 'e': [True, False, True], 'f': pd.Categorical(list('abc')), 'g': pd.date_range('20130101', periods=3), 'h': pd.date_range('20130101', periods=3, tz='US/Eastern'), 'i': pd.date_range('20130101', periods=3, tz='CET'), 'j': pd.period_range('2013-01', periods=3, freq='M'), 'k': pd.timedelta_range('1 day', periods=3)}) ri = df.select_dtypes(exclude=np.number) ei = df[['a', 'e', 'f', 'g', 'h', 'i', 'j']] assert_frame_equal(ri, ei) ri = df.select_dtypes(exclude='category') ei = df[['a', 'b', 'c', 'd', 'e', 'g', 'h', 'i', 'j', 'k']] assert_frame_equal(ri, ei) pytest.raises(NotImplementedError, lambda: df.select_dtypes(exclude='period')) def test_select_dtypes_include_exclude_using_scalars(self): df = DataFrame({'a': list('abc'), 'b': list(range(1, 4)), 'c': np.arange(3, 6).astype('u1'), 'd': np.arange(4.0, 7.0, dtype='float64'), 'e': [True, False, True], 'f': pd.Categorical(list('abc')), 'g': pd.date_range('20130101', periods=3), 'h': pd.date_range('20130101', periods=3, tz='US/Eastern'), 'i': pd.date_range('20130101', periods=3, tz='CET'), 'j': pd.period_range('2013-01', periods=3, freq='M'), 'k': pd.timedelta_range('1 day', periods=3)}) ri = df.select_dtypes(include=np.number, exclude='floating') ei = df[['b', 'c', 'k']] assert_frame_equal(ri, ei) def test_select_dtypes_include_exclude_mixed_scalars_lists(self): df = DataFrame({'a': list('abc'), 'b': list(range(1, 4)), 'c': np.arange(3, 6).astype('u1'), 'd': np.arange(4.0, 7.0, dtype='float64'), 'e': [True, False, True], 'f': pd.Categorical(list('abc')), 'g': pd.date_range('20130101', periods=3), 'h': pd.date_range('20130101', periods=3, tz='US/Eastern'), 'i': pd.date_range('20130101', periods=3, tz='CET'), 'j': pd.period_range('2013-01', periods=3, freq='M'), 'k': pd.timedelta_range('1 day', periods=3)}) ri = df.select_dtypes(include=np.number, exclude=['floating', 'timedelta']) ei = df[['b', 'c']] assert_frame_equal(ri, ei) ri = df.select_dtypes(include=[np.number, 'category'], exclude='floating') ei = df[['b', 'c', 'f', 'k']] assert_frame_equal(ri, ei) def test_select_dtypes_duplicate_columns(self): # GH20839 odict = compat.OrderedDict df = DataFrame(odict([('a', list('abc')), ('b', list(range(1, 4))), ('c', np.arange(3, 6).astype('u1')), ('d', np.arange(4.0, 7.0, dtype='float64')), ('e', [True, False, True]), ('f', pd.date_range('now', periods=3).values)])) df.columns = ['a', 'a', 'b', 'b', 'b', 'c'] expected = DataFrame({'a': list(range(1, 4)), 'b': np.arange(3, 6).astype('u1')}) result = df.select_dtypes(include=[np.number], exclude=['floating']) assert_frame_equal(result, expected) def test_select_dtypes_not_an_attr_but_still_valid_dtype(self): df = DataFrame({'a': list('abc'), 'b': list(range(1, 4)), 'c': np.arange(3, 6).astype('u1'), 'd': np.arange(4.0, 7.0, dtype='float64'), 'e': [True, False, True], 'f': pd.date_range('now', periods=3).values}) df['g'] = df.f.diff() assert not hasattr(np, 'u8') r = df.select_dtypes(include=['i8', 'O'], exclude=['timedelta']) e = df[['a', 'b']] assert_frame_equal(r, e) r = df.select_dtypes(include=['i8', 'O', 'timedelta64[ns]']) e = df[['a', 'b', 'g']] assert_frame_equal(r, e) def test_select_dtypes_empty(self): df = DataFrame({'a': list('abc'), 'b': list(range(1, 4))}) with tm.assert_raises_regex(ValueError, 'at least one of ' 'include or exclude ' 'must be nonempty'): df.select_dtypes() def test_select_dtypes_bad_datetime64(self): df = DataFrame({'a': list('abc'), 'b': list(range(1, 4)), 'c': np.arange(3, 6).astype('u1'), 'd': np.arange(4.0, 7.0, dtype='float64'), 'e': [True, False, True], 'f': pd.date_range('now', periods=3).values}) with tm.assert_raises_regex(ValueError, '.+ is too specific'): df.select_dtypes(include=['datetime64[D]']) with tm.assert_raises_regex(ValueError, '.+ is too specific'): df.select_dtypes(exclude=['datetime64[as]']) def test_select_dtypes_datetime_with_tz(self): df2 = DataFrame(dict(A=Timestamp('20130102', tz='US/Eastern'), B=Timestamp('20130603', tz='CET')), index=range(5)) df3 = pd.concat([df2.A.to_frame(), df2.B.to_frame()], axis=1) result = df3.select_dtypes(include=['datetime64[ns]']) expected = df3.reindex(columns=[]) assert_frame_equal(result, expected) @pytest.mark.parametrize( "dtype", [str, "str", np.string_, "S1", "unicode", np.unicode_, "U1"] + ([unicode] if PY2 else [])) @pytest.mark.parametrize("arg", ["include", "exclude"]) def test_select_dtypes_str_raises(self, dtype, arg): df = DataFrame({"a": list("abc"), "g": list(u("abc")), "b": list(range(1, 4)), "c": np.arange(3, 6).astype("u1"), "d": np.arange(4.0, 7.0, dtype="float64"), "e": [True, False, True], "f": pd.date_range("now", periods=3).values}) msg = "string dtypes are not allowed" kwargs = {arg: [dtype]} with tm.assert_raises_regex(TypeError, msg): df.select_dtypes(**kwargs) def test_select_dtypes_bad_arg_raises(self): df = DataFrame({'a': list('abc'), 'g': list(u('abc')), 'b': list(range(1, 4)), 'c': np.arange(3, 6).astype('u1'), 'd': np.arange(4.0, 7.0, dtype='float64'), 'e': [True, False, True], 'f': pd.date_range('now', periods=3).values}) with tm.assert_raises_regex(TypeError, 'data type.' '*not understood'): df.select_dtypes(['blargy, blarg, blarg']) def test_select_dtypes_typecodes(self): # GH 11990 df = mkdf(30, 3, data_gen_f=lambda x, y: np.random.random()) expected = df FLOAT_TYPES = list(np.typecodes['AllFloat']) assert_frame_equal(df.select_dtypes(FLOAT_TYPES), expected) def test_dtypes_gh8722(self): self.mixed_frame['bool'] = self.mixed_frame['A'] > 0 result = self.mixed_frame.dtypes expected = Series(dict((k, v.dtype) for k, v in compat.iteritems(self.mixed_frame)), index=result.index) assert_series_equal(result, expected) # compat, GH 8722 with option_context('use_inf_as_na', True): df = DataFrame([[1]]) result = df.dtypes assert_series_equal(result, Series({0: np.dtype('int64')})) def test_ftypes(self): frame = self.mixed_float expected = Series(dict(A='float32:dense', B='float32:dense', C='float16:dense', D='float64:dense')).sort_values() result = frame.ftypes.sort_values() assert_series_equal(result, expected) def test_astype(self): casted = self.frame.astype(int) expected = DataFrame(self.frame.values.astype(int), index=self.frame.index, columns=self.frame.columns) assert_frame_equal(casted, expected) casted = self.frame.astype(np.int32) expected = DataFrame(self.frame.values.astype(np.int32), index=self.frame.index, columns=self.frame.columns) assert_frame_equal(casted, expected) self.frame['foo'] = '5' casted = self.frame.astype(int) expected = DataFrame(self.frame.values.astype(int), index=self.frame.index, columns=self.frame.columns) assert_frame_equal(casted, expected) # mixed casting def _check_cast(df, v): assert (list(set(s.dtype.name for _, s in compat.iteritems(df)))[0] == v) mn = self.all_mixed._get_numeric_data().copy() mn['little_float'] = np.array(12345., dtype='float16') mn['big_float'] = np.array(123456789101112., dtype='float64') casted = mn.astype('float64') _check_cast(casted, 'float64') casted = mn.astype('int64') _check_cast(casted, 'int64') casted = self.mixed_float.reindex(columns=['A', 'B']).astype('float32') _check_cast(casted, 'float32') casted = mn.reindex(columns=['little_float']).astype('float16') _check_cast(casted, 'float16') casted = self.mixed_float.reindex(columns=['A', 'B']).astype('float16') _check_cast(casted, 'float16') casted = mn.astype('float32') _check_cast(casted, 'float32') casted = mn.astype('int32') _check_cast(casted, 'int32') # to object casted = mn.astype('O') _check_cast(casted, 'object') def test_astype_with_exclude_string(self): df = self.frame.copy() expected = self.frame.astype(int) df['string'] = 'foo' casted = df.astype(int, errors='ignore') expected['string'] = 'foo' assert_frame_equal(casted, expected) df = self.frame.copy() expected = self.frame.astype(np.int32) df['string'] = 'foo' casted = df.astype(np.int32, errors='ignore') expected['string'] = 'foo' assert_frame_equal(casted, expected) def test_astype_with_view(self): tf = self.mixed_float.reindex(columns=['A', 'B', 'C']) casted = tf.astype(np.int64) casted = tf.astype(np.float32) # this is the only real reason to do it this way tf = np.round(self.frame).astype(np.int32) casted = tf.astype(np.float32, copy=False) # TODO(wesm): verification? tf = self.frame.astype(np.float64) casted = tf.astype(np.int64, copy=False) # noqa @pytest.mark.parametrize("dtype", [np.int32, np.int64]) @pytest.mark.parametrize("val", [np.nan, np.inf]) def test_astype_cast_nan_inf_int(self, val, dtype): # see gh-14265 # # Check NaN and inf --> raise error when converting to int. msg = "Cannot convert non-finite values \\(NA or inf\\) to integer" df = DataFrame([val]) with tm.assert_raises_regex(ValueError, msg): df.astype(dtype) def test_astype_str(self, text_dtype): # see gh-9757 a = Series(date_range("2010-01-04", periods=5)) b = Series(date_range("3/6/2012 00:00", periods=5, tz="US/Eastern")) c = Series([Timedelta(x, unit="d") for x in range(5)]) d = Series(range(5)) e = Series([0.0, 0.2, 0.4, 0.6, 0.8]) df = DataFrame({"a": a, "b": b, "c": c, "d": d, "e": e}) # Datetime-like # Test str and unicode on Python 2.x and just str on Python 3.x result = df.astype(text_dtype) expected = DataFrame({ "a": list(map(text_dtype, map(lambda x: Timestamp(x)._date_repr, a._values))), "b": list(map(text_dtype, map(Timestamp, b._values))), "c": list(map(text_dtype, map(lambda x: Timedelta(x)._repr_base(format="all"), c._values))), "d": list(map(text_dtype, d._values)), "e": list(map(text_dtype, e._values)), }) assert_frame_equal(result, expected) def test_astype_str_float(self, text_dtype): # see gh-11302 result = DataFrame([np.NaN]).astype(text_dtype) expected = DataFrame(["nan"]) assert_frame_equal(result, expected) result = DataFrame([1.12345678901234567890]).astype(text_dtype) # < 1.14 truncates # >= 1.14 preserves the full repr val = ("1.12345678901" if _np_version_under1p14 else "1.1234567890123457") expected = DataFrame([val]) assert_frame_equal(result, expected) @pytest.mark.parametrize("dtype_class", [dict, Series]) def test_astype_dict_like(self, dtype_class): # GH7271 & GH16717 a = Series(date_range('2010-01-04', periods=5)) b = Series(range(5)) c = Series([0.0, 0.2, 0.4, 0.6, 0.8]) d = Series(['1.0', '2', '3.14', '4', '5.4']) df = DataFrame({'a': a, 'b': b, 'c': c, 'd': d}) original = df.copy(deep=True) # change type of a subset of columns dt1 = dtype_class({'b': 'str', 'd': 'float32'}) result = df.astype(dt1) expected = DataFrame({ 'a': a, 'b': Series(['0', '1', '2', '3', '4']), 'c': c, 'd': Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype='float32')}) assert_frame_equal(result, expected) assert_frame_equal(df, original) dt2 = dtype_class({'b': np.float32, 'c': 'float32', 'd': np.float64}) result = df.astype(dt2) expected = DataFrame({ 'a': a, 'b': Series([0.0, 1.0, 2.0, 3.0, 4.0], dtype='float32'), 'c': Series([0.0, 0.2, 0.4, 0.6, 0.8], dtype='float32'), 'd': Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype='float64')}) assert_frame_equal(result, expected) assert_frame_equal(df, original) # change all columns dt3 = dtype_class({'a': str, 'b': str, 'c': str, 'd': str}) assert_frame_equal(df.astype(dt3), df.astype(str)) assert_frame_equal(df, original) # error should be raised when using something other than column labels # in the keys of the dtype dict dt4 = dtype_class({'b': str, 2: str}) dt5 = dtype_class({'e': str}) pytest.raises(KeyError, df.astype, dt4) pytest.raises(KeyError, df.astype, dt5) assert_frame_equal(df, original) # if the dtypes provided are the same as the original dtypes, the # resulting DataFrame should be the same as the original DataFrame dt6 = dtype_class({col: df[col].dtype for col in df.columns}) equiv = df.astype(dt6) assert_frame_equal(df, equiv) assert_frame_equal(df, original) # GH 16717 # if dtypes provided is empty, the resulting DataFrame # should be the same as the original DataFrame dt7 = dtype_class({}) result = df.astype(dt7) assert_frame_equal(df, equiv) assert_frame_equal(df, original) def test_astype_duplicate_col(self): a1 = Series([1, 2, 3, 4, 5], name='a') b = Series([0.1, 0.2, 0.4, 0.6, 0.8], name='b') a2 = Series([0, 1, 2, 3, 4], name='a') df = concat([a1, b, a2], axis=1) result = df.astype(str) a1_str = Series(['1', '2', '3', '4', '5'], dtype='str', name='a') b_str = Series(['0.1', '0.2', '0.4', '0.6', '0.8'], dtype=str, name='b') a2_str = Series(['0', '1', '2', '3', '4'], dtype='str', name='a') expected = concat([a1_str, b_str, a2_str], axis=1) assert_frame_equal(result, expected) result = df.astype({'a': 'str'}) expected = concat([a1_str, b, a2_str], axis=1) assert_frame_equal(result, expected) @pytest.mark.parametrize('dtype', [ 'category', CategoricalDtype(), CategoricalDtype(ordered=True), CategoricalDtype(ordered=False), CategoricalDtype(categories=list('abcdef')), CategoricalDtype(categories=list('edba'), ordered=False), CategoricalDtype(categories=list('edcb'), ordered=True)], ids=repr) def test_astype_categorical(self, dtype): # GH 18099 d = {'A': list('abbc'), 'B': list('bccd'), 'C': list('cdde')} df = DataFrame(d) result = df.astype(dtype) expected = DataFrame({k: Categorical(d[k], dtype=dtype) for k in d}) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("cls", [ pd.api.types.CategoricalDtype, pd.api.types.DatetimeTZDtype, pd.api.types.IntervalDtype ]) def test_astype_categoricaldtype_class_raises(self, cls): df = DataFrame({"A": ['a', 'a', 'b', 'c']}) xpr = "Expected an instance of {}".format(cls.__name__) with tm.assert_raises_regex(TypeError, xpr): df.astype({"A": cls}) with tm.assert_raises_regex(TypeError, xpr): df['A'].astype(cls) @pytest.mark.parametrize('dtype', [ {100: 'float64', 200: 'uint64'}, 'category', 'float64']) def test_astype_column_metadata(self, dtype): # GH 19920 columns = pd.UInt64Index([100, 200, 300], name='foo') df = DataFrame(np.arange(15).reshape(5, 3), columns=columns) df = df.astype(dtype) tm.assert_index_equal(df.columns, columns) @pytest.mark.parametrize("dtype", ["M8", "m8"]) @pytest.mark.parametrize("unit", ['ns', 'us', 'ms', 's', 'h', 'm', 'D']) def test_astype_from_datetimelike_to_objectt(self, dtype, unit): # tests astype to object dtype # gh-19223 / gh-12425 dtype = "{}[{}]".format(dtype, unit) arr = np.array([[1, 2, 3]], dtype=dtype) df = DataFrame(arr) result = df.astype(object) assert (result.dtypes == object).all() if dtype.startswith('M8'): assert result.iloc[0, 0] == pd.to_datetime(1, unit=unit) else: assert result.iloc[0, 0] == pd.to_timedelta(1, unit=unit) @pytest.mark.parametrize("arr_dtype", [np.int64, np.float64]) @pytest.mark.parametrize("dtype", ["M8", "m8"]) @pytest.mark.parametrize("unit", ['ns', 'us', 'ms', 's', 'h', 'm', 'D']) def test_astype_to_datetimelike_unit(self, arr_dtype, dtype, unit): # tests all units from numeric origination # gh-19223 / gh-12425 dtype = "{}[{}]".format(dtype, unit) arr = np.array([[1, 2, 3]], dtype=arr_dtype) df = DataFrame(arr) result = df.astype(dtype) expected = DataFrame(arr.astype(dtype)) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("unit", ['ns', 'us', 'ms', 's', 'h', 'm', 'D']) def test_astype_to_datetime_unit(self, unit): # tests all units from datetime origination # gh-19223 dtype = "M8[{}]".format(unit) arr = np.array([[1, 2, 3]], dtype=dtype) df = DataFrame(arr) result = df.astype(dtype) expected = DataFrame(arr.astype(dtype)) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("unit", ['ns']) def test_astype_to_timedelta_unit_ns(self, unit): # preserver the timedelta conversion # gh-19223 dtype = "m8[{}]".format(unit) arr = np.array([[1, 2, 3]], dtype=dtype) df = DataFrame(arr) result = df.astype(dtype) expected = DataFrame(arr.astype(dtype)) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("unit", ['us', 'ms', 's', 'h', 'm', 'D']) def test_astype_to_timedelta_unit(self, unit): # coerce to float # gh-19223 dtype = "m8[{}]".format(unit) arr = np.array([[1, 2, 3]], dtype=dtype) df = DataFrame(arr) result = df.astype(dtype) expected = DataFrame(df.values.astype(dtype).astype(float)) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("unit", ['ns', 'us', 'ms', 's', 'h', 'm', 'D']) def test_astype_to_incorrect_datetimelike(self, unit): # trying to astype a m to a M, or vice-versa # gh-19224 dtype = "M8[{}]".format(unit) other = "m8[{}]".format(unit) df = DataFrame(np.array([[1, 2, 3]], dtype=dtype)) with pytest.raises(TypeError): df.astype(other) df = DataFrame(np.array([[1, 2, 3]], dtype=other)) with pytest.raises(TypeError): df.astype(dtype) def test_timedeltas(self): df = DataFrame(dict(A=Series(date_range('2012-1-1', periods=3, freq='D')), B=Series([timedelta(days=i) for i in range(3)]))) result = df.get_dtype_counts().sort_index() expected = Series( {'datetime64[ns]': 1, 'timedelta64[ns]': 1}).sort_index() assert_series_equal(result, expected) df['C'] = df['A'] + df['B'] expected = Series( {'datetime64[ns]': 2, 'timedelta64[ns]': 1}).sort_values() result = df.get_dtype_counts().sort_values() assert_series_equal(result, expected) # mixed int types df['D'] = 1 expected = Series({'datetime64[ns]': 2, 'timedelta64[ns]': 1, 'int64': 1}).sort_values() result = df.get_dtype_counts().sort_values() assert_series_equal(result, expected) def test_arg_for_errors_in_astype(self): # issue #14878 df = DataFrame([1, 2, 3]) with pytest.raises(ValueError): df.astype(np.float64, errors=True) with tm.assert_produces_warning(FutureWarning): df.astype(np.int8, raise_on_error=False) df.astype(np.int8, errors='ignore') @pytest.mark.parametrize('input_vals', [ ([1, 2]), (['1', '2']), (list(pd.date_range('1/1/2011', periods=2, freq='H'))), (list(pd.date_range('1/1/2011', periods=2, freq='H', tz='US/Eastern'))), ([pd.Interval(left=0, right=5)]), ]) def test_constructor_list_str(self, input_vals, string_dtype): # GH 16605 # Ensure that data elements are converted to strings when # dtype is str, 'str', or 'U' result = DataFrame({'A': input_vals}, dtype=string_dtype) expected = DataFrame({'A': input_vals}).astype({'A': string_dtype}) assert_frame_equal(result, expected) def test_constructor_list_str_na(self, string_dtype): result = DataFrame({"A": [1.0, 2.0, None]}, dtype=string_dtype) expected = DataFrame({"A": ['1.0', '2.0', None]}, dtype=object) assert_frame_equal(result, expected)
result = hash_tuple(tup) expected = hash_tuples([tup])[0] assert result == expected @pytest.mark.parametrize("val", [ 1, 1.4, "A", b"A", u"A", pd.Timestamp("2012-01-01"), pd.Timestamp("2012-01-01", tz="Europe/Brussels"), datetime.datetime(2012, 1, 1), pd.Timestamp("2012-01-01", tz="EST").to_pydatetime(), pd.Timedelta("1 days"), datetime.timedelta(1), pd.Period("2012-01-01", freq="D"), pd.Interval(0, 1), np.nan, pd.NaT, None ]) def test_hash_scalar(val): result = _hash_scalar(val) expected = hash_array(np.array([val], dtype=object), categorize=True) assert result[0] == expected[0] @pytest.mark.parametrize("val", [5, "foo", pd.Timestamp("20130101")]) def test_hash_tuples_err(val): msg = "must be convertible to a list-of-tuples" with pytest.raises(TypeError, match=msg): hash_tuples(val)
class TestDataFrameDataTypes: def test_concat_empty_dataframe_dtypes(self): df = DataFrame(columns=list("abc")) df["a"] = df["a"].astype(np.bool_) df["b"] = df["b"].astype(np.int32) df["c"] = df["c"].astype(np.float64) result = pd.concat([df, df]) assert result["a"].dtype == np.bool_ assert result["b"].dtype == np.int32 assert result["c"].dtype == np.float64 result = pd.concat([df, df.astype(np.float64)]) assert result["a"].dtype == np.object_ assert result["b"].dtype == np.float64 assert result["c"].dtype == np.float64 def test_empty_frame_dtypes_ftypes(self): empty_df = pd.DataFrame() assert_series_equal(empty_df.dtypes, pd.Series(dtype=np.object)) # GH 26705 - Assert .ftypes is deprecated with tm.assert_produces_warning(FutureWarning): assert_series_equal(empty_df.ftypes, pd.Series(dtype=np.object)) nocols_df = pd.DataFrame(index=[1, 2, 3]) assert_series_equal(nocols_df.dtypes, pd.Series(dtype=np.object)) # GH 26705 - Assert .ftypes is deprecated with tm.assert_produces_warning(FutureWarning): assert_series_equal(nocols_df.ftypes, pd.Series(dtype=np.object)) norows_df = pd.DataFrame(columns=list("abc")) assert_series_equal(norows_df.dtypes, pd.Series(np.object, index=list("abc"))) # GH 26705 - Assert .ftypes is deprecated with tm.assert_produces_warning(FutureWarning): assert_series_equal( norows_df.ftypes, pd.Series("object:dense", index=list("abc")) ) norows_int_df = pd.DataFrame(columns=list("abc")).astype(np.int32) assert_series_equal( norows_int_df.dtypes, pd.Series(np.dtype("int32"), index=list("abc")) ) # GH 26705 - Assert .ftypes is deprecated with tm.assert_produces_warning(FutureWarning): assert_series_equal( norows_int_df.ftypes, pd.Series("int32:dense", index=list("abc")) ) odict = OrderedDict df = pd.DataFrame(odict([("a", 1), ("b", True), ("c", 1.0)]), index=[1, 2, 3]) ex_dtypes = pd.Series( odict([("a", np.int64), ("b", np.bool), ("c", np.float64)]) ) ex_ftypes = pd.Series( odict([("a", "int64:dense"), ("b", "bool:dense"), ("c", "float64:dense")]) ) assert_series_equal(df.dtypes, ex_dtypes) # GH 26705 - Assert .ftypes is deprecated with tm.assert_produces_warning(FutureWarning): assert_series_equal(df.ftypes, ex_ftypes) # same but for empty slice of df assert_series_equal(df[:0].dtypes, ex_dtypes) # GH 26705 - Assert .ftypes is deprecated with tm.assert_produces_warning(FutureWarning): assert_series_equal(df[:0].ftypes, ex_ftypes) def test_datetime_with_tz_dtypes(self): tzframe = DataFrame( { "A": date_range("20130101", periods=3), "B": date_range("20130101", periods=3, tz="US/Eastern"), "C": date_range("20130101", periods=3, tz="CET"), } ) tzframe.iloc[1, 1] = pd.NaT tzframe.iloc[1, 2] = pd.NaT result = tzframe.dtypes.sort_index() expected = Series( [ np.dtype("datetime64[ns]"), DatetimeTZDtype("ns", "US/Eastern"), DatetimeTZDtype("ns", "CET"), ], ["A", "B", "C"], ) assert_series_equal(result, expected) def test_dtypes_are_correct_after_column_slice(self): # GH6525 df = pd.DataFrame(index=range(5), columns=list("abc"), dtype=np.float_) odict = OrderedDict assert_series_equal( df.dtypes, pd.Series(odict([("a", np.float_), ("b", np.float_), ("c", np.float_)])), ) assert_series_equal(df.iloc[:, 2:].dtypes, pd.Series(odict([("c", np.float_)]))) assert_series_equal( df.dtypes, pd.Series(odict([("a", np.float_), ("b", np.float_), ("c", np.float_)])), ) def test_select_dtypes_include_using_list_like(self): df = DataFrame( { "a": list("abc"), "b": list(range(1, 4)), "c": np.arange(3, 6).astype("u1"), "d": np.arange(4.0, 7.0, dtype="float64"), "e": [True, False, True], "f": pd.Categorical(list("abc")), "g": pd.date_range("20130101", periods=3), "h": pd.date_range("20130101", periods=3, tz="US/Eastern"), "i": pd.date_range("20130101", periods=3, tz="CET"), "j": pd.period_range("2013-01", periods=3, freq="M"), "k": pd.timedelta_range("1 day", periods=3), } ) ri = df.select_dtypes(include=[np.number]) ei = df[["b", "c", "d", "k"]] assert_frame_equal(ri, ei) ri = df.select_dtypes(include=[np.number], exclude=["timedelta"]) ei = df[["b", "c", "d"]] assert_frame_equal(ri, ei) ri = df.select_dtypes(include=[np.number, "category"], exclude=["timedelta"]) ei = df[["b", "c", "d", "f"]] assert_frame_equal(ri, ei) ri = df.select_dtypes(include=["datetime"]) ei = df[["g"]] assert_frame_equal(ri, ei) ri = df.select_dtypes(include=["datetime64"]) ei = df[["g"]] assert_frame_equal(ri, ei) ri = df.select_dtypes(include=["datetimetz"]) ei = df[["h", "i"]] assert_frame_equal(ri, ei) with pytest.raises(NotImplementedError, match=r"^$"): df.select_dtypes(include=["period"]) def test_select_dtypes_exclude_using_list_like(self): df = DataFrame( { "a": list("abc"), "b": list(range(1, 4)), "c": np.arange(3, 6).astype("u1"), "d": np.arange(4.0, 7.0, dtype="float64"), "e": [True, False, True], } ) re = df.select_dtypes(exclude=[np.number]) ee = df[["a", "e"]] assert_frame_equal(re, ee) def test_select_dtypes_exclude_include_using_list_like(self): df = DataFrame( { "a": list("abc"), "b": list(range(1, 4)), "c": np.arange(3, 6).astype("u1"), "d": np.arange(4.0, 7.0, dtype="float64"), "e": [True, False, True], "f": pd.date_range("now", periods=3).values, } ) exclude = (np.datetime64,) include = np.bool_, "integer" r = df.select_dtypes(include=include, exclude=exclude) e = df[["b", "c", "e"]] assert_frame_equal(r, e) exclude = ("datetime",) include = "bool", "int64", "int32" r = df.select_dtypes(include=include, exclude=exclude) e = df[["b", "e"]] assert_frame_equal(r, e) def test_select_dtypes_include_using_scalars(self): df = DataFrame( { "a": list("abc"), "b": list(range(1, 4)), "c": np.arange(3, 6).astype("u1"), "d": np.arange(4.0, 7.0, dtype="float64"), "e": [True, False, True], "f": pd.Categorical(list("abc")), "g": pd.date_range("20130101", periods=3), "h": pd.date_range("20130101", periods=3, tz="US/Eastern"), "i": pd.date_range("20130101", periods=3, tz="CET"), "j": pd.period_range("2013-01", periods=3, freq="M"), "k": pd.timedelta_range("1 day", periods=3), } ) ri = df.select_dtypes(include=np.number) ei = df[["b", "c", "d", "k"]] assert_frame_equal(ri, ei) ri = df.select_dtypes(include="datetime") ei = df[["g"]] assert_frame_equal(ri, ei) ri = df.select_dtypes(include="datetime64") ei = df[["g"]] assert_frame_equal(ri, ei) ri = df.select_dtypes(include="category") ei = df[["f"]] assert_frame_equal(ri, ei) with pytest.raises(NotImplementedError, match=r"^$"): df.select_dtypes(include="period") def test_select_dtypes_exclude_using_scalars(self): df = DataFrame( { "a": list("abc"), "b": list(range(1, 4)), "c": np.arange(3, 6).astype("u1"), "d": np.arange(4.0, 7.0, dtype="float64"), "e": [True, False, True], "f": pd.Categorical(list("abc")), "g": pd.date_range("20130101", periods=3), "h": pd.date_range("20130101", periods=3, tz="US/Eastern"), "i": pd.date_range("20130101", periods=3, tz="CET"), "j": pd.period_range("2013-01", periods=3, freq="M"), "k": pd.timedelta_range("1 day", periods=3), } ) ri = df.select_dtypes(exclude=np.number) ei = df[["a", "e", "f", "g", "h", "i", "j"]] assert_frame_equal(ri, ei) ri = df.select_dtypes(exclude="category") ei = df[["a", "b", "c", "d", "e", "g", "h", "i", "j", "k"]] assert_frame_equal(ri, ei) with pytest.raises(NotImplementedError, match=r"^$"): df.select_dtypes(exclude="period") def test_select_dtypes_include_exclude_using_scalars(self): df = DataFrame( { "a": list("abc"), "b": list(range(1, 4)), "c": np.arange(3, 6).astype("u1"), "d": np.arange(4.0, 7.0, dtype="float64"), "e": [True, False, True], "f": pd.Categorical(list("abc")), "g": pd.date_range("20130101", periods=3), "h": pd.date_range("20130101", periods=3, tz="US/Eastern"), "i": pd.date_range("20130101", periods=3, tz="CET"), "j": pd.period_range("2013-01", periods=3, freq="M"), "k": pd.timedelta_range("1 day", periods=3), } ) ri = df.select_dtypes(include=np.number, exclude="floating") ei = df[["b", "c", "k"]] assert_frame_equal(ri, ei) def test_select_dtypes_include_exclude_mixed_scalars_lists(self): df = DataFrame( { "a": list("abc"), "b": list(range(1, 4)), "c": np.arange(3, 6).astype("u1"), "d": np.arange(4.0, 7.0, dtype="float64"), "e": [True, False, True], "f": pd.Categorical(list("abc")), "g": pd.date_range("20130101", periods=3), "h": pd.date_range("20130101", periods=3, tz="US/Eastern"), "i": pd.date_range("20130101", periods=3, tz="CET"), "j": pd.period_range("2013-01", periods=3, freq="M"), "k": pd.timedelta_range("1 day", periods=3), } ) ri = df.select_dtypes(include=np.number, exclude=["floating", "timedelta"]) ei = df[["b", "c"]] assert_frame_equal(ri, ei) ri = df.select_dtypes(include=[np.number, "category"], exclude="floating") ei = df[["b", "c", "f", "k"]] assert_frame_equal(ri, ei) def test_select_dtypes_duplicate_columns(self): # GH20839 odict = OrderedDict df = DataFrame( odict( [ ("a", list("abc")), ("b", list(range(1, 4))), ("c", np.arange(3, 6).astype("u1")), ("d", np.arange(4.0, 7.0, dtype="float64")), ("e", [True, False, True]), ("f", pd.date_range("now", periods=3).values), ] ) ) df.columns = ["a", "a", "b", "b", "b", "c"] expected = DataFrame( {"a": list(range(1, 4)), "b": np.arange(3, 6).astype("u1")} ) result = df.select_dtypes(include=[np.number], exclude=["floating"]) assert_frame_equal(result, expected) def test_select_dtypes_not_an_attr_but_still_valid_dtype(self): df = DataFrame( { "a": list("abc"), "b": list(range(1, 4)), "c": np.arange(3, 6).astype("u1"), "d": np.arange(4.0, 7.0, dtype="float64"), "e": [True, False, True], "f": pd.date_range("now", periods=3).values, } ) df["g"] = df.f.diff() assert not hasattr(np, "u8") r = df.select_dtypes(include=["i8", "O"], exclude=["timedelta"]) e = df[["a", "b"]] assert_frame_equal(r, e) r = df.select_dtypes(include=["i8", "O", "timedelta64[ns]"]) e = df[["a", "b", "g"]] assert_frame_equal(r, e) def test_select_dtypes_empty(self): df = DataFrame({"a": list("abc"), "b": list(range(1, 4))}) msg = "at least one of include or exclude must be nonempty" with pytest.raises(ValueError, match=msg): df.select_dtypes() def test_select_dtypes_bad_datetime64(self): df = DataFrame( { "a": list("abc"), "b": list(range(1, 4)), "c": np.arange(3, 6).astype("u1"), "d": np.arange(4.0, 7.0, dtype="float64"), "e": [True, False, True], "f": pd.date_range("now", periods=3).values, } ) with pytest.raises(ValueError, match=".+ is too specific"): df.select_dtypes(include=["datetime64[D]"]) with pytest.raises(ValueError, match=".+ is too specific"): df.select_dtypes(exclude=["datetime64[as]"]) def test_select_dtypes_datetime_with_tz(self): df2 = DataFrame( dict( A=Timestamp("20130102", tz="US/Eastern"), B=Timestamp("20130603", tz="CET"), ), index=range(5), ) df3 = pd.concat([df2.A.to_frame(), df2.B.to_frame()], axis=1) result = df3.select_dtypes(include=["datetime64[ns]"]) expected = df3.reindex(columns=[]) assert_frame_equal(result, expected) @pytest.mark.parametrize( "dtype", [str, "str", np.string_, "S1", "unicode", np.unicode_, "U1"] ) @pytest.mark.parametrize("arg", ["include", "exclude"]) def test_select_dtypes_str_raises(self, dtype, arg): df = DataFrame( { "a": list("abc"), "g": list("abc"), "b": list(range(1, 4)), "c": np.arange(3, 6).astype("u1"), "d": np.arange(4.0, 7.0, dtype="float64"), "e": [True, False, True], "f": pd.date_range("now", periods=3).values, } ) msg = "string dtypes are not allowed" kwargs = {arg: [dtype]} with pytest.raises(TypeError, match=msg): df.select_dtypes(**kwargs) def test_select_dtypes_bad_arg_raises(self): df = DataFrame( { "a": list("abc"), "g": list("abc"), "b": list(range(1, 4)), "c": np.arange(3, 6).astype("u1"), "d": np.arange(4.0, 7.0, dtype="float64"), "e": [True, False, True], "f": pd.date_range("now", periods=3).values, } ) msg = "data type.*not understood" with pytest.raises(TypeError, match=msg): df.select_dtypes(["blargy, blarg, blarg"]) def test_select_dtypes_typecodes(self): # GH 11990 df = mkdf(30, 3, data_gen_f=lambda x, y: np.random.random()) expected = df FLOAT_TYPES = list(np.typecodes["AllFloat"]) assert_frame_equal(df.select_dtypes(FLOAT_TYPES), expected) def test_dtypes_gh8722(self, float_string_frame): float_string_frame["bool"] = float_string_frame["A"] > 0 result = float_string_frame.dtypes expected = Series( {k: v.dtype for k, v in float_string_frame.items()}, index=result.index ) assert_series_equal(result, expected) # compat, GH 8722 with option_context("use_inf_as_na", True): df = DataFrame([[1]]) result = df.dtypes assert_series_equal(result, Series({0: np.dtype("int64")})) def test_ftypes(self, mixed_float_frame): frame = mixed_float_frame expected = Series( dict( A="float32:dense", B="float32:dense", C="float16:dense", D="float64:dense", ) ).sort_values() # GH 26705 - Assert .ftypes is deprecated with tm.assert_produces_warning(FutureWarning): result = frame.ftypes.sort_values() assert_series_equal(result, expected) def test_astype_float(self, float_frame): casted = float_frame.astype(int) expected = DataFrame( float_frame.values.astype(int), index=float_frame.index, columns=float_frame.columns, ) assert_frame_equal(casted, expected) casted = float_frame.astype(np.int32) expected = DataFrame( float_frame.values.astype(np.int32), index=float_frame.index, columns=float_frame.columns, ) assert_frame_equal(casted, expected) float_frame["foo"] = "5" casted = float_frame.astype(int) expected = DataFrame( float_frame.values.astype(int), index=float_frame.index, columns=float_frame.columns, ) assert_frame_equal(casted, expected) def test_astype_mixed_float(self, mixed_float_frame): # mixed casting casted = mixed_float_frame.reindex(columns=["A", "B"]).astype("float32") _check_cast(casted, "float32") casted = mixed_float_frame.reindex(columns=["A", "B"]).astype("float16") _check_cast(casted, "float16") def test_astype_mixed_type(self, mixed_type_frame): # mixed casting mn = mixed_type_frame._get_numeric_data().copy() mn["little_float"] = np.array(12345.0, dtype="float16") mn["big_float"] = np.array(123456789101112.0, dtype="float64") casted = mn.astype("float64") _check_cast(casted, "float64") casted = mn.astype("int64") _check_cast(casted, "int64") casted = mn.reindex(columns=["little_float"]).astype("float16") _check_cast(casted, "float16") casted = mn.astype("float32") _check_cast(casted, "float32") casted = mn.astype("int32") _check_cast(casted, "int32") # to object casted = mn.astype("O") _check_cast(casted, "object") def test_astype_with_exclude_string(self, float_frame): df = float_frame.copy() expected = float_frame.astype(int) df["string"] = "foo" casted = df.astype(int, errors="ignore") expected["string"] = "foo" assert_frame_equal(casted, expected) df = float_frame.copy() expected = float_frame.astype(np.int32) df["string"] = "foo" casted = df.astype(np.int32, errors="ignore") expected["string"] = "foo" assert_frame_equal(casted, expected) def test_astype_with_view_float(self, float_frame): # this is the only real reason to do it this way tf = np.round(float_frame).astype(np.int32) casted = tf.astype(np.float32, copy=False) # TODO(wesm): verification? tf = float_frame.astype(np.float64) casted = tf.astype(np.int64, copy=False) # noqa def test_astype_with_view_mixed_float(self, mixed_float_frame): tf = mixed_float_frame.reindex(columns=["A", "B", "C"]) casted = tf.astype(np.int64) casted = tf.astype(np.float32) # noqa @pytest.mark.parametrize("dtype", [np.int32, np.int64]) @pytest.mark.parametrize("val", [np.nan, np.inf]) def test_astype_cast_nan_inf_int(self, val, dtype): # see gh-14265 # # Check NaN and inf --> raise error when converting to int. msg = "Cannot convert non-finite values \\(NA or inf\\) to integer" df = DataFrame([val]) with pytest.raises(ValueError, match=msg): df.astype(dtype) def test_astype_str(self): # see gh-9757 a = Series(date_range("2010-01-04", periods=5)) b = Series(date_range("3/6/2012 00:00", periods=5, tz="US/Eastern")) c = Series([Timedelta(x, unit="d") for x in range(5)]) d = Series(range(5)) e = Series([0.0, 0.2, 0.4, 0.6, 0.8]) df = DataFrame({"a": a, "b": b, "c": c, "d": d, "e": e}) # Datetime-like result = df.astype(str) expected = DataFrame( { "a": list(map(str, map(lambda x: Timestamp(x)._date_repr, a._values))), "b": list(map(str, map(Timestamp, b._values))), "c": list( map( str, map(lambda x: Timedelta(x)._repr_base(format="all"), c._values), ) ), "d": list(map(str, d._values)), "e": list(map(str, e._values)), } ) assert_frame_equal(result, expected) def test_astype_str_float(self): # see gh-11302 result = DataFrame([np.NaN]).astype(str) expected = DataFrame(["nan"]) assert_frame_equal(result, expected) result = DataFrame([1.12345678901234567890]).astype(str) # < 1.14 truncates # >= 1.14 preserves the full repr val = "1.12345678901" if _np_version_under1p14 else "1.1234567890123457" expected = DataFrame([val]) assert_frame_equal(result, expected) @pytest.mark.parametrize("dtype_class", [dict, Series]) def test_astype_dict_like(self, dtype_class): # GH7271 & GH16717 a = Series(date_range("2010-01-04", periods=5)) b = Series(range(5)) c = Series([0.0, 0.2, 0.4, 0.6, 0.8]) d = Series(["1.0", "2", "3.14", "4", "5.4"]) df = DataFrame({"a": a, "b": b, "c": c, "d": d}) original = df.copy(deep=True) # change type of a subset of columns dt1 = dtype_class({"b": "str", "d": "float32"}) result = df.astype(dt1) expected = DataFrame( { "a": a, "b": Series(["0", "1", "2", "3", "4"]), "c": c, "d": Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype="float32"), } ) assert_frame_equal(result, expected) assert_frame_equal(df, original) dt2 = dtype_class({"b": np.float32, "c": "float32", "d": np.float64}) result = df.astype(dt2) expected = DataFrame( { "a": a, "b": Series([0.0, 1.0, 2.0, 3.0, 4.0], dtype="float32"), "c": Series([0.0, 0.2, 0.4, 0.6, 0.8], dtype="float32"), "d": Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype="float64"), } ) assert_frame_equal(result, expected) assert_frame_equal(df, original) # change all columns dt3 = dtype_class({"a": str, "b": str, "c": str, "d": str}) assert_frame_equal(df.astype(dt3), df.astype(str)) assert_frame_equal(df, original) # error should be raised when using something other than column labels # in the keys of the dtype dict dt4 = dtype_class({"b": str, 2: str}) dt5 = dtype_class({"e": str}) msg = ( "Only a column name can be used for the key in a dtype mappings" " argument" ) with pytest.raises(KeyError, match=msg): df.astype(dt4) with pytest.raises(KeyError, match=msg): df.astype(dt5) assert_frame_equal(df, original) # if the dtypes provided are the same as the original dtypes, the # resulting DataFrame should be the same as the original DataFrame dt6 = dtype_class({col: df[col].dtype for col in df.columns}) equiv = df.astype(dt6) assert_frame_equal(df, equiv) assert_frame_equal(df, original) # GH 16717 # if dtypes provided is empty, the resulting DataFrame # should be the same as the original DataFrame dt7 = dtype_class({}) result = df.astype(dt7) assert_frame_equal(df, equiv) assert_frame_equal(df, original) def test_astype_duplicate_col(self): a1 = Series([1, 2, 3, 4, 5], name="a") b = Series([0.1, 0.2, 0.4, 0.6, 0.8], name="b") a2 = Series([0, 1, 2, 3, 4], name="a") df = concat([a1, b, a2], axis=1) result = df.astype(str) a1_str = Series(["1", "2", "3", "4", "5"], dtype="str", name="a") b_str = Series(["0.1", "0.2", "0.4", "0.6", "0.8"], dtype=str, name="b") a2_str = Series(["0", "1", "2", "3", "4"], dtype="str", name="a") expected = concat([a1_str, b_str, a2_str], axis=1) assert_frame_equal(result, expected) result = df.astype({"a": "str"}) expected = concat([a1_str, b, a2_str], axis=1) assert_frame_equal(result, expected) @pytest.mark.parametrize( "dtype", [ "category", CategoricalDtype(), CategoricalDtype(ordered=True), CategoricalDtype(ordered=False), CategoricalDtype(categories=list("abcdef")), CategoricalDtype(categories=list("edba"), ordered=False), CategoricalDtype(categories=list("edcb"), ordered=True), ], ids=repr, ) def test_astype_categorical(self, dtype): # GH 18099 d = {"A": list("abbc"), "B": list("bccd"), "C": list("cdde")} df = DataFrame(d) result = df.astype(dtype) expected = DataFrame({k: Categorical(d[k], dtype=dtype) for k in d}) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "cls", [ pd.api.types.CategoricalDtype, pd.api.types.DatetimeTZDtype, pd.api.types.IntervalDtype, ], ) def test_astype_categoricaldtype_class_raises(self, cls): df = DataFrame({"A": ["a", "a", "b", "c"]}) xpr = "Expected an instance of {}".format(cls.__name__) with pytest.raises(TypeError, match=xpr): df.astype({"A": cls}) with pytest.raises(TypeError, match=xpr): df["A"].astype(cls) @pytest.mark.parametrize("dtype", ["Int64", "Int32", "Int16"]) def test_astype_extension_dtypes(self, dtype): # GH 22578 df = pd.DataFrame([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], columns=["a", "b"]) expected1 = pd.DataFrame( { "a": integer_array([1, 3, 5], dtype=dtype), "b": integer_array([2, 4, 6], dtype=dtype), } ) tm.assert_frame_equal(df.astype(dtype), expected1) tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1) tm.assert_frame_equal(df.astype(dtype).astype("float64"), df) df = pd.DataFrame([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], columns=["a", "b"]) df["b"] = df["b"].astype(dtype) expected2 = pd.DataFrame( {"a": [1.0, 3.0, 5.0], "b": integer_array([2, 4, 6], dtype=dtype)} ) tm.assert_frame_equal(df, expected2) tm.assert_frame_equal(df.astype(dtype), expected1) tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1) @pytest.mark.parametrize("dtype", ["Int64", "Int32", "Int16"]) def test_astype_extension_dtypes_1d(self, dtype): # GH 22578 df = pd.DataFrame({"a": [1.0, 2.0, 3.0]}) expected1 = pd.DataFrame({"a": integer_array([1, 2, 3], dtype=dtype)}) tm.assert_frame_equal(df.astype(dtype), expected1) tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1) df = pd.DataFrame({"a": [1.0, 2.0, 3.0]}) df["a"] = df["a"].astype(dtype) expected2 = pd.DataFrame({"a": integer_array([1, 2, 3], dtype=dtype)}) tm.assert_frame_equal(df, expected2) tm.assert_frame_equal(df.astype(dtype), expected1) tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1) @pytest.mark.parametrize("dtype", ["category", "Int64"]) def test_astype_extension_dtypes_duplicate_col(self, dtype): # GH 24704 a1 = Series([0, np.nan, 4], name="a") a2 = Series([np.nan, 3, 5], name="a") df = concat([a1, a2], axis=1) result = df.astype(dtype) expected = concat([a1.astype(dtype), a2.astype(dtype)], axis=1) assert_frame_equal(result, expected) @pytest.mark.parametrize( "dtype", [{100: "float64", 200: "uint64"}, "category", "float64"] ) def test_astype_column_metadata(self, dtype): # GH 19920 columns = pd.UInt64Index([100, 200, 300], name="foo") df = DataFrame(np.arange(15).reshape(5, 3), columns=columns) df = df.astype(dtype) tm.assert_index_equal(df.columns, columns) @pytest.mark.parametrize("dtype", ["M8", "m8"]) @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"]) def test_astype_from_datetimelike_to_objectt(self, dtype, unit): # tests astype to object dtype # gh-19223 / gh-12425 dtype = "{}[{}]".format(dtype, unit) arr = np.array([[1, 2, 3]], dtype=dtype) df = DataFrame(arr) result = df.astype(object) assert (result.dtypes == object).all() if dtype.startswith("M8"): assert result.iloc[0, 0] == pd.to_datetime(1, unit=unit) else: assert result.iloc[0, 0] == pd.to_timedelta(1, unit=unit) @pytest.mark.parametrize("arr_dtype", [np.int64, np.float64]) @pytest.mark.parametrize("dtype", ["M8", "m8"]) @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"]) def test_astype_to_datetimelike_unit(self, arr_dtype, dtype, unit): # tests all units from numeric origination # gh-19223 / gh-12425 dtype = "{}[{}]".format(dtype, unit) arr = np.array([[1, 2, 3]], dtype=arr_dtype) df = DataFrame(arr) result = df.astype(dtype) expected = DataFrame(arr.astype(dtype)) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"]) def test_astype_to_datetime_unit(self, unit): # tests all units from datetime origination # gh-19223 dtype = "M8[{}]".format(unit) arr = np.array([[1, 2, 3]], dtype=dtype) df = DataFrame(arr) result = df.astype(dtype) expected = DataFrame(arr.astype(dtype)) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("unit", ["ns"]) def test_astype_to_timedelta_unit_ns(self, unit): # preserver the timedelta conversion # gh-19223 dtype = "m8[{}]".format(unit) arr = np.array([[1, 2, 3]], dtype=dtype) df = DataFrame(arr) result = df.astype(dtype) expected = DataFrame(arr.astype(dtype)) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("unit", ["us", "ms", "s", "h", "m", "D"]) def test_astype_to_timedelta_unit(self, unit): # coerce to float # gh-19223 dtype = "m8[{}]".format(unit) arr = np.array([[1, 2, 3]], dtype=dtype) df = DataFrame(arr) result = df.astype(dtype) expected = DataFrame(df.values.astype(dtype).astype(float)) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"]) def test_astype_to_incorrect_datetimelike(self, unit): # trying to astype a m to a M, or vice-versa # gh-19224 dtype = "M8[{}]".format(unit) other = "m8[{}]".format(unit) df = DataFrame(np.array([[1, 2, 3]], dtype=dtype)) msg = ( r"cannot astype a datetimelike from \[datetime64\[ns\]\] to" r" \[timedelta64\[{}\]\]" ).format(unit) with pytest.raises(TypeError, match=msg): df.astype(other) msg = ( r"cannot astype a timedelta from \[timedelta64\[ns\]\] to" r" \[datetime64\[{}\]\]" ).format(unit) df = DataFrame(np.array([[1, 2, 3]], dtype=other)) with pytest.raises(TypeError, match=msg): df.astype(dtype) def test_timedeltas(self): df = DataFrame( dict( A=Series(date_range("2012-1-1", periods=3, freq="D")), B=Series([timedelta(days=i) for i in range(3)]), ) ) result = df.dtypes expected = Series( [np.dtype("datetime64[ns]"), np.dtype("timedelta64[ns]")], index=list("AB") ) assert_series_equal(result, expected) df["C"] = df["A"] + df["B"] result = df.dtypes expected = Series( [ np.dtype("datetime64[ns]"), np.dtype("timedelta64[ns]"), np.dtype("datetime64[ns]"), ], index=list("ABC"), ) assert_series_equal(result, expected) # mixed int types df["D"] = 1 result = df.dtypes expected = Series( [ np.dtype("datetime64[ns]"), np.dtype("timedelta64[ns]"), np.dtype("datetime64[ns]"), np.dtype("int64"), ], index=list("ABCD"), ) assert_series_equal(result, expected) def test_arg_for_errors_in_astype(self): # issue #14878 df = DataFrame([1, 2, 3]) with pytest.raises(ValueError): df.astype(np.float64, errors=True) df.astype(np.int8, errors="ignore") def test_arg_for_errors_in_astype_dictlist(self): # GH-25905 df = pd.DataFrame( [ {"a": "1", "b": "16.5%", "c": "test"}, {"a": "2.2", "b": "15.3", "c": "another_test"}, ] ) expected = pd.DataFrame( [ {"a": 1.0, "b": "16.5%", "c": "test"}, {"a": 2.2, "b": "15.3", "c": "another_test"}, ] ) type_dict = {"a": "float64", "b": "float64", "c": "object"} result = df.astype(dtype=type_dict, errors="ignore") tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "input_vals", [ ([1, 2]), (["1", "2"]), (list(pd.date_range("1/1/2011", periods=2, freq="H"))), (list(pd.date_range("1/1/2011", periods=2, freq="H", tz="US/Eastern"))), ([pd.Interval(left=0, right=5)]), ], ) def test_constructor_list_str(self, input_vals, string_dtype): # GH 16605 # Ensure that data elements are converted to strings when # dtype is str, 'str', or 'U' result = DataFrame({"A": input_vals}, dtype=string_dtype) expected = DataFrame({"A": input_vals}).astype({"A": string_dtype}) assert_frame_equal(result, expected) def test_constructor_list_str_na(self, string_dtype): result = DataFrame({"A": [1.0, 2.0, None]}, dtype=string_dtype) expected = DataFrame({"A": ["1.0", "2.0", None]}, dtype=object) assert_frame_equal(result, expected) @pytest.mark.parametrize( "data, expected", [ # empty (DataFrame(), True), # multi-same (DataFrame({"A": [1, 2], "B": [1, 2]}), True), # multi-object ( DataFrame( { "A": np.array([1, 2], dtype=object), "B": np.array(["a", "b"], dtype=object), } ), True, ), # multi-extension ( DataFrame( {"A": pd.Categorical(["a", "b"]), "B": pd.Categorical(["a", "b"])} ), True, ), # differ types (DataFrame({"A": [1, 2], "B": [1.0, 2.0]}), False), # differ sizes ( DataFrame( { "A": np.array([1, 2], dtype=np.int32), "B": np.array([1, 2], dtype=np.int64), } ), False, ), # multi-extension differ ( DataFrame( {"A": pd.Categorical(["a", "b"]), "B": pd.Categorical(["b", "c"])} ), False, ), ], ) def test_is_homogeneous_type(self, data, expected): assert data._is_homogeneous_type is expected def test_asarray_homogenous(self): df = pd.DataFrame({"A": pd.Categorical([1, 2]), "B": pd.Categorical([1, 2])}) result = np.asarray(df) # may change from object in the future expected = np.array([[1, 1], [2, 2]], dtype="object") tm.assert_numpy_array_equal(result, expected)
def test_is_all_dates(self): # GH 23576 year_2017 = pd.Interval(pd.Timestamp('2017-01-01 00:00:00'), pd.Timestamp('2018-01-01 00:00:00')) year_2017_index = pd.IntervalIndex([year_2017]) assert not year_2017_index.is_all_dates
def testJSONSerialize(self): for serial_type in self._get_serial_types(): provider = JsonSerializeProvider( data_serial_type=serial_type, pickle_protocol=TEST_PICKLE_PROTOCOL) node2 = Node2(a=[['ss'], ['dd']], data=[3, 7, 212]) node1 = Node1( a='test1', b1=2, b2=2000, b3=5000, b4=500000, c1=2, c2=2000, c3=5000, c4=500000, d1=2.5, d2=7.37, d3=5.976321, cl1=1 + 2j, cl2=2.5 + 3.1j, e=False, f1=Node2Entity(node2), f2=Node2Entity(node2), g=Node2(a=[['1', '2'], ['3', '4']]), h=[[2, 3], node2, True, { 1: node2 }, np.datetime64('1066-10-13'), np.timedelta64(1, 'D'), np.complex64(1 + 2j), np.complex128(2 + 3j), lambda x: x + 2, pytz.timezone('Asia/Shanghai'), pd.arrays.IntervalArray( [pd.Interval(0, 1), pd.Interval(1, 5)]), nt(1, 2)], i=[Node8(b1=111), Node8(b1=222)], j=Node2(a=[['u'], ['v']]), k=[Node5(a='uvw'), Node8(b1=222, j=Node5(a='xyz')), None], l=lambda x: x + 1, m=pytz.timezone('Asia/Shanghai'), n=pd.arrays.IntervalArray( [pd.Interval(0, 1), pd.Interval(1, 5)]), o=nt(3, 4)) node3 = Node3(value=node1) serials = serializes(provider, [node2, node3]) serials = [ json.loads(json.dumps(s), object_hook=OrderedDict) for s in serials ] loads_fun = _loads_with_check if serial_type == dataserializer.SerialType.PICKLE \ else original_pickle_loads with unittest.mock.patch('pickle.loads', new=loads_fun): d_node2, d_node3 = deserializes(provider, [Node2, Node3], serials) self.assertIsNot(node2, d_node2) self.assertEqual(node2.a, d_node2.a) self.assertEqual(node2.data, d_node2.data) self.assertIsNot(node3, d_node3) self.assertIsInstance(d_node3.value, Node8) self.assertIsNot(node3.value, d_node3.value) self.assertEqual(node3.value.a, d_node3.value.a) self.assertEqual(node3.value.b1, d_node3.value.b1) self.assertEqual(node3.value.b2, d_node3.value.b2) self.assertEqual(node3.value.b3, d_node3.value.b3) self.assertEqual(node3.value.b4, d_node3.value.b4) self.assertEqual(node3.value.c1, d_node3.value.c1) self.assertEqual(node3.value.c2, d_node3.value.c2) self.assertEqual(node3.value.c3, d_node3.value.c3) self.assertEqual(node3.value.c4, d_node3.value.c4) self.assertAlmostEqual(node3.value.d1, d_node3.value.d1, places=2) self.assertAlmostEqual(node3.value.d2, d_node3.value.d2, places=4) self.assertAlmostEqual(node3.value.d3, d_node3.value.d3) self.assertAlmostEqual(node3.value.cl1, d_node3.value.cl1) self.assertAlmostEqual(node3.value.cl2, d_node3.value.cl2) self.assertEqual(node3.value.e, d_node3.value.e) self.assertIsNot(node3.value.f1, d_node3.value.f1) self.assertEqual(node3.value.f1.a, d_node3.value.f1.a) self.assertIsNot(node3.value.f2, d_node3.value.f2) self.assertEqual(node3.value.f2.a, d_node3.value.f2.a) self.assertIsNot(node3.value.g, d_node3.value.g) self.assertEqual(node3.value.g.a, d_node3.value.g.a) self.assertEqual(node3.value.h[0], d_node3.value.h[0]) self.assertNotIsInstance(d_node3.value.h[1], str) self.assertIs(d_node3.value.h[1], d_node3.value.f1) self.assertEqual(node3.value.h[2], True) self.assertAlmostEqual(node3.value.h[6], d_node3.value.h[6]) self.assertAlmostEqual(node3.value.h[7], d_node3.value.h[7]) self.assertEqual(node3.value.h[8](2), 4) self.assertEqual(node3.value.h[9], d_node3.value.h[9]) np.testing.assert_array_equal(node3.value.h[10], d_node3.value.h[10]) self.assertEqual(node3.value.h[11], d_node3.value.h[11]) self.assertEqual([n.b1 for n in node3.value.i], [n.b1 for n in d_node3.value.i]) self.assertIsInstance(d_node3.value.i[0], Node8) self.assertIsInstance(d_node3.value.j, Node2) self.assertEqual(node3.value.j.a, d_node3.value.j.a) self.assertIsInstance(d_node3.value.k[0], Node5) self.assertEqual(node3.value.k[0].a, d_node3.value.k[0].a) self.assertIsInstance(d_node3.value.k[1], Node8) self.assertEqual(node3.value.k[1].b1, d_node3.value.k[1].b1) self.assertIsInstance(d_node3.value.k[1].j, Node5) self.assertEqual(node3.value.k[1].j.a, d_node3.value.k[1].j.a) self.assertIsNone(node3.value.k[2]) self.assertEqual(d_node3.value.l(1), 2) self.assertEqual(d_node3.value.m, node3.value.m) np.testing.assert_array_equal(d_node3.value.n, node3.value.n) self.assertEqual(d_node3.value.o, node3.value.o) with self.assertRaises(ValueError): serializes(provider, [Node3(value='sth else')])
def create_file_index_for_climate_observations( parameter_set: DwdObservationDataset, resolution: Resolution, period: Period, ) -> pd.DataFrame: """ Function (cached) to create a file index of the DWD station data. The file index is created for an individual set of parameters. Args: parameter_set: parameter of Parameter enumeration resolution: time resolution of TimeResolution enumeration period: period type of PeriodType enumeration Returns: file index in a pandas.DataFrame with sets of parameters and station id """ file_index = _create_file_index_for_dwd_server( parameter_set, resolution, period, DWDCDCBase.CLIMATE_OBSERVATIONS ) file_index = file_index[ file_index[DwdColumns.FILENAME.value].str.endswith(Extension.ZIP.value) ] file_index[DwdColumns.STATION_ID.value] = ( file_index[DwdColumns.FILENAME.value].str.findall(STATION_ID_REGEX).str[0] ) file_index = file_index.dropna().reset_index(drop=True) file_index.loc[:, DwdColumns.STATION_ID.value] = file_index[ DwdColumns.STATION_ID.value ].astype(str) if resolution in HIGH_RESOLUTIONS and period == Period.HISTORICAL: # Date range string for additional filtering of historical files file_index[DwdColumns.DATE_RANGE.value] = ( file_index[DwdColumns.FILENAME.value].str.findall(DATE_RANGE_REGEX).str[0] ) file_index[[DwdColumns.FROM_DATE.value, DwdColumns.TO_DATE.value]] = file_index[ DwdColumns.DATE_RANGE.value ].str.split("_", expand=True) file_index[DwdColumns.FROM_DATE.value] = pd.to_datetime( file_index[DwdColumns.FROM_DATE.value], format=DatetimeFormat.YMD.value, utc=True, ) file_index[DwdColumns.TO_DATE.value] = pd.to_datetime( file_index[DwdColumns.TO_DATE.value], format=DatetimeFormat.YMD.value, utc=True, ) # Temporary fix for filenames with wrong ordered/faulty dates # Fill those cases with minimum/maximum date to ensure that they are loaded as # we don't know what exact date range the included data has wrong_date_order_index = ( file_index[DwdColumns.FROM_DATE.value] > file_index[DwdColumns.TO_DATE.value] ) file_index.loc[wrong_date_order_index, DwdColumns.FROM_DATE.value] = file_index[ DwdColumns.FROM_DATE.value ].min() file_index.loc[wrong_date_order_index, DwdColumns.TO_DATE.value] = file_index[ DwdColumns.TO_DATE.value ].max() file_index[DwdColumns.INTERVAL.value] = file_index.apply( lambda x: pd.Interval( left=x[DwdColumns.FROM_DATE.value], right=x[DwdColumns.TO_DATE.value], closed="both", ), axis=1, ) file_index = file_index.sort_values( by=[DwdColumns.STATION_ID.value, DwdColumns.FILENAME.value] ) return file_index
"datetime64[ns]", np.dtype("datetime64[ns]"), {}, ), ( pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]), object, np.dtype("datetime64[ns]"), { ("infer_objects", False): np.dtype("object") }, ), (pd.period_range("1/1/2011", freq="M", periods=3), None, pd.PeriodDtype("M"), {}), ( pd.arrays.IntervalArray([pd.Interval(0, 1), pd.Interval(1, 5)]), None, pd.IntervalDtype("int64"), {}, ), ] class TestSeriesConvertDtypes: @pytest.mark.parametrize( "data, maindtype, expected_default, expected_other", test_cases, ) @pytest.mark.parametrize("params", product(*[(True, False)] * 5)) def test_convert_dtypes(self, data, maindtype, params, expected_default,
('datetime64', [np.datetime64('2013-01-01'), np.nan, np.datetime64('2018-01-01')]), ('datetime', [pd.Timestamp('20130101'), np.nan, pd.Timestamp('20180101')]), ('date', [date(2013, 1, 1), np.nan, date(2018, 1, 1)]), # The following two dtypes are commented out due to GH 23554 # ('complex', [1 + 1j, np.nan, 2 + 2j]), # ('timedelta64', [np.timedelta64(1, 'D'), # np.nan, np.timedelta64(2, 'D')]), ('timedelta', [timedelta(1), np.nan, timedelta(2)]), ('time', [time(1), np.nan, time(2)]), ('period', [pd.Period(2013), pd.NaT, pd.Period(2018)]), ('interval', [pd.Interval(0, 1), np.nan, pd.Interval(0, 2)]) ] ids, _ = zip(*_any_skipna_inferred_dtype) # use inferred type as fixture-id @pytest.fixture(params=_any_skipna_inferred_dtype, ids=ids) def any_skipna_inferred_dtype(request): """ Fixture for all inferred dtypes from _libs.lib.infer_dtype The covered (inferred) types are: * 'string' * 'unicode' (if PY2) * 'empty' * 'bytes' (if PY3)
def correlate_binned_data(top_donor_data, binned, bins): # PREP TOP DONATORS plot_hourly_runned_summed_data(binned, bins) print("TOP DONATION DF") print(top_donor_data.head()) top_donor_data = top_donor_data.drop("donated_amount", axis=1) top_donor_data = top_donor_data.drop("bin", axis=1) top_donor_data_per_hour = fix_intervals_for_data(top_donor_data) corrs = [] hours_to_shift = 300 # get and print correlations for binned groups for i in range(1, len(bins) - 1): left = bins[i - 1] right = bins[i] interval = pd.Interval(left=left, right=right) print("Interval to compare with top-donors", interval) data_to_comp = binned.get_group(interval) print(data_to_comp.head()) data_to_comp = data_to_comp.drop("donated_amount", axis=1) data_to_comp = data_to_comp.drop("bin", axis=1) data_to_comp = fix_intervals_for_data(data_to_comp) # Assuming top donor data is the most influential corr_per_range = [] for i in range(-hours_to_shift, hours_to_shift): # hours are actually reversed data_to_comp_mod = copy.deepcopy(data_to_comp) top_donor_data_per_hour_mod = copy.deepcopy( top_donor_data_per_hour) if i > 0: top_donor_data_per_hour_mod = top_donor_data_per_hour_mod[:-i] data_to_comp_mod = data_to_comp_mod[i:] elif i < 0: top_donor_data_per_hour_mod = top_donor_data_per_hour_mod[-i:] data_to_comp_mod = data_to_comp_mod[:i] else: pass # its 0 corr = np.corrcoef( top_donor_data_per_hour_mod, data_to_comp_mod)[0, 1] # grab the compared correlation corr_per_range.append(corr) corrs.append( (interval, corr_per_range)) # append tuple-> inteval, corrs over the hours # init new plot fig, ax = plt.subplots() for data_brick in corrs: interval = data_brick[0] corrs_shifted = np.asarray(data_brick[1]) # reverse the hour amount as this is logical for the graph x = np.asarray(range(-hours_to_shift, hours_to_shift)) * -1 ax.plot(x, corrs_shifted, label="Interval: " + str(interval), alpha=0.5) xmax = x[np.argmax(corrs_shifted)] ymax = corrs_shifted.max() print("Max correlation coefficients") print(xmax, ymax, interval) ax.plot(xmax, ymax, marker="o", ls="", ms=3) plt.ylabel("Correlation coefficient", fontsize=30) plt.xlabel("Hours shifted", fontsize=30) plt.legend(loc='upper left', labels=[ 'Low donors', 'Peak correlation: Low donors', 'Medium donors', 'Peak correlation: medium donors', 'Large donors', 'Peak correlation: Large donors' ], prop={'size': 20}) plt.yticks(fontsize=20) plt.xticks(fontsize=20) plt.show()
class TestFillnaSeriesCoercion(CoercionBase): # not indexing, but place here for consistency method = "fillna" @pytest.mark.xfail(reason="Test not implemented") def test_has_comprehensive_tests(self): raise NotImplementedError def _assert_fillna_conversion(self, original, value, expected, expected_dtype): """test coercion triggered by fillna""" target = original.copy() res = target.fillna(value) tm.assert_equal(res, expected) assert res.dtype == expected_dtype @pytest.mark.parametrize( "fill_val, fill_dtype", [(1, object), (1.1, object), (1 + 1j, object), (True, object)], ) def test_fillna_object(self, index_or_series, fill_val, fill_dtype): klass = index_or_series obj = klass(["a", np.nan, "c", "d"]) assert obj.dtype == object exp = klass(["a", fill_val, "c", "d"]) self._assert_fillna_conversion(obj, fill_val, exp, fill_dtype) @pytest.mark.parametrize( "fill_val,fill_dtype", [(1, np.float64), (1.1, np.float64), (1 + 1j, np.complex128), (True, object)], ) def test_fillna_float64(self, index_or_series, fill_val, fill_dtype): klass = index_or_series obj = klass([1.1, np.nan, 3.3, 4.4]) assert obj.dtype == np.float64 exp = klass([1.1, fill_val, 3.3, 4.4]) self._assert_fillna_conversion(obj, fill_val, exp, fill_dtype) @pytest.mark.parametrize( "fill_val,fill_dtype", [ (1, np.complex128), (1.1, np.complex128), (1 + 1j, np.complex128), (True, object), ], ) def test_fillna_complex128(self, index_or_series, fill_val, fill_dtype): klass = index_or_series obj = klass([1 + 1j, np.nan, 3 + 3j, 4 + 4j], dtype=np.complex128) assert obj.dtype == np.complex128 exp = klass([1 + 1j, fill_val, 3 + 3j, 4 + 4j]) self._assert_fillna_conversion(obj, fill_val, exp, fill_dtype) @pytest.mark.parametrize( "fill_val,fill_dtype", [ (pd.Timestamp("2012-01-01"), "datetime64[ns]"), (pd.Timestamp("2012-01-01", tz="US/Eastern"), object), (1, object), ("x", object), ], ids=["datetime64", "datetime64tz", "object", "object"], ) def test_fillna_datetime(self, index_or_series, fill_val, fill_dtype): klass = index_or_series obj = klass([ pd.Timestamp("2011-01-01"), pd.NaT, pd.Timestamp("2011-01-03"), pd.Timestamp("2011-01-04"), ]) assert obj.dtype == "datetime64[ns]" exp = klass([ pd.Timestamp("2011-01-01"), fill_val, pd.Timestamp("2011-01-03"), pd.Timestamp("2011-01-04"), ]) self._assert_fillna_conversion(obj, fill_val, exp, fill_dtype) @pytest.mark.parametrize( "fill_val,fill_dtype", [ (pd.Timestamp("2012-01-01", tz="US/Eastern"), "datetime64[ns, US/Eastern]"), (pd.Timestamp("2012-01-01"), object), (pd.Timestamp("2012-01-01", tz="Asia/Tokyo"), object), (1, object), ("x", object), ], ) def test_fillna_datetime64tz(self, index_or_series, fill_val, fill_dtype): klass = index_or_series tz = "US/Eastern" obj = klass([ pd.Timestamp("2011-01-01", tz=tz), pd.NaT, pd.Timestamp("2011-01-03", tz=tz), pd.Timestamp("2011-01-04", tz=tz), ]) assert obj.dtype == "datetime64[ns, US/Eastern]" exp = klass([ pd.Timestamp("2011-01-01", tz=tz), fill_val, # Once deprecation is enforced, this becomes: # fill_val.tz_convert(tz) if getattr(fill_val, "tz", None) # is not None else fill_val, pd.Timestamp("2011-01-03", tz=tz), pd.Timestamp("2011-01-04", tz=tz), ]) warn = None if getattr(fill_val, "tz", None) is not None and fill_val.tz != obj[0].tz: warn = FutureWarning with tm.assert_produces_warning(warn, match="mismatched timezone"): self._assert_fillna_conversion(obj, fill_val, exp, fill_dtype) @pytest.mark.parametrize( "fill_val", [ 1, 1.1, 1 + 1j, True, pd.Interval(1, 2, inclusive="left"), pd.Timestamp("2012-01-01", tz="US/Eastern"), pd.Timestamp("2012-01-01"), pd.Timedelta(days=1), pd.Period("2016-01-01", "D"), ], ) def test_fillna_interval(self, index_or_series, fill_val): ii = pd.interval_range(1.0, 5.0, inclusive="right").insert(1, np.nan) assert isinstance(ii.dtype, pd.IntervalDtype) obj = index_or_series(ii) exp = index_or_series([ii[0], fill_val, ii[2], ii[3], ii[4]], dtype=object) fill_dtype = object self._assert_fillna_conversion(obj, fill_val, exp, fill_dtype) @pytest.mark.xfail(reason="Test not implemented") def test_fillna_series_int64(self): raise NotImplementedError @pytest.mark.xfail(reason="Test not implemented") def test_fillna_index_int64(self): raise NotImplementedError @pytest.mark.xfail(reason="Test not implemented") def test_fillna_series_bool(self): raise NotImplementedError @pytest.mark.xfail(reason="Test not implemented") def test_fillna_index_bool(self): raise NotImplementedError @pytest.mark.xfail(reason="Test not implemented") def test_fillna_series_timedelta64(self): raise NotImplementedError @pytest.mark.parametrize( "fill_val", [ 1, 1.1, 1 + 1j, True, pd.Interval(1, 2, inclusive="left"), pd.Timestamp("2012-01-01", tz="US/Eastern"), pd.Timestamp("2012-01-01"), pd.Timedelta(days=1), pd.Period("2016-01-01", "W"), ], ) def test_fillna_series_period(self, index_or_series, fill_val): pi = pd.period_range("2016-01-01", periods=4, freq="D").insert(1, pd.NaT) assert isinstance(pi.dtype, pd.PeriodDtype) obj = index_or_series(pi) exp = index_or_series([pi[0], fill_val, pi[2], pi[3], pi[4]], dtype=object) fill_dtype = object self._assert_fillna_conversion(obj, fill_val, exp, fill_dtype) @pytest.mark.xfail(reason="Test not implemented") def test_fillna_index_timedelta64(self): raise NotImplementedError @pytest.mark.xfail(reason="Test not implemented") def test_fillna_index_period(self): raise NotImplementedError
), ( pd.TimedeltaIndex(["1H", "2H"]), None, TimedeltaArray._from_sequence(["1H", "2H"]), ), # Category (["a", "b"], "category", pd.Categorical(["a", "b"])), ( ["a", "b"], pd.CategoricalDtype(None, ordered=True), pd.Categorical(["a", "b"], ordered=True), ), # Interval ( [pd.Interval(1, 2), pd.Interval(3, 4)], "interval", IntervalArray.from_tuples([(1, 2), (3, 4)]), ), # Sparse ([0, 1], "Sparse[int64]", SparseArray([0, 1], dtype="int64")), # IntegerNA ([1, None], "Int16", integer_array([1, None], dtype="Int16")), (pd.Series([1, 2]), None, PandasArray(np.array([1, 2], dtype=np.int64))), # String (["a", None], "string", StringArray._from_sequence(["a", None])), ( ["a", None], pd.StringDtype(), StringArray._from_sequence(["a", None]),
@pytest.mark.parametrize( "array, expected", [ (np.array([1, 2], dtype=np.int64), np.array([1, 2], dtype=np.int64)), (pd.Categorical(["a", "b"]), np.array(["a", "b"], dtype=object)), ( pd.core.arrays.period_array(["2000", "2001"], freq="D"), np.array( [pd.Period("2000", freq="D"), pd.Period("2001", freq="D")]), ), (pd.array([0, np.nan], dtype="Int64"), np.array([0, pd.NA], dtype=object)), ( IntervalArray.from_breaks([0, 1, 2]), np.array([pd.Interval(0, 1), pd.Interval(1, 2)], dtype=object), ), (SparseArray([0, 1]), np.array([0, 1], dtype=np.int64)), # tz-naive datetime ( DatetimeArray(np.array(["2000", "2001"], dtype="M8[ns]")), np.array(["2000", "2001"], dtype="M8[ns]"), ), # tz-aware stays tz`-aware ( DatetimeArray( np.array(["2000-01-01T06:00:00", "2000-01-02T06:00:00"], dtype="M8[ns]"), dtype=DatetimeTZDtype(tz="US/Central"), ), np.array([
class TestCategoricalIndex(Base): _holder = CategoricalIndex @pytest.fixture def indices(self, request): return tm.makeCategoricalIndex(100) def create_index(self, categories=None, ordered=False): if categories is None: categories = list("cab") return CategoricalIndex(list("aabbca"), categories=categories, ordered=ordered) def test_can_hold_identifiers(self): idx = self.create_index(categories=list("abcd")) key = idx[0] assert idx._can_hold_identifiers_and_holds_name(key) is True @pytest.mark.parametrize( "func,op_name", [ (lambda idx: idx - idx, "__sub__"), (lambda idx: idx + idx, "__add__"), (lambda idx: idx - ["a", "b"], "__sub__"), (lambda idx: idx + ["a", "b"], "__add__"), (lambda idx: ["a", "b"] - idx, "__rsub__"), (lambda idx: ["a", "b"] + idx, "__radd__"), ], ) def test_disallow_addsub_ops(self, func, op_name): # GH 10039 # set ops (+/-) raise TypeError idx = pd.Index(pd.Categorical(["a", "b"])) msg = f"cannot perform {op_name} with this index type: CategoricalIndex" with pytest.raises(TypeError, match=msg): func(idx) def test_method_delegation(self): ci = CategoricalIndex(list("aabbca"), categories=list("cabdef")) result = ci.set_categories(list("cab")) tm.assert_index_equal( result, CategoricalIndex(list("aabbca"), categories=list("cab"))) ci = CategoricalIndex(list("aabbca"), categories=list("cab")) result = ci.rename_categories(list("efg")) tm.assert_index_equal( result, CategoricalIndex(list("ffggef"), categories=list("efg"))) # GH18862 (let rename_categories take callables) result = ci.rename_categories(lambda x: x.upper()) tm.assert_index_equal( result, CategoricalIndex(list("AABBCA"), categories=list("CAB"))) ci = CategoricalIndex(list("aabbca"), categories=list("cab")) result = ci.add_categories(["d"]) tm.assert_index_equal( result, CategoricalIndex(list("aabbca"), categories=list("cabd"))) ci = CategoricalIndex(list("aabbca"), categories=list("cab")) result = ci.remove_categories(["c"]) tm.assert_index_equal( result, CategoricalIndex(list("aabb") + [np.nan] + ["a"], categories=list("ab")), ) ci = CategoricalIndex(list("aabbca"), categories=list("cabdef")) result = ci.as_unordered() tm.assert_index_equal(result, ci) ci = CategoricalIndex(list("aabbca"), categories=list("cabdef")) result = ci.as_ordered() tm.assert_index_equal( result, CategoricalIndex(list("aabbca"), categories=list("cabdef"), ordered=True), ) # invalid msg = "cannot use inplace with CategoricalIndex" with pytest.raises(ValueError, match=msg): ci.set_categories(list("cab"), inplace=True) def test_contains(self): ci = self.create_index(categories=list("cabdef")) assert "a" in ci assert "z" not in ci assert "e" not in ci assert np.nan not in ci # assert codes NOT in index assert 0 not in ci assert 1 not in ci ci = CategoricalIndex(list("aabbca") + [np.nan], categories=list("cabdef")) assert np.nan in ci @pytest.mark.parametrize( "item, expected", [ (pd.Interval(0, 1), True), (1.5, True), (pd.Interval(0.5, 1.5), False), ("a", False), (pd.Timestamp(1), False), (pd.Timedelta(1), False), ], ids=str, ) def test_contains_interval(self, item, expected): # GH 23705 ci = CategoricalIndex(IntervalIndex.from_breaks(range(3))) result = item in ci assert result is expected def test_contains_list(self): # GH#21729 idx = pd.CategoricalIndex([1, 2, 3]) assert "a" not in idx with pytest.raises(TypeError, match="unhashable type"): ["a"] in idx with pytest.raises(TypeError, match="unhashable type"): ["a", "b"] in idx def test_map(self): ci = pd.CategoricalIndex(list("ABABC"), categories=list("CBA"), ordered=True) result = ci.map(lambda x: x.lower()) exp = pd.CategoricalIndex(list("ababc"), categories=list("cba"), ordered=True) tm.assert_index_equal(result, exp) ci = pd.CategoricalIndex(list("ABABC"), categories=list("BAC"), ordered=False, name="XXX") result = ci.map(lambda x: x.lower()) exp = pd.CategoricalIndex(list("ababc"), categories=list("bac"), ordered=False, name="XXX") tm.assert_index_equal(result, exp) # GH 12766: Return an index not an array tm.assert_index_equal( ci.map(lambda x: 1), Index(np.array([1] * 5, dtype=np.int64), name="XXX")) # change categories dtype ci = pd.CategoricalIndex(list("ABABC"), categories=list("BAC"), ordered=False) def f(x): return {"A": 10, "B": 20, "C": 30}.get(x) result = ci.map(f) exp = pd.CategoricalIndex([10, 20, 10, 20, 30], categories=[20, 10, 30], ordered=False) tm.assert_index_equal(result, exp) result = ci.map(pd.Series([10, 20, 30], index=["A", "B", "C"])) tm.assert_index_equal(result, exp) result = ci.map({"A": 10, "B": 20, "C": 30}) tm.assert_index_equal(result, exp) def test_map_with_categorical_series(self): # GH 12756 a = pd.Index([1, 2, 3, 4]) b = pd.Series(["even", "odd", "even", "odd"], dtype="category") c = pd.Series(["even", "odd", "even", "odd"]) exp = CategoricalIndex(["odd", "even", "odd", np.nan]) tm.assert_index_equal(a.map(b), exp) exp = pd.Index(["odd", "even", "odd", np.nan]) tm.assert_index_equal(a.map(c), exp) @pytest.mark.parametrize( ("data", "f"), ( ([1, 1, np.nan], pd.isna), ([1, 2, np.nan], pd.isna), ([1, 1, np.nan], { 1: False }), ([1, 2, np.nan], { 1: False, 2: False }), ([1, 1, np.nan], pd.Series([False, False])), ([1, 2, np.nan], pd.Series([False, False, False])), ), ) def test_map_with_nan(self, data, f): # GH 24241 values = pd.Categorical(data) result = values.map(f) if data[1] == 1: expected = pd.Categorical([False, False, np.nan]) tm.assert_categorical_equal(result, expected) else: expected = pd.Index([False, False, np.nan]) tm.assert_index_equal(result, expected) @pytest.mark.parametrize("klass", [list, tuple, np.array, pd.Series]) def test_where(self, klass): i = self.create_index() cond = [True] * len(i) expected = i result = i.where(klass(cond)) tm.assert_index_equal(result, expected) cond = [False] + [True] * (len(i) - 1) expected = CategoricalIndex([np.nan] + i[1:].tolist(), categories=i.categories) result = i.where(klass(cond)) tm.assert_index_equal(result, expected) def test_append(self): ci = self.create_index() categories = ci.categories # append cats with the same categories result = ci[:3].append(ci[3:]) tm.assert_index_equal(result, ci, exact=True) foos = [ci[:1], ci[1:3], ci[3:]] result = foos[0].append(foos[1:]) tm.assert_index_equal(result, ci, exact=True) # empty result = ci.append([]) tm.assert_index_equal(result, ci, exact=True) # appending with different categories or reordered is not ok msg = "all inputs must be Index" with pytest.raises(TypeError, match=msg): ci.append(ci.values.set_categories(list("abcd"))) with pytest.raises(TypeError, match=msg): ci.append(ci.values.reorder_categories(list("abc"))) # with objects result = ci.append(Index(["c", "a"])) expected = CategoricalIndex(list("aabbcaca"), categories=categories) tm.assert_index_equal(result, expected, exact=True) # invalid objects msg = "cannot append a non-category item to a CategoricalIndex" with pytest.raises(TypeError, match=msg): ci.append(Index(["a", "d"])) # GH14298 - if base object is not categorical -> coerce to object result = Index(["c", "a"]).append(ci) expected = Index(list("caaabbca")) tm.assert_index_equal(result, expected, exact=True) def test_append_to_another(self): # hits Index._concat_same_dtype fst = Index(["a", "b"]) snd = CategoricalIndex(["d", "e"]) result = fst.append(snd) expected = Index(["a", "b", "d", "e"]) tm.assert_index_equal(result, expected) def test_insert(self): ci = self.create_index() categories = ci.categories # test 0th element result = ci.insert(0, "a") expected = CategoricalIndex(list("aaabbca"), categories=categories) tm.assert_index_equal(result, expected, exact=True) # test Nth element that follows Python list behavior result = ci.insert(-1, "a") expected = CategoricalIndex(list("aabbcaa"), categories=categories) tm.assert_index_equal(result, expected, exact=True) # test empty result = CategoricalIndex(categories=categories).insert(0, "a") expected = CategoricalIndex(["a"], categories=categories) tm.assert_index_equal(result, expected, exact=True) # invalid msg = ("cannot insert an item into a CategoricalIndex that is not" " already an existing category") with pytest.raises(TypeError, match=msg): ci.insert(0, "d") # GH 18295 (test missing) expected = CategoricalIndex(["a", np.nan, "a", "b", "c", "b"]) for na in (np.nan, pd.NaT, None): result = CategoricalIndex(list("aabcb")).insert(1, na) tm.assert_index_equal(result, expected) def test_delete(self): ci = self.create_index() categories = ci.categories result = ci.delete(0) expected = CategoricalIndex(list("abbca"), categories=categories) tm.assert_index_equal(result, expected, exact=True) result = ci.delete(-1) expected = CategoricalIndex(list("aabbc"), categories=categories) tm.assert_index_equal(result, expected, exact=True) with pytest.raises((IndexError, ValueError)): # Either depending on NumPy version ci.delete(10) def test_astype(self): ci = self.create_index() result = ci.astype(object) tm.assert_index_equal(result, Index(np.array(ci))) # this IS equal, but not the same class assert result.equals(ci) assert isinstance(result, Index) assert not isinstance(result, CategoricalIndex) # interval ii = IntervalIndex.from_arrays(left=[-0.001, 2.0], right=[2, 4], closed="right") ci = CategoricalIndex( Categorical.from_codes([0, 1, -1], categories=ii, ordered=True)) result = ci.astype("interval") expected = ii.take([0, 1, -1]) tm.assert_index_equal(result, expected) result = IntervalIndex(result.values) tm.assert_index_equal(result, expected) @pytest.mark.parametrize("name", [None, "foo"]) @pytest.mark.parametrize("dtype_ordered", [True, False]) @pytest.mark.parametrize("index_ordered", [True, False]) def test_astype_category(self, name, dtype_ordered, index_ordered): # GH 18630 index = self.create_index(ordered=index_ordered) if name: index = index.rename(name) # standard categories dtype = CategoricalDtype(ordered=dtype_ordered) result = index.astype(dtype) expected = CategoricalIndex( index.tolist(), name=name, categories=index.categories, ordered=dtype_ordered, ) tm.assert_index_equal(result, expected) # non-standard categories dtype = CategoricalDtype(index.unique().tolist()[:-1], dtype_ordered) result = index.astype(dtype) expected = CategoricalIndex(index.tolist(), name=name, dtype=dtype) tm.assert_index_equal(result, expected) if dtype_ordered is False: # dtype='category' can't specify ordered, so only test once result = index.astype("category") expected = index tm.assert_index_equal(result, expected) def test_reindex_base(self): # Determined by cat ordering. idx = CategoricalIndex(list("cab"), categories=list("cab")) expected = np.arange(len(idx), dtype=np.intp) actual = idx.get_indexer(idx) tm.assert_numpy_array_equal(expected, actual) with pytest.raises(ValueError, match="Invalid fill method"): idx.get_indexer(idx, method="invalid") def test_reindexing(self): np.random.seed(123456789) ci = self.create_index() oidx = Index(np.array(ci)) for n in [1, 2, 5, len(ci)]: finder = oidx[np.random.randint(0, len(ci), size=n)] expected = oidx.get_indexer_non_unique(finder)[0] actual = ci.get_indexer(finder) tm.assert_numpy_array_equal(expected, actual) # see gh-17323 # # Even when indexer is equal to the # members in the index, we should # respect duplicates instead of taking # the fast-track path. for finder in [list("aabbca"), list("aababca")]: expected = oidx.get_indexer_non_unique(finder)[0] actual = ci.get_indexer(finder) tm.assert_numpy_array_equal(expected, actual) def test_reindex_dtype(self): c = CategoricalIndex(["a", "b", "c", "a"]) res, indexer = c.reindex(["a", "c"]) tm.assert_index_equal(res, Index(["a", "a", "c"]), exact=True) tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2], dtype=np.intp)) c = CategoricalIndex(["a", "b", "c", "a"]) res, indexer = c.reindex(Categorical(["a", "c"])) exp = CategoricalIndex(["a", "a", "c"], categories=["a", "c"]) tm.assert_index_equal(res, exp, exact=True) tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2], dtype=np.intp)) c = CategoricalIndex(["a", "b", "c", "a"], categories=["a", "b", "c", "d"]) res, indexer = c.reindex(["a", "c"]) exp = Index(["a", "a", "c"], dtype="object") tm.assert_index_equal(res, exp, exact=True) tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2], dtype=np.intp)) c = CategoricalIndex(["a", "b", "c", "a"], categories=["a", "b", "c", "d"]) res, indexer = c.reindex(Categorical(["a", "c"])) exp = CategoricalIndex(["a", "a", "c"], categories=["a", "c"]) tm.assert_index_equal(res, exp, exact=True) tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2], dtype=np.intp)) def test_reindex_duplicate_target(self): # See GH25459 cat = CategoricalIndex(["a", "b", "c"], categories=["a", "b", "c", "d"]) res, indexer = cat.reindex(["a", "c", "c"]) exp = Index(["a", "c", "c"], dtype="object") tm.assert_index_equal(res, exp, exact=True) tm.assert_numpy_array_equal(indexer, np.array([0, 2, 2], dtype=np.intp)) res, indexer = cat.reindex( CategoricalIndex(["a", "c", "c"], categories=["a", "b", "c", "d"])) exp = CategoricalIndex(["a", "c", "c"], categories=["a", "b", "c", "d"]) tm.assert_index_equal(res, exp, exact=True) tm.assert_numpy_array_equal(indexer, np.array([0, 2, 2], dtype=np.intp)) def test_reindex_empty_index(self): # See GH16770 c = CategoricalIndex([]) res, indexer = c.reindex(["a", "b"]) tm.assert_index_equal(res, Index(["a", "b"]), exact=True) tm.assert_numpy_array_equal(indexer, np.array([-1, -1], dtype=np.intp)) @pytest.mark.parametrize( "data, non_lexsorted_data", [[[1, 2, 3], [9, 0, 1, 2, 3]], [list("abc"), list("fabcd")]], ) def test_is_monotonic(self, data, non_lexsorted_data): c = CategoricalIndex(data) assert c.is_monotonic_increasing is True assert c.is_monotonic_decreasing is False c = CategoricalIndex(data, ordered=True) assert c.is_monotonic_increasing is True assert c.is_monotonic_decreasing is False c = CategoricalIndex(data, categories=reversed(data)) assert c.is_monotonic_increasing is False assert c.is_monotonic_decreasing is True c = CategoricalIndex(data, categories=reversed(data), ordered=True) assert c.is_monotonic_increasing is False assert c.is_monotonic_decreasing is True # test when data is neither monotonic increasing nor decreasing reordered_data = [data[0], data[2], data[1]] c = CategoricalIndex(reordered_data, categories=reversed(data)) assert c.is_monotonic_increasing is False assert c.is_monotonic_decreasing is False # non lexsorted categories categories = non_lexsorted_data c = CategoricalIndex(categories[:2], categories=categories) assert c.is_monotonic_increasing is True assert c.is_monotonic_decreasing is False c = CategoricalIndex(categories[1:3], categories=categories) assert c.is_monotonic_increasing is True assert c.is_monotonic_decreasing is False def test_has_duplicates(self): idx = CategoricalIndex([0, 0, 0], name="foo") assert idx.is_unique is False assert idx.has_duplicates is True def test_drop_duplicates(self): idx = CategoricalIndex([0, 0, 0], name="foo") expected = CategoricalIndex([0], name="foo") tm.assert_index_equal(idx.drop_duplicates(), expected) tm.assert_index_equal(idx.unique(), expected) def test_get_indexer(self): idx1 = CategoricalIndex(list("aabcde"), categories=list("edabc")) idx2 = CategoricalIndex(list("abf")) for indexer in [idx2, list("abf"), Index(list("abf"))]: r1 = idx1.get_indexer(idx2) tm.assert_almost_equal(r1, np.array([0, 1, 2, -1], dtype=np.intp)) msg = ("method='pad' and method='backfill' not implemented yet for" " CategoricalIndex") with pytest.raises(NotImplementedError, match=msg): idx2.get_indexer(idx1, method="pad") with pytest.raises(NotImplementedError, match=msg): idx2.get_indexer(idx1, method="backfill") msg = "method='nearest' not implemented yet for CategoricalIndex" with pytest.raises(NotImplementedError, match=msg): idx2.get_indexer(idx1, method="nearest") def test_get_loc(self): # GH 12531 cidx1 = CategoricalIndex(list("abcde"), categories=list("edabc")) idx1 = Index(list("abcde")) assert cidx1.get_loc("a") == idx1.get_loc("a") assert cidx1.get_loc("e") == idx1.get_loc("e") for i in [cidx1, idx1]: with pytest.raises(KeyError, match="'NOT-EXIST'"): i.get_loc("NOT-EXIST") # non-unique cidx2 = CategoricalIndex(list("aacded"), categories=list("edabc")) idx2 = Index(list("aacded")) # results in bool array res = cidx2.get_loc("d") tm.assert_numpy_array_equal(res, idx2.get_loc("d")) tm.assert_numpy_array_equal( res, np.array([False, False, False, True, False, True])) # unique element results in scalar res = cidx2.get_loc("e") assert res == idx2.get_loc("e") assert res == 4 for i in [cidx2, idx2]: with pytest.raises(KeyError, match="'NOT-EXIST'"): i.get_loc("NOT-EXIST") # non-unique, sliceable cidx3 = CategoricalIndex(list("aabbb"), categories=list("abc")) idx3 = Index(list("aabbb")) # results in slice res = cidx3.get_loc("a") assert res == idx3.get_loc("a") assert res == slice(0, 2, None) res = cidx3.get_loc("b") assert res == idx3.get_loc("b") assert res == slice(2, 5, None) for i in [cidx3, idx3]: with pytest.raises(KeyError, match="'c'"): i.get_loc("c") def test_repr_roundtrip(self): ci = CategoricalIndex(["a", "b"], categories=["a", "b"], ordered=True) str(ci) tm.assert_index_equal(eval(repr(ci)), ci, exact=True) # formatting str(ci) # long format # this is not reprable ci = CategoricalIndex(np.random.randint(0, 5, size=100)) str(ci) def test_isin(self): ci = CategoricalIndex(list("aabca") + [np.nan], categories=["c", "a", "b"]) tm.assert_numpy_array_equal( ci.isin(["c"]), np.array([False, False, False, True, False, False])) tm.assert_numpy_array_equal(ci.isin(["c", "a", "b"]), np.array([True] * 5 + [False])) tm.assert_numpy_array_equal(ci.isin(["c", "a", "b", np.nan]), np.array([True] * 6)) # mismatched categorical -> coerced to ndarray so doesn't matter result = ci.isin(ci.set_categories(list("abcdefghi"))) expected = np.array([True] * 6) tm.assert_numpy_array_equal(result, expected) result = ci.isin(ci.set_categories(list("defghi"))) expected = np.array([False] * 5 + [True]) tm.assert_numpy_array_equal(result, expected) def test_identical(self): ci1 = CategoricalIndex(["a", "b"], categories=["a", "b"], ordered=True) ci2 = CategoricalIndex(["a", "b"], categories=["a", "b", "c"], ordered=True) assert ci1.identical(ci1) assert ci1.identical(ci1.copy()) assert not ci1.identical(ci2) def test_ensure_copied_data(self, indices): # gh-12309: Check the "copy" argument of each # Index.__new__ is honored. # # Must be tested separately from other indexes because # self.values is not an ndarray. # GH#29918 Index.base has been removed # FIXME: is this test still meaningful? _base = lambda ar: ar if getattr(ar, "base", None) is None else ar.base result = CategoricalIndex(indices.values, copy=True) tm.assert_index_equal(indices, result) assert _base(indices.values) is not _base(result.values) result = CategoricalIndex(indices.values, copy=False) assert _base(indices.values) is _base(result.values) def test_equals_categorical(self): ci1 = CategoricalIndex(["a", "b"], categories=["a", "b"], ordered=True) ci2 = CategoricalIndex(["a", "b"], categories=["a", "b", "c"], ordered=True) assert ci1.equals(ci1) assert not ci1.equals(ci2) assert ci1.equals(ci1.astype(object)) assert ci1.astype(object).equals(ci1) assert (ci1 == ci1).all() assert not (ci1 != ci1).all() assert not (ci1 > ci1).all() assert not (ci1 < ci1).all() assert (ci1 <= ci1).all() assert (ci1 >= ci1).all() assert not (ci1 == 1).all() assert (ci1 == Index(["a", "b"])).all() assert (ci1 == ci1.values).all() # invalid comparisons with pytest.raises(ValueError, match="Lengths must match"): ci1 == Index(["a", "b", "c"]) msg = ( "categorical index comparisons must have the same categories" " and ordered attributes" "|" "Categoricals can only be compared if 'categories' are the same. " "Categories are different lengths" "|" "Categoricals can only be compared if 'ordered' is the same") with pytest.raises(TypeError, match=msg): ci1 == ci2 with pytest.raises(TypeError, match=msg): ci1 == Categorical(ci1.values, ordered=False) with pytest.raises(TypeError, match=msg): ci1 == Categorical(ci1.values, categories=list("abc")) # tests # make sure that we are testing for category inclusion properly ci = CategoricalIndex(list("aabca"), categories=["c", "a", "b"]) assert not ci.equals(list("aabca")) # Same categories, but different order # Unordered assert ci.equals(CategoricalIndex(list("aabca"))) # Ordered assert not ci.equals(CategoricalIndex(list("aabca"), ordered=True)) assert ci.equals(ci.copy()) ci = CategoricalIndex(list("aabca") + [np.nan], categories=["c", "a", "b"]) assert not ci.equals(list("aabca")) assert not ci.equals(CategoricalIndex(list("aabca"))) assert ci.equals(ci.copy()) ci = CategoricalIndex(list("aabca") + [np.nan], categories=["c", "a", "b"]) assert not ci.equals(list("aabca") + [np.nan]) assert ci.equals(CategoricalIndex(list("aabca") + [np.nan])) assert not ci.equals( CategoricalIndex(list("aabca") + [np.nan], ordered=True)) assert ci.equals(ci.copy()) def test_equals_categoridcal_unordered(self): # https://github.com/pandas-dev/pandas/issues/16603 a = pd.CategoricalIndex(["A"], categories=["A", "B"]) b = pd.CategoricalIndex(["A"], categories=["B", "A"]) c = pd.CategoricalIndex(["C"], categories=["B", "A"]) assert a.equals(b) assert not a.equals(c) assert not b.equals(c) def test_frame_repr(self): df = pd.DataFrame({"A": [1, 2, 3]}, index=pd.CategoricalIndex(["a", "b", "c"])) result = repr(df) expected = " A\na 1\nb 2\nc 3" assert result == expected def test_string_categorical_index_repr(self): # short idx = pd.CategoricalIndex(["a", "bb", "ccc"]) expected = """CategoricalIndex(['a', 'bb', 'ccc'], categories=['a', 'bb', 'ccc'], ordered=False, dtype='category')""" # noqa assert repr(idx) == expected # multiple lines idx = pd.CategoricalIndex(["a", "bb", "ccc"] * 10) expected = """CategoricalIndex(['a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc'], categories=['a', 'bb', 'ccc'], ordered=False, dtype='category')""" # noqa assert repr(idx) == expected # truncated idx = pd.CategoricalIndex(["a", "bb", "ccc"] * 100) expected = """CategoricalIndex(['a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', ... 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc'], categories=['a', 'bb', 'ccc'], ordered=False, dtype='category', length=300)""" # noqa assert repr(idx) == expected # larger categories idx = pd.CategoricalIndex(list("abcdefghijklmmo")) expected = """CategoricalIndex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'm', 'o'], categories=['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', ...], ordered=False, dtype='category')""" # noqa assert repr(idx) == expected # short idx = pd.CategoricalIndex(["あ", "いい", "ううう"]) expected = """CategoricalIndex(['あ', 'いい', 'ううう'], categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')""" # noqa assert repr(idx) == expected # multiple lines idx = pd.CategoricalIndex(["あ", "いい", "ううう"] * 10) expected = """CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'], categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')""" # noqa assert repr(idx) == expected # truncated idx = pd.CategoricalIndex(["あ", "いい", "ううう"] * 100) expected = """CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', ... 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'], categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category', length=300)""" # noqa assert repr(idx) == expected # larger categories idx = pd.CategoricalIndex(list("あいうえおかきくけこさしすせそ")) expected = """CategoricalIndex(['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', 'け', 'こ', 'さ', 'し', 'す', 'せ', 'そ'], categories=['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', ...], ordered=False, dtype='category')""" # noqa assert repr(idx) == expected # Emable Unicode option ----------------------------------------- with cf.option_context("display.unicode.east_asian_width", True): # short idx = pd.CategoricalIndex(["あ", "いい", "ううう"]) expected = """CategoricalIndex(['あ', 'いい', 'ううう'], categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')""" # noqa assert repr(idx) == expected # multiple lines idx = pd.CategoricalIndex(["あ", "いい", "ううう"] * 10) expected = """CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'], categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')""" # noqa assert repr(idx) == expected # truncated idx = pd.CategoricalIndex(["あ", "いい", "ううう"] * 100) expected = """CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', ... 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'], categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category', length=300)""" # noqa assert repr(idx) == expected # larger categories idx = pd.CategoricalIndex(list("あいうえおかきくけこさしすせそ")) expected = """CategoricalIndex(['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', 'け', 'こ', 'さ', 'し', 'す', 'せ', 'そ'], categories=['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', ...], ordered=False, dtype='category')""" # noqa assert repr(idx) == expected def test_fillna_categorical(self): # GH 11343 idx = CategoricalIndex([1.0, np.nan, 3.0, 1.0], name="x") # fill by value in categories exp = CategoricalIndex([1.0, 1.0, 3.0, 1.0], name="x") tm.assert_index_equal(idx.fillna(1.0), exp) # fill by value not in categories raises ValueError msg = "fill value must be in categories" with pytest.raises(ValueError, match=msg): idx.fillna(2.0) def test_take_fill_value(self): # GH 12631 # numeric category idx = pd.CategoricalIndex([1, 2, 3], name="xxx") result = idx.take(np.array([1, 0, -1])) expected = pd.CategoricalIndex([2, 1, 3], name="xxx") tm.assert_index_equal(result, expected) tm.assert_categorical_equal(result.values, expected.values) # fill_value result = idx.take(np.array([1, 0, -1]), fill_value=True) expected = pd.CategoricalIndex([2, 1, np.nan], categories=[1, 2, 3], name="xxx") tm.assert_index_equal(result, expected) tm.assert_categorical_equal(result.values, expected.values) # allow_fill=False result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) expected = pd.CategoricalIndex([2, 1, 3], name="xxx") tm.assert_index_equal(result, expected) tm.assert_categorical_equal(result.values, expected.values) # object category idx = pd.CategoricalIndex(list("CBA"), categories=list("ABC"), ordered=True, name="xxx") result = idx.take(np.array([1, 0, -1])) expected = pd.CategoricalIndex(list("BCA"), categories=list("ABC"), ordered=True, name="xxx") tm.assert_index_equal(result, expected) tm.assert_categorical_equal(result.values, expected.values) # fill_value result = idx.take(np.array([1, 0, -1]), fill_value=True) expected = pd.CategoricalIndex(["B", "C", np.nan], categories=list("ABC"), ordered=True, name="xxx") tm.assert_index_equal(result, expected) tm.assert_categorical_equal(result.values, expected.values) # allow_fill=False result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) expected = pd.CategoricalIndex(list("BCA"), categories=list("ABC"), ordered=True, name="xxx") tm.assert_index_equal(result, expected) tm.assert_categorical_equal(result.values, expected.values) msg = ("When allow_fill=True and fill_value is not None, " "all indices must be >= -1") with pytest.raises(ValueError, match=msg): idx.take(np.array([1, 0, -2]), fill_value=True) with pytest.raises(ValueError, match=msg): idx.take(np.array([1, 0, -5]), fill_value=True) with pytest.raises(IndexError): idx.take(np.array([1, -5])) def test_take_fill_value_datetime(self): # datetime category idx = pd.DatetimeIndex(["2011-01-01", "2011-02-01", "2011-03-01"], name="xxx") idx = pd.CategoricalIndex(idx) result = idx.take(np.array([1, 0, -1])) expected = pd.DatetimeIndex(["2011-02-01", "2011-01-01", "2011-03-01"], name="xxx") expected = pd.CategoricalIndex(expected) tm.assert_index_equal(result, expected) # fill_value result = idx.take(np.array([1, 0, -1]), fill_value=True) expected = pd.DatetimeIndex(["2011-02-01", "2011-01-01", "NaT"], name="xxx") exp_cats = pd.DatetimeIndex(["2011-01-01", "2011-02-01", "2011-03-01"]) expected = pd.CategoricalIndex(expected, categories=exp_cats) tm.assert_index_equal(result, expected) # allow_fill=False result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) expected = pd.DatetimeIndex(["2011-02-01", "2011-01-01", "2011-03-01"], name="xxx") expected = pd.CategoricalIndex(expected) tm.assert_index_equal(result, expected) msg = ("When allow_fill=True and fill_value is not None, " "all indices must be >= -1") with pytest.raises(ValueError, match=msg): idx.take(np.array([1, 0, -2]), fill_value=True) with pytest.raises(ValueError, match=msg): idx.take(np.array([1, 0, -5]), fill_value=True) with pytest.raises(IndexError): idx.take(np.array([1, -5])) def test_take_invalid_kwargs(self): idx = pd.CategoricalIndex([1, 2, 3], name="foo") indices = [1, 0, -1] msg = r"take\(\) got an unexpected keyword argument 'foo'" with pytest.raises(TypeError, match=msg): idx.take(indices, foo=2) msg = "the 'out' parameter is not supported" with pytest.raises(ValueError, match=msg): idx.take(indices, out=indices) msg = "the 'mode' parameter is not supported" with pytest.raises(ValueError, match=msg): idx.take(indices, mode="clip") @pytest.mark.parametrize( "dtype, engine_type", [ (np.int8, libindex.Int8Engine), (np.int16, libindex.Int16Engine), (np.int32, libindex.Int32Engine), (np.int64, libindex.Int64Engine), ], ) def test_engine_type(self, dtype, engine_type): if dtype != np.int64: # num. of uniques required to push CategoricalIndex.codes to a # dtype (128 categories required for .codes dtype to be int16 etc.) num_uniques = {np.int8: 1, np.int16: 128, np.int32: 32768}[dtype] ci = pd.CategoricalIndex(range(num_uniques)) else: # having 2**32 - 2**31 categories would be very memory-intensive, # so we cheat a bit with the dtype ci = pd.CategoricalIndex(range(32768)) # == 2**16 - 2**(16 - 1) ci.values._codes = ci.values._codes.astype("int64") assert np.issubdtype(ci.codes.dtype, dtype) assert isinstance(ci._engine, engine_type)
class TestSeriesConstructors: @pytest.mark.parametrize( "constructor,check_index_type", [ # NOTE: some overlap with test_constructor_empty but that test does not # test for None or an empty generator. # test_constructor_pass_none tests None but only with the index also # passed. (lambda: Series(), True), (lambda: Series(None), True), (lambda: Series({}), True), (lambda: Series(()), False), # creates a RangeIndex (lambda: Series([]), False), # creates a RangeIndex (lambda: Series((x for x in [])), False), # creates a RangeIndex (lambda: Series(data=None), True), (lambda: Series(data={}), True), (lambda: Series(data=()), False), # creates a RangeIndex (lambda: Series(data=[]), False), # creates a RangeIndex (lambda: Series(data=(x for x in [])), False), # creates a RangeIndex ], ) def test_empty_constructor(self, constructor, check_index_type): expected = Series() result = constructor() assert len(result.index) == 0 tm.assert_series_equal(result, expected, check_index_type=check_index_type) def test_invalid_dtype(self): # GH15520 msg = "not understood" invalid_list = [pd.Timestamp, "pd.Timestamp", list] for dtype in invalid_list: with pytest.raises(TypeError, match=msg): Series([], name="time", dtype=dtype) def test_scalar_conversion(self): # Pass in scalar is disabled scalar = Series(0.5) assert not isinstance(scalar, float) # Coercion assert float(Series([1.0])) == 1.0 assert int(Series([1.0])) == 1 def test_constructor(self, datetime_series): empty_series = Series() assert datetime_series.index.is_all_dates # Pass in Series derived = Series(datetime_series) assert derived.index.is_all_dates assert tm.equalContents(derived.index, datetime_series.index) # Ensure new index is not created assert id(datetime_series.index) == id(derived.index) # Mixed type Series mixed = Series(["hello", np.NaN], index=[0, 1]) assert mixed.dtype == np.object_ assert mixed[1] is np.NaN assert not empty_series.index.is_all_dates assert not Series().index.is_all_dates # exception raised is of type Exception with pytest.raises(Exception, match="Data must be 1-dimensional"): Series(np.random.randn(3, 3), index=np.arange(3)) mixed.name = "Series" rs = Series(mixed).name xp = "Series" assert rs == xp # raise on MultiIndex GH4187 m = MultiIndex.from_arrays([[1, 2], [3, 4]]) msg = "initializing a Series from a MultiIndex is not supported" with pytest.raises(NotImplementedError, match=msg): Series(m) @pytest.mark.parametrize("input_class", [list, dict, OrderedDict]) def test_constructor_empty(self, input_class): empty = Series() empty2 = Series(input_class()) # these are Index() and RangeIndex() which don't compare type equal # but are just .equals assert_series_equal(empty, empty2, check_index_type=False) # With explicit dtype: empty = Series(dtype="float64") empty2 = Series(input_class(), dtype="float64") assert_series_equal(empty, empty2, check_index_type=False) # GH 18515 : with dtype=category: empty = Series(dtype="category") empty2 = Series(input_class(), dtype="category") assert_series_equal(empty, empty2, check_index_type=False) if input_class is not list: # With index: empty = Series(index=range(10)) empty2 = Series(input_class(), index=range(10)) assert_series_equal(empty, empty2) # With index and dtype float64: empty = Series(np.nan, index=range(10)) empty2 = Series(input_class(), index=range(10), dtype="float64") assert_series_equal(empty, empty2) # GH 19853 : with empty string, index and dtype str empty = Series("", dtype=str, index=range(3)) empty2 = Series("", index=range(3)) assert_series_equal(empty, empty2) @pytest.mark.parametrize("input_arg", [np.nan, float("nan")]) def test_constructor_nan(self, input_arg): empty = Series(dtype="float64", index=range(10)) empty2 = Series(input_arg, index=range(10)) assert_series_equal(empty, empty2, check_index_type=False) @pytest.mark.parametrize( "dtype", [ "f8", "i8", "M8[ns]", "m8[ns]", "category", "object", "datetime64[ns, UTC]" ], ) @pytest.mark.parametrize("index", [None, pd.Index([])]) def test_constructor_dtype_only(self, dtype, index): # GH-20865 result = pd.Series(dtype=dtype, index=index) assert result.dtype == dtype assert len(result) == 0 def test_constructor_no_data_index_order(self): result = pd.Series(index=["b", "a", "c"]) assert result.index.tolist() == ["b", "a", "c"] def test_constructor_no_data_string_type(self): # GH 22477 result = pd.Series(index=[1], dtype=str) assert np.isnan(result.iloc[0]) @pytest.mark.parametrize("item", ["entry", "ѐ", 13]) def test_constructor_string_element_string_type(self, item): # GH 22477 result = pd.Series(item, index=[1], dtype=str) assert result.iloc[0] == str(item) def test_constructor_dtype_str_na_values(self, string_dtype): # https://github.com/pandas-dev/pandas/issues/21083 ser = Series(["x", None], dtype=string_dtype) result = ser.isna() expected = Series([False, True]) tm.assert_series_equal(result, expected) assert ser.iloc[1] is None ser = Series(["x", np.nan], dtype=string_dtype) assert np.isnan(ser.iloc[1]) def test_constructor_series(self): index1 = ["d", "b", "a", "c"] index2 = sorted(index1) s1 = Series([4, 7, -5, 3], index=index1) s2 = Series(s1, index=index2) assert_series_equal(s2, s1.sort_index()) def test_constructor_iterable(self): # GH 21987 class Iter: def __iter__(self): for i in range(10): yield i expected = Series(list(range(10)), dtype="int64") result = Series(Iter(), dtype="int64") assert_series_equal(result, expected) def test_constructor_sequence(self): # GH 21987 expected = Series(list(range(10)), dtype="int64") result = Series(range(10), dtype="int64") assert_series_equal(result, expected) def test_constructor_single_str(self): # GH 21987 expected = Series(["abc"]) result = Series("abc") assert_series_equal(result, expected) def test_constructor_list_like(self): # make sure that we are coercing different # list-likes to standard dtypes and not # platform specific expected = Series([1, 2, 3], dtype="int64") for obj in [[1, 2, 3], (1, 2, 3), np.array([1, 2, 3], dtype="int64")]: result = Series(obj, index=[0, 1, 2]) assert_series_equal(result, expected) @pytest.mark.parametrize("dtype", ["bool", "int32", "int64", "float64"]) def test_constructor_index_dtype(self, dtype): # GH 17088 s = Series(Index([0, 2, 4]), dtype=dtype) assert s.dtype == dtype @pytest.mark.parametrize( "input_vals", [ ([1, 2]), (["1", "2"]), (list(pd.date_range("1/1/2011", periods=2, freq="H"))), (list( pd.date_range("1/1/2011", periods=2, freq="H", tz="US/Eastern"))), ([pd.Interval(left=0, right=5)]), ], ) def test_constructor_list_str(self, input_vals, string_dtype): # GH 16605 # Ensure that data elements from a list are converted to strings # when dtype is str, 'str', or 'U' result = Series(input_vals, dtype=string_dtype) expected = Series(input_vals).astype(string_dtype) assert_series_equal(result, expected) def test_constructor_list_str_na(self, string_dtype): result = Series([1.0, 2.0, np.nan], dtype=string_dtype) expected = Series(["1.0", "2.0", np.nan], dtype=object) assert_series_equal(result, expected) assert np.isnan(result[2]) def test_constructor_generator(self): gen = (i for i in range(10)) result = Series(gen) exp = Series(range(10)) assert_series_equal(result, exp) gen = (i for i in range(10)) result = Series(gen, index=range(10, 20)) exp.index = range(10, 20) assert_series_equal(result, exp) def test_constructor_map(self): # GH8909 m = map(lambda x: x, range(10)) result = Series(m) exp = Series(range(10)) assert_series_equal(result, exp) m = map(lambda x: x, range(10)) result = Series(m, index=range(10, 20)) exp.index = range(10, 20) assert_series_equal(result, exp) def test_constructor_categorical(self): cat = pd.Categorical([0, 1, 2, 0, 1, 2], ["a", "b", "c"], fastpath=True) res = Series(cat) tm.assert_categorical_equal(res.values, cat) # can cast to a new dtype result = Series(pd.Categorical([1, 2, 3]), dtype="int64") expected = pd.Series([1, 2, 3], dtype="int64") tm.assert_series_equal(result, expected) # GH12574 cat = Series(pd.Categorical([1, 2, 3]), dtype="category") assert is_categorical_dtype(cat) assert is_categorical_dtype(cat.dtype) s = Series([1, 2, 3], dtype="category") assert is_categorical_dtype(s) assert is_categorical_dtype(s.dtype) def test_constructor_categorical_with_coercion(self): factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"]) # test basic creation / coercion of categoricals s = Series(factor, name="A") assert s.dtype == "category" assert len(s) == len(factor) str(s.values) str(s) # in a frame df = DataFrame({"A": factor}) result = df["A"] tm.assert_series_equal(result, s) result = df.iloc[:, 0] tm.assert_series_equal(result, s) assert len(df) == len(factor) str(df.values) str(df) df = DataFrame({"A": s}) result = df["A"] tm.assert_series_equal(result, s) assert len(df) == len(factor) str(df.values) str(df) # multiples df = DataFrame({"A": s, "B": s, "C": 1}) result1 = df["A"] result2 = df["B"] tm.assert_series_equal(result1, s) tm.assert_series_equal(result2, s, check_names=False) assert result2.name == "B" assert len(df) == len(factor) str(df.values) str(df) # GH8623 x = DataFrame( [[1, "John P. Doe"], [2, "Jane Dove"], [1, "John P. Doe"]], columns=["person_id", "person_name"], ) x["person_name"] = Categorical( x.person_name) # doing this breaks transform expected = x.iloc[0].person_name result = x.person_name.iloc[0] assert result == expected result = x.person_name[0] assert result == expected result = x.person_name.loc[0] assert result == expected def test_constructor_categorical_dtype(self): result = pd.Series(["a", "b"], dtype=CategoricalDtype(["a", "b", "c"], ordered=True)) assert is_categorical_dtype(result) is True tm.assert_index_equal(result.cat.categories, pd.Index(["a", "b", "c"])) assert result.cat.ordered result = pd.Series(["a", "b"], dtype=CategoricalDtype(["b", "a"])) assert is_categorical_dtype(result) tm.assert_index_equal(result.cat.categories, pd.Index(["b", "a"])) assert result.cat.ordered is False # GH 19565 - Check broadcasting of scalar with Categorical dtype result = Series("a", index=[0, 1], dtype=CategoricalDtype(["a", "b"], ordered=True)) expected = Series(["a", "a"], index=[0, 1], dtype=CategoricalDtype(["a", "b"], ordered=True)) tm.assert_series_equal(result, expected, check_categorical=True) def test_constructor_categorical_string(self): # GH 26336: the string 'category' maintains existing CategoricalDtype cdt = CategoricalDtype(categories=list("dabc"), ordered=True) expected = Series(list("abcabc"), dtype=cdt) # Series(Categorical, dtype='category') keeps existing dtype cat = Categorical(list("abcabc"), dtype=cdt) result = Series(cat, dtype="category") tm.assert_series_equal(result, expected) # Series(Series[Categorical], dtype='category') keeps existing dtype result = Series(result, dtype="category") tm.assert_series_equal(result, expected) @pytest.mark.parametrize("none, warning", [(None, None), (ordered_sentinel, FutureWarning)]) def test_categorical_ordered_none_deprecated(self, none, warning): # GH 26336: only warn if None is not explicitly passed cdt1 = CategoricalDtype(categories=list("cdab"), ordered=True) cdt2 = CategoricalDtype(categories=list("cedafb"), ordered=none) cat = Categorical(list("abcdaba"), dtype=cdt1) with tm.assert_produces_warning(warning, check_stacklevel=False): Series(cat, dtype=cdt2) s = Series(cat) with tm.assert_produces_warning(warning, check_stacklevel=False): Series(s, dtype=cdt2) def test_categorical_sideeffects_free(self): # Passing a categorical to a Series and then changing values in either # the series or the categorical should not change the values in the # other one, IF you specify copy! cat = Categorical(["a", "b", "c", "a"]) s = Series(cat, copy=True) assert s.cat is not cat s.cat.categories = [1, 2, 3] exp_s = np.array([1, 2, 3, 1], dtype=np.int64) exp_cat = np.array(["a", "b", "c", "a"], dtype=np.object_) tm.assert_numpy_array_equal(s.__array__(), exp_s) tm.assert_numpy_array_equal(cat.__array__(), exp_cat) # setting s[0] = 2 exp_s2 = np.array([2, 2, 3, 1], dtype=np.int64) tm.assert_numpy_array_equal(s.__array__(), exp_s2) tm.assert_numpy_array_equal(cat.__array__(), exp_cat) # however, copy is False by default # so this WILL change values cat = Categorical(["a", "b", "c", "a"]) s = Series(cat) assert s.values is cat s.cat.categories = [1, 2, 3] exp_s = np.array([1, 2, 3, 1], dtype=np.int64) tm.assert_numpy_array_equal(s.__array__(), exp_s) tm.assert_numpy_array_equal(cat.__array__(), exp_s) s[0] = 2 exp_s2 = np.array([2, 2, 3, 1], dtype=np.int64) tm.assert_numpy_array_equal(s.__array__(), exp_s2) tm.assert_numpy_array_equal(cat.__array__(), exp_s2) def test_unordered_compare_equal(self): left = pd.Series(["a", "b", "c"], dtype=CategoricalDtype(["a", "b"])) right = pd.Series( pd.Categorical(["a", "b", np.nan], categories=["a", "b"])) tm.assert_series_equal(left, right) def test_constructor_maskedarray(self): data = ma.masked_all((3, ), dtype=float) result = Series(data) expected = Series([nan, nan, nan]) assert_series_equal(result, expected) data[0] = 0.0 data[2] = 2.0 index = ["a", "b", "c"] result = Series(data, index=index) expected = Series([0.0, nan, 2.0], index=index) assert_series_equal(result, expected) data[1] = 1.0 result = Series(data, index=index) expected = Series([0.0, 1.0, 2.0], index=index) assert_series_equal(result, expected) data = ma.masked_all((3, ), dtype=int) result = Series(data) expected = Series([nan, nan, nan], dtype=float) assert_series_equal(result, expected) data[0] = 0 data[2] = 2 index = ["a", "b", "c"] result = Series(data, index=index) expected = Series([0, nan, 2], index=index, dtype=float) assert_series_equal(result, expected) data[1] = 1 result = Series(data, index=index) expected = Series([0, 1, 2], index=index, dtype=int) assert_series_equal(result, expected) data = ma.masked_all((3, ), dtype=bool) result = Series(data) expected = Series([nan, nan, nan], dtype=object) assert_series_equal(result, expected) data[0] = True data[2] = False index = ["a", "b", "c"] result = Series(data, index=index) expected = Series([True, nan, False], index=index, dtype=object) assert_series_equal(result, expected) data[1] = True result = Series(data, index=index) expected = Series([True, True, False], index=index, dtype=bool) assert_series_equal(result, expected) data = ma.masked_all((3, ), dtype="M8[ns]") result = Series(data) expected = Series([iNaT, iNaT, iNaT], dtype="M8[ns]") assert_series_equal(result, expected) data[0] = datetime(2001, 1, 1) data[2] = datetime(2001, 1, 3) index = ["a", "b", "c"] result = Series(data, index=index) expected = Series( [datetime(2001, 1, 1), iNaT, datetime(2001, 1, 3)], index=index, dtype="M8[ns]", ) assert_series_equal(result, expected) data[1] = datetime(2001, 1, 2) result = Series(data, index=index) expected = Series( [datetime(2001, 1, 1), datetime(2001, 1, 2), datetime(2001, 1, 3)], index=index, dtype="M8[ns]", ) assert_series_equal(result, expected) def test_constructor_maskedarray_hardened(self): # Check numpy masked arrays with hard masks -- from GH24574 data = ma.masked_all((3, ), dtype=float).harden_mask() result = pd.Series(data) expected = pd.Series([nan, nan, nan]) tm.assert_series_equal(result, expected) def test_series_ctor_plus_datetimeindex(self): rng = date_range("20090415", "20090519", freq="B") data = {k: 1 for k in rng} result = Series(data, index=rng) assert result.index is rng def test_constructor_default_index(self): s = Series([0, 1, 2]) tm.assert_index_equal(s.index, pd.Index(np.arange(3))) @pytest.mark.parametrize( "input", [ [1, 2, 3], (1, 2, 3), list(range(3)), pd.Categorical(["a", "b", "a"]), (i for i in range(3)), map(lambda x: x, range(3)), ], ) def test_constructor_index_mismatch(self, input): # GH 19342 # test that construction of a Series with an index of different length # raises an error msg = "Length of passed values is 3, index implies 4" with pytest.raises(ValueError, match=msg): Series(input, index=np.arange(4)) def test_constructor_numpy_scalar(self): # GH 19342 # construction with a numpy scalar # should not raise result = Series(np.array(100), index=np.arange(4), dtype="int64") expected = Series(100, index=np.arange(4), dtype="int64") tm.assert_series_equal(result, expected) def test_constructor_broadcast_list(self): # GH 19342 # construction with single-element container and index # should raise msg = "Length of passed values is 1, index implies 3" with pytest.raises(ValueError, match=msg): Series(["foo"], index=["a", "b", "c"]) def test_constructor_corner(self): df = tm.makeTimeDataFrame() objs = [df, df] s = Series(objs, index=[0, 1]) assert isinstance(s, Series) def test_constructor_sanitize(self): s = Series(np.array([1.0, 1.0, 8.0]), dtype="i8") assert s.dtype == np.dtype("i8") s = Series(np.array([1.0, 1.0, np.nan]), copy=True, dtype="i8") assert s.dtype == np.dtype("f8") def test_constructor_copy(self): # GH15125 # test dtype parameter has no side effects on copy=True for data in [[1.0], np.array([1.0])]: x = Series(data) y = pd.Series(x, copy=True, dtype=float) # copy=True maintains original data in Series tm.assert_series_equal(x, y) # changes to origin of copy does not affect the copy x[0] = 2.0 assert not x.equals(y) assert x[0] == 2.0 assert y[0] == 1.0 @pytest.mark.parametrize( "index", [ pd.date_range("20170101", periods=3, tz="US/Eastern"), pd.date_range("20170101", periods=3), pd.timedelta_range("1 day", periods=3), pd.period_range("2012Q1", periods=3, freq="Q"), pd.Index(list("abc")), pd.Int64Index([1, 2, 3]), pd.RangeIndex(0, 3), ], ids=lambda x: type(x).__name__, ) def test_constructor_limit_copies(self, index): # GH 17449 # limit copies of input s = pd.Series(index) # we make 1 copy; this is just a smoke test here assert s._data.blocks[0].values is not index def test_constructor_pass_none(self): s = Series(None, index=range(5)) assert s.dtype == np.float64 s = Series(None, index=range(5), dtype=object) assert s.dtype == np.object_ # GH 7431 # inference on the index s = Series(index=np.array([None])) expected = Series(index=Index([None])) assert_series_equal(s, expected) def test_constructor_pass_nan_nat(self): # GH 13467 exp = Series([np.nan, np.nan], dtype=np.float64) assert exp.dtype == np.float64 tm.assert_series_equal(Series([np.nan, np.nan]), exp) tm.assert_series_equal(Series(np.array([np.nan, np.nan])), exp) exp = Series([pd.NaT, pd.NaT]) assert exp.dtype == "datetime64[ns]" tm.assert_series_equal(Series([pd.NaT, pd.NaT]), exp) tm.assert_series_equal(Series(np.array([pd.NaT, pd.NaT])), exp) tm.assert_series_equal(Series([pd.NaT, np.nan]), exp) tm.assert_series_equal(Series(np.array([pd.NaT, np.nan])), exp) tm.assert_series_equal(Series([np.nan, pd.NaT]), exp) tm.assert_series_equal(Series(np.array([np.nan, pd.NaT])), exp) def test_constructor_cast(self): msg = "could not convert string to float" with pytest.raises(ValueError, match=msg): Series(["a", "b", "c"], dtype=float) def test_constructor_unsigned_dtype_overflow(self, uint_dtype): # see gh-15832 msg = "Trying to coerce negative values to unsigned integers" with pytest.raises(OverflowError, match=msg): Series([-1], dtype=uint_dtype) def test_constructor_coerce_float_fail(self, any_int_dtype): # see gh-15832 msg = "Trying to coerce float values to integers" with pytest.raises(ValueError, match=msg): Series([1, 2, 3.5], dtype=any_int_dtype) def test_constructor_coerce_float_valid(self, float_dtype): s = Series([1, 2, 3.5], dtype=float_dtype) expected = Series([1, 2, 3.5]).astype(float_dtype) assert_series_equal(s, expected) def test_constructor_dtype_no_cast(self): # see gh-1572 s = Series([1, 2, 3]) s2 = Series(s, dtype=np.int64) s2[1] = 5 assert s[1] == 5 def test_constructor_datelike_coercion(self): # GH 9477 # incorrectly inferring on dateimelike looking when object dtype is # specified s = Series([Timestamp("20130101"), "NOV"], dtype=object) assert s.iloc[0] == Timestamp("20130101") assert s.iloc[1] == "NOV" assert s.dtype == object # the dtype was being reset on the slicing and re-inferred to datetime # even thought the blocks are mixed belly = "216 3T19".split() wing1 = "2T15 4H19".split() wing2 = "416 4T20".split() mat = pd.to_datetime("2016-01-22 2019-09-07".split()) df = pd.DataFrame({ "wing1": wing1, "wing2": wing2, "mat": mat }, index=belly) result = df.loc["3T19"] assert result.dtype == object result = df.loc["216"] assert result.dtype == object def test_constructor_datetimes_with_nulls(self): # gh-15869 for arr in [ np.array([None, None, None, None, datetime.now(), None]), np.array([None, None, datetime.now(), None]), ]: result = Series(arr) assert result.dtype == "M8[ns]" def test_constructor_dtype_datetime64(self): s = Series(iNaT, dtype="M8[ns]", index=range(5)) assert isna(s).all() # in theory this should be all nulls, but since # we are not specifying a dtype is ambiguous s = Series(iNaT, index=range(5)) assert not isna(s).all() s = Series(nan, dtype="M8[ns]", index=range(5)) assert isna(s).all() s = Series([datetime(2001, 1, 2, 0, 0), iNaT], dtype="M8[ns]") assert isna(s[1]) assert s.dtype == "M8[ns]" s = Series([datetime(2001, 1, 2, 0, 0), nan], dtype="M8[ns]") assert isna(s[1]) assert s.dtype == "M8[ns]" # GH3416 dates = [ np.datetime64(datetime(2013, 1, 1)), np.datetime64(datetime(2013, 1, 2)), np.datetime64(datetime(2013, 1, 3)), ] s = Series(dates) assert s.dtype == "M8[ns]" s.iloc[0] = np.nan assert s.dtype == "M8[ns]" # GH3414 related expected = Series( [datetime(2013, 1, 1), datetime(2013, 1, 2), datetime(2013, 1, 3)], dtype="datetime64[ns]", ) result = Series(Series(dates).astype(np.int64) / 1000000, dtype="M8[ms]") tm.assert_series_equal(result, expected) result = Series(dates, dtype="datetime64[ns]") tm.assert_series_equal(result, expected) expected = Series( [pd.NaT, datetime(2013, 1, 2), datetime(2013, 1, 3)], dtype="datetime64[ns]") result = Series([np.nan] + dates[1:], dtype="datetime64[ns]") tm.assert_series_equal(result, expected) dts = Series(dates, dtype="datetime64[ns]") # valid astype dts.astype("int64") # invalid casting msg = r"cannot astype a datetimelike from \[datetime64\[ns\]\] to" r" \[int32\]" with pytest.raises(TypeError, match=msg): dts.astype("int32") # ints are ok # we test with np.int64 to get similar results on # windows / 32-bit platforms result = Series(dts, dtype=np.int64) expected = Series(dts.astype(np.int64)) tm.assert_series_equal(result, expected) # invalid dates can be help as object result = Series([datetime(2, 1, 1)]) assert result[0] == datetime(2, 1, 1, 0, 0) result = Series([datetime(3000, 1, 1)]) assert result[0] == datetime(3000, 1, 1, 0, 0) # don't mix types result = Series([Timestamp("20130101"), 1], index=["a", "b"]) assert result["a"] == Timestamp("20130101") assert result["b"] == 1 # GH6529 # coerce datetime64 non-ns properly dates = date_range("01-Jan-2015", "01-Dec-2015", freq="M") values2 = dates.view(np.ndarray).astype("datetime64[ns]") expected = Series(values2, index=dates) for dtype in ["s", "D", "ms", "us", "ns"]: values1 = dates.view(np.ndarray).astype("M8[{0}]".format(dtype)) result = Series(values1, dates) assert_series_equal(result, expected) # GH 13876 # coerce to non-ns to object properly expected = Series(values2, index=dates, dtype=object) for dtype in ["s", "D", "ms", "us", "ns"]: values1 = dates.view(np.ndarray).astype("M8[{0}]".format(dtype)) result = Series(values1, index=dates, dtype=object) assert_series_equal(result, expected) # leave datetime.date alone dates2 = np.array([d.date() for d in dates.to_pydatetime()], dtype=object) series1 = Series(dates2, dates) tm.assert_numpy_array_equal(series1.values, dates2) assert series1.dtype == object # these will correctly infer a datetime s = Series([None, pd.NaT, "2013-08-05 15:30:00.000001"]) assert s.dtype == "datetime64[ns]" s = Series([np.nan, pd.NaT, "2013-08-05 15:30:00.000001"]) assert s.dtype == "datetime64[ns]" s = Series([pd.NaT, None, "2013-08-05 15:30:00.000001"]) assert s.dtype == "datetime64[ns]" s = Series([pd.NaT, np.nan, "2013-08-05 15:30:00.000001"]) assert s.dtype == "datetime64[ns]" # tz-aware (UTC and other tz's) # GH 8411 dr = date_range("20130101", periods=3) assert Series(dr).iloc[0].tz is None dr = date_range("20130101", periods=3, tz="UTC") assert str(Series(dr).iloc[0].tz) == "UTC" dr = date_range("20130101", periods=3, tz="US/Eastern") assert str(Series(dr).iloc[0].tz) == "US/Eastern" # non-convertible s = Series([1479596223000, -1479590, pd.NaT]) assert s.dtype == "object" assert s[2] is pd.NaT assert "NaT" in str(s) # if we passed a NaT it remains s = Series([datetime(2010, 1, 1), datetime(2, 1, 1), pd.NaT]) assert s.dtype == "object" assert s[2] is pd.NaT assert "NaT" in str(s) # if we passed a nan it remains s = Series([datetime(2010, 1, 1), datetime(2, 1, 1), np.nan]) assert s.dtype == "object" assert s[2] is np.nan assert "NaN" in str(s) def test_constructor_with_datetime_tz(self): # 8260 # support datetime64 with tz dr = date_range("20130101", periods=3, tz="US/Eastern") s = Series(dr) assert s.dtype.name == "datetime64[ns, US/Eastern]" assert s.dtype == "datetime64[ns, US/Eastern]" assert is_datetime64tz_dtype(s.dtype) assert "datetime64[ns, US/Eastern]" in str(s) # export result = s.values assert isinstance(result, np.ndarray) assert result.dtype == "datetime64[ns]" exp = pd.DatetimeIndex(result) exp = exp.tz_localize("UTC").tz_convert(tz=s.dt.tz) tm.assert_index_equal(dr, exp) # indexing result = s.iloc[0] assert result == Timestamp("2013-01-01 00:00:00-0500", tz="US/Eastern", freq="D") result = s[0] assert result == Timestamp("2013-01-01 00:00:00-0500", tz="US/Eastern", freq="D") result = s[Series([True, True, False], index=s.index)] assert_series_equal(result, s[0:2]) result = s.iloc[0:1] assert_series_equal(result, Series(dr[0:1])) # concat result = pd.concat([s.iloc[0:1], s.iloc[1:]]) assert_series_equal(result, s) # short str assert "datetime64[ns, US/Eastern]" in str(s) # formatting with NaT result = s.shift() assert "datetime64[ns, US/Eastern]" in str(result) assert "NaT" in str(result) # long str t = Series(date_range("20130101", periods=1000, tz="US/Eastern")) assert "datetime64[ns, US/Eastern]" in str(t) result = pd.DatetimeIndex(s, freq="infer") tm.assert_index_equal(result, dr) # inference s = Series([ pd.Timestamp("2013-01-01 13:00:00-0800", tz="US/Pacific"), pd.Timestamp("2013-01-02 14:00:00-0800", tz="US/Pacific"), ]) assert s.dtype == "datetime64[ns, US/Pacific]" assert lib.infer_dtype(s, skipna=True) == "datetime64" s = Series([ pd.Timestamp("2013-01-01 13:00:00-0800", tz="US/Pacific"), pd.Timestamp("2013-01-02 14:00:00-0800", tz="US/Eastern"), ]) assert s.dtype == "object" assert lib.infer_dtype(s, skipna=True) == "datetime" # with all NaT s = Series(pd.NaT, index=[0, 1], dtype="datetime64[ns, US/Eastern]") expected = Series(pd.DatetimeIndex(["NaT", "NaT"], tz="US/Eastern")) assert_series_equal(s, expected) @pytest.mark.parametrize("arr_dtype", [np.int64, np.float64]) @pytest.mark.parametrize("dtype", ["M8", "m8"]) @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"]) def test_construction_to_datetimelike_unit(self, arr_dtype, dtype, unit): # tests all units # gh-19223 dtype = "{}[{}]".format(dtype, unit) arr = np.array([1, 2, 3], dtype=arr_dtype) s = Series(arr) result = s.astype(dtype) expected = Series(arr.astype(dtype)) tm.assert_series_equal(result, expected) @pytest.mark.parametrize("arg", ["2013-01-01 00:00:00", pd.NaT, np.nan, None]) def test_constructor_with_naive_string_and_datetimetz_dtype(self, arg): # GH 17415: With naive string result = Series([arg], dtype="datetime64[ns, CET]") expected = Series(pd.Timestamp(arg)).dt.tz_localize("CET") assert_series_equal(result, expected) def test_construction_interval(self): # construction from interval & array of intervals index = IntervalIndex.from_breaks(np.arange(3), closed="right") result = Series(index) repr(result) str(result) tm.assert_index_equal(Index(result.values), index) result = Series(index.values) tm.assert_index_equal(Index(result.values), index) def test_construction_consistency(self): # make sure that we are not re-localizing upon construction # GH 14928 s = Series(pd.date_range("20130101", periods=3, tz="US/Eastern")) result = Series(s, dtype=s.dtype) tm.assert_series_equal(result, s) result = Series(s.dt.tz_convert("UTC"), dtype=s.dtype) tm.assert_series_equal(result, s) result = Series(s.values, dtype=s.dtype) tm.assert_series_equal(result, s) def test_constructor_infer_period(self): data = [pd.Period("2000", "D"), pd.Period("2001", "D"), None] result = pd.Series(data) expected = pd.Series(period_array(data)) tm.assert_series_equal(result, expected) assert result.dtype == "Period[D]" data = np.asarray(data, dtype=object) tm.assert_series_equal(result, expected) assert result.dtype == "Period[D]" def test_constructor_period_incompatible_frequency(self): data = [pd.Period("2000", "D"), pd.Period("2001", "A")] result = pd.Series(data) assert result.dtype == object assert result.tolist() == data def test_constructor_periodindex(self): # GH7932 # converting a PeriodIndex when put in a Series pi = period_range("20130101", periods=5, freq="D") s = Series(pi) assert s.dtype == "Period[D]" expected = Series(pi.astype(object)) assert_series_equal(s, expected) def test_constructor_dict(self): d = {"a": 0.0, "b": 1.0, "c": 2.0} result = Series(d, index=["b", "c", "d", "a"]) expected = Series([1, 2, nan, 0], index=["b", "c", "d", "a"]) assert_series_equal(result, expected) pidx = tm.makePeriodIndex(100) d = {pidx[0]: 0, pidx[1]: 1} result = Series(d, index=pidx) expected = Series(np.nan, pidx) expected.iloc[0] = 0 expected.iloc[1] = 1 assert_series_equal(result, expected) def test_constructor_dict_order(self): # GH19018 # initialization ordering: by insertion order if python>= 3.6, else # order by value d = {"b": 1, "a": 0, "c": 2} result = Series(d) if PY36: expected = Series([1, 0, 2], index=list("bac")) else: expected = Series([0, 1, 2], index=list("abc")) tm.assert_series_equal(result, expected) @pytest.mark.parametrize("value", [2, np.nan, None, float("nan")]) def test_constructor_dict_nan_key(self, value): # GH 18480 d = {1: "a", value: "b", float("nan"): "c", 4: "d"} result = Series(d).sort_values() expected = Series(["a", "b", "c", "d"], index=[1, value, np.nan, 4]) assert_series_equal(result, expected) # MultiIndex: d = {(1, 1): "a", (2, np.nan): "b", (3, value): "c"} result = Series(d).sort_values() expected = Series(["a", "b", "c"], index=Index([(1, 1), (2, np.nan), (3, value)])) assert_series_equal(result, expected) def test_constructor_dict_datetime64_index(self): # GH 9456 dates_as_str = ["1984-02-19", "1988-11-06", "1989-12-03", "1990-03-15"] values = [42544017.198965244, 1234565, 40512335.181958228, -1] def create_data(constructor): return dict(zip((constructor(x) for x in dates_as_str), values)) data_datetime64 = create_data(np.datetime64) data_datetime = create_data(lambda x: datetime.strptime(x, "%Y-%m-%d")) data_Timestamp = create_data(Timestamp) expected = Series(values, (Timestamp(x) for x in dates_as_str)) result_datetime64 = Series(data_datetime64) result_datetime = Series(data_datetime) result_Timestamp = Series(data_Timestamp) assert_series_equal(result_datetime64, expected) assert_series_equal(result_datetime, expected) assert_series_equal(result_Timestamp, expected) def test_constructor_list_of_tuples(self): data = [(1, 1), (2, 2), (2, 3)] s = Series(data) assert list(s) == data def test_constructor_tuple_of_tuples(self): data = ((1, 1), (2, 2), (2, 3)) s = Series(data) assert tuple(s) == data def test_constructor_dict_of_tuples(self): data = {(1, 2): 3, (None, 5): 6} result = Series(data).sort_values() expected = Series([3, 6], index=MultiIndex.from_tuples([(1, 2), (None, 5)])) tm.assert_series_equal(result, expected) def test_constructor_set(self): values = {1, 2, 3, 4, 5} with pytest.raises(TypeError, match="'set' type is unordered"): Series(values) values = frozenset(values) with pytest.raises(TypeError, match="'frozenset' type is unordered"): Series(values) # https://github.com/pandas-dev/pandas/issues/22698 @pytest.mark.filterwarnings("ignore:elementwise comparison:FutureWarning") def test_fromDict(self): data = {"a": 0, "b": 1, "c": 2, "d": 3} series = Series(data) tm.assert_is_sorted(series.index) data = {"a": 0, "b": "1", "c": "2", "d": datetime.now()} series = Series(data) assert series.dtype == np.object_ data = {"a": 0, "b": "1", "c": "2", "d": "3"} series = Series(data) assert series.dtype == np.object_ data = {"a": "0", "b": "1"} series = Series(data, dtype=float) assert series.dtype == np.float64 def test_fromValue(self, datetime_series): nans = Series(np.NaN, index=datetime_series.index) assert nans.dtype == np.float_ assert len(nans) == len(datetime_series) strings = Series("foo", index=datetime_series.index) assert strings.dtype == np.object_ assert len(strings) == len(datetime_series) d = datetime.now() dates = Series(d, index=datetime_series.index) assert dates.dtype == "M8[ns]" assert len(dates) == len(datetime_series) # GH12336 # Test construction of categorical series from value categorical = Series(0, index=datetime_series.index, dtype="category") expected = Series(0, index=datetime_series.index).astype("category") assert categorical.dtype == "category" assert len(categorical) == len(datetime_series) tm.assert_series_equal(categorical, expected) def test_constructor_dtype_timedelta64(self): # basic td = Series([timedelta(days=i) for i in range(3)]) assert td.dtype == "timedelta64[ns]" td = Series([timedelta(days=1)]) assert td.dtype == "timedelta64[ns]" td = Series( [timedelta(days=1), timedelta(days=2), np.timedelta64(1, "s")]) assert td.dtype == "timedelta64[ns]" # mixed with NaT td = Series([timedelta(days=1), NaT], dtype="m8[ns]") assert td.dtype == "timedelta64[ns]" td = Series([timedelta(days=1), np.nan], dtype="m8[ns]") assert td.dtype == "timedelta64[ns]" td = Series([np.timedelta64(300000000), pd.NaT], dtype="m8[ns]") assert td.dtype == "timedelta64[ns]" # improved inference # GH5689 td = Series([np.timedelta64(300000000), NaT]) assert td.dtype == "timedelta64[ns]" # because iNaT is int, not coerced to timedelta td = Series([np.timedelta64(300000000), iNaT]) assert td.dtype == "object" td = Series([np.timedelta64(300000000), np.nan]) assert td.dtype == "timedelta64[ns]" td = Series([pd.NaT, np.timedelta64(300000000)]) assert td.dtype == "timedelta64[ns]" td = Series([np.timedelta64(1, "s")]) assert td.dtype == "timedelta64[ns]" # these are frequency conversion astypes # for t in ['s', 'D', 'us', 'ms']: # with pytest.raises(TypeError): # td.astype('m8[%s]' % t) # valid astype td.astype("int64") # invalid casting msg = r"cannot astype a timedelta from \[timedelta64\[ns\]\] to" r" \[int32\]" with pytest.raises(TypeError, match=msg): td.astype("int32") # this is an invalid casting msg = "Could not convert object to NumPy timedelta" with pytest.raises(ValueError, match=msg): Series([timedelta(days=1), "foo"], dtype="m8[ns]") # leave as object here td = Series([timedelta(days=i) for i in range(3)] + ["foo"]) assert td.dtype == "object" # these will correctly infer a timedelta s = Series([None, pd.NaT, "1 Day"]) assert s.dtype == "timedelta64[ns]" s = Series([np.nan, pd.NaT, "1 Day"]) assert s.dtype == "timedelta64[ns]" s = Series([pd.NaT, None, "1 Day"]) assert s.dtype == "timedelta64[ns]" s = Series([pd.NaT, np.nan, "1 Day"]) assert s.dtype == "timedelta64[ns]" # GH 16406 def test_constructor_mixed_tz(self): s = Series( [Timestamp("20130101"), Timestamp("20130101", tz="US/Eastern")]) expected = Series( [Timestamp("20130101"), Timestamp("20130101", tz="US/Eastern")], dtype="object", ) assert_series_equal(s, expected) def test_NaT_scalar(self): series = Series([0, 1000, 2000, iNaT], dtype="M8[ns]") val = series[3] assert isna(val) series[2] = val assert isna(series[2]) def test_NaT_cast(self): # GH10747 result = Series([np.nan]).astype("M8[ns]") expected = Series([NaT]) assert_series_equal(result, expected) def test_constructor_name_hashable(self): for n in [777, 777.0, "name", datetime(2001, 11, 11), (1, ), "\u05D0"]: for data in [[1, 2, 3], np.ones(3), {"a": 0, "b": 1}]: s = Series(data, name=n) assert s.name == n def test_constructor_name_unhashable(self): msg = r"Series\.name must be a hashable type" for n in [["name_list"], np.ones(2), {1: 2}]: for data in [["name_list"], np.ones(2), {1: 2}]: with pytest.raises(TypeError, match=msg): Series(data, name=n) def test_auto_conversion(self): series = Series(list(date_range("1/1/2000", periods=10))) assert series.dtype == "M8[ns]" def test_convert_non_ns(self): # convert from a numpy array of non-ns timedelta64 arr = np.array([1, 2, 3], dtype="timedelta64[s]") s = Series(arr) expected = Series(pd.timedelta_range("00:00:01", periods=3, freq="s")) assert_series_equal(s, expected) # convert from a numpy array of non-ns datetime64 # note that creating a numpy datetime64 is in LOCAL time!!!! # seems to work for M8[D], but not for M8[s] s = Series( np.array(["2013-01-01", "2013-01-02", "2013-01-03"], dtype="datetime64[D]")) assert_series_equal( s, Series(date_range("20130101", periods=3, freq="D"))) # s = Series(np.array(['2013-01-01 00:00:01','2013-01-01 # 00:00:02','2013-01-01 00:00:03'],dtype='datetime64[s]')) # assert_series_equal(s,date_range('20130101 # 00:00:01',period=3,freq='s')) @pytest.mark.parametrize( "index", [ date_range("1/1/2000", periods=10), timedelta_range("1 day", periods=10), period_range("2000-Q1", periods=10, freq="Q"), ], ids=lambda x: type(x).__name__, ) def test_constructor_cant_cast_datetimelike(self, index): # floats are not ok msg = "Cannot cast {}.*? to ".format( # strip Index to convert PeriodIndex -> Period # We don't care whether the error message says # PeriodIndex or PeriodArray type(index).__name__.rstrip("Index")) with pytest.raises(TypeError, match=msg): Series(index, dtype=float) # ints are ok # we test with np.int64 to get similar results on # windows / 32-bit platforms result = Series(index, dtype=np.int64) expected = Series(index.astype(np.int64)) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( "index", [ date_range("1/1/2000", periods=10), timedelta_range("1 day", periods=10), period_range("2000-Q1", periods=10, freq="Q"), ], ids=lambda x: type(x).__name__, ) def test_constructor_cast_object(self, index): s = Series(index, dtype=object) exp = Series(index).astype(object) tm.assert_series_equal(s, exp) s = Series(pd.Index(index, dtype=object), dtype=object) exp = Series(index).astype(object) tm.assert_series_equal(s, exp) s = Series(index.astype(object), dtype=object) exp = Series(index).astype(object) tm.assert_series_equal(s, exp) @pytest.mark.parametrize("dtype", [np.datetime64, np.timedelta64]) def test_constructor_generic_timestamp_no_frequency(self, dtype): # see gh-15524, gh-15987 msg = "dtype has no unit. Please pass in" with pytest.raises(ValueError, match=msg): Series([], dtype=dtype) @pytest.mark.parametrize( "dtype,msg", [ ("m8[ps]", "cannot convert timedeltalike"), ("M8[ps]", "cannot convert datetimelike"), ], ) def test_constructor_generic_timestamp_bad_frequency(self, dtype, msg): # see gh-15524, gh-15987 with pytest.raises(TypeError, match=msg): Series([], dtype=dtype) @pytest.mark.parametrize("dtype", [None, "uint8", "category"]) def test_constructor_range_dtype(self, dtype): # GH 16804 expected = Series([0, 1, 2, 3, 4], dtype=dtype or "int64") result = Series(range(5), dtype=dtype) tm.assert_series_equal(result, expected) def test_constructor_tz_mixed_data(self): # GH 13051 dt_list = [ Timestamp("2016-05-01 02:03:37"), Timestamp("2016-04-30 19:03:37-0700", tz="US/Pacific"), ] result = Series(dt_list) expected = Series(dt_list, dtype=object) tm.assert_series_equal(result, expected)
def test_survival_table_from_events_will_collapse_if_asked(): T, C = np.array([1, 3, 4, 5]), np.array([True, True, True, True]) table = utils.survival_table_from_events(T, C, collapse=True) assert table.index.tolist() == [pd.Interval(0, 3.5089999999999999, closed='right'), pd.Interval(3.5089999999999999, 7.0179999999999998, closed='right')]
class TestSeriesConstructors(): def test_invalid_dtype(self): # GH15520 msg = 'not understood' invalid_list = [pd.Timestamp, 'pd.Timestamp', list] for dtype in invalid_list: with pytest.raises(TypeError, match=msg): Series([], name='time', dtype=dtype) def test_scalar_conversion(self): # Pass in scalar is disabled scalar = Series(0.5) assert not isinstance(scalar, float) # Coercion assert float(Series([1.])) == 1.0 assert int(Series([1.])) == 1 assert long(Series([1.])) == 1 def test_constructor(self, datetime_series, empty_series): assert datetime_series.index.is_all_dates # Pass in Series derived = Series(datetime_series) assert derived.index.is_all_dates assert tm.equalContents(derived.index, datetime_series.index) # Ensure new index is not created assert id(datetime_series.index) == id(derived.index) # Mixed type Series mixed = Series(['hello', np.NaN], index=[0, 1]) assert mixed.dtype == np.object_ assert mixed[1] is np.NaN assert not empty_series.index.is_all_dates assert not Series({}).index.is_all_dates pytest.raises(Exception, Series, np.random.randn(3, 3), index=np.arange(3)) mixed.name = 'Series' rs = Series(mixed).name xp = 'Series' assert rs == xp # raise on MultiIndex GH4187 m = MultiIndex.from_arrays([[1, 2], [3, 4]]) pytest.raises(NotImplementedError, Series, m) @pytest.mark.parametrize('input_class', [list, dict, OrderedDict]) def test_constructor_empty(self, input_class): empty = Series() empty2 = Series(input_class()) # these are Index() and RangeIndex() which don't compare type equal # but are just .equals assert_series_equal(empty, empty2, check_index_type=False) # With explicit dtype: empty = Series(dtype='float64') empty2 = Series(input_class(), dtype='float64') assert_series_equal(empty, empty2, check_index_type=False) # GH 18515 : with dtype=category: empty = Series(dtype='category') empty2 = Series(input_class(), dtype='category') assert_series_equal(empty, empty2, check_index_type=False) if input_class is not list: # With index: empty = Series(index=lrange(10)) empty2 = Series(input_class(), index=lrange(10)) assert_series_equal(empty, empty2) # With index and dtype float64: empty = Series(np.nan, index=lrange(10)) empty2 = Series(input_class(), index=lrange(10), dtype='float64') assert_series_equal(empty, empty2) # GH 19853 : with empty string, index and dtype str empty = Series('', dtype=str, index=range(3)) empty2 = Series('', index=range(3)) assert_series_equal(empty, empty2) @pytest.mark.parametrize('input_arg', [np.nan, float('nan')]) def test_constructor_nan(self, input_arg): empty = Series(dtype='float64', index=lrange(10)) empty2 = Series(input_arg, index=lrange(10)) assert_series_equal(empty, empty2, check_index_type=False) @pytest.mark.parametrize('dtype', [ 'f8', 'i8', 'M8[ns]', 'm8[ns]', 'category', 'object', 'datetime64[ns, UTC]', ]) @pytest.mark.parametrize('index', [None, pd.Index([])]) def test_constructor_dtype_only(self, dtype, index): # GH-20865 result = pd.Series(dtype=dtype, index=index) assert result.dtype == dtype assert len(result) == 0 def test_constructor_no_data_index_order(self): result = pd.Series(index=['b', 'a', 'c']) assert result.index.tolist() == ['b', 'a', 'c'] def test_constructor_no_data_string_type(self): # GH 22477 result = pd.Series(index=[1], dtype=str) assert np.isnan(result.iloc[0]) @pytest.mark.parametrize('item', ['entry', 'ѐ', 13]) def test_constructor_string_element_string_type(self, item): # GH 22477 result = pd.Series(item, index=[1], dtype=str) assert result.iloc[0] == str(item) def test_constructor_dtype_str_na_values(self, string_dtype): # https://github.com/pandas-dev/pandas/issues/21083 ser = Series(['x', None], dtype=string_dtype) result = ser.isna() expected = Series([False, True]) tm.assert_series_equal(result, expected) assert ser.iloc[1] is None ser = Series(['x', np.nan], dtype=string_dtype) assert np.isnan(ser.iloc[1]) def test_constructor_series(self): index1 = ['d', 'b', 'a', 'c'] index2 = sorted(index1) s1 = Series([4, 7, -5, 3], index=index1) s2 = Series(s1, index=index2) assert_series_equal(s2, s1.sort_index()) def test_constructor_iterable(self): # GH 21987 class Iter(): def __iter__(self): for i in range(10): yield i expected = Series(list(range(10)), dtype='int64') result = Series(Iter(), dtype='int64') assert_series_equal(result, expected) def test_constructor_sequence(self): # GH 21987 expected = Series(list(range(10)), dtype='int64') result = Series(range(10), dtype='int64') assert_series_equal(result, expected) def test_constructor_single_str(self): # GH 21987 expected = Series(['abc']) result = Series('abc') assert_series_equal(result, expected) def test_constructor_list_like(self): # make sure that we are coercing different # list-likes to standard dtypes and not # platform specific expected = Series([1, 2, 3], dtype='int64') for obj in [[1, 2, 3], (1, 2, 3), np.array([1, 2, 3], dtype='int64')]: result = Series(obj, index=[0, 1, 2]) assert_series_equal(result, expected) @pytest.mark.parametrize('input_vals', [ ([1, 2]), (['1', '2']), (list(pd.date_range('1/1/2011', periods=2, freq='H'))), (list(pd.date_range('1/1/2011', periods=2, freq='H', tz='US/Eastern'))), ([pd.Interval(left=0, right=5)]), ]) def test_constructor_list_str(self, input_vals, string_dtype): # GH 16605 # Ensure that data elements from a list are converted to strings # when dtype is str, 'str', or 'U' result = Series(input_vals, dtype=string_dtype) expected = Series(input_vals).astype(string_dtype) assert_series_equal(result, expected) def test_constructor_list_str_na(self, string_dtype): result = Series([1.0, 2.0, np.nan], dtype=string_dtype) expected = Series(['1.0', '2.0', np.nan], dtype=object) assert_series_equal(result, expected) assert np.isnan(result[2]) def test_constructor_generator(self): gen = (i for i in range(10)) result = Series(gen) exp = Series(lrange(10)) assert_series_equal(result, exp) gen = (i for i in range(10)) result = Series(gen, index=lrange(10, 20)) exp.index = lrange(10, 20) assert_series_equal(result, exp) def test_constructor_map(self): # GH8909 m = map(lambda x: x, range(10)) result = Series(m) exp = Series(lrange(10)) assert_series_equal(result, exp) m = map(lambda x: x, range(10)) result = Series(m, index=lrange(10, 20)) exp.index = lrange(10, 20) assert_series_equal(result, exp) def test_constructor_categorical(self): cat = pd.Categorical([0, 1, 2, 0, 1, 2], ['a', 'b', 'c'], fastpath=True) res = Series(cat) tm.assert_categorical_equal(res.values, cat) # can cast to a new dtype result = Series(pd.Categorical([1, 2, 3]), dtype='int64') expected = pd.Series([1, 2, 3], dtype='int64') tm.assert_series_equal(result, expected) # GH12574 cat = Series(pd.Categorical([1, 2, 3]), dtype='category') assert is_categorical_dtype(cat) assert is_categorical_dtype(cat.dtype) s = Series([1, 2, 3], dtype='category') assert is_categorical_dtype(s) assert is_categorical_dtype(s.dtype) def test_constructor_categorical_with_coercion(self): factor = Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']) # test basic creation / coercion of categoricals s = Series(factor, name='A') assert s.dtype == 'category' assert len(s) == len(factor) str(s.values) str(s) # in a frame df = DataFrame({'A': factor}) result = df['A'] tm.assert_series_equal(result, s) result = df.iloc[:, 0] tm.assert_series_equal(result, s) assert len(df) == len(factor) str(df.values) str(df) df = DataFrame({'A': s}) result = df['A'] tm.assert_series_equal(result, s) assert len(df) == len(factor) str(df.values) str(df) # multiples df = DataFrame({'A': s, 'B': s, 'C': 1}) result1 = df['A'] result2 = df['B'] tm.assert_series_equal(result1, s) tm.assert_series_equal(result2, s, check_names=False) assert result2.name == 'B' assert len(df) == len(factor) str(df.values) str(df) # GH8623 x = DataFrame([[1, 'John P. Doe'], [2, 'Jane Dove'], [1, 'John P. Doe']], columns=['person_id', 'person_name']) x['person_name'] = Categorical(x.person_name ) # doing this breaks transform expected = x.iloc[0].person_name result = x.person_name.iloc[0] assert result == expected result = x.person_name[0] assert result == expected result = x.person_name.loc[0] assert result == expected def test_constructor_categorical_dtype(self): result = pd.Series(['a', 'b'], dtype=CategoricalDtype(['a', 'b', 'c'], ordered=True)) assert is_categorical_dtype(result) is True tm.assert_index_equal(result.cat.categories, pd.Index(['a', 'b', 'c'])) assert result.cat.ordered result = pd.Series(['a', 'b'], dtype=CategoricalDtype(['b', 'a'])) assert is_categorical_dtype(result) tm.assert_index_equal(result.cat.categories, pd.Index(['b', 'a'])) assert result.cat.ordered is False # GH 19565 - Check broadcasting of scalar with Categorical dtype result = Series('a', index=[0, 1], dtype=CategoricalDtype(['a', 'b'], ordered=True)) expected = Series(['a', 'a'], index=[0, 1], dtype=CategoricalDtype(['a', 'b'], ordered=True)) tm.assert_series_equal(result, expected, check_categorical=True) def test_categorical_sideeffects_free(self): # Passing a categorical to a Series and then changing values in either # the series or the categorical should not change the values in the # other one, IF you specify copy! cat = Categorical(["a", "b", "c", "a"]) s = Series(cat, copy=True) assert s.cat is not cat s.cat.categories = [1, 2, 3] exp_s = np.array([1, 2, 3, 1], dtype=np.int64) exp_cat = np.array(["a", "b", "c", "a"], dtype=np.object_) tm.assert_numpy_array_equal(s.__array__(), exp_s) tm.assert_numpy_array_equal(cat.__array__(), exp_cat) # setting s[0] = 2 exp_s2 = np.array([2, 2, 3, 1], dtype=np.int64) tm.assert_numpy_array_equal(s.__array__(), exp_s2) tm.assert_numpy_array_equal(cat.__array__(), exp_cat) # however, copy is False by default # so this WILL change values cat = Categorical(["a", "b", "c", "a"]) s = Series(cat) assert s.values is cat s.cat.categories = [1, 2, 3] exp_s = np.array([1, 2, 3, 1], dtype=np.int64) tm.assert_numpy_array_equal(s.__array__(), exp_s) tm.assert_numpy_array_equal(cat.__array__(), exp_s) s[0] = 2 exp_s2 = np.array([2, 2, 3, 1], dtype=np.int64) tm.assert_numpy_array_equal(s.__array__(), exp_s2) tm.assert_numpy_array_equal(cat.__array__(), exp_s2) def test_unordered_compare_equal(self): left = pd.Series(['a', 'b', 'c'], dtype=CategoricalDtype(['a', 'b'])) right = pd.Series(pd.Categorical(['a', 'b', np.nan], categories=['a', 'b'])) tm.assert_series_equal(left, right) def test_constructor_maskedarray(self): data = ma.masked_all((3, ), dtype=float) result = Series(data) expected = Series([nan, nan, nan]) assert_series_equal(result, expected) data[0] = 0.0 data[2] = 2.0 index = ['a', 'b', 'c'] result = Series(data, index=index) expected = Series([0.0, nan, 2.0], index=index) assert_series_equal(result, expected) data[1] = 1.0 result = Series(data, index=index) expected = Series([0.0, 1.0, 2.0], index=index) assert_series_equal(result, expected) data = ma.masked_all((3, ), dtype=int) result = Series(data) expected = Series([nan, nan, nan], dtype=float) assert_series_equal(result, expected) data[0] = 0 data[2] = 2 index = ['a', 'b', 'c'] result = Series(data, index=index) expected = Series([0, nan, 2], index=index, dtype=float) assert_series_equal(result, expected) data[1] = 1 result = Series(data, index=index) expected = Series([0, 1, 2], index=index, dtype=int) assert_series_equal(result, expected) data = ma.masked_all((3, ), dtype=bool) result = Series(data) expected = Series([nan, nan, nan], dtype=object) assert_series_equal(result, expected) data[0] = True data[2] = False index = ['a', 'b', 'c'] result = Series(data, index=index) expected = Series([True, nan, False], index=index, dtype=object) assert_series_equal(result, expected) data[1] = True result = Series(data, index=index) expected = Series([True, True, False], index=index, dtype=bool) assert_series_equal(result, expected) data = ma.masked_all((3, ), dtype='M8[ns]') result = Series(data) expected = Series([iNaT, iNaT, iNaT], dtype='M8[ns]') assert_series_equal(result, expected) data[0] = datetime(2001, 1, 1) data[2] = datetime(2001, 1, 3) index = ['a', 'b', 'c'] result = Series(data, index=index) expected = Series([datetime(2001, 1, 1), iNaT, datetime(2001, 1, 3)], index=index, dtype='M8[ns]') assert_series_equal(result, expected) data[1] = datetime(2001, 1, 2) result = Series(data, index=index) expected = Series([datetime(2001, 1, 1), datetime(2001, 1, 2), datetime(2001, 1, 3)], index=index, dtype='M8[ns]') assert_series_equal(result, expected) def test_series_ctor_plus_datetimeindex(self): rng = date_range('20090415', '20090519', freq='B') data = {k: 1 for k in rng} result = Series(data, index=rng) assert result.index is rng def test_constructor_default_index(self): s = Series([0, 1, 2]) tm.assert_index_equal(s.index, pd.Index(np.arange(3))) @pytest.mark.parametrize('input', [[1, 2, 3], (1, 2, 3), list(range(3)), pd.Categorical(['a', 'b', 'a']), (i for i in range(3)), map(lambda x: x, range(3))]) def test_constructor_index_mismatch(self, input): # GH 19342 # test that construction of a Series with an index of different length # raises an error msg = 'Length of passed values is 3, index implies 4' with pytest.raises(ValueError, match=msg): Series(input, index=np.arange(4)) def test_constructor_numpy_scalar(self): # GH 19342 # construction with a numpy scalar # should not raise result = Series(np.array(100), index=np.arange(4), dtype='int64') expected = Series(100, index=np.arange(4), dtype='int64') tm.assert_series_equal(result, expected) def test_constructor_broadcast_list(self): # GH 19342 # construction with single-element container and index # should raise pytest.raises(ValueError, Series, ['foo'], index=['a', 'b', 'c']) def test_constructor_corner(self): df = tm.makeTimeDataFrame() objs = [df, df] s = Series(objs, index=[0, 1]) assert isinstance(s, Series) def test_constructor_sanitize(self): s = Series(np.array([1., 1., 8.]), dtype='i8') assert s.dtype == np.dtype('i8') s = Series(np.array([1., 1., np.nan]), copy=True, dtype='i8') assert s.dtype == np.dtype('f8') def test_constructor_copy(self): # GH15125 # test dtype parameter has no side effects on copy=True for data in [[1.], np.array([1.])]: x = Series(data) y = pd.Series(x, copy=True, dtype=float) # copy=True maintains original data in Series tm.assert_series_equal(x, y) # changes to origin of copy does not affect the copy x[0] = 2. assert not x.equals(y) assert x[0] == 2. assert y[0] == 1. @pytest.mark.parametrize( "index", [ pd.date_range('20170101', periods=3, tz='US/Eastern'), pd.date_range('20170101', periods=3), pd.timedelta_range('1 day', periods=3), pd.period_range('2012Q1', periods=3, freq='Q'), pd.Index(list('abc')), pd.Int64Index([1, 2, 3]), pd.RangeIndex(0, 3)], ids=lambda x: type(x).__name__) def test_constructor_limit_copies(self, index): # GH 17449 # limit copies of input s = pd.Series(index) # we make 1 copy; this is just a smoke test here assert s._data.blocks[0].values is not index def test_constructor_pass_none(self): s = Series(None, index=lrange(5)) assert s.dtype == np.float64 s = Series(None, index=lrange(5), dtype=object) assert s.dtype == np.object_ # GH 7431 # inference on the index s = Series(index=np.array([None])) expected = Series(index=Index([None])) assert_series_equal(s, expected) def test_constructor_pass_nan_nat(self): # GH 13467 exp = Series([np.nan, np.nan], dtype=np.float64) assert exp.dtype == np.float64 tm.assert_series_equal(Series([np.nan, np.nan]), exp) tm.assert_series_equal(Series(np.array([np.nan, np.nan])), exp) exp = Series([pd.NaT, pd.NaT]) assert exp.dtype == 'datetime64[ns]' tm.assert_series_equal(Series([pd.NaT, pd.NaT]), exp) tm.assert_series_equal(Series(np.array([pd.NaT, pd.NaT])), exp) tm.assert_series_equal(Series([pd.NaT, np.nan]), exp) tm.assert_series_equal(Series(np.array([pd.NaT, np.nan])), exp) tm.assert_series_equal(Series([np.nan, pd.NaT]), exp) tm.assert_series_equal(Series(np.array([np.nan, pd.NaT])), exp) def test_constructor_cast(self): msg = "could not convert string to float" with pytest.raises(ValueError, match=msg): Series(["a", "b", "c"], dtype=float) def test_constructor_unsigned_dtype_overflow(self, uint_dtype): # see gh-15832 msg = 'Trying to coerce negative values to unsigned integers' with pytest.raises(OverflowError, match=msg): Series([-1], dtype=uint_dtype) def test_constructor_coerce_float_fail(self, any_int_dtype): # see gh-15832 msg = "Trying to coerce float values to integers" with pytest.raises(ValueError, match=msg): Series([1, 2, 3.5], dtype=any_int_dtype) def test_constructor_coerce_float_valid(self, float_dtype): s = Series([1, 2, 3.5], dtype=float_dtype) expected = Series([1, 2, 3.5]).astype(float_dtype) assert_series_equal(s, expected) def test_constructor_dtype_no_cast(self): # see gh-1572 s = Series([1, 2, 3]) s2 = Series(s, dtype=np.int64) s2[1] = 5 assert s[1] == 5 def test_constructor_datelike_coercion(self): # GH 9477 # incorrectly inferring on dateimelike looking when object dtype is # specified s = Series([Timestamp('20130101'), 'NOV'], dtype=object) assert s.iloc[0] == Timestamp('20130101') assert s.iloc[1] == 'NOV' assert s.dtype == object # the dtype was being reset on the slicing and re-inferred to datetime # even thought the blocks are mixed belly = '216 3T19'.split() wing1 = '2T15 4H19'.split() wing2 = '416 4T20'.split() mat = pd.to_datetime('2016-01-22 2019-09-07'.split()) df = pd.DataFrame( {'wing1': wing1, 'wing2': wing2, 'mat': mat}, index=belly) result = df.loc['3T19'] assert result.dtype == object result = df.loc['216'] assert result.dtype == object def test_constructor_datetimes_with_nulls(self): # gh-15869 for arr in [np.array([None, None, None, None, datetime.now(), None]), np.array([None, None, datetime.now(), None])]: result = Series(arr) assert result.dtype == 'M8[ns]' def test_constructor_dtype_datetime64(self): s = Series(iNaT, dtype='M8[ns]', index=lrange(5)) assert isna(s).all() # in theory this should be all nulls, but since # we are not specifying a dtype is ambiguous s = Series(iNaT, index=lrange(5)) assert not isna(s).all() s = Series(nan, dtype='M8[ns]', index=lrange(5)) assert isna(s).all() s = Series([datetime(2001, 1, 2, 0, 0), iNaT], dtype='M8[ns]') assert isna(s[1]) assert s.dtype == 'M8[ns]' s = Series([datetime(2001, 1, 2, 0, 0), nan], dtype='M8[ns]') assert isna(s[1]) assert s.dtype == 'M8[ns]' # GH3416 dates = [ np.datetime64(datetime(2013, 1, 1)), np.datetime64(datetime(2013, 1, 2)), np.datetime64(datetime(2013, 1, 3)), ] s = Series(dates) assert s.dtype == 'M8[ns]' s.iloc[0] = np.nan assert s.dtype == 'M8[ns]' # GH3414 related pytest.raises(TypeError, lambda x: Series( Series(dates).astype('int') / 1000000, dtype='M8[ms]')) pytest.raises(TypeError, lambda x: Series(dates, dtype='datetime64')) # invalid dates can be help as object result = Series([datetime(2, 1, 1)]) assert result[0] == datetime(2, 1, 1, 0, 0) result = Series([datetime(3000, 1, 1)]) assert result[0] == datetime(3000, 1, 1, 0, 0) # don't mix types result = Series([Timestamp('20130101'), 1], index=['a', 'b']) assert result['a'] == Timestamp('20130101') assert result['b'] == 1 # GH6529 # coerce datetime64 non-ns properly dates = date_range('01-Jan-2015', '01-Dec-2015', freq='M') values2 = dates.view(np.ndarray).astype('datetime64[ns]') expected = Series(values2, index=dates) for dtype in ['s', 'D', 'ms', 'us', 'ns']: values1 = dates.view(np.ndarray).astype('M8[{0}]'.format(dtype)) result = Series(values1, dates) assert_series_equal(result, expected) # GH 13876 # coerce to non-ns to object properly expected = Series(values2, index=dates, dtype=object) for dtype in ['s', 'D', 'ms', 'us', 'ns']: values1 = dates.view(np.ndarray).astype('M8[{0}]'.format(dtype)) result = Series(values1, index=dates, dtype=object) assert_series_equal(result, expected) # leave datetime.date alone dates2 = np.array([d.date() for d in dates.to_pydatetime()], dtype=object) series1 = Series(dates2, dates) tm.assert_numpy_array_equal(series1.values, dates2) assert series1.dtype == object # these will correctly infer a datetime s = Series([None, pd.NaT, '2013-08-05 15:30:00.000001']) assert s.dtype == 'datetime64[ns]' s = Series([np.nan, pd.NaT, '2013-08-05 15:30:00.000001']) assert s.dtype == 'datetime64[ns]' s = Series([pd.NaT, None, '2013-08-05 15:30:00.000001']) assert s.dtype == 'datetime64[ns]' s = Series([pd.NaT, np.nan, '2013-08-05 15:30:00.000001']) assert s.dtype == 'datetime64[ns]' # tz-aware (UTC and other tz's) # GH 8411 dr = date_range('20130101', periods=3) assert Series(dr).iloc[0].tz is None dr = date_range('20130101', periods=3, tz='UTC') assert str(Series(dr).iloc[0].tz) == 'UTC' dr = date_range('20130101', periods=3, tz='US/Eastern') assert str(Series(dr).iloc[0].tz) == 'US/Eastern' # non-convertible s = Series([1479596223000, -1479590, pd.NaT]) assert s.dtype == 'object' assert s[2] is pd.NaT assert 'NaT' in str(s) # if we passed a NaT it remains s = Series([datetime(2010, 1, 1), datetime(2, 1, 1), pd.NaT]) assert s.dtype == 'object' assert s[2] is pd.NaT assert 'NaT' in str(s) # if we passed a nan it remains s = Series([datetime(2010, 1, 1), datetime(2, 1, 1), np.nan]) assert s.dtype == 'object' assert s[2] is np.nan assert 'NaN' in str(s) def test_constructor_with_datetime_tz(self): # 8260 # support datetime64 with tz dr = date_range('20130101', periods=3, tz='US/Eastern') s = Series(dr) assert s.dtype.name == 'datetime64[ns, US/Eastern]' assert s.dtype == 'datetime64[ns, US/Eastern]' assert is_datetime64tz_dtype(s.dtype) assert 'datetime64[ns, US/Eastern]' in str(s) # export result = s.values assert isinstance(result, np.ndarray) assert result.dtype == 'datetime64[ns]' exp = pd.DatetimeIndex(result) exp = exp.tz_localize('UTC').tz_convert(tz=s.dt.tz) tm.assert_index_equal(dr, exp) # indexing result = s.iloc[0] assert result == Timestamp('2013-01-01 00:00:00-0500', tz='US/Eastern', freq='D') result = s[0] assert result == Timestamp('2013-01-01 00:00:00-0500', tz='US/Eastern', freq='D') result = s[Series([True, True, False], index=s.index)] assert_series_equal(result, s[0:2]) result = s.iloc[0:1] assert_series_equal(result, Series(dr[0:1])) # concat result = pd.concat([s.iloc[0:1], s.iloc[1:]]) assert_series_equal(result, s) # short str assert 'datetime64[ns, US/Eastern]' in str(s) # formatting with NaT result = s.shift() assert 'datetime64[ns, US/Eastern]' in str(result) assert 'NaT' in str(result) # long str t = Series(date_range('20130101', periods=1000, tz='US/Eastern')) assert 'datetime64[ns, US/Eastern]' in str(t) result = pd.DatetimeIndex(s, freq='infer') tm.assert_index_equal(result, dr) # inference s = Series([pd.Timestamp('2013-01-01 13:00:00-0800', tz='US/Pacific'), pd.Timestamp('2013-01-02 14:00:00-0800', tz='US/Pacific')]) assert s.dtype == 'datetime64[ns, US/Pacific]' assert lib.infer_dtype(s, skipna=False) == 'datetime64' s = Series([pd.Timestamp('2013-01-01 13:00:00-0800', tz='US/Pacific'), pd.Timestamp('2013-01-02 14:00:00-0800', tz='US/Eastern')]) assert s.dtype == 'object' assert lib.infer_dtype(s, skipna=False) == 'datetime' # with all NaT s = Series(pd.NaT, index=[0, 1], dtype='datetime64[ns, US/Eastern]') expected = Series(pd.DatetimeIndex(['NaT', 'NaT'], tz='US/Eastern')) assert_series_equal(s, expected) @pytest.mark.parametrize("arr_dtype", [np.int64, np.float64]) @pytest.mark.parametrize("dtype", ["M8", "m8"]) @pytest.mark.parametrize("unit", ['ns', 'us', 'ms', 's', 'h', 'm', 'D']) def test_construction_to_datetimelike_unit(self, arr_dtype, dtype, unit): # tests all units # gh-19223 dtype = "{}[{}]".format(dtype, unit) arr = np.array([1, 2, 3], dtype=arr_dtype) s = Series(arr) result = s.astype(dtype) expected = Series(arr.astype(dtype)) tm.assert_series_equal(result, expected) @pytest.mark.parametrize('arg', ['2013-01-01 00:00:00', pd.NaT, np.nan, None]) def test_constructor_with_naive_string_and_datetimetz_dtype(self, arg): # GH 17415: With naive string result = Series([arg], dtype='datetime64[ns, CET]') expected = Series(pd.Timestamp(arg)).dt.tz_localize('CET') assert_series_equal(result, expected) def test_construction_interval(self): # construction from interval & array of intervals index = IntervalIndex.from_breaks(np.arange(3), closed='right') result = Series(index) repr(result) str(result) tm.assert_index_equal(Index(result.values), index) result = Series(index.values) tm.assert_index_equal(Index(result.values), index) def test_construction_consistency(self): # make sure that we are not re-localizing upon construction # GH 14928 s = Series(pd.date_range('20130101', periods=3, tz='US/Eastern')) result = Series(s, dtype=s.dtype) tm.assert_series_equal(result, s) result = Series(s.dt.tz_convert('UTC'), dtype=s.dtype) tm.assert_series_equal(result, s) result = Series(s.values, dtype=s.dtype) tm.assert_series_equal(result, s) def test_constructor_infer_period(self): data = [pd.Period('2000', 'D'), pd.Period('2001', 'D'), None] result = pd.Series(data) expected = pd.Series(period_array(data)) tm.assert_series_equal(result, expected) assert result.dtype == 'Period[D]' data = np.asarray(data, dtype=object) tm.assert_series_equal(result, expected) assert result.dtype == 'Period[D]' def test_constructor_period_incompatible_frequency(self): data = [pd.Period('2000', 'D'), pd.Period('2001', 'A')] result = pd.Series(data) assert result.dtype == object assert result.tolist() == data def test_constructor_periodindex(self): # GH7932 # converting a PeriodIndex when put in a Series pi = period_range('20130101', periods=5, freq='D') s = Series(pi) assert s.dtype == 'Period[D]' expected = Series(pi.astype(object)) assert_series_equal(s, expected) def test_constructor_dict(self): d = {'a': 0., 'b': 1., 'c': 2.} result = Series(d, index=['b', 'c', 'd', 'a']) expected = Series([1, 2, nan, 0], index=['b', 'c', 'd', 'a']) assert_series_equal(result, expected) pidx = tm.makePeriodIndex(100) d = {pidx[0]: 0, pidx[1]: 1} result = Series(d, index=pidx) expected = Series(np.nan, pidx) expected.iloc[0] = 0 expected.iloc[1] = 1 assert_series_equal(result, expected) def test_constructor_dict_order(self): # GH19018 # initialization ordering: by insertion order if python>= 3.6, else # order by value d = {'b': 1, 'a': 0, 'c': 2} result = Series(d) if PY36: expected = Series([1, 0, 2], index=list('bac')) else: expected = Series([0, 1, 2], index=list('abc')) tm.assert_series_equal(result, expected) @pytest.mark.parametrize("value", [2, np.nan, None, float('nan')]) def test_constructor_dict_nan_key(self, value): # GH 18480 d = {1: 'a', value: 'b', float('nan'): 'c', 4: 'd'} result = Series(d).sort_values() expected = Series(['a', 'b', 'c', 'd'], index=[1, value, np.nan, 4]) assert_series_equal(result, expected) # MultiIndex: d = {(1, 1): 'a', (2, np.nan): 'b', (3, value): 'c'} result = Series(d).sort_values() expected = Series(['a', 'b', 'c'], index=Index([(1, 1), (2, np.nan), (3, value)])) assert_series_equal(result, expected) def test_constructor_dict_datetime64_index(self): # GH 9456 dates_as_str = ['1984-02-19', '1988-11-06', '1989-12-03', '1990-03-15'] values = [42544017.198965244, 1234565, 40512335.181958228, -1] def create_data(constructor): return dict(zip((constructor(x) for x in dates_as_str), values)) data_datetime64 = create_data(np.datetime64) data_datetime = create_data(lambda x: datetime.strptime(x, '%Y-%m-%d')) data_Timestamp = create_data(Timestamp) expected = Series(values, (Timestamp(x) for x in dates_as_str)) result_datetime64 = Series(data_datetime64) result_datetime = Series(data_datetime) result_Timestamp = Series(data_Timestamp) assert_series_equal(result_datetime64, expected) assert_series_equal(result_datetime, expected) assert_series_equal(result_Timestamp, expected) def test_constructor_list_of_tuples(self): data = [(1, 1), (2, 2), (2, 3)] s = Series(data) assert list(s) == data def test_constructor_tuple_of_tuples(self): data = ((1, 1), (2, 2), (2, 3)) s = Series(data) assert tuple(s) == data def test_constructor_dict_of_tuples(self): data = {(1, 2): 3, (None, 5): 6} result = Series(data).sort_values() expected = Series([3, 6], index=MultiIndex.from_tuples([(1, 2), (None, 5)])) tm.assert_series_equal(result, expected) def test_constructor_set(self): values = {1, 2, 3, 4, 5} pytest.raises(TypeError, Series, values) values = frozenset(values) pytest.raises(TypeError, Series, values) # https://github.com/pandas-dev/pandas/issues/22698 @pytest.mark.filterwarnings("ignore:elementwise comparison:FutureWarning") def test_fromDict(self): data = {'a': 0, 'b': 1, 'c': 2, 'd': 3} series = Series(data) assert tm.is_sorted(series.index) data = {'a': 0, 'b': '1', 'c': '2', 'd': datetime.now()} series = Series(data) assert series.dtype == np.object_ data = {'a': 0, 'b': '1', 'c': '2', 'd': '3'} series = Series(data) assert series.dtype == np.object_ data = {'a': '0', 'b': '1'} series = Series(data, dtype=float) assert series.dtype == np.float64 def test_fromValue(self, datetime_series): nans = Series(np.NaN, index=datetime_series.index) assert nans.dtype == np.float_ assert len(nans) == len(datetime_series) strings = Series('foo', index=datetime_series.index) assert strings.dtype == np.object_ assert len(strings) == len(datetime_series) d = datetime.now() dates = Series(d, index=datetime_series.index) assert dates.dtype == 'M8[ns]' assert len(dates) == len(datetime_series) # GH12336 # Test construction of categorical series from value categorical = Series(0, index=datetime_series.index, dtype="category") expected = Series(0, index=datetime_series.index).astype("category") assert categorical.dtype == 'category' assert len(categorical) == len(datetime_series) tm.assert_series_equal(categorical, expected) def test_constructor_dtype_timedelta64(self): # basic td = Series([timedelta(days=i) for i in range(3)]) assert td.dtype == 'timedelta64[ns]' td = Series([timedelta(days=1)]) assert td.dtype == 'timedelta64[ns]' td = Series([timedelta(days=1), timedelta(days=2), np.timedelta64( 1, 's')]) assert td.dtype == 'timedelta64[ns]' # mixed with NaT td = Series([timedelta(days=1), NaT], dtype='m8[ns]') assert td.dtype == 'timedelta64[ns]' td = Series([timedelta(days=1), np.nan], dtype='m8[ns]') assert td.dtype == 'timedelta64[ns]' td = Series([np.timedelta64(300000000), pd.NaT], dtype='m8[ns]') assert td.dtype == 'timedelta64[ns]' # improved inference # GH5689 td = Series([np.timedelta64(300000000), NaT]) assert td.dtype == 'timedelta64[ns]' # because iNaT is int, not coerced to timedelta td = Series([np.timedelta64(300000000), iNaT]) assert td.dtype == 'object' td = Series([np.timedelta64(300000000), np.nan]) assert td.dtype == 'timedelta64[ns]' td = Series([pd.NaT, np.timedelta64(300000000)]) assert td.dtype == 'timedelta64[ns]' td = Series([np.timedelta64(1, 's')]) assert td.dtype == 'timedelta64[ns]' # these are frequency conversion astypes # for t in ['s', 'D', 'us', 'ms']: # pytest.raises(TypeError, td.astype, 'm8[%s]' % t) # valid astype td.astype('int64') # invalid casting pytest.raises(TypeError, td.astype, 'int32') # this is an invalid casting def f(): Series([timedelta(days=1), 'foo'], dtype='m8[ns]') pytest.raises(Exception, f) # leave as object here td = Series([timedelta(days=i) for i in range(3)] + ['foo']) assert td.dtype == 'object' # these will correctly infer a timedelta s = Series([None, pd.NaT, '1 Day']) assert s.dtype == 'timedelta64[ns]' s = Series([np.nan, pd.NaT, '1 Day']) assert s.dtype == 'timedelta64[ns]' s = Series([pd.NaT, None, '1 Day']) assert s.dtype == 'timedelta64[ns]' s = Series([pd.NaT, np.nan, '1 Day']) assert s.dtype == 'timedelta64[ns]' # GH 16406 def test_constructor_mixed_tz(self): s = Series([Timestamp('20130101'), Timestamp('20130101', tz='US/Eastern')]) expected = Series([Timestamp('20130101'), Timestamp('20130101', tz='US/Eastern')], dtype='object') assert_series_equal(s, expected) def test_NaT_scalar(self): series = Series([0, 1000, 2000, iNaT], dtype='M8[ns]') val = series[3] assert isna(val) series[2] = val assert isna(series[2]) def test_NaT_cast(self): # GH10747 result = Series([np.nan]).astype('M8[ns]') expected = Series([NaT]) assert_series_equal(result, expected) def test_constructor_name_hashable(self): for n in [777, 777., 'name', datetime(2001, 11, 11), (1, ), u"\u05D0"]: for data in [[1, 2, 3], np.ones(3), {'a': 0, 'b': 1}]: s = Series(data, name=n) assert s.name == n def test_constructor_name_unhashable(self): for n in [['name_list'], np.ones(2), {1: 2}]: for data in [['name_list'], np.ones(2), {1: 2}]: pytest.raises(TypeError, Series, data, name=n) def test_auto_conversion(self): series = Series(list(date_range('1/1/2000', periods=10))) assert series.dtype == 'M8[ns]' def test_convert_non_ns(self): # convert from a numpy array of non-ns timedelta64 arr = np.array([1, 2, 3], dtype='timedelta64[s]') s = Series(arr) expected = Series(pd.timedelta_range('00:00:01', periods=3, freq='s')) assert_series_equal(s, expected) # convert from a numpy array of non-ns datetime64 # note that creating a numpy datetime64 is in LOCAL time!!!! # seems to work for M8[D], but not for M8[s] s = Series(np.array(['2013-01-01', '2013-01-02', '2013-01-03'], dtype='datetime64[D]')) assert_series_equal(s, Series(date_range('20130101', periods=3, freq='D'))) # s = Series(np.array(['2013-01-01 00:00:01','2013-01-01 # 00:00:02','2013-01-01 00:00:03'],dtype='datetime64[s]')) # assert_series_equal(s,date_range('20130101 # 00:00:01',period=3,freq='s')) @pytest.mark.parametrize( "index", [ date_range('1/1/2000', periods=10), timedelta_range('1 day', periods=10), period_range('2000-Q1', periods=10, freq='Q')], ids=lambda x: type(x).__name__) def test_constructor_cant_cast_datetimelike(self, index): # floats are not ok msg = "Cannot cast {}.*? to ".format( # strip Index to convert PeriodIndex -> Period # We don't care whether the error message says # PeriodIndex or PeriodArray type(index).__name__.rstrip("Index") ) with pytest.raises(TypeError, match=msg): Series(index, dtype=float) # ints are ok # we test with np.int64 to get similar results on # windows / 32-bit platforms result = Series(index, dtype=np.int64) expected = Series(index.astype(np.int64)) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( "index", [ date_range('1/1/2000', periods=10), timedelta_range('1 day', periods=10), period_range('2000-Q1', periods=10, freq='Q')], ids=lambda x: type(x).__name__) def test_constructor_cast_object(self, index): s = Series(index, dtype=object) exp = Series(index).astype(object) tm.assert_series_equal(s, exp) s = Series(pd.Index(index, dtype=object), dtype=object) exp = Series(index).astype(object) tm.assert_series_equal(s, exp) s = Series(index.astype(object), dtype=object) exp = Series(index).astype(object) tm.assert_series_equal(s, exp) @pytest.mark.parametrize("dtype", [ np.datetime64, np.timedelta64, ]) def test_constructor_generic_timestamp_no_frequency(self, dtype): # see gh-15524, gh-15987 msg = "dtype has no unit. Please pass in" with pytest.raises(ValueError, match=msg): Series([], dtype=dtype) @pytest.mark.parametrize("dtype,msg", [ ("m8[ps]", "cannot convert timedeltalike"), ("M8[ps]", "cannot convert datetimelike"), ]) def test_constructor_generic_timestamp_bad_frequency(self, dtype, msg): # see gh-15524, gh-15987 with pytest.raises(TypeError, match=msg): Series([], dtype=dtype) @pytest.mark.parametrize('dtype', [None, 'uint8', 'category']) def test_constructor_range_dtype(self, dtype): # GH 16804 expected = Series([0, 1, 2, 3, 4], dtype=dtype or 'int64') result = Series(range(5), dtype=dtype) tm.assert_series_equal(result, expected) def test_constructor_tz_mixed_data(self): # GH 13051 dt_list = [Timestamp('2016-05-01 02:03:37'), Timestamp('2016-04-30 19:03:37-0700', tz='US/Pacific')] result = Series(dt_list) expected = Series(dt_list, dtype=object) tm.assert_series_equal(result, expected)
class TestCategoricalOps: def test_compare_frame(self): # GH#24282 check that Categorical.__cmp__(DataFrame) defers to frame data = ["a", "b", 2, "a"] cat = Categorical(data) df = DataFrame(cat) result = cat == df.T expected = DataFrame([[True, True, True, True]]) tm.assert_frame_equal(result, expected) result = cat[::-1] != df.T expected = DataFrame([[False, True, True, False]]) tm.assert_frame_equal(result, expected) def test_compare_frame_raises(self, all_compare_operators): # alignment raises unless we transpose op = getattr(operator, all_compare_operators) cat = Categorical(["a", "b", 2, "a"]) df = DataFrame(cat) msg = "Unable to coerce to Series, length must be 1: given 4" with pytest.raises(ValueError, match=msg): op(cat, df) def test_datetime_categorical_comparison(self): dt_cat = Categorical(date_range("2014-01-01", periods=3), ordered=True) tm.assert_numpy_array_equal(dt_cat > dt_cat[0], np.array([False, True, True])) tm.assert_numpy_array_equal(dt_cat[0] < dt_cat, np.array([False, True, True])) def test_reflected_comparison_with_scalars(self): # GH8658 cat = Categorical([1, 2, 3], ordered=True) tm.assert_numpy_array_equal(cat > cat[0], np.array([False, True, True])) tm.assert_numpy_array_equal(cat[0] < cat, np.array([False, True, True])) def test_comparison_with_unknown_scalars(self): # https://github.com/pandas-dev/pandas/issues/9836#issuecomment-92123057 # and following comparisons with scalars not in categories should raise # for unequal comps, but not for equal/not equal cat = Categorical([1, 2, 3], ordered=True) msg = "Invalid comparison between dtype=category and int" with pytest.raises(TypeError, match=msg): cat < 4 with pytest.raises(TypeError, match=msg): cat > 4 with pytest.raises(TypeError, match=msg): 4 < cat with pytest.raises(TypeError, match=msg): 4 > cat tm.assert_numpy_array_equal(cat == 4, np.array([False, False, False])) tm.assert_numpy_array_equal(cat != 4, np.array([True, True, True])) def test_comparison_of_ordered_categorical_with_nan_to_scalar( self, compare_operators_no_eq_ne): # https://github.com/pandas-dev/pandas/issues/26504 # BUG: fix ordered categorical comparison with missing values (#26504 ) # and following comparisons with scalars in categories with missing # values should be evaluated as False cat = Categorical([1, 2, 3, None], categories=[1, 2, 3], ordered=True) scalar = 2 with warnings.catch_warnings(): warnings.simplefilter("ignore", RuntimeWarning) expected = getattr(np.array(cat), compare_operators_no_eq_ne)(scalar) actual = getattr(cat, compare_operators_no_eq_ne)(scalar) tm.assert_numpy_array_equal(actual, expected) def test_comparison_of_ordered_categorical_with_nan_to_listlike( self, compare_operators_no_eq_ne): # https://github.com/pandas-dev/pandas/issues/26504 # and following comparisons of missing values in ordered Categorical # with listlike should be evaluated as False cat = Categorical([1, 2, 3, None], categories=[1, 2, 3], ordered=True) other = Categorical([2, 2, 2, 2], categories=[1, 2, 3], ordered=True) with warnings.catch_warnings(): warnings.simplefilter("ignore", RuntimeWarning) expected = getattr(np.array(cat), compare_operators_no_eq_ne)(2) actual = getattr(cat, compare_operators_no_eq_ne)(other) tm.assert_numpy_array_equal(actual, expected) @pytest.mark.parametrize( "data,reverse,base", [(list("abc"), list("cba"), list("bbb")), ([1, 2, 3], [3, 2, 1], [2, 2, 2])], ) def test_comparisons(self, data, reverse, base): cat_rev = Series(Categorical(data, categories=reverse, ordered=True)) cat_rev_base = Series( Categorical(base, categories=reverse, ordered=True)) cat = Series(Categorical(data, ordered=True)) cat_base = Series( Categorical(base, categories=cat.cat.categories, ordered=True)) s = Series(base) a = np.array(base) # comparisons need to take categories ordering into account res_rev = cat_rev > cat_rev_base exp_rev = Series([True, False, False]) tm.assert_series_equal(res_rev, exp_rev) res_rev = cat_rev < cat_rev_base exp_rev = Series([False, False, True]) tm.assert_series_equal(res_rev, exp_rev) res = cat > cat_base exp = Series([False, False, True]) tm.assert_series_equal(res, exp) scalar = base[1] res = cat > scalar exp = Series([False, False, True]) exp2 = cat.values > scalar tm.assert_series_equal(res, exp) tm.assert_numpy_array_equal(res.values, exp2) res_rev = cat_rev > scalar exp_rev = Series([True, False, False]) exp_rev2 = cat_rev.values > scalar tm.assert_series_equal(res_rev, exp_rev) tm.assert_numpy_array_equal(res_rev.values, exp_rev2) # Only categories with same categories can be compared msg = "Categoricals can only be compared if 'categories' are the same" with pytest.raises(TypeError, match=msg): cat > cat_rev # categorical cannot be compared to Series or numpy array, and also # not the other way around msg = ("Cannot compare a Categorical for op __gt__ with type " r"<class 'numpy\.ndarray'>") with pytest.raises(TypeError, match=msg): cat > s with pytest.raises(TypeError, match=msg): cat_rev > s with pytest.raises(TypeError, match=msg): cat > a with pytest.raises(TypeError, match=msg): cat_rev > a with pytest.raises(TypeError, match=msg): s < cat with pytest.raises(TypeError, match=msg): s < cat_rev with pytest.raises(TypeError, match=msg): a < cat with pytest.raises(TypeError, match=msg): a < cat_rev @pytest.mark.parametrize( "ctor", [ lambda *args, **kwargs: Categorical(*args, **kwargs), lambda *args, **kwargs: Series(Categorical(*args, **kwargs)), ], ) def test_unordered_different_order_equal(self, ctor): # https://github.com/pandas-dev/pandas/issues/16014 c1 = ctor(["a", "b"], categories=["a", "b"], ordered=False) c2 = ctor(["a", "b"], categories=["b", "a"], ordered=False) assert (c1 == c2).all() c1 = ctor(["a", "b"], categories=["a", "b"], ordered=False) c2 = ctor(["b", "a"], categories=["b", "a"], ordered=False) assert (c1 != c2).all() c1 = ctor(["a", "a"], categories=["a", "b"], ordered=False) c2 = ctor(["b", "b"], categories=["b", "a"], ordered=False) assert (c1 != c2).all() c1 = ctor(["a", "a"], categories=["a", "b"], ordered=False) c2 = ctor(["a", "b"], categories=["b", "a"], ordered=False) result = c1 == c2 tm.assert_numpy_array_equal(np.array(result), np.array([True, False])) def test_unordered_different_categories_raises(self): c1 = Categorical(["a", "b"], categories=["a", "b"], ordered=False) c2 = Categorical(["a", "c"], categories=["c", "a"], ordered=False) with pytest.raises(TypeError, match=("Categoricals can only be compared")): c1 == c2 def test_compare_different_lengths(self): c1 = Categorical([], categories=["a", "b"]) c2 = Categorical([], categories=["a"]) msg = "Categoricals can only be compared if 'categories' are the same." with pytest.raises(TypeError, match=msg): c1 == c2 def test_compare_unordered_different_order(self): # https://github.com/pandas-dev/pandas/issues/16603#issuecomment- # 349290078 a = pd.Categorical(["a"], categories=["a", "b"]) b = pd.Categorical(["b"], categories=["b", "a"]) assert not a.equals(b) def test_numeric_like_ops(self): df = DataFrame({"value": np.random.randint(0, 10000, 100)}) labels = [f"{i} - {i + 499}" for i in range(0, 10000, 500)] cat_labels = Categorical(labels, labels) df = df.sort_values(by=["value"], ascending=True) df["value_group"] = pd.cut(df.value, range(0, 10500, 500), right=False, labels=cat_labels) # numeric ops should not succeed for op, str_rep in [ ("__add__", r"\+"), ("__sub__", "-"), ("__mul__", r"\*"), ("__truediv__", "/"), ]: msg = f"Series cannot perform the operation {str_rep}|unsupported operand" with pytest.raises(TypeError, match=msg): getattr(df, op)(df) # reduction ops should not succeed (unless specifically defined, e.g. # min/max) s = df["value_group"] for op in ["kurt", "skew", "var", "std", "mean", "sum", "median"]: msg = f"Categorical cannot perform the operation {op}" with pytest.raises(TypeError, match=msg): getattr(s, op)(numeric_only=False) # mad technically works because it takes always the numeric data # numpy ops s = Series(Categorical([1, 2, 3, 4])) with pytest.raises( TypeError, match="Categorical cannot perform the operation sum"): np.sum(s) # numeric ops on a Series for op, str_rep in [ ("__add__", r"\+"), ("__sub__", "-"), ("__mul__", r"\*"), ("__truediv__", "/"), ]: msg = f"Series cannot perform the operation {str_rep}|unsupported operand" with pytest.raises(TypeError, match=msg): getattr(s, op)(2) # invalid ufunc msg = "Object with dtype category cannot perform the numpy op log" with pytest.raises(TypeError, match=msg): np.log(s) def test_contains(self): # GH21508 c = pd.Categorical(list("aabbca"), categories=list("cab")) assert "b" in c assert "z" not in c assert np.nan not in c with pytest.raises(TypeError, match="unhashable type: 'list'"): assert [1] in c # assert codes NOT in index assert 0 not in c assert 1 not in c c = pd.Categorical(list("aabbca") + [np.nan], categories=list("cab")) assert np.nan in c @pytest.mark.parametrize( "item, expected", [ (pd.Interval(0, 1), True), (1.5, True), (pd.Interval(0.5, 1.5), False), ("a", False), (pd.Timestamp(1), False), (pd.Timedelta(1), False), ], ids=str, ) def test_contains_interval(self, item, expected): # GH 23705 cat = Categorical(pd.IntervalIndex.from_breaks(range(3))) result = item in cat assert result is expected def test_contains_list(self): # GH#21729 cat = Categorical([1, 2, 3]) assert "a" not in cat with pytest.raises(TypeError, match="unhashable type"): ["a"] in cat with pytest.raises(TypeError, match="unhashable type"): ["a", "b"] in cat
def transform(self, df): if self.args.user_type == 'adult' or self.args.user_type == 'bank': if self.args.user_type == 'adult': df.replace( {'income': { ' <=50K.': ' <=50K', ' >50K.': ' >50K' }}, inplace=True) df.replace({'income': {' <=50K': '0', ' >50K': '1'}}, inplace=True) self.drop_column = list( set(self.columns).difference(set( self.column_to_perturb)).difference(set([self.column_to_agg]))) df.drop(self.drop_column, axis=1, inplace=True) self.columns = [x for x in df.columns if x not in self.drop_column] # print stats # for col in self.columns: # print(col, end=': ') # if df[col].dtypes == 'object': # print(df[col].unique()) # else: # print(df[col].min(), df[col].max(), df[col].unique()) for col in self.columns: bins = np.array([]) if col not in self.ranges: df[col] = df[col].apply(str) df[col] = df[col].str.strip() df[col] = pd.Categorical(df[col]) else: bins = np.round( np.arange(self.ranges[col]['min'], self.ranges[col]['max'], self.ranges[col]['gran']), 2) # too slow when there are many bins # df[col] = pd.cut(df[col], bins, right=False) start = bins[0] step = bins[1] - bins[0] if self.args.perturb_type in ['ord2cat2', 'ord2cat2q11', 'ord2cat2q20', 'ord2cat2q22', 'ord2cat2q10', 'ord2cat2q12', 'ord4cat4q11', 'ord4cat4q12', 'ord4cat4q10', 'ord4cat4q02', 'ord4cat4q01', 'ord4cat4q20', 'ord4cat4q21'] \ and col in ['INCTOT', 'FTOTINC'] and self.args.user_type[0:5] == 'ipums': df[col] = np.random.randint(0, len(bins), df.shape[0]) else: df[col] = ((df[col].values - start) / step).astype( np.int16) logging.info('finish %s' % col) if bins.any(): bins = [ pd.Interval(x, np.round(x + self.ranges[col]['gran'], 2), closed='left') for x in bins ] self.kv_map[col] = dict(enumerate(bins)) self.vk_map[col] = {v: k for k, v in self.kv_map[col].items()} else: self.kv_map[col] = dict(enumerate(df[col].cat.categories)) self.vk_map[col] = {v: k for k, v in self.kv_map[col].items()} df.replace({col: self.vk_map[col]}, inplace=True) return df.values.astype(np.int)