Beispiel #1
                           pd.Series(np.add(ser, Dummy(1))))

        pd.array([1, 3, 2], dtype=np.int64),
        pd.array([1, 3, 2], dtype="Int64"),
        pd.array([1, 3, 2], dtype="Float32"),
        pd.array([1, 10, 2], dtype="Sparse[int]"),
        pd.to_datetime(["2000", "2010", "2001"]),
        pd.to_datetime(["2000", "2010", "2001"]).tz_localize("CET"),
        pd.to_datetime(["2000", "2010", "2001"]).to_period(freq="D"),
        pd.to_timedelta(["1 Day", "3 Days", "2 Days"]),
            [pd.Interval(0, 1),
             pd.Interval(2, 3),
             pd.Interval(1, 2)]),
    ids=lambda x: str(x.dtype),
@pytest.mark.parametrize("box", [pd.array, pd.Index, pd.Series, pd.DataFrame])
def test_reduce(values, box, request):
    # TODO: cases with NAs

    same_type = True

    if box is pd.Index:
        if values.dtype.kind in ["i", "f"]:
            # ATM Index casts to object, so we get python ints/floats
            same_type = False
Beispiel #2
class TestDataFrameDataTypes:
    def test_empty_frame_dtypes(self):
        empty_df = DataFrame()
        tm.assert_series_equal(empty_df.dtypes, Series(dtype=object))

        nocols_df = DataFrame(index=[1, 2, 3])
        tm.assert_series_equal(nocols_df.dtypes, Series(dtype=object))

        norows_df = DataFrame(columns=list("abc"))
                               Series(object, index=list("abc")))

        norows_int_df = DataFrame(columns=list("abc")).astype(np.int32)
                               Series(np.dtype("int32"), index=list("abc")))

        df = DataFrame(dict([("a", 1), ("b", True), ("c", 1.0)]),
                       index=[1, 2, 3])
        ex_dtypes = Series(
            dict([("a", np.int64), ("b", np.bool_), ("c", np.float64)]))
        tm.assert_series_equal(df.dtypes, ex_dtypes)

        # same but for empty slice of df
        tm.assert_series_equal(df[:0].dtypes, ex_dtypes)

    def test_datetime_with_tz_dtypes(self):
        tzframe = DataFrame({
            date_range("20130101", periods=3),
            date_range("20130101", periods=3, tz="US/Eastern"),
            date_range("20130101", periods=3, tz="CET"),
        tzframe.iloc[1, 1] = pd.NaT
        tzframe.iloc[1, 2] = pd.NaT
        result = tzframe.dtypes.sort_index()
        expected = Series(
                DatetimeTZDtype("ns", "US/Eastern"),
                DatetimeTZDtype("ns", "CET"),
            ["A", "B", "C"],

        tm.assert_series_equal(result, expected)

    def test_dtypes_are_correct_after_column_slice(self):
        # GH6525
        df = DataFrame(index=range(5), columns=list("abc"), dtype=np.float_)
            Series(dict([("a", np.float_), ("b", np.float_),
                         ("c", np.float_)])),
        tm.assert_series_equal(df.iloc[:, 2:].dtypes,
                               Series(dict([("c", np.float_)])))
            Series(dict([("a", np.float_), ("b", np.float_),
                         ("c", np.float_)])),

    def test_dtypes_gh8722(self, float_string_frame):
        float_string_frame["bool"] = float_string_frame["A"] > 0
        result = float_string_frame.dtypes
        expected = Series({k: v.dtype
                           for k, v in float_string_frame.items()},
        tm.assert_series_equal(result, expected)

        # compat, GH 8722
        with option_context("use_inf_as_na", True):
            df = DataFrame([[1]])
            result = df.dtypes
            tm.assert_series_equal(result, Series({0: np.dtype("int64")}))

    def test_singlerow_slice_categoricaldtype_gives_series(self):
        # GH29521
        df = DataFrame({"x": pd.Categorical("a b c d e".split())})
        result = df.iloc[0]
        raw_cat = pd.Categorical(["a"], categories=["a", "b", "c", "d", "e"])
        expected = Series(raw_cat, index=["x"], name=0, dtype="category")

        tm.assert_series_equal(result, expected)

    def test_timedeltas(self):
        df = DataFrame(
                A=Series(date_range("2012-1-1", periods=3, freq="D")),
                B=Series([timedelta(days=i) for i in range(3)]),
        result = df.dtypes
        expected = Series(
        tm.assert_series_equal(result, expected)

        df["C"] = df["A"] + df["B"]
        result = df.dtypes
        expected = Series(
        tm.assert_series_equal(result, expected)

        # mixed int types
        df["D"] = 1
        result = df.dtypes
        expected = Series(
        tm.assert_series_equal(result, expected)

            ([1, 2]),
            (["1", "2"]),
            (list(pd.date_range("1/1/2011", periods=2, freq="H"))),
                pd.date_range("1/1/2011", periods=2, freq="H",
            ([pd.Interval(left=0, right=5)]),
    def test_constructor_list_str(self, input_vals, string_dtype):
        # GH 16605
        # Ensure that data elements are converted to strings when
        # dtype is str, 'str', or 'U'

        result = DataFrame({"A": input_vals}, dtype=string_dtype)
        expected = DataFrame({"A": input_vals}).astype({"A": string_dtype})
        tm.assert_frame_equal(result, expected)

    def test_constructor_list_str_na(self, string_dtype):

        result = DataFrame({"A": [1.0, 2.0, None]}, dtype=string_dtype)
        expected = DataFrame({"A": ["1.0", "2.0", None]}, dtype=object)
        tm.assert_frame_equal(result, expected)

        "data, expected",
            # empty
            (DataFrame(), True),
            # multi-same
                "A": [1, 2],
                "B": [1, 2]
            }), True),
            # multi-object
                    "A": np.array([1, 2], dtype=object),
                    "B": np.array(["a", "b"], dtype=object),
            # multi-extension
                    "A": pd.Categorical(["a", "b"]),
                    "B": pd.Categorical(["a", "b"])
            # differ types
                "A": [1, 2],
                "B": [1.0, 2.0]
            }), False),
            # differ sizes
                    "A": np.array([1, 2], dtype=np.int32),
                    "B": np.array([1, 2], dtype=np.int64),
            # multi-extension differ
                    "A": pd.Categorical(["a", "b"]),
                    "B": pd.Categorical(["b", "c"])
    def test_is_homogeneous_type(self, data, expected):
        assert data._is_homogeneous_type is expected

    def test_asarray_homogenous(self):
        df = DataFrame({
            "A": pd.Categorical([1, 2]),
            "B": pd.Categorical([1, 2])
        result = np.asarray(df)
        # may change from object in the future
        expected = np.array([[1, 1], [2, 2]], dtype="object")
        tm.assert_numpy_array_equal(result, expected)

    def test_str_to_small_float_conversion_type(self):
        # GH 20388
        col_data = [str(np.random.random() * 1e-12) for _ in range(5)]
        result = DataFrame(col_data, columns=["A"])
        expected = DataFrame(col_data, columns=["A"], dtype=object)
        tm.assert_frame_equal(result, expected)
        # change the dtype of the elements from object to float one by one
        result.loc[result.index, "A"] = [float(x) for x in col_data]
        expected = DataFrame(col_data, columns=["A"], dtype=float)
        tm.assert_frame_equal(result, expected)
Beispiel #3
def test_survival_table_from_events_will_collapse_to_desired_bins():
    T, C = np.array([1, 3, 4, 5]), np.array([True, True, True, True])
    table = utils.survival_table_from_events(T, C, collapse=True, intervals=[0, 4, 8])
    assert table.index.tolist() == [pd.Interval(0, 4, closed='right'), pd.Interval(4,  8, closed='right')]
def catagorize_donation_amounts(donation_df):
    # pop the first 11 rows which are not per 10s
    SKIP = 11

    # donation catagories
    bins = [0, 100, 5000, 50000, 9999999999]

    data_dates = donation_df['date'].iloc[SKIP:]
    donation_data_delta = donation_df.diff(periods=1, axis=0)
    donation_data_delta = donation_data_delta.iloc[SKIP:]['amount']

    merged = pd.concat([data_dates, donation_data_delta],
                       keys=['date', 'donated_amount'])
    plt.hist(np.abs(merged['donated_amount']), bins=500, log=True)
    plt.xlabel("Amount donated in USD", fontsize=30)
    plt.ylabel("Frequency", fontsize=30)
    plt.legend(prop={'size': 20})
    MEDIUM - HIGH - Histogram of donation amounts
    plt.hist(np.abs(merged['donated_amount']), bins=10000, log=True)
    plt.xlim(left=0, right=50000)
    plt.xlabel("Amount donated in USD", fontsize=30)
    plt.ylabel("Frequency", fontsize=30)
    plt.axvline(5000, color='k', linestyle='dashed', label="$5000", alpha=0.5)
    plt.axvline(100, color='r', linestyle='dashed', label="$100", alpha=0.5)
    plt.legend(prop={'size': 20})

    merged = merged[merged['donated_amount'] < 300]
    plt.hist(merged['donated_amount'], bins=2000)
    plt.xlim(left=0, right=300)
    plt.xlabel("Amount donated in USD", fontsize=30)
    plt.ylabel("Frequency", fontsize=30)
    plt.axvline(100, color='r', linestyle='dashed', label="$100", alpha=0.5)
    plt.legend(prop={'size': 20})

    merged = pd.concat([data_dates, donation_data_delta],
                       keys=['date', 'donated_amount'])
    # Bin the donations
    merged['bin'] = pd.cut(x=merged['donated_amount'], bins=bins)
    # cumsum the bins
    merged['cumsum'] = merged.groupby('bin')['donated_amount'].cumsum()

    # Group the tweets per catagory for v-lines
    binned = merged.groupby(['bin'])

    # Get the donors in highest interval
    TOP_DONORS_INTERVAL = pd.Interval(left=50000, right=9999999999)
    top_donor_data = binned.get_group(TOP_DONORS_INTERVAL)

    ax = sns.lineplot(x="date",
    ax.set(xlabel='Date', ylabel='Binned Cumulative Sum')
    plt.legend(loc='upper left',
                   'Biggest donors', 'Medium donors', 'Large donors',
                   'Smallest donors'
               prop={'size': 20})
    # plt.yscale('log')
    plt.ylabel("USD donated", fontsize=30)

    correlate_binned_data(top_donor_data, binned, bins)
Beispiel #5
 def test_is_all_dates(self):
     # GH 23576
     year_2017 = pd.Interval(Timestamp("2017-01-01 00:00:00"),
                             Timestamp("2018-01-01 00:00:00"))
     year_2017_index = pd.IntervalIndex([year_2017])
     assert not year_2017_index._is_all_dates
Beispiel #6
class TestSeriesConvertDtypes:
    # The answerdict has keys that have 4 tuples, corresponding to the arguments
    # infer_objects, convert_string, convert_integer, convert_boolean
    # This allows all 16 possible combinations to be tested.  Since common
    # combinations expect the same answer, this provides an easy way to list
    # all the possibilities
        "data, maindtype, answerdict",
                [1, 2, 3],
                    ((True, False), (True, False), (True, ), (True, False)):
                    ((True, False), (True, False), (False, ), (True, False)):
                [1, 2, 3],
                    ((True, False), (True, False), (True, ), (True, False)):
                    ((True, False), (True, False), (False, ), (True, False)):
                ["x", "y", "z"],
                        (True, False),
                        (True, ),
                        (True, False),
                        (True, False),
                    ((True, False), (False, ), (True, False), (True, False)):
                [True, False, np.nan],
                        (True, False),
                        (True, False),
                        (True, False),
                        (True, ),
                    ((True, False), (True, False), (True, False), (False, )):
                ["h", "i", np.nan],
                        (True, False),
                        (True, ),
                        (True, False),
                        (True, False),
                    ((True, False), (False, ), (True, False), (True, False)):
                [10, np.nan, 20],
                    ((True, False), (True, False), (True, ), (True, False)):
                    ((True, False), (True, False), (False, ), (True, False)):
                [np.nan, 100.5, 200],
                        (True, False),
                        (True, False),
                        (True, False),
                        (True, False),
                [3, 4, 5],
                    ((True, False), (True, False), (True, False), (True, False)):
                [[1, 2], [3, 4], [5]],
                        (True, False),
                        (True, False),
                        (True, False),
                        (True, False),
                [4, 5, 6],
                    ((True, False), (True, False), (True, ), (True, False)):
                    ((True, False), (True, False), (False, ), (True, False)):
                [-10, 12, 13],
                    ((True, False), (True, False), (True, ), (True, False)):
                    ((True, False), (True, False), (False, ), (True, False)):
                [1, 2.0],
                    ((True, False), (True, False), (True, ), (True, False)):
                    ((True, ), (True, False), (False, ), (True, False)):
                    ((False, ), (True, False), (False, ), (True, False)):
                ["a", "b"],
                        (True, False),
                        (True, False),
                        (True, False),
                        (True, False),
                pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]),
                        (True, False),
                        (True, False),
                        (True, False),
                        (True, False),
                pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]),
                        (True, False),
                        (True, False),
                        (True, False),
                        (True, False),
                pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]),
                        (True, ),
                        (True, False),
                        (True, False),
                        (True, False),
                        (False, ),
                        (True, False),
                        (True, False),
                        (True, False),
                pd.period_range("1/1/2011", freq="M", periods=3),
                        (True, False),
                        (True, False),
                        (True, False),
                        (True, False),
                pd.arrays.IntervalArray([pd.Interval(0, 1),
                                         pd.Interval(1, 5)]),
                        (True, False),
                        (True, False),
                        (True, False),
                        (True, False),
    @pytest.mark.parametrize("params", product(*[(True, False)] * 4))
    def test_convert_dtypes(self, data, maindtype, params, answerdict):
        if maindtype is not None:
            series = pd.Series(data, dtype=maindtype)
            series = pd.Series(data)
        answers = {
            k: a
            for (kk, a) in answerdict.items() for k in product(*kk)

        ns = series.convert_dtypes(*params)
        expected_dtype = answers[tuple(params)]
        expected = pd.Series(series.values, dtype=expected_dtype)
        tm.assert_series_equal(ns, expected)

        # Test that it is a copy
        copy = series.copy(deep=True)
        ns[ns.notna()] = np.nan

        # Make sure original not changed
        tm.assert_series_equal(series, copy)
Beispiel #7
     [np.datetime64("2013-01-01"), np.nan,
    ("datetime", [pd.Timestamp("20130101"), np.nan,
    ("date", [date(2013, 1, 1), np.nan,
              date(2018, 1, 1)]),
    # The following two dtypes are commented out due to GH 23554
    # ('complex', [1 + 1j, np.nan, 2 + 2j]),
    # ('timedelta64', [np.timedelta64(1, 'D'),
    #                  np.nan, np.timedelta64(2, 'D')]),
    ("timedelta", [timedelta(1), np.nan, timedelta(2)]),
    ("time", [time(1), np.nan, time(2)]),
    ("period", [pd.Period(2013), pd.NaT,
    ("interval", [pd.Interval(0, 1), np.nan,
                  pd.Interval(0, 2)]),
ids, _ = zip(*_any_skipna_inferred_dtype)  # use inferred type as fixture-id

@pytest.fixture(params=_any_skipna_inferred_dtype, ids=ids)
def any_skipna_inferred_dtype(request):
    Fixture for all inferred dtypes from _libs.lib.infer_dtype

    The covered (inferred) types are:
    * 'string'
    * 'empty'
    * 'bytes'
    * 'mixed'
Beispiel #8
class TestContains:
    def test_contains(self):

        ci = CategoricalIndex(list("aabbca"), categories=list("cabdef"), ordered=False)

        assert "a" in ci
        assert "z" not in ci
        assert "e" not in ci
        assert np.nan not in ci

        # assert codes NOT in index
        assert 0 not in ci
        assert 1 not in ci

    def test_contains_nan(self):
        ci = CategoricalIndex(list("aabbca") + [np.nan], categories=list("cabdef"))
        assert np.nan in ci

    @pytest.mark.parametrize("unwrap", [True, False])
    def test_contains_na_dtype(self, unwrap):
        dti = pd.date_range("2016-01-01", periods=100).insert(0, pd.NaT)
        pi = dti.to_period("D")
        tdi = dti - dti[-1]
        ci = CategoricalIndex(dti)

        obj = ci
        if unwrap:
            obj = ci._data

        assert np.nan in obj
        assert None in obj
        assert pd.NaT in obj
        assert np.datetime64("NaT") in obj
        assert np.timedelta64("NaT") not in obj

        obj2 = CategoricalIndex(tdi)
        if unwrap:
            obj2 = obj2._data

        assert np.nan in obj2
        assert None in obj2
        assert pd.NaT in obj2
        assert np.datetime64("NaT") not in obj2
        assert np.timedelta64("NaT") in obj2

        obj3 = CategoricalIndex(pi)
        if unwrap:
            obj3 = obj3._data

        assert np.nan in obj3
        assert None in obj3
        assert pd.NaT in obj3
        assert np.datetime64("NaT") not in obj3
        assert np.timedelta64("NaT") not in obj3

        "item, expected",
            (pd.Interval(0, 1), True),
            (1.5, True),
            (pd.Interval(0.5, 1.5), False),
            ("a", False),
            (Timestamp(1), False),
            (pd.Timedelta(1), False),
    def test_contains_interval(self, item, expected):
        # GH 23705
        ci = CategoricalIndex(IntervalIndex.from_breaks(range(3)))
        result = item in ci
        assert result is expected

    def test_contains_list(self):
        # GH#21729
        idx = CategoricalIndex([1, 2, 3])

        assert "a" not in idx

        with pytest.raises(TypeError, match="unhashable type"):
            ["a"] in idx

        with pytest.raises(TypeError, match="unhashable type"):
            ["a", "b"] in idx
Beispiel #9
    def _get(self,
        dates = [None]
        if start_date:
            if not end_date:
                end_date = pd.Timestamp.utcnow()
            dates = pd.interval_range(API._timestamp(start_date),
            if len(dates) == 0:
            elif dates[-1].right < API._timestamp(end_date):
                    pd.Interval(dates[-1].right, API._timestamp(end_date)))

        @request_retry(self.ID, retry, retry_wait)
        def helper(start, start_date, end_date):
            if start_date and end_date:
                endpoint = f'/api/v1/{ep}?symbol={symbol}&count={API_MAX}&reverse=false&start={start}&startTime={start_date}&endTime={end_date}'
                endpoint = f'/api/v1/{ep}?symbol={symbol}&reverse=true'
            header = {}
            if self.key_id and self.key_secret:
                header = self._generate_signature("GET", endpoint)
            header['Accept'] = 'application/json'
            return requests.get('{}{}'.format(self.api, endpoint),

        for interval in dates:
            start = 0
            if interval is not None:
                end = interval.right
                end -= pd.Timedelta(nanoseconds=1)

                start_date = str(interval.left).replace(" ", "T") + "Z"
                end_date = str(end).replace(" ", "T") + "Z"

            while True:
                r = helper(start, start_date, end_date)

                if r.status_code in {502, 504}:
                    LOG.warning("%s: %d for URL %s - %s", self.ID,
                                r.status_code, r.url, r.text)
                elif r.status_code == 429:
                elif r.status_code != 200:
                    self._handle_error(r, LOG)

                limit = int(r.headers['X-RateLimit-Remaining'])
                data = r.json()

                yield data

                if len(data) != API_MAX:

                if limit < 1:

                start += len(data)
Beispiel #10
class TestSeriesReplace:
    def test_replace_explicit_none(self):
        # GH#36984 if the user explicitly passes value=None, give it to them
        ser = pd.Series([0, 0, ""], dtype=object)
        result = ser.replace("", None)
        expected = pd.Series([0, 0, None], dtype=object)
        tm.assert_series_equal(result, expected)

        df = pd.DataFrame(np.zeros((3, 3)))
        df.iloc[2, 2] = ""
        result = df.replace("", None)
        expected = pd.DataFrame({
            0: np.zeros(3),
            1: np.zeros(3),
            2: np.array([0.0, 0.0, None], dtype=object),
        assert expected.iloc[2, 2] is None
        tm.assert_frame_equal(result, expected)

        # GH#19998 same thing with object dtype
        ser = pd.Series([10, 20, 30, "a", "a", "b", "a"])
        result = ser.replace("a", None)
        expected = pd.Series([10, 20, 30, None, None, "b", None])
        assert expected.iloc[-1] is None
        tm.assert_series_equal(result, expected)

    def test_replace_noop_doesnt_downcast(self):
        # GH#44498
        ser = pd.Series(
            [None, None, pd.Timestamp("2021-12-16 17:31")], dtype=object)
        res = ser.replace({np.nan: None})  # should be a no-op
        tm.assert_series_equal(res, ser)
        assert res.dtype == object

        # same thing but different calling convention
        res = ser.replace(np.nan, None)
        tm.assert_series_equal(res, ser)
        assert res.dtype == object

    def test_replace(self):
        N = 100
        ser = pd.Series(np.random.randn(N))
        ser[0:4] = np.nan
        ser[6:10] = 0

        # replace list with a single value
        return_value = ser.replace([np.nan], -1, inplace=True)
        assert return_value is None

        exp = ser.fillna(-1)
        tm.assert_series_equal(ser, exp)

        rs = ser.replace(0.0, np.nan)
        ser[ser == 0.0] = np.nan
        tm.assert_series_equal(rs, ser)

        ser = pd.Series(np.fabs(np.random.randn(N)),
        ser[:5] = np.nan
        ser[6:10] = "foo"
        ser[20:30] = "bar"

        # replace list with a single value
        rs = ser.replace([np.nan, "foo", "bar"], -1)

        assert (rs[:5] == -1).all()
        assert (rs[6:10] == -1).all()
        assert (rs[20:30] == -1).all()
        assert (pd.isna(ser[:5])).all()

        # replace with different values
        rs = ser.replace({np.nan: -1, "foo": -2, "bar": -3})

        assert (rs[:5] == -1).all()
        assert (rs[6:10] == -2).all()
        assert (rs[20:30] == -3).all()
        assert (pd.isna(ser[:5])).all()

        # replace with different values with 2 lists
        rs2 = ser.replace([np.nan, "foo", "bar"], [-1, -2, -3])
        tm.assert_series_equal(rs, rs2)

        # replace inplace
        return_value = ser.replace([np.nan, "foo", "bar"], -1, inplace=True)
        assert return_value is None

        assert (ser[:5] == -1).all()
        assert (ser[6:10] == -1).all()
        assert (ser[20:30] == -1).all()

    def test_replace_nan_with_inf(self):
        ser = pd.Series([np.nan, 0, np.inf])
        tm.assert_series_equal(ser.replace(np.nan, 0), ser.fillna(0))

        ser = pd.Series([np.nan, 0, "foo", "bar", np.inf, None, pd.NaT])
        tm.assert_series_equal(ser.replace(np.nan, 0), ser.fillna(0))
        filled = ser.copy()
        filled[4] = 0
        tm.assert_series_equal(ser.replace(np.inf, 0), filled)

    def test_replace_listlike_value_listlike_target(self, datetime_series):
        ser = pd.Series(datetime_series.index)
        tm.assert_series_equal(ser.replace(np.nan, 0), ser.fillna(0))

        # malformed
        msg = r"Replacement lists must match in length\. Expecting 3 got 2"
        with pytest.raises(ValueError, match=msg):
            ser.replace([1, 2, 3], [np.nan, 0])

        # ser is dt64 so can't hold 1 or 2, so this replace is a no-op
        result = ser.replace([1, 2], [np.nan, 0])
        tm.assert_series_equal(result, ser)

        ser = pd.Series([0, 1, 2, 3, 4])
        result = ser.replace([0, 1, 2, 3, 4], [4, 3, 2, 1, 0])
        tm.assert_series_equal(result, pd.Series([4, 3, 2, 1, 0]))

    def test_replace_gh5319(self):
        # API change from 0.12?
        # GH 5319
        ser = pd.Series([0, np.nan, 2, 3, 4])
        expected = ser.ffill()
        result = ser.replace([np.nan])
        tm.assert_series_equal(result, expected)

        ser = pd.Series([0, np.nan, 2, 3, 4])
        expected = ser.ffill()
        result = ser.replace(np.nan)
        tm.assert_series_equal(result, expected)

    def test_replace_datetime64(self):
        # GH 5797
        ser = pd.Series(pd.date_range("20130101", periods=5))
        expected = ser.copy()
        expected.loc[2] = pd.Timestamp("20120101")
        result = ser.replace(
            {pd.Timestamp("20130103"): pd.Timestamp("20120101")})
        tm.assert_series_equal(result, expected)
        result = ser.replace(pd.Timestamp("20130103"),
        tm.assert_series_equal(result, expected)

    def test_replace_nat_with_tz(self):
        # GH 11792: Test with replacing NaT in a list with tz data
        ts = pd.Timestamp("2015/01/01", tz="UTC")
        s = pd.Series([pd.NaT, pd.Timestamp("2015/01/01", tz="UTC")])
        result = s.replace([np.nan, pd.NaT], pd.Timestamp.min)
        expected = pd.Series([pd.Timestamp.min, ts], dtype=object)
        tm.assert_series_equal(expected, result)

    def test_replace_timedelta_td64(self):
        tdi = pd.timedelta_range(0, periods=5)
        ser = pd.Series(tdi)

        # Using a single dict argument means we go through replace_list
        result = ser.replace({ser[1]: ser[3]})

        expected = pd.Series([ser[0], ser[3], ser[2], ser[3], ser[4]])
        tm.assert_series_equal(result, expected)

    def test_replace_with_single_list(self):
        ser = pd.Series([0, 1, 2, 3, 4])
        result = ser.replace([1, 2, 3])
        tm.assert_series_equal(result, pd.Series([0, 0, 0, 0, 4]))

        s = ser.copy()
        return_value = s.replace([1, 2, 3], inplace=True)
        assert return_value is None
        tm.assert_series_equal(s, pd.Series([0, 0, 0, 0, 4]))

        # make sure things don't get corrupted when fillna call fails
        s = ser.copy()
        msg = (r"Invalid fill method\. Expecting pad \(ffill\) or backfill "
               r"\(bfill\)\. Got crash_cymbal")
        with pytest.raises(ValueError, match=msg):
            return_value = s.replace([1, 2, 3],
            assert return_value is None
        tm.assert_series_equal(s, ser)

    def test_replace_mixed_types(self):
        ser = pd.Series(np.arange(5), dtype="int64")

        def check_replace(to_rep, val, expected):
            sc = ser.copy()
            result = ser.replace(to_rep, val)
            return_value = sc.replace(to_rep, val, inplace=True)
            assert return_value is None
            tm.assert_series_equal(expected, result)
            tm.assert_series_equal(expected, sc)

        # 3.0 can still be held in our int64 series, so we do not upcast GH#44940
        tr, v = [3], [3.0]
        check_replace(tr, v, ser)
        # Note this matches what we get with the scalars 3 and 3.0
        check_replace(tr[0], v[0], ser)

        # MUST upcast to float
        e = pd.Series([0, 1, 2, 3.5, 4])
        tr, v = [3], [3.5]
        check_replace(tr, v, e)

        # casts to object
        e = pd.Series([0, 1, 2, 3.5, "a"])
        tr, v = [3, 4], [3.5, "a"]
        check_replace(tr, v, e)

        # again casts to object
        e = pd.Series([0, 1, 2, 3.5, pd.Timestamp("20130101")])
        tr, v = [3, 4], [3.5, pd.Timestamp("20130101")]
        check_replace(tr, v, e)

        # casts to object
        e = pd.Series([0, 1, 2, 3.5, True], dtype="object")
        tr, v = [3, 4], [3.5, True]
        check_replace(tr, v, e)

        # test an object with dates + floats + integers + strings
        dr = pd.Series(pd.date_range("1/1/2001", "1/10/2001", freq="D"))
        result = dr.astype(object).replace([dr[0], dr[1], dr[2]],
                                           [1.0, 2, "a"])
        expected = pd.Series([1.0, 2, "a"] + dr[3:].tolist(), dtype=object)
        tm.assert_series_equal(result, expected)

    def test_replace_bool_with_string_no_op(self):
        s = pd.Series([True, False, True])
        result = s.replace("fun", "in-the-sun")
        tm.assert_series_equal(s, result)

    def test_replace_bool_with_string(self):
        # nonexistent elements
        s = pd.Series([True, False, True])
        result = s.replace(True, "2u")
        expected = pd.Series(["2u", False, "2u"])
        tm.assert_series_equal(expected, result)

    def test_replace_bool_with_bool(self):
        s = pd.Series([True, False, True])
        result = s.replace(True, False)
        expected = pd.Series([False] * len(s))
        tm.assert_series_equal(expected, result)

    def test_replace_with_dict_with_bool_keys(self):
        s = pd.Series([True, False, True])
        result = s.replace({"asdf": "asdb", True: "yes"})
        expected = pd.Series(["yes", False, "yes"])
        tm.assert_series_equal(result, expected)

    def test_replace_Int_with_na(self, any_int_ea_dtype):
        # GH 38267
        result = pd.Series([0, None], dtype=any_int_ea_dtype).replace(0, pd.NA)
        expected = pd.Series([pd.NA, pd.NA], dtype=any_int_ea_dtype)
        tm.assert_series_equal(result, expected)
        result = pd.Series([0, 1], dtype=any_int_ea_dtype).replace(0, pd.NA)
        result.replace(1, pd.NA, inplace=True)
        tm.assert_series_equal(result, expected)

    def test_replace2(self):
        N = 100
        ser = pd.Series(np.fabs(np.random.randn(N)),
        ser[:5] = np.nan
        ser[6:10] = "foo"
        ser[20:30] = "bar"

        # replace list with a single value
        rs = ser.replace([np.nan, "foo", "bar"], -1)

        assert (rs[:5] == -1).all()
        assert (rs[6:10] == -1).all()
        assert (rs[20:30] == -1).all()
        assert (pd.isna(ser[:5])).all()

        # replace with different values
        rs = ser.replace({np.nan: -1, "foo": -2, "bar": -3})

        assert (rs[:5] == -1).all()
        assert (rs[6:10] == -2).all()
        assert (rs[20:30] == -3).all()
        assert (pd.isna(ser[:5])).all()

        # replace with different values with 2 lists
        rs2 = ser.replace([np.nan, "foo", "bar"], [-1, -2, -3])
        tm.assert_series_equal(rs, rs2)

        # replace inplace
        return_value = ser.replace([np.nan, "foo", "bar"], -1, inplace=True)
        assert return_value is None
        assert (ser[:5] == -1).all()
        assert (ser[6:10] == -1).all()
        assert (ser[20:30] == -1).all()

    def test_replace_with_dictlike_and_string_dtype(self,
        # GH 32621, GH#44940
        ser = pd.Series(["one", "two", np.nan], dtype=nullable_string_dtype)
        expected = pd.Series(["1", "2", np.nan], dtype=nullable_string_dtype)
        result = ser.replace({"one": "1", "two": "2"})
        tm.assert_series_equal(expected, result)

    def test_replace_with_empty_dictlike(self):
        # GH 15289
        s = pd.Series(list("abcd"))
        tm.assert_series_equal(s, s.replace({}))

        with tm.assert_produces_warning(FutureWarning):
            empty_series = pd.Series([])
        tm.assert_series_equal(s, s.replace(empty_series))

    def test_replace_string_with_number(self):
        # GH 15743
        s = pd.Series([1, 2, 3])
        result = s.replace("2", np.nan)
        expected = pd.Series([1, 2, 3])
        tm.assert_series_equal(expected, result)

    def test_replace_replacer_equals_replacement(self):
        # GH 20656
        # make sure all replacers are matching against original values
        s = pd.Series(["a", "b"])
        expected = pd.Series(["b", "a"])
        result = s.replace({"a": "b", "b": "a"})
        tm.assert_series_equal(expected, result)

    def test_replace_unicode_with_number(self):
        # GH 15743
        s = pd.Series([1, 2, 3])
        result = s.replace("2", np.nan)
        expected = pd.Series([1, 2, 3])
        tm.assert_series_equal(expected, result)

    def test_replace_mixed_types_with_string(self):
        # Testing mixed
        s = pd.Series([1, 2, 3, "4", 4, 5])
        result = s.replace([2, "4"], np.nan)
        expected = pd.Series([1, np.nan, 3, np.nan, 4, 5])
        tm.assert_series_equal(expected, result)

        "categorical, numeric",
            (pd.Categorical(["A"], categories=["A", "B"]), [1]),
            (pd.Categorical(["A", "B"], categories=["A", "B"]), [1, 2]),
    def test_replace_categorical(self, categorical, numeric):
        # GH 24971, GH#23305
        ser = pd.Series(categorical)
        result = ser.replace({"A": 1, "B": 2})
        expected = pd.Series(numeric).astype("category")
        if 2 not in
            # i.e. categories should be [1, 2] even if there are no "B"s present
            # GH#44940
            expected =
        tm.assert_series_equal(expected, result)

    def test_replace_categorical_single(self):
        # GH 26988
        dti = pd.date_range("2016-01-01", periods=3, tz="US/Pacific")
        s = pd.Series(dti)
        c = s.astype("category")

        expected = c.copy()
        expected ="foo")
        expected[2] = "foo"
        expected =
        assert c[2] != "foo"

        result = c.replace(c[2], "foo")
        tm.assert_series_equal(expected, result)
        assert c[2] != "foo"  # ensure non-inplace call does not alter original

        return_value = c.replace(c[2], "foo", inplace=True)
        assert return_value is None
        tm.assert_series_equal(expected, c)

        first_value = c[0]
        return_value = c.replace(c[1], c[0], inplace=True)
        assert return_value is None
        assert c[0] == c[1] == first_value  # test replacing with existing value

    def test_replace_with_no_overflowerror(self):
        # GH 25616
        # casts to object without Exception from OverflowError
        s = pd.Series([0, 1, 2, 3, 4])
        result = s.replace([3], ["100000000000000000000"])
        expected = pd.Series([0, 1, 2, "100000000000000000000", 4])
        tm.assert_series_equal(result, expected)

        s = pd.Series([0, "100000000000000000000", "100000000000000000001"])
        result = s.replace(["100000000000000000000"], [1])
        expected = pd.Series([0, 1, "100000000000000000001"])
        tm.assert_series_equal(result, expected)

        "ser, to_replace, exp",
            ([1, 2, 3], {
                1: 2,
                2: 3,
                3: 4
            }, [2, 3, 4]),
            (["1", "2", "3"], {
                "1": "2",
                "2": "3",
                "3": "4"
            }, ["2", "3", "4"]),
    def test_replace_commutative(self, ser, to_replace, exp):
        # GH 16051
        # DataFrame.replace() overwrites when values are non-numeric

        series = pd.Series(ser)

        expected = pd.Series(exp)
        result = series.replace(to_replace)

        tm.assert_series_equal(result, expected)

    @pytest.mark.parametrize("ser, exp", [([1, 2, 3], [1, True, 3]),
                                          (["x", 2, 3], ["x", True, 3])])
    def test_replace_no_cast(self, ser, exp):
        # GH 9113
        # BUG: replace int64 dtype with bool coerces to int64

        series = pd.Series(ser)
        result = series.replace(2, True)
        expected = pd.Series(exp)

        tm.assert_series_equal(result, expected)

    def test_replace_invalid_to_replace(self):
        # GH 18634
        # API: replace() should raise an exception if invalid argument is given
        series = pd.Series(["a", "b", "c "])
        msg = (r"Expecting 'to_replace' to be either a scalar, array-like, "
               r"dict or None, got invalid type.*")
        with pytest.raises(TypeError, match=msg):
            series.replace(lambda x: x.strip())

    @pytest.mark.parametrize("frame", [False, True])
    def test_replace_nonbool_regex(self, frame):
        obj = pd.Series(["a", "b", "c "])
        if frame:
            obj = obj.to_frame()

        msg = "'to_replace' must be 'None' if 'regex' is not a bool"
        with pytest.raises(ValueError, match=msg):
            obj.replace(to_replace=["a"], regex="foo")

    @pytest.mark.parametrize("frame", [False, True])
    def test_replace_empty_copy(self, frame):
        obj = pd.Series([], dtype=np.float64)
        if frame:
            obj = obj.to_frame()

        res = obj.replace(4, 5, inplace=True)
        assert res is None

        res = obj.replace(4, 5, inplace=False)
        tm.assert_equal(res, obj)
        assert res is not obj

    def test_replace_only_one_dictlike_arg(self, fixed_now_ts):
        # GH#33340

        ser = pd.Series([1, 2, "A", fixed_now_ts, True])
        to_replace = {0: 1, 2: "A"}
        value = "foo"
        msg = "Series.replace cannot use dict-like to_replace and non-None value"
        with pytest.raises(ValueError, match=msg):
            ser.replace(to_replace, value)

        to_replace = 1
        value = {0: "foo", 2: "bar"}
        msg = "Series.replace cannot use dict-value and non-None to_replace"
        with pytest.raises(ValueError, match=msg):
            ser.replace(to_replace, value)

    def test_replace_extension_other(self, frame_or_series):
        obj = frame_or_series(pd.array([1, 2, 3], dtype="Int64"))
        result = obj.replace("", "")  # no exception
        # should not have changed dtype
        tm.assert_equal(obj, result)

    def _check_replace_with_method(self, ser: pd.Series):
        df = ser.to_frame()

        res = ser.replace(ser[1], method="pad")
        expected = pd.Series([ser[0], ser[0]] + list(ser[2:]), dtype=ser.dtype)
        tm.assert_series_equal(res, expected)

        res_df = df.replace(ser[1], method="pad")
        tm.assert_frame_equal(res_df, expected.to_frame())

        ser2 = ser.copy()
        res2 = ser2.replace(ser[1], method="pad", inplace=True)
        assert res2 is None
        tm.assert_series_equal(ser2, expected)

        res_df2 = df.replace(ser[1], method="pad", inplace=True)
        assert res_df2 is None
        tm.assert_frame_equal(df, expected.to_frame())

    def test_replace_ea_dtype_with_method(self, any_numeric_ea_dtype):
        arr = pd.array([1, 2, pd.NA, 4], dtype=any_numeric_ea_dtype)
        ser = pd.Series(arr)


    @pytest.mark.parametrize("as_categorical", [True, False])
    def test_replace_interval_with_method(self, as_categorical):
        # in particular interval that can't hold NA

        idx = pd.IntervalIndex.from_breaks(range(4))
        ser = pd.Series(idx)
        if as_categorical:
            ser = ser.astype("category")


    @pytest.mark.parametrize("as_period", [True, False])
    @pytest.mark.parametrize("as_categorical", [True, False])
    def test_replace_datetimelike_with_method(self, as_period, as_categorical):
        idx = pd.date_range("2016-01-01", periods=5, tz="US/Pacific")
        if as_period:
            idx = idx.tz_localize(None).to_period("D")

        ser = pd.Series(idx)
        ser.iloc[-2] = pd.NaT
        if as_categorical:
            ser = ser.astype("category")


    def test_replace_with_compiled_regex(self):
        s = pd.Series(["a", "b", "c"])
        regex = re.compile("^a$")
        result = s.replace({regex: "z"}, regex=True)
        expected = pd.Series(["z", "b", "c"])
        tm.assert_series_equal(result, expected)

    def test_pandas_replace_na(self):
        # GH#43344
        ser = pd.Series(["AA", "BB", "CC", "DD", "EE", "", pd.NA],
        regex_mapping = {
            "AA": "CC",
            "BB": "CC",
            "EE": "CC",
            "CC": "CC-REPL",
        result = ser.replace(regex_mapping, regex=True)
        exp = pd.Series(["CC", "CC", "CC-REPL", "DD", "CC", "", pd.NA],
        tm.assert_series_equal(result, exp)

        "dtype, input_data, to_replace, expected_data",
            ("bool", [True, False], {
                True: False
            }, [False, False]),
            ("int64", [1, 2], {
                1: 10,
                2: 20
            }, [10, 20]),
            ("Int64", [1, 2], {
                1: 10,
                2: 20
            }, [10, 20]),
            ("float64", [1.1, 2.2], {
                1.1: 10.1,
                2.2: 20.5
            }, [10.1, 20.5]),
            ("Float64", [1.1, 2.2], {
                1.1: 10.1,
                2.2: 20.5
            }, [10.1, 20.5]),
            ("string", ["one", "two"], {
                "one": "1",
                "two": "2"
            }, ["1", "2"]),
                IntervalArray([pd.Interval(1, 2),
                               pd.Interval(2, 3)]),
                    pd.Interval(1, 2): pd.Interval(10, 20)
                IntervalArray([pd.Interval(10, 20),
                               pd.Interval(2, 3)]),
                IntervalArray([pd.Interval(1.0, 2.7),
                               pd.Interval(2.8, 3.1)]),
                    pd.Interval(1.0, 2.7): pd.Interval(10.6, 20.8)
                IntervalArray([pd.Interval(10.6, 20.8),
                               pd.Interval(2.8, 3.1)]),
                [pd.Period("2020-05", freq="M")],
                    pd.Period("2020-05", freq="M"): pd.Period("2020-06",
                [pd.Period("2020-06", freq="M")],
    def test_replace_dtype(self, dtype, input_data, to_replace, expected_data):
        # GH#33484
        ser = pd.Series(input_data, dtype=dtype)
        result = ser.replace(to_replace)
        expected = pd.Series(expected_data, dtype=dtype)
        tm.assert_series_equal(result, expected)

    def test_replace_string_dtype(self):
        # GH#40732, GH#44940
        ser = pd.Series(["one", "two", np.nan], dtype="string")
        res = ser.replace({"one": "1", "two": "2"})
        expected = pd.Series(["1", "2", np.nan], dtype="string")
        tm.assert_series_equal(res, expected)

        # GH#31644
        ser2 = pd.Series(["A", np.nan], dtype="string")
        res2 = ser2.replace("A", "B")
        expected2 = pd.Series(["B", np.nan], dtype="string")
        tm.assert_series_equal(res2, expected2)

        ser3 = pd.Series(["A", "B"], dtype="string")
        res3 = ser3.replace("A", pd.NA)
        expected3 = pd.Series([pd.NA, "B"], dtype="string")
        tm.assert_series_equal(res3, expected3)

    def test_replace_string_dtype_list_to_replace(self):
        # GH#41215, GH#44940
        ser = pd.Series(["abc", "def"], dtype="string")
        res = ser.replace(["abc", "any other string"], "xyz")
        expected = pd.Series(["xyz", "def"], dtype="string")
        tm.assert_series_equal(res, expected)

    def test_replace_string_dtype_regex(self):
        # GH#31644
        ser = pd.Series(["A", "B"], dtype="string")
        res = ser.replace(r".", "C", regex=True)
        expected = pd.Series(["C", "C"], dtype="string")
        tm.assert_series_equal(res, expected)

    def test_replace_nullable_numeric(self):
        # GH#40732, GH#44940

        floats = pd.Series([1.0, 2.0, 3.999, 4.4], dtype=pd.Float64Dtype())
        assert floats.replace({1.0: 9}).dtype == floats.dtype
        assert floats.replace(1.0, 9).dtype == floats.dtype
        assert floats.replace({1.0: 9.0}).dtype == floats.dtype
        assert floats.replace(1.0, 9.0).dtype == floats.dtype

        res = floats.replace(to_replace=[1.0, 2.0], value=[9.0, 10.0])
        assert res.dtype == floats.dtype

        ints = pd.Series([1, 2, 3, 4], dtype=pd.Int64Dtype())
        assert ints.replace({1: 9}).dtype == ints.dtype
        assert ints.replace(1, 9).dtype == ints.dtype
        assert ints.replace({1: 9.0}).dtype == ints.dtype
        assert ints.replace(1, 9.0).dtype == ints.dtype

        # nullable (for now) raises instead of casting
        with pytest.raises(TypeError, match="Invalid value"):
            ints.replace({1: 9.5})
        with pytest.raises(TypeError, match="Invalid value"):
            ints.replace(1, 9.5)

    @pytest.mark.parametrize("regex", [False, True])
    def test_replace_regex_dtype_series(self, regex):
        # GH-48644
        series = pd.Series(["0"])
        expected = pd.Series([1])
        result = series.replace(to_replace="0", value=1, regex=regex)
        tm.assert_series_equal(result, expected)

    def test_replace_different_int_types(self, any_int_numpy_dtype):
        # GH#45311
        labs = pd.Series([1, 1, 1, 0, 0, 2, 2, 2], dtype=any_int_numpy_dtype)

        maps = pd.Series([0, 2, 1], dtype=any_int_numpy_dtype)
        map_dict = {old: new for (old, new) in zip(maps.values, maps.index)}

        result = labs.replace(map_dict)
        expected = labs.replace({0: 0, 2: 1, 1: 2})
        tm.assert_series_equal(result, expected)
def test_slicing_agg_min_max(s1_fix):
    result = s1_fix.slice(range(-4, 11, 2)).agg(["min", "max"])
            pd.Interval(-4, -2, closed="left"): -1.75,
            pd.Interval(-2, 0, closed="left"): -1.75,
            pd.Interval(0, 2, closed="left"): -1.75,
            pd.Interval(2, 4, closed="left"): 0.25,
            pd.Interval(4, 6, closed="left"): 2.0,
            pd.Interval(6, 8, closed="left"): -0.5,
            pd.Interval(8, 10, closed="left"): -0.5,
            pd.Interval(-4, -2, closed="left"): -1.75,
            pd.Interval(-2, 0, closed="left"): -1.75,
            pd.Interval(0, 2, closed="left"): 0.25,
            pd.Interval(2, 4, closed="left"): 2.75,
            pd.Interval(4, 6, closed="left"): 2.75,
            pd.Interval(6, 8, closed="left"): -0.5,
            pd.Interval(8, 10, closed="left"): -0.5,
Beispiel #12
class TestHashing(object):
        Series([1, 2, 3] * 3, dtype='int32'),
        Series([None, 2.5, 3.5] * 3, dtype='float32'),
        Series(['a', 'b', 'c'] * 3, dtype='category'),
        Series(['d', 'e', 'f'] * 3),
        Series([True, False, True] * 3),
        Series(pd.date_range('20130101', periods=9)),
        Series(pd.date_range('20130101', periods=9, tz='US/Eastern')),
        Series(pd.timedelta_range('2000', periods=9))
    def series(self, request):
        return request.param

    def test_consistency(self):
        # check that our hash doesn't change because of a mistake
        # in the actual code; this is the ground truth
        result = hash_pandas_object(Index(['foo', 'bar', 'baz']))
        expected = Series(np.array(
            [3600424527151052760, 1374399572096150070, 477881037637427054],
                          index=['foo', 'bar', 'baz'])
        tm.assert_series_equal(result, expected)

    def test_hash_array(self, series):
        a = series.values
        tm.assert_numpy_array_equal(hash_array(a), hash_array(a))

    def test_hash_array_mixed(self):
        result1 = hash_array(np.array([3, 4, 'All']))
        result2 = hash_array(np.array(['3', '4', 'All']))
        result3 = hash_array(np.array([3, 4, 'All'], dtype=object))
        tm.assert_numpy_array_equal(result1, result2)
        tm.assert_numpy_array_equal(result1, result3)

    @pytest.mark.parametrize('val', [5, 'foo', pd.Timestamp('20130101')])
    def test_hash_array_errors(self, val):
        msg = 'must pass a ndarray-like'
        with tm.assert_raises_regex(TypeError, msg):

    def check_equal(self, obj, **kwargs):
        a = hash_pandas_object(obj, **kwargs)
        b = hash_pandas_object(obj, **kwargs)
        tm.assert_series_equal(a, b)

        kwargs.pop('index', None)
        a = hash_pandas_object(obj, **kwargs)
        b = hash_pandas_object(obj, **kwargs)
        tm.assert_series_equal(a, b)

    def check_not_equal_with_index(self, obj):

        # check that we are not hashing the same if
        # we include the index
        if not isinstance(obj, Index):
            a = hash_pandas_object(obj, index=True)
            b = hash_pandas_object(obj, index=False)
            if len(obj):
                assert not (a == b).all()

    def test_hash_tuples(self):
        tups = [(1, 'one'), (1, 'two'), (2, 'one')]
        result = hash_tuples(tups)
        expected = hash_pandas_object(MultiIndex.from_tuples(tups)).values
        tm.assert_numpy_array_equal(result, expected)

        result = hash_tuples(tups[0])
        assert result == expected[0]

    @pytest.mark.parametrize('tup', [(1, 'one'), (1, np.nan),
                                     (1.0, pd.NaT, 'A'),
                                     ('A', pd.Timestamp("2012-01-01"))])
    def test_hash_tuple(self, tup):
        # test equivalence between hash_tuples and hash_tuple
        result = hash_tuple(tup)
        expected = hash_tuples([tup])[0]
        assert result == expected

    @pytest.mark.parametrize('val', [
        1, 1.4, 'A', b'A', u'A',
        pd.Timestamp("2012-01-01", tz='Europe/Brussels'),
        datetime.datetime(2012, 1, 1),
        pd.Timestamp("2012-01-01", tz='EST').to_pydatetime(),
        pd.Timedelta('1 days'),
        pd.Period('2012-01-01', freq='D'),
        pd.Interval(0, 1), np.nan, pd.NaT, None
    def test_hash_scalar(self, val):
        result = _hash_scalar(val)
        expected = hash_array(np.array([val], dtype=object), categorize=True)
        assert result[0] == expected[0]

    @pytest.mark.parametrize('val', [5, 'foo', pd.Timestamp('20130101')])
    def test_hash_tuples_err(self, val):
        msg = 'must be convertible to a list-of-tuples'
        with tm.assert_raises_regex(TypeError, msg):

    def test_multiindex_unique(self):
        mi = MultiIndex.from_tuples([(118, 472), (236, 118), (51, 204),
                                     (102, 51)])
        assert mi.is_unique
        result = hash_pandas_object(mi)
        assert result.is_unique

    def test_multiindex_objects(self):
        mi = MultiIndex(levels=[['b', 'd', 'a'], [1, 2, 3]],
                        labels=[[0, 1, 0, 2], [2, 0, 0, 1]],
                        names=['col1', 'col2'])
        recons = mi._sort_levels_monotonic()

        # these are equal
        assert mi.equals(recons)
        assert Index(mi.values).equals(Index(recons.values))

        # _hashed_values and hash_pandas_object(..., index=False)
        # equivalency
        expected = hash_pandas_object(mi, index=False).values
        result = mi._hashed_values
        tm.assert_numpy_array_equal(result, expected)

        expected = hash_pandas_object(recons, index=False).values
        result = recons._hashed_values
        tm.assert_numpy_array_equal(result, expected)

        expected = mi._hashed_values
        result = recons._hashed_values

        # values should match, but in different order
        tm.assert_numpy_array_equal(np.sort(result), np.sort(expected))

    @pytest.mark.parametrize('obj', [
        Series([1, 2, 3]),
        Series([1.0, 1.5, 3.2]),
        Series([1.0, 1.5, np.nan]),
        Series([1.0, 1.5, 3.2], index=[1.5, 1.1, 3.3]),
        Series(['a', 'b', 'c']),
        Series(['a', np.nan, 'c']),
        Series(['a', None, 'c']),
        Series([True, False, True]),
        Index([1, 2, 3]),
        Index([True, False, True]),
            'x': ['a', 'b', 'c'],
            'y': [1, 2, 3]
        Series(pd.date_range('20130101', periods=3, tz='US/Eastern')),
            range(5), ['foo', 'bar', 'baz'],
            pd.date_range('20130101', periods=2)
    def test_hash_pandas_object(self, obj):

    def test_hash_pandas_object2(self, series):

        [Series([], dtype='float64'),
         Series([], dtype='object'),
    def test_hash_pandas_empty_object(self, obj):
        # these are by-definition the same with
        # or w/o the index as the data is empty

    @pytest.mark.parametrize('s1', [
        Series(['a', 'b', 'c', 'd']),
        Series([1000, 2000, 3000, 4000]),
        Series(pd.date_range(0, periods=4))
    @pytest.mark.parametrize('categorize', [True, False])
    def test_categorical_consistency(self, s1, categorize):
        # GH15143
        # Check that categoricals hash consistent with their values, not codes
        # This should work for categoricals of any dtype
        s2 = s1.astype('category').cat.set_categories(s1)
        s3 =

        # These should all hash identically
        h1 = hash_pandas_object(s1, categorize=categorize)
        h2 = hash_pandas_object(s2, categorize=categorize)
        h3 = hash_pandas_object(s3, categorize=categorize)
        tm.assert_series_equal(h1, h2)
        tm.assert_series_equal(h1, h3)

    def test_categorical_with_nan_consistency(self):
        c = pd.Categorical.from_codes([-1, 0, 1, 2, 3, 4],
        expected = hash_array(c, categorize=False)
        c = pd.Categorical.from_codes([-1, 0],
        result = hash_array(c, categorize=False)
        assert result[0] in expected
        assert result[1] in expected

    def test_pandas_errors(self):
        with pytest.raises(TypeError):

        obj = tm.makePanel()

        with pytest.raises(TypeError):

    def test_hash_keys(self):
        # using different hash keys, should have different hashes
        # for the same data

        # this only matters for object dtypes
        obj = Series(list('abc'))
        a = hash_pandas_object(obj, hash_key='9876543210123456')
        b = hash_pandas_object(obj, hash_key='9876543210123465')
        assert (a != b).all()

    def test_invalid_key(self):
        # this only matters for object dtypes
        msg = 'key should be a 16-byte string encoded'
        with tm.assert_raises_regex(ValueError, msg):
            hash_pandas_object(Series(list('abc')), hash_key='foo')

    def test_alread_encoded(self):
        # if already encoded then ok

        obj = Series(list('abc')).str.encode('utf8')

    def test_alternate_encoding(self):

        obj = Series(list('abc'))
        self.check_equal(obj, encoding='ascii')

    @pytest.mark.parametrize('l_exp', range(8))
    @pytest.mark.parametrize('l_add', [0, 1])
    def test_same_len_hash_collisions(self, l_exp, l_add):
        length = 2**(l_exp + 8) + l_add
        s = tm.rands_array(length, 2)
        result = hash_array(s, 'utf8')
        assert not result[0] == result[1]

    def test_hash_collisions(self):

        # hash collisions are bad
        L = [
            'Ingrid-9Z9fKIZmkO7i7Cn51Li34pJm44fgX6DYGBNj3VPlOH50m7HnBlPxfIwFMrcNJNMP6PSgLmwWnInciMWrCSAlLEvt7JkJl4IxiMrVbXSa8ZQoVaq5xoQPjltuJEfwdNlO6jo8qRRHvD8sBEBMQASrRa6TsdaPTPCBo3nwIBpE7YzzmyH0vMBhjQZLx1aCT7faSEx7PgFxQhHdKFWROcysamgy9iVj8DO2Fmwg1NNl93rIAqC3mdqfrCxrzfvIY8aJdzin2cHVzy3QUJxZgHvtUtOLxoqnUHsYbNTeq0xcLXpTZEZCxD4PGubIuCNf32c33M7HFsnjWSEjE2yVdWKhmSVodyF8hFYVmhYnMCztQnJrt3O8ZvVRXd5IKwlLexiSp4h888w7SzAIcKgc3g5XQJf6MlSMftDXm9lIsE1mJNiJEv6uY6pgvC3fUPhatlR5JPpVAHNSbSEE73MBzJrhCAbOLXQumyOXigZuPoME7QgJcBalliQol7YZ9',  # noqa
        ]  # noqa

        # these should be different!
        result1 = hash_array(np.asarray(L[0:1], dtype=object), 'utf8')
        expected1 = np.array([14963968704024874985], dtype=np.uint64)
        tm.assert_numpy_array_equal(result1, expected1)

        result2 = hash_array(np.asarray(L[1:2], dtype=object), 'utf8')
        expected2 = np.array([16428432627716348016], dtype=np.uint64)
        tm.assert_numpy_array_equal(result2, expected2)

        result = hash_array(np.asarray(L, dtype=object), 'utf8')
            result, np.concatenate([expected1, expected2], axis=0))
Beispiel #13
class TestDataFrameDataTypes(TestData):

    def test_concat_empty_dataframe_dtypes(self):
        df = DataFrame(columns=list("abc"))
        df['a'] = df['a'].astype(np.bool_)
        df['b'] = df['b'].astype(np.int32)
        df['c'] = df['c'].astype(np.float64)

        result = pd.concat([df, df])
        assert result['a'].dtype == np.bool_
        assert result['b'].dtype == np.int32
        assert result['c'].dtype == np.float64

        result = pd.concat([df, df.astype(np.float64)])
        assert result['a'].dtype == np.object_
        assert result['b'].dtype == np.float64
        assert result['c'].dtype == np.float64

    def test_empty_frame_dtypes_ftypes(self):
        empty_df = pd.DataFrame()
        assert_series_equal(empty_df.dtypes, pd.Series(dtype=np.object))
        assert_series_equal(empty_df.ftypes, pd.Series(dtype=np.object))

        nocols_df = pd.DataFrame(index=[1, 2, 3])
        assert_series_equal(nocols_df.dtypes, pd.Series(dtype=np.object))
        assert_series_equal(nocols_df.ftypes, pd.Series(dtype=np.object))

        norows_df = pd.DataFrame(columns=list("abc"))
        assert_series_equal(norows_df.dtypes, pd.Series(
            np.object, index=list("abc")))
        assert_series_equal(norows_df.ftypes, pd.Series(
            'object:dense', index=list("abc")))

        norows_int_df = pd.DataFrame(columns=list("abc")).astype(np.int32)
        assert_series_equal(norows_int_df.dtypes, pd.Series(
            np.dtype('int32'), index=list("abc")))
        assert_series_equal(norows_int_df.ftypes, pd.Series(
            'int32:dense', index=list("abc")))

        odict = compat.OrderedDict
        df = pd.DataFrame(odict([('a', 1), ('b', True), ('c', 1.0)]),
                          index=[1, 2, 3])
        ex_dtypes = pd.Series(odict([('a', np.int64),
                                     ('b', np.bool),
                                     ('c', np.float64)]))
        ex_ftypes = pd.Series(odict([('a', 'int64:dense'),
                                     ('b', 'bool:dense'),
                                     ('c', 'float64:dense')]))
        assert_series_equal(df.dtypes, ex_dtypes)
        assert_series_equal(df.ftypes, ex_ftypes)

        # same but for empty slice of df
        assert_series_equal(df[:0].dtypes, ex_dtypes)
        assert_series_equal(df[:0].ftypes, ex_ftypes)

    def test_datetime_with_tz_dtypes(self):
        tzframe = DataFrame({'A': date_range('20130101', periods=3),
                             'B': date_range('20130101', periods=3,
                             'C': date_range('20130101', periods=3, tz='CET')})
        tzframe.iloc[1, 1] = pd.NaT
        tzframe.iloc[1, 2] = pd.NaT
        result = tzframe.dtypes.sort_index()
        expected = Series([np.dtype('datetime64[ns]'),
                           DatetimeTZDtype('datetime64[ns, US/Eastern]'),
                           DatetimeTZDtype('datetime64[ns, CET]')],
                          ['A', 'B', 'C'])

        assert_series_equal(result, expected)

    def test_dtypes_are_correct_after_column_slice(self):
        # GH6525
        df = pd.DataFrame(index=range(5), columns=list("abc"), dtype=np.float_)
        odict = compat.OrderedDict
                            pd.Series(odict([('a', np.float_),
                                             ('b', np.float_),
                                             ('c', np.float_)])))
        assert_series_equal(df.iloc[:, 2:].dtypes,
                            pd.Series(odict([('c', np.float_)])))
                            pd.Series(odict([('a', np.float_),
                                             ('b', np.float_),
                                             ('c', np.float_)])))

    def test_select_dtypes_include_using_list_like(self):
        df = DataFrame({'a': list('abc'),
                        'b': list(range(1, 4)),
                        'c': np.arange(3, 6).astype('u1'),
                        'd': np.arange(4.0, 7.0, dtype='float64'),
                        'e': [True, False, True],
                        'f': pd.Categorical(list('abc')),
                        'g': pd.date_range('20130101', periods=3),
                        'h': pd.date_range('20130101', periods=3,
                        'i': pd.date_range('20130101', periods=3,
                        'j': pd.period_range('2013-01', periods=3,
                        'k': pd.timedelta_range('1 day', periods=3)})

        ri = df.select_dtypes(include=[np.number])
        ei = df[['b', 'c', 'd', 'k']]
        assert_frame_equal(ri, ei)

        ri = df.select_dtypes(include=[np.number], exclude=['timedelta'])
        ei = df[['b', 'c', 'd']]
        assert_frame_equal(ri, ei)

        ri = df.select_dtypes(include=[np.number, 'category'],
        ei = df[['b', 'c', 'd', 'f']]
        assert_frame_equal(ri, ei)

        ri = df.select_dtypes(include=['datetime'])
        ei = df[['g']]
        assert_frame_equal(ri, ei)

        ri = df.select_dtypes(include=['datetime64'])
        ei = df[['g']]
        assert_frame_equal(ri, ei)

        ri = df.select_dtypes(include=['datetimetz'])
        ei = df[['h', 'i']]
        assert_frame_equal(ri, ei)

                      lambda: df.select_dtypes(include=['period']))

    def test_select_dtypes_exclude_using_list_like(self):
        df = DataFrame({'a': list('abc'),
                        'b': list(range(1, 4)),
                        'c': np.arange(3, 6).astype('u1'),
                        'd': np.arange(4.0, 7.0, dtype='float64'),
                        'e': [True, False, True]})
        re = df.select_dtypes(exclude=[np.number])
        ee = df[['a', 'e']]
        assert_frame_equal(re, ee)

    def test_select_dtypes_exclude_include_using_list_like(self):
        df = DataFrame({'a': list('abc'),
                        'b': list(range(1, 4)),
                        'c': np.arange(3, 6).astype('u1'),
                        'd': np.arange(4.0, 7.0, dtype='float64'),
                        'e': [True, False, True],
                        'f': pd.date_range('now', periods=3).values})
        exclude = np.datetime64,
        include = np.bool_, 'integer'
        r = df.select_dtypes(include=include, exclude=exclude)
        e = df[['b', 'c', 'e']]
        assert_frame_equal(r, e)

        exclude = 'datetime',
        include = 'bool', 'int64', 'int32'
        r = df.select_dtypes(include=include, exclude=exclude)
        e = df[['b', 'e']]
        assert_frame_equal(r, e)

    def test_select_dtypes_include_using_scalars(self):
        df = DataFrame({'a': list('abc'),
                        'b': list(range(1, 4)),
                        'c': np.arange(3, 6).astype('u1'),
                        'd': np.arange(4.0, 7.0, dtype='float64'),
                        'e': [True, False, True],
                        'f': pd.Categorical(list('abc')),
                        'g': pd.date_range('20130101', periods=3),
                        'h': pd.date_range('20130101', periods=3,
                        'i': pd.date_range('20130101', periods=3,
                        'j': pd.period_range('2013-01', periods=3,
                        'k': pd.timedelta_range('1 day', periods=3)})

        ri = df.select_dtypes(include=np.number)
        ei = df[['b', 'c', 'd', 'k']]
        assert_frame_equal(ri, ei)

        ri = df.select_dtypes(include='datetime')
        ei = df[['g']]
        assert_frame_equal(ri, ei)

        ri = df.select_dtypes(include='datetime64')
        ei = df[['g']]
        assert_frame_equal(ri, ei)

        ri = df.select_dtypes(include='category')
        ei = df[['f']]
        assert_frame_equal(ri, ei)

                      lambda: df.select_dtypes(include='period'))

    def test_select_dtypes_exclude_using_scalars(self):
        df = DataFrame({'a': list('abc'),
                        'b': list(range(1, 4)),
                        'c': np.arange(3, 6).astype('u1'),
                        'd': np.arange(4.0, 7.0, dtype='float64'),
                        'e': [True, False, True],
                        'f': pd.Categorical(list('abc')),
                        'g': pd.date_range('20130101', periods=3),
                        'h': pd.date_range('20130101', periods=3,
                        'i': pd.date_range('20130101', periods=3,
                        'j': pd.period_range('2013-01', periods=3,
                        'k': pd.timedelta_range('1 day', periods=3)})

        ri = df.select_dtypes(exclude=np.number)
        ei = df[['a', 'e', 'f', 'g', 'h', 'i', 'j']]
        assert_frame_equal(ri, ei)

        ri = df.select_dtypes(exclude='category')
        ei = df[['a', 'b', 'c', 'd', 'e', 'g', 'h', 'i', 'j', 'k']]
        assert_frame_equal(ri, ei)

                      lambda: df.select_dtypes(exclude='period'))

    def test_select_dtypes_include_exclude_using_scalars(self):
        df = DataFrame({'a': list('abc'),
                        'b': list(range(1, 4)),
                        'c': np.arange(3, 6).astype('u1'),
                        'd': np.arange(4.0, 7.0, dtype='float64'),
                        'e': [True, False, True],
                        'f': pd.Categorical(list('abc')),
                        'g': pd.date_range('20130101', periods=3),
                        'h': pd.date_range('20130101', periods=3,
                        'i': pd.date_range('20130101', periods=3,
                        'j': pd.period_range('2013-01', periods=3,
                        'k': pd.timedelta_range('1 day', periods=3)})

        ri = df.select_dtypes(include=np.number, exclude='floating')
        ei = df[['b', 'c', 'k']]
        assert_frame_equal(ri, ei)

    def test_select_dtypes_include_exclude_mixed_scalars_lists(self):
        df = DataFrame({'a': list('abc'),
                        'b': list(range(1, 4)),
                        'c': np.arange(3, 6).astype('u1'),
                        'd': np.arange(4.0, 7.0, dtype='float64'),
                        'e': [True, False, True],
                        'f': pd.Categorical(list('abc')),
                        'g': pd.date_range('20130101', periods=3),
                        'h': pd.date_range('20130101', periods=3,
                        'i': pd.date_range('20130101', periods=3,
                        'j': pd.period_range('2013-01', periods=3,
                        'k': pd.timedelta_range('1 day', periods=3)})

        ri = df.select_dtypes(include=np.number,
                              exclude=['floating', 'timedelta'])
        ei = df[['b', 'c']]
        assert_frame_equal(ri, ei)

        ri = df.select_dtypes(include=[np.number, 'category'],
        ei = df[['b', 'c', 'f', 'k']]
        assert_frame_equal(ri, ei)

    def test_select_dtypes_duplicate_columns(self):
        # GH20839
        odict = compat.OrderedDict
        df = DataFrame(odict([('a', list('abc')),
                              ('b', list(range(1, 4))),
                              ('c', np.arange(3, 6).astype('u1')),
                              ('d', np.arange(4.0, 7.0, dtype='float64')),
                              ('e', [True, False, True]),
                              ('f', pd.date_range('now', periods=3).values)]))
        df.columns = ['a', 'a', 'b', 'b', 'b', 'c']

        expected = DataFrame({'a': list(range(1, 4)),
                              'b': np.arange(3, 6).astype('u1')})

        result = df.select_dtypes(include=[np.number], exclude=['floating'])
        assert_frame_equal(result, expected)

    def test_select_dtypes_not_an_attr_but_still_valid_dtype(self):
        df = DataFrame({'a': list('abc'),
                        'b': list(range(1, 4)),
                        'c': np.arange(3, 6).astype('u1'),
                        'd': np.arange(4.0, 7.0, dtype='float64'),
                        'e': [True, False, True],
                        'f': pd.date_range('now', periods=3).values})
        df['g'] = df.f.diff()
        assert not hasattr(np, 'u8')
        r = df.select_dtypes(include=['i8', 'O'], exclude=['timedelta'])
        e = df[['a', 'b']]
        assert_frame_equal(r, e)

        r = df.select_dtypes(include=['i8', 'O', 'timedelta64[ns]'])
        e = df[['a', 'b', 'g']]
        assert_frame_equal(r, e)

    def test_select_dtypes_empty(self):
        df = DataFrame({'a': list('abc'), 'b': list(range(1, 4))})
        with tm.assert_raises_regex(ValueError, 'at least one of '
                                    'include or exclude '
                                    'must be nonempty'):

    def test_select_dtypes_bad_datetime64(self):
        df = DataFrame({'a': list('abc'),
                        'b': list(range(1, 4)),
                        'c': np.arange(3, 6).astype('u1'),
                        'd': np.arange(4.0, 7.0, dtype='float64'),
                        'e': [True, False, True],
                        'f': pd.date_range('now', periods=3).values})
        with tm.assert_raises_regex(ValueError, '.+ is too specific'):

        with tm.assert_raises_regex(ValueError, '.+ is too specific'):

    def test_select_dtypes_datetime_with_tz(self):

        df2 = DataFrame(dict(A=Timestamp('20130102', tz='US/Eastern'),
                             B=Timestamp('20130603', tz='CET')),
        df3 = pd.concat([df2.A.to_frame(), df2.B.to_frame()], axis=1)
        result = df3.select_dtypes(include=['datetime64[ns]'])
        expected = df3.reindex(columns=[])
        assert_frame_equal(result, expected)

        "dtype", [str, "str", np.string_, "S1",
                  "unicode", np.unicode_, "U1"] + ([unicode] if PY2 else []))
    @pytest.mark.parametrize("arg", ["include", "exclude"])
    def test_select_dtypes_str_raises(self, dtype, arg):
        df = DataFrame({"a": list("abc"),
                        "g": list(u("abc")),
                        "b": list(range(1, 4)),
                        "c": np.arange(3, 6).astype("u1"),
                        "d": np.arange(4.0, 7.0, dtype="float64"),
                        "e": [True, False, True],
                        "f": pd.date_range("now", periods=3).values})
        msg = "string dtypes are not allowed"
        kwargs = {arg: [dtype]}

        with tm.assert_raises_regex(TypeError, msg):

    def test_select_dtypes_bad_arg_raises(self):
        df = DataFrame({'a': list('abc'),
                        'g': list(u('abc')),
                        'b': list(range(1, 4)),
                        'c': np.arange(3, 6).astype('u1'),
                        'd': np.arange(4.0, 7.0, dtype='float64'),
                        'e': [True, False, True],
                        'f': pd.date_range('now', periods=3).values})
        with tm.assert_raises_regex(TypeError, 'data type.'
                                    '*not understood'):
            df.select_dtypes(['blargy, blarg, blarg'])

    def test_select_dtypes_typecodes(self):
        # GH 11990
        df = mkdf(30, 3, data_gen_f=lambda x, y: np.random.random())
        expected = df
        FLOAT_TYPES = list(np.typecodes['AllFloat'])
        assert_frame_equal(df.select_dtypes(FLOAT_TYPES), expected)

    def test_dtypes_gh8722(self):
        self.mixed_frame['bool'] = self.mixed_frame['A'] > 0
        result = self.mixed_frame.dtypes
        expected = Series(dict((k, v.dtype)
                               for k, v in compat.iteritems(self.mixed_frame)),
        assert_series_equal(result, expected)

        # compat, GH 8722
        with option_context('use_inf_as_na', True):
            df = DataFrame([[1]])
            result = df.dtypes
            assert_series_equal(result, Series({0: np.dtype('int64')}))

    def test_ftypes(self):
        frame = self.mixed_float
        expected = Series(dict(A='float32:dense',
        result = frame.ftypes.sort_values()
        assert_series_equal(result, expected)

    def test_astype(self):
        casted = self.frame.astype(int)
        expected = DataFrame(self.frame.values.astype(int),
        assert_frame_equal(casted, expected)

        casted = self.frame.astype(np.int32)
        expected = DataFrame(self.frame.values.astype(np.int32),
        assert_frame_equal(casted, expected)

        self.frame['foo'] = '5'
        casted = self.frame.astype(int)
        expected = DataFrame(self.frame.values.astype(int),
        assert_frame_equal(casted, expected)

        # mixed casting
        def _check_cast(df, v):
            assert (list(set( for
                             _, s in compat.iteritems(df)))[0] == v)

        mn = self.all_mixed._get_numeric_data().copy()
        mn['little_float'] = np.array(12345., dtype='float16')
        mn['big_float'] = np.array(123456789101112., dtype='float64')

        casted = mn.astype('float64')
        _check_cast(casted, 'float64')

        casted = mn.astype('int64')
        _check_cast(casted, 'int64')

        casted = self.mixed_float.reindex(columns=['A', 'B']).astype('float32')
        _check_cast(casted, 'float32')

        casted = mn.reindex(columns=['little_float']).astype('float16')
        _check_cast(casted, 'float16')

        casted = self.mixed_float.reindex(columns=['A', 'B']).astype('float16')
        _check_cast(casted, 'float16')

        casted = mn.astype('float32')
        _check_cast(casted, 'float32')

        casted = mn.astype('int32')
        _check_cast(casted, 'int32')

        # to object
        casted = mn.astype('O')
        _check_cast(casted, 'object')

    def test_astype_with_exclude_string(self):
        df = self.frame.copy()
        expected = self.frame.astype(int)
        df['string'] = 'foo'
        casted = df.astype(int, errors='ignore')

        expected['string'] = 'foo'
        assert_frame_equal(casted, expected)

        df = self.frame.copy()
        expected = self.frame.astype(np.int32)
        df['string'] = 'foo'
        casted = df.astype(np.int32, errors='ignore')

        expected['string'] = 'foo'
        assert_frame_equal(casted, expected)

    def test_astype_with_view(self):

        tf = self.mixed_float.reindex(columns=['A', 'B', 'C'])

        casted = tf.astype(np.int64)

        casted = tf.astype(np.float32)

        # this is the only real reason to do it this way
        tf = np.round(self.frame).astype(np.int32)
        casted = tf.astype(np.float32, copy=False)

        # TODO(wesm): verification?
        tf = self.frame.astype(np.float64)
        casted = tf.astype(np.int64, copy=False)  # noqa

    @pytest.mark.parametrize("dtype", [np.int32, np.int64])
    @pytest.mark.parametrize("val", [np.nan, np.inf])
    def test_astype_cast_nan_inf_int(self, val, dtype):
        # see gh-14265
        # Check NaN and inf --> raise error when converting to int.
        msg = "Cannot convert non-finite values \\(NA or inf\\) to integer"
        df = DataFrame([val])

        with tm.assert_raises_regex(ValueError, msg):

    def test_astype_str(self, text_dtype):
        # see gh-9757
        a = Series(date_range("2010-01-04", periods=5))
        b = Series(date_range("3/6/2012 00:00", periods=5, tz="US/Eastern"))
        c = Series([Timedelta(x, unit="d") for x in range(5)])
        d = Series(range(5))
        e = Series([0.0, 0.2, 0.4, 0.6, 0.8])

        df = DataFrame({"a": a, "b": b, "c": c, "d": d, "e": e})

        # Datetime-like
        # Test str and unicode on Python 2.x and just str on Python 3.x
        result = df.astype(text_dtype)

        expected = DataFrame({
            "a": list(map(text_dtype,
                          map(lambda x: Timestamp(x)._date_repr, a._values))),
            "b": list(map(text_dtype, map(Timestamp, b._values))),
            "c": list(map(text_dtype,
                          map(lambda x: Timedelta(x)._repr_base(format="all"),
            "d": list(map(text_dtype, d._values)),
            "e": list(map(text_dtype, e._values)),

        assert_frame_equal(result, expected)

    def test_astype_str_float(self, text_dtype):
        # see gh-11302
        result = DataFrame([np.NaN]).astype(text_dtype)
        expected = DataFrame(["nan"])

        assert_frame_equal(result, expected)
        result = DataFrame([1.12345678901234567890]).astype(text_dtype)

        # < 1.14 truncates
        # >= 1.14 preserves the full repr
        val = ("1.12345678901" if _np_version_under1p14
               else "1.1234567890123457")
        expected = DataFrame([val])
        assert_frame_equal(result, expected)

    @pytest.mark.parametrize("dtype_class", [dict, Series])
    def test_astype_dict_like(self, dtype_class):
        # GH7271 & GH16717
        a = Series(date_range('2010-01-04', periods=5))
        b = Series(range(5))
        c = Series([0.0, 0.2, 0.4, 0.6, 0.8])
        d = Series(['1.0', '2', '3.14', '4', '5.4'])
        df = DataFrame({'a': a, 'b': b, 'c': c, 'd': d})
        original = df.copy(deep=True)

        # change type of a subset of columns
        dt1 = dtype_class({'b': 'str', 'd': 'float32'})
        result = df.astype(dt1)
        expected = DataFrame({
            'a': a,
            'b': Series(['0', '1', '2', '3', '4']),
            'c': c,
            'd': Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype='float32')})
        assert_frame_equal(result, expected)
        assert_frame_equal(df, original)

        dt2 = dtype_class({'b': np.float32, 'c': 'float32', 'd': np.float64})
        result = df.astype(dt2)
        expected = DataFrame({
            'a': a,
            'b': Series([0.0, 1.0, 2.0, 3.0, 4.0], dtype='float32'),
            'c': Series([0.0, 0.2, 0.4, 0.6, 0.8], dtype='float32'),
            'd': Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype='float64')})
        assert_frame_equal(result, expected)
        assert_frame_equal(df, original)

        # change all columns
        dt3 = dtype_class({'a': str, 'b': str, 'c': str, 'd': str})
        assert_frame_equal(df, original)

        # error should be raised when using something other than column labels
        # in the keys of the dtype dict
        dt4 = dtype_class({'b': str, 2: str})
        dt5 = dtype_class({'e': str})
        pytest.raises(KeyError, df.astype, dt4)
        pytest.raises(KeyError, df.astype, dt5)
        assert_frame_equal(df, original)

        # if the dtypes provided are the same as the original dtypes, the
        # resulting DataFrame should be the same as the original DataFrame
        dt6 = dtype_class({col: df[col].dtype for col in df.columns})
        equiv = df.astype(dt6)
        assert_frame_equal(df, equiv)
        assert_frame_equal(df, original)

        # GH 16717
        # if dtypes provided is empty, the resulting DataFrame
        # should be the same as the original DataFrame
        dt7 = dtype_class({})
        result = df.astype(dt7)
        assert_frame_equal(df, equiv)
        assert_frame_equal(df, original)

    def test_astype_duplicate_col(self):
        a1 = Series([1, 2, 3, 4, 5], name='a')
        b = Series([0.1, 0.2, 0.4, 0.6, 0.8], name='b')
        a2 = Series([0, 1, 2, 3, 4], name='a')
        df = concat([a1, b, a2], axis=1)

        result = df.astype(str)
        a1_str = Series(['1', '2', '3', '4', '5'], dtype='str', name='a')
        b_str = Series(['0.1', '0.2', '0.4', '0.6', '0.8'], dtype=str,
        a2_str = Series(['0', '1', '2', '3', '4'], dtype='str', name='a')
        expected = concat([a1_str, b_str, a2_str], axis=1)
        assert_frame_equal(result, expected)

        result = df.astype({'a': 'str'})
        expected = concat([a1_str, b, a2_str], axis=1)
        assert_frame_equal(result, expected)

    @pytest.mark.parametrize('dtype', [
        CategoricalDtype(categories=list('edba'), ordered=False),
        CategoricalDtype(categories=list('edcb'), ordered=True)], ids=repr)
    def test_astype_categorical(self, dtype):
        # GH 18099
        d = {'A': list('abbc'), 'B': list('bccd'), 'C': list('cdde')}
        df = DataFrame(d)
        result = df.astype(dtype)
        expected = DataFrame({k: Categorical(d[k], dtype=dtype) for k in d})
        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize("cls", [
    def test_astype_categoricaldtype_class_raises(self, cls):
        df = DataFrame({"A": ['a', 'a', 'b', 'c']})
        xpr = "Expected an instance of {}".format(cls.__name__)
        with tm.assert_raises_regex(TypeError, xpr):
            df.astype({"A": cls})

        with tm.assert_raises_regex(TypeError, xpr):

    @pytest.mark.parametrize('dtype', [
        {100: 'float64', 200: 'uint64'}, 'category', 'float64'])
    def test_astype_column_metadata(self, dtype):
        # GH 19920
        columns = pd.UInt64Index([100, 200, 300], name='foo')
        df = DataFrame(np.arange(15).reshape(5, 3), columns=columns)
        df = df.astype(dtype)
        tm.assert_index_equal(df.columns, columns)

    @pytest.mark.parametrize("dtype", ["M8", "m8"])
    @pytest.mark.parametrize("unit", ['ns', 'us', 'ms', 's', 'h', 'm', 'D'])
    def test_astype_from_datetimelike_to_objectt(self, dtype, unit):
        # tests astype to object dtype
        # gh-19223 / gh-12425
        dtype = "{}[{}]".format(dtype, unit)
        arr = np.array([[1, 2, 3]], dtype=dtype)
        df = DataFrame(arr)
        result = df.astype(object)
        assert (result.dtypes == object).all()

        if dtype.startswith('M8'):
            assert result.iloc[0, 0] == pd.to_datetime(1, unit=unit)
            assert result.iloc[0, 0] == pd.to_timedelta(1, unit=unit)

    @pytest.mark.parametrize("arr_dtype", [np.int64, np.float64])
    @pytest.mark.parametrize("dtype", ["M8", "m8"])
    @pytest.mark.parametrize("unit", ['ns', 'us', 'ms', 's', 'h', 'm', 'D'])
    def test_astype_to_datetimelike_unit(self, arr_dtype, dtype, unit):
        # tests all units from numeric origination
        # gh-19223 / gh-12425
        dtype = "{}[{}]".format(dtype, unit)
        arr = np.array([[1, 2, 3]], dtype=arr_dtype)
        df = DataFrame(arr)
        result = df.astype(dtype)
        expected = DataFrame(arr.astype(dtype))

        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize("unit", ['ns', 'us', 'ms', 's', 'h', 'm', 'D'])
    def test_astype_to_datetime_unit(self, unit):
        # tests all units from datetime origination
        # gh-19223
        dtype = "M8[{}]".format(unit)
        arr = np.array([[1, 2, 3]], dtype=dtype)
        df = DataFrame(arr)
        result = df.astype(dtype)
        expected = DataFrame(arr.astype(dtype))

        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize("unit", ['ns'])
    def test_astype_to_timedelta_unit_ns(self, unit):
        # preserver the timedelta conversion
        # gh-19223
        dtype = "m8[{}]".format(unit)
        arr = np.array([[1, 2, 3]], dtype=dtype)
        df = DataFrame(arr)
        result = df.astype(dtype)
        expected = DataFrame(arr.astype(dtype))

        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize("unit", ['us', 'ms', 's', 'h', 'm', 'D'])
    def test_astype_to_timedelta_unit(self, unit):
        # coerce to float
        # gh-19223
        dtype = "m8[{}]".format(unit)
        arr = np.array([[1, 2, 3]], dtype=dtype)
        df = DataFrame(arr)
        result = df.astype(dtype)
        expected = DataFrame(df.values.astype(dtype).astype(float))

        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize("unit", ['ns', 'us', 'ms', 's', 'h', 'm', 'D'])
    def test_astype_to_incorrect_datetimelike(self, unit):
        # trying to astype a m to a M, or vice-versa
        # gh-19224
        dtype = "M8[{}]".format(unit)
        other = "m8[{}]".format(unit)

        df = DataFrame(np.array([[1, 2, 3]], dtype=dtype))
        with pytest.raises(TypeError):

        df = DataFrame(np.array([[1, 2, 3]], dtype=other))
        with pytest.raises(TypeError):

    def test_timedeltas(self):
        df = DataFrame(dict(A=Series(date_range('2012-1-1', periods=3,
                            B=Series([timedelta(days=i) for i in range(3)])))
        result = df.get_dtype_counts().sort_index()
        expected = Series(
            {'datetime64[ns]': 1, 'timedelta64[ns]': 1}).sort_index()
        assert_series_equal(result, expected)

        df['C'] = df['A'] + df['B']
        expected = Series(
            {'datetime64[ns]': 2, 'timedelta64[ns]': 1}).sort_values()
        result = df.get_dtype_counts().sort_values()
        assert_series_equal(result, expected)

        # mixed int types
        df['D'] = 1
        expected = Series({'datetime64[ns]': 2,
                           'timedelta64[ns]': 1,
                           'int64': 1}).sort_values()
        result = df.get_dtype_counts().sort_values()
        assert_series_equal(result, expected)

    def test_arg_for_errors_in_astype(self):
        # issue #14878

        df = DataFrame([1, 2, 3])

        with pytest.raises(ValueError):
            df.astype(np.float64, errors=True)

        with tm.assert_produces_warning(FutureWarning):
            df.astype(np.int8, raise_on_error=False)

        df.astype(np.int8, errors='ignore')

    @pytest.mark.parametrize('input_vals', [
        ([1, 2]),
        (['1', '2']),
        (list(pd.date_range('1/1/2011', periods=2, freq='H'))),
        (list(pd.date_range('1/1/2011', periods=2, freq='H',
        ([pd.Interval(left=0, right=5)]),
    def test_constructor_list_str(self, input_vals, string_dtype):
        # GH 16605
        # Ensure that data elements are converted to strings when
        # dtype is str, 'str', or 'U'

        result = DataFrame({'A': input_vals}, dtype=string_dtype)
        expected = DataFrame({'A': input_vals}).astype({'A': string_dtype})
        assert_frame_equal(result, expected)

    def test_constructor_list_str_na(self, string_dtype):

        result = DataFrame({"A": [1.0, 2.0, None]}, dtype=string_dtype)
        expected = DataFrame({"A": ['1.0', '2.0', None]}, dtype=object)
        assert_frame_equal(result, expected)
Beispiel #14
    result = hash_tuple(tup)
    expected = hash_tuples([tup])[0]

    assert result == expected

@pytest.mark.parametrize("val", [
    1, 1.4, "A", b"A", u"A",
    pd.Timestamp("2012-01-01", tz="Europe/Brussels"),
    datetime.datetime(2012, 1, 1),
    pd.Timestamp("2012-01-01", tz="EST").to_pydatetime(),
    pd.Timedelta("1 days"),
    pd.Period("2012-01-01", freq="D"),
    pd.Interval(0, 1), np.nan, pd.NaT, None
def test_hash_scalar(val):
    result = _hash_scalar(val)
    expected = hash_array(np.array([val], dtype=object), categorize=True)

    assert result[0] == expected[0]

@pytest.mark.parametrize("val", [5, "foo", pd.Timestamp("20130101")])
def test_hash_tuples_err(val):
    msg = "must be convertible to a list-of-tuples"
    with pytest.raises(TypeError, match=msg):

Beispiel #15
class TestDataFrameDataTypes:
    def test_concat_empty_dataframe_dtypes(self):
        df = DataFrame(columns=list("abc"))
        df["a"] = df["a"].astype(np.bool_)
        df["b"] = df["b"].astype(np.int32)
        df["c"] = df["c"].astype(np.float64)

        result = pd.concat([df, df])
        assert result["a"].dtype == np.bool_
        assert result["b"].dtype == np.int32
        assert result["c"].dtype == np.float64

        result = pd.concat([df, df.astype(np.float64)])
        assert result["a"].dtype == np.object_
        assert result["b"].dtype == np.float64
        assert result["c"].dtype == np.float64

    def test_empty_frame_dtypes_ftypes(self):
        empty_df = pd.DataFrame()
        assert_series_equal(empty_df.dtypes, pd.Series(dtype=np.object))

        # GH 26705 - Assert .ftypes is deprecated
        with tm.assert_produces_warning(FutureWarning):
            assert_series_equal(empty_df.ftypes, pd.Series(dtype=np.object))

        nocols_df = pd.DataFrame(index=[1, 2, 3])
        assert_series_equal(nocols_df.dtypes, pd.Series(dtype=np.object))

        # GH 26705 - Assert .ftypes is deprecated
        with tm.assert_produces_warning(FutureWarning):
            assert_series_equal(nocols_df.ftypes, pd.Series(dtype=np.object))

        norows_df = pd.DataFrame(columns=list("abc"))
        assert_series_equal(norows_df.dtypes, pd.Series(np.object, index=list("abc")))

        # GH 26705 - Assert .ftypes is deprecated
        with tm.assert_produces_warning(FutureWarning):
                norows_df.ftypes, pd.Series("object:dense", index=list("abc"))

        norows_int_df = pd.DataFrame(columns=list("abc")).astype(np.int32)
            norows_int_df.dtypes, pd.Series(np.dtype("int32"), index=list("abc"))
        # GH 26705 - Assert .ftypes is deprecated
        with tm.assert_produces_warning(FutureWarning):
                norows_int_df.ftypes, pd.Series("int32:dense", index=list("abc"))

        odict = OrderedDict
        df = pd.DataFrame(odict([("a", 1), ("b", True), ("c", 1.0)]), index=[1, 2, 3])
        ex_dtypes = pd.Series(
            odict([("a", np.int64), ("b", np.bool), ("c", np.float64)])
        ex_ftypes = pd.Series(
            odict([("a", "int64:dense"), ("b", "bool:dense"), ("c", "float64:dense")])
        assert_series_equal(df.dtypes, ex_dtypes)

        # GH 26705 - Assert .ftypes is deprecated
        with tm.assert_produces_warning(FutureWarning):
            assert_series_equal(df.ftypes, ex_ftypes)

        # same but for empty slice of df
        assert_series_equal(df[:0].dtypes, ex_dtypes)

        # GH 26705 - Assert .ftypes is deprecated
        with tm.assert_produces_warning(FutureWarning):
            assert_series_equal(df[:0].ftypes, ex_ftypes)

    def test_datetime_with_tz_dtypes(self):
        tzframe = DataFrame(
                "A": date_range("20130101", periods=3),
                "B": date_range("20130101", periods=3, tz="US/Eastern"),
                "C": date_range("20130101", periods=3, tz="CET"),
        tzframe.iloc[1, 1] = pd.NaT
        tzframe.iloc[1, 2] = pd.NaT
        result = tzframe.dtypes.sort_index()
        expected = Series(
                DatetimeTZDtype("ns", "US/Eastern"),
                DatetimeTZDtype("ns", "CET"),
            ["A", "B", "C"],

        assert_series_equal(result, expected)

    def test_dtypes_are_correct_after_column_slice(self):
        # GH6525
        df = pd.DataFrame(index=range(5), columns=list("abc"), dtype=np.float_)
        odict = OrderedDict
            pd.Series(odict([("a", np.float_), ("b", np.float_), ("c", np.float_)])),
        assert_series_equal(df.iloc[:, 2:].dtypes, pd.Series(odict([("c", np.float_)])))
            pd.Series(odict([("a", np.float_), ("b", np.float_), ("c", np.float_)])),

    def test_select_dtypes_include_using_list_like(self):
        df = DataFrame(
                "a": list("abc"),
                "b": list(range(1, 4)),
                "c": np.arange(3, 6).astype("u1"),
                "d": np.arange(4.0, 7.0, dtype="float64"),
                "e": [True, False, True],
                "f": pd.Categorical(list("abc")),
                "g": pd.date_range("20130101", periods=3),
                "h": pd.date_range("20130101", periods=3, tz="US/Eastern"),
                "i": pd.date_range("20130101", periods=3, tz="CET"),
                "j": pd.period_range("2013-01", periods=3, freq="M"),
                "k": pd.timedelta_range("1 day", periods=3),

        ri = df.select_dtypes(include=[np.number])
        ei = df[["b", "c", "d", "k"]]
        assert_frame_equal(ri, ei)

        ri = df.select_dtypes(include=[np.number], exclude=["timedelta"])
        ei = df[["b", "c", "d"]]
        assert_frame_equal(ri, ei)

        ri = df.select_dtypes(include=[np.number, "category"], exclude=["timedelta"])
        ei = df[["b", "c", "d", "f"]]
        assert_frame_equal(ri, ei)

        ri = df.select_dtypes(include=["datetime"])
        ei = df[["g"]]
        assert_frame_equal(ri, ei)

        ri = df.select_dtypes(include=["datetime64"])
        ei = df[["g"]]
        assert_frame_equal(ri, ei)

        ri = df.select_dtypes(include=["datetimetz"])
        ei = df[["h", "i"]]
        assert_frame_equal(ri, ei)

        with pytest.raises(NotImplementedError, match=r"^$"):

    def test_select_dtypes_exclude_using_list_like(self):
        df = DataFrame(
                "a": list("abc"),
                "b": list(range(1, 4)),
                "c": np.arange(3, 6).astype("u1"),
                "d": np.arange(4.0, 7.0, dtype="float64"),
                "e": [True, False, True],
        re = df.select_dtypes(exclude=[np.number])
        ee = df[["a", "e"]]
        assert_frame_equal(re, ee)

    def test_select_dtypes_exclude_include_using_list_like(self):
        df = DataFrame(
                "a": list("abc"),
                "b": list(range(1, 4)),
                "c": np.arange(3, 6).astype("u1"),
                "d": np.arange(4.0, 7.0, dtype="float64"),
                "e": [True, False, True],
                "f": pd.date_range("now", periods=3).values,
        exclude = (np.datetime64,)
        include = np.bool_, "integer"
        r = df.select_dtypes(include=include, exclude=exclude)
        e = df[["b", "c", "e"]]
        assert_frame_equal(r, e)

        exclude = ("datetime",)
        include = "bool", "int64", "int32"
        r = df.select_dtypes(include=include, exclude=exclude)
        e = df[["b", "e"]]
        assert_frame_equal(r, e)

    def test_select_dtypes_include_using_scalars(self):
        df = DataFrame(
                "a": list("abc"),
                "b": list(range(1, 4)),
                "c": np.arange(3, 6).astype("u1"),
                "d": np.arange(4.0, 7.0, dtype="float64"),
                "e": [True, False, True],
                "f": pd.Categorical(list("abc")),
                "g": pd.date_range("20130101", periods=3),
                "h": pd.date_range("20130101", periods=3, tz="US/Eastern"),
                "i": pd.date_range("20130101", periods=3, tz="CET"),
                "j": pd.period_range("2013-01", periods=3, freq="M"),
                "k": pd.timedelta_range("1 day", periods=3),

        ri = df.select_dtypes(include=np.number)
        ei = df[["b", "c", "d", "k"]]
        assert_frame_equal(ri, ei)

        ri = df.select_dtypes(include="datetime")
        ei = df[["g"]]
        assert_frame_equal(ri, ei)

        ri = df.select_dtypes(include="datetime64")
        ei = df[["g"]]
        assert_frame_equal(ri, ei)

        ri = df.select_dtypes(include="category")
        ei = df[["f"]]
        assert_frame_equal(ri, ei)

        with pytest.raises(NotImplementedError, match=r"^$"):

    def test_select_dtypes_exclude_using_scalars(self):
        df = DataFrame(
                "a": list("abc"),
                "b": list(range(1, 4)),
                "c": np.arange(3, 6).astype("u1"),
                "d": np.arange(4.0, 7.0, dtype="float64"),
                "e": [True, False, True],
                "f": pd.Categorical(list("abc")),
                "g": pd.date_range("20130101", periods=3),
                "h": pd.date_range("20130101", periods=3, tz="US/Eastern"),
                "i": pd.date_range("20130101", periods=3, tz="CET"),
                "j": pd.period_range("2013-01", periods=3, freq="M"),
                "k": pd.timedelta_range("1 day", periods=3),

        ri = df.select_dtypes(exclude=np.number)
        ei = df[["a", "e", "f", "g", "h", "i", "j"]]
        assert_frame_equal(ri, ei)

        ri = df.select_dtypes(exclude="category")
        ei = df[["a", "b", "c", "d", "e", "g", "h", "i", "j", "k"]]
        assert_frame_equal(ri, ei)

        with pytest.raises(NotImplementedError, match=r"^$"):

    def test_select_dtypes_include_exclude_using_scalars(self):
        df = DataFrame(
                "a": list("abc"),
                "b": list(range(1, 4)),
                "c": np.arange(3, 6).astype("u1"),
                "d": np.arange(4.0, 7.0, dtype="float64"),
                "e": [True, False, True],
                "f": pd.Categorical(list("abc")),
                "g": pd.date_range("20130101", periods=3),
                "h": pd.date_range("20130101", periods=3, tz="US/Eastern"),
                "i": pd.date_range("20130101", periods=3, tz="CET"),
                "j": pd.period_range("2013-01", periods=3, freq="M"),
                "k": pd.timedelta_range("1 day", periods=3),

        ri = df.select_dtypes(include=np.number, exclude="floating")
        ei = df[["b", "c", "k"]]
        assert_frame_equal(ri, ei)

    def test_select_dtypes_include_exclude_mixed_scalars_lists(self):
        df = DataFrame(
                "a": list("abc"),
                "b": list(range(1, 4)),
                "c": np.arange(3, 6).astype("u1"),
                "d": np.arange(4.0, 7.0, dtype="float64"),
                "e": [True, False, True],
                "f": pd.Categorical(list("abc")),
                "g": pd.date_range("20130101", periods=3),
                "h": pd.date_range("20130101", periods=3, tz="US/Eastern"),
                "i": pd.date_range("20130101", periods=3, tz="CET"),
                "j": pd.period_range("2013-01", periods=3, freq="M"),
                "k": pd.timedelta_range("1 day", periods=3),

        ri = df.select_dtypes(include=np.number, exclude=["floating", "timedelta"])
        ei = df[["b", "c"]]
        assert_frame_equal(ri, ei)

        ri = df.select_dtypes(include=[np.number, "category"], exclude="floating")
        ei = df[["b", "c", "f", "k"]]
        assert_frame_equal(ri, ei)

    def test_select_dtypes_duplicate_columns(self):
        # GH20839
        odict = OrderedDict
        df = DataFrame(
                    ("a", list("abc")),
                    ("b", list(range(1, 4))),
                    ("c", np.arange(3, 6).astype("u1")),
                    ("d", np.arange(4.0, 7.0, dtype="float64")),
                    ("e", [True, False, True]),
                    ("f", pd.date_range("now", periods=3).values),
        df.columns = ["a", "a", "b", "b", "b", "c"]

        expected = DataFrame(
            {"a": list(range(1, 4)), "b": np.arange(3, 6).astype("u1")}

        result = df.select_dtypes(include=[np.number], exclude=["floating"])
        assert_frame_equal(result, expected)

    def test_select_dtypes_not_an_attr_but_still_valid_dtype(self):
        df = DataFrame(
                "a": list("abc"),
                "b": list(range(1, 4)),
                "c": np.arange(3, 6).astype("u1"),
                "d": np.arange(4.0, 7.0, dtype="float64"),
                "e": [True, False, True],
                "f": pd.date_range("now", periods=3).values,
        df["g"] = df.f.diff()
        assert not hasattr(np, "u8")
        r = df.select_dtypes(include=["i8", "O"], exclude=["timedelta"])
        e = df[["a", "b"]]
        assert_frame_equal(r, e)

        r = df.select_dtypes(include=["i8", "O", "timedelta64[ns]"])
        e = df[["a", "b", "g"]]
        assert_frame_equal(r, e)

    def test_select_dtypes_empty(self):
        df = DataFrame({"a": list("abc"), "b": list(range(1, 4))})
        msg = "at least one of include or exclude must be nonempty"
        with pytest.raises(ValueError, match=msg):

    def test_select_dtypes_bad_datetime64(self):
        df = DataFrame(
                "a": list("abc"),
                "b": list(range(1, 4)),
                "c": np.arange(3, 6).astype("u1"),
                "d": np.arange(4.0, 7.0, dtype="float64"),
                "e": [True, False, True],
                "f": pd.date_range("now", periods=3).values,
        with pytest.raises(ValueError, match=".+ is too specific"):

        with pytest.raises(ValueError, match=".+ is too specific"):

    def test_select_dtypes_datetime_with_tz(self):

        df2 = DataFrame(
                A=Timestamp("20130102", tz="US/Eastern"),
                B=Timestamp("20130603", tz="CET"),
        df3 = pd.concat([df2.A.to_frame(), df2.B.to_frame()], axis=1)
        result = df3.select_dtypes(include=["datetime64[ns]"])
        expected = df3.reindex(columns=[])
        assert_frame_equal(result, expected)

        "dtype", [str, "str", np.string_, "S1", "unicode", np.unicode_, "U1"]
    @pytest.mark.parametrize("arg", ["include", "exclude"])
    def test_select_dtypes_str_raises(self, dtype, arg):
        df = DataFrame(
                "a": list("abc"),
                "g": list("abc"),
                "b": list(range(1, 4)),
                "c": np.arange(3, 6).astype("u1"),
                "d": np.arange(4.0, 7.0, dtype="float64"),
                "e": [True, False, True],
                "f": pd.date_range("now", periods=3).values,
        msg = "string dtypes are not allowed"
        kwargs = {arg: [dtype]}

        with pytest.raises(TypeError, match=msg):

    def test_select_dtypes_bad_arg_raises(self):
        df = DataFrame(
                "a": list("abc"),
                "g": list("abc"),
                "b": list(range(1, 4)),
                "c": np.arange(3, 6).astype("u1"),
                "d": np.arange(4.0, 7.0, dtype="float64"),
                "e": [True, False, True],
                "f": pd.date_range("now", periods=3).values,

        msg = "data type.*not understood"
        with pytest.raises(TypeError, match=msg):
            df.select_dtypes(["blargy, blarg, blarg"])

    def test_select_dtypes_typecodes(self):
        # GH 11990
        df = mkdf(30, 3, data_gen_f=lambda x, y: np.random.random())
        expected = df
        FLOAT_TYPES = list(np.typecodes["AllFloat"])
        assert_frame_equal(df.select_dtypes(FLOAT_TYPES), expected)

    def test_dtypes_gh8722(self, float_string_frame):
        float_string_frame["bool"] = float_string_frame["A"] > 0
        result = float_string_frame.dtypes
        expected = Series(
            {k: v.dtype for k, v in float_string_frame.items()}, index=result.index
        assert_series_equal(result, expected)

        # compat, GH 8722
        with option_context("use_inf_as_na", True):
            df = DataFrame([[1]])
            result = df.dtypes
            assert_series_equal(result, Series({0: np.dtype("int64")}))

    def test_ftypes(self, mixed_float_frame):
        frame = mixed_float_frame
        expected = Series(

        # GH 26705 - Assert .ftypes is deprecated
        with tm.assert_produces_warning(FutureWarning):
            result = frame.ftypes.sort_values()
        assert_series_equal(result, expected)

    def test_astype_float(self, float_frame):
        casted = float_frame.astype(int)
        expected = DataFrame(
        assert_frame_equal(casted, expected)

        casted = float_frame.astype(np.int32)
        expected = DataFrame(
        assert_frame_equal(casted, expected)

        float_frame["foo"] = "5"
        casted = float_frame.astype(int)
        expected = DataFrame(
        assert_frame_equal(casted, expected)

    def test_astype_mixed_float(self, mixed_float_frame):
        # mixed casting
        casted = mixed_float_frame.reindex(columns=["A", "B"]).astype("float32")
        _check_cast(casted, "float32")

        casted = mixed_float_frame.reindex(columns=["A", "B"]).astype("float16")
        _check_cast(casted, "float16")

    def test_astype_mixed_type(self, mixed_type_frame):
        # mixed casting
        mn = mixed_type_frame._get_numeric_data().copy()
        mn["little_float"] = np.array(12345.0, dtype="float16")
        mn["big_float"] = np.array(123456789101112.0, dtype="float64")

        casted = mn.astype("float64")
        _check_cast(casted, "float64")

        casted = mn.astype("int64")
        _check_cast(casted, "int64")

        casted = mn.reindex(columns=["little_float"]).astype("float16")
        _check_cast(casted, "float16")

        casted = mn.astype("float32")
        _check_cast(casted, "float32")

        casted = mn.astype("int32")
        _check_cast(casted, "int32")

        # to object
        casted = mn.astype("O")
        _check_cast(casted, "object")

    def test_astype_with_exclude_string(self, float_frame):
        df = float_frame.copy()
        expected = float_frame.astype(int)
        df["string"] = "foo"
        casted = df.astype(int, errors="ignore")

        expected["string"] = "foo"
        assert_frame_equal(casted, expected)

        df = float_frame.copy()
        expected = float_frame.astype(np.int32)
        df["string"] = "foo"
        casted = df.astype(np.int32, errors="ignore")

        expected["string"] = "foo"
        assert_frame_equal(casted, expected)

    def test_astype_with_view_float(self, float_frame):

        # this is the only real reason to do it this way
        tf = np.round(float_frame).astype(np.int32)
        casted = tf.astype(np.float32, copy=False)

        # TODO(wesm): verification?
        tf = float_frame.astype(np.float64)
        casted = tf.astype(np.int64, copy=False)  # noqa

    def test_astype_with_view_mixed_float(self, mixed_float_frame):

        tf = mixed_float_frame.reindex(columns=["A", "B", "C"])

        casted = tf.astype(np.int64)
        casted = tf.astype(np.float32)  # noqa

    @pytest.mark.parametrize("dtype", [np.int32, np.int64])
    @pytest.mark.parametrize("val", [np.nan, np.inf])
    def test_astype_cast_nan_inf_int(self, val, dtype):
        # see gh-14265
        # Check NaN and inf --> raise error when converting to int.
        msg = "Cannot convert non-finite values \\(NA or inf\\) to integer"
        df = DataFrame([val])

        with pytest.raises(ValueError, match=msg):

    def test_astype_str(self):
        # see gh-9757
        a = Series(date_range("2010-01-04", periods=5))
        b = Series(date_range("3/6/2012 00:00", periods=5, tz="US/Eastern"))
        c = Series([Timedelta(x, unit="d") for x in range(5)])
        d = Series(range(5))
        e = Series([0.0, 0.2, 0.4, 0.6, 0.8])

        df = DataFrame({"a": a, "b": b, "c": c, "d": d, "e": e})

        # Datetime-like
        result = df.astype(str)

        expected = DataFrame(
                "a": list(map(str, map(lambda x: Timestamp(x)._date_repr, a._values))),
                "b": list(map(str, map(Timestamp, b._values))),
                "c": list(
                        map(lambda x: Timedelta(x)._repr_base(format="all"), c._values),
                "d": list(map(str, d._values)),
                "e": list(map(str, e._values)),

        assert_frame_equal(result, expected)

    def test_astype_str_float(self):
        # see gh-11302
        result = DataFrame([np.NaN]).astype(str)
        expected = DataFrame(["nan"])

        assert_frame_equal(result, expected)
        result = DataFrame([1.12345678901234567890]).astype(str)

        # < 1.14 truncates
        # >= 1.14 preserves the full repr
        val = "1.12345678901" if _np_version_under1p14 else "1.1234567890123457"
        expected = DataFrame([val])
        assert_frame_equal(result, expected)

    @pytest.mark.parametrize("dtype_class", [dict, Series])
    def test_astype_dict_like(self, dtype_class):
        # GH7271 & GH16717
        a = Series(date_range("2010-01-04", periods=5))
        b = Series(range(5))
        c = Series([0.0, 0.2, 0.4, 0.6, 0.8])
        d = Series(["1.0", "2", "3.14", "4", "5.4"])
        df = DataFrame({"a": a, "b": b, "c": c, "d": d})
        original = df.copy(deep=True)

        # change type of a subset of columns
        dt1 = dtype_class({"b": "str", "d": "float32"})
        result = df.astype(dt1)
        expected = DataFrame(
                "a": a,
                "b": Series(["0", "1", "2", "3", "4"]),
                "c": c,
                "d": Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype="float32"),
        assert_frame_equal(result, expected)
        assert_frame_equal(df, original)

        dt2 = dtype_class({"b": np.float32, "c": "float32", "d": np.float64})
        result = df.astype(dt2)
        expected = DataFrame(
                "a": a,
                "b": Series([0.0, 1.0, 2.0, 3.0, 4.0], dtype="float32"),
                "c": Series([0.0, 0.2, 0.4, 0.6, 0.8], dtype="float32"),
                "d": Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype="float64"),
        assert_frame_equal(result, expected)
        assert_frame_equal(df, original)

        # change all columns
        dt3 = dtype_class({"a": str, "b": str, "c": str, "d": str})
        assert_frame_equal(df.astype(dt3), df.astype(str))
        assert_frame_equal(df, original)

        # error should be raised when using something other than column labels
        # in the keys of the dtype dict
        dt4 = dtype_class({"b": str, 2: str})
        dt5 = dtype_class({"e": str})
        msg = (
            "Only a column name can be used for the key in a dtype mappings" " argument"
        with pytest.raises(KeyError, match=msg):
        with pytest.raises(KeyError, match=msg):
        assert_frame_equal(df, original)

        # if the dtypes provided are the same as the original dtypes, the
        # resulting DataFrame should be the same as the original DataFrame
        dt6 = dtype_class({col: df[col].dtype for col in df.columns})
        equiv = df.astype(dt6)
        assert_frame_equal(df, equiv)
        assert_frame_equal(df, original)

        # GH 16717
        # if dtypes provided is empty, the resulting DataFrame
        # should be the same as the original DataFrame
        dt7 = dtype_class({})
        result = df.astype(dt7)
        assert_frame_equal(df, equiv)
        assert_frame_equal(df, original)

    def test_astype_duplicate_col(self):
        a1 = Series([1, 2, 3, 4, 5], name="a")
        b = Series([0.1, 0.2, 0.4, 0.6, 0.8], name="b")
        a2 = Series([0, 1, 2, 3, 4], name="a")
        df = concat([a1, b, a2], axis=1)

        result = df.astype(str)
        a1_str = Series(["1", "2", "3", "4", "5"], dtype="str", name="a")
        b_str = Series(["0.1", "0.2", "0.4", "0.6", "0.8"], dtype=str, name="b")
        a2_str = Series(["0", "1", "2", "3", "4"], dtype="str", name="a")
        expected = concat([a1_str, b_str, a2_str], axis=1)
        assert_frame_equal(result, expected)

        result = df.astype({"a": "str"})
        expected = concat([a1_str, b, a2_str], axis=1)
        assert_frame_equal(result, expected)

            CategoricalDtype(categories=list("edba"), ordered=False),
            CategoricalDtype(categories=list("edcb"), ordered=True),
    def test_astype_categorical(self, dtype):
        # GH 18099
        d = {"A": list("abbc"), "B": list("bccd"), "C": list("cdde")}
        df = DataFrame(d)
        result = df.astype(dtype)
        expected = DataFrame({k: Categorical(d[k], dtype=dtype) for k in d})
        tm.assert_frame_equal(result, expected)

    def test_astype_categoricaldtype_class_raises(self, cls):
        df = DataFrame({"A": ["a", "a", "b", "c"]})
        xpr = "Expected an instance of {}".format(cls.__name__)
        with pytest.raises(TypeError, match=xpr):
            df.astype({"A": cls})

        with pytest.raises(TypeError, match=xpr):

    @pytest.mark.parametrize("dtype", ["Int64", "Int32", "Int16"])
    def test_astype_extension_dtypes(self, dtype):
        # GH 22578
        df = pd.DataFrame([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], columns=["a", "b"])

        expected1 = pd.DataFrame(
                "a": integer_array([1, 3, 5], dtype=dtype),
                "b": integer_array([2, 4, 6], dtype=dtype),
        tm.assert_frame_equal(df.astype(dtype), expected1)
        tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1)
        tm.assert_frame_equal(df.astype(dtype).astype("float64"), df)

        df = pd.DataFrame([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], columns=["a", "b"])
        df["b"] = df["b"].astype(dtype)
        expected2 = pd.DataFrame(
            {"a": [1.0, 3.0, 5.0], "b": integer_array([2, 4, 6], dtype=dtype)}
        tm.assert_frame_equal(df, expected2)

        tm.assert_frame_equal(df.astype(dtype), expected1)
        tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1)

    @pytest.mark.parametrize("dtype", ["Int64", "Int32", "Int16"])
    def test_astype_extension_dtypes_1d(self, dtype):
        # GH 22578
        df = pd.DataFrame({"a": [1.0, 2.0, 3.0]})

        expected1 = pd.DataFrame({"a": integer_array([1, 2, 3], dtype=dtype)})
        tm.assert_frame_equal(df.astype(dtype), expected1)
        tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1)

        df = pd.DataFrame({"a": [1.0, 2.0, 3.0]})
        df["a"] = df["a"].astype(dtype)
        expected2 = pd.DataFrame({"a": integer_array([1, 2, 3], dtype=dtype)})
        tm.assert_frame_equal(df, expected2)

        tm.assert_frame_equal(df.astype(dtype), expected1)
        tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1)

    @pytest.mark.parametrize("dtype", ["category", "Int64"])
    def test_astype_extension_dtypes_duplicate_col(self, dtype):
        # GH 24704
        a1 = Series([0, np.nan, 4], name="a")
        a2 = Series([np.nan, 3, 5], name="a")
        df = concat([a1, a2], axis=1)

        result = df.astype(dtype)
        expected = concat([a1.astype(dtype), a2.astype(dtype)], axis=1)
        assert_frame_equal(result, expected)

        "dtype", [{100: "float64", 200: "uint64"}, "category", "float64"]
    def test_astype_column_metadata(self, dtype):
        # GH 19920
        columns = pd.UInt64Index([100, 200, 300], name="foo")
        df = DataFrame(np.arange(15).reshape(5, 3), columns=columns)
        df = df.astype(dtype)
        tm.assert_index_equal(df.columns, columns)

    @pytest.mark.parametrize("dtype", ["M8", "m8"])
    @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"])
    def test_astype_from_datetimelike_to_objectt(self, dtype, unit):
        # tests astype to object dtype
        # gh-19223 / gh-12425
        dtype = "{}[{}]".format(dtype, unit)
        arr = np.array([[1, 2, 3]], dtype=dtype)
        df = DataFrame(arr)
        result = df.astype(object)
        assert (result.dtypes == object).all()

        if dtype.startswith("M8"):
            assert result.iloc[0, 0] == pd.to_datetime(1, unit=unit)
            assert result.iloc[0, 0] == pd.to_timedelta(1, unit=unit)

    @pytest.mark.parametrize("arr_dtype", [np.int64, np.float64])
    @pytest.mark.parametrize("dtype", ["M8", "m8"])
    @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"])
    def test_astype_to_datetimelike_unit(self, arr_dtype, dtype, unit):
        # tests all units from numeric origination
        # gh-19223 / gh-12425
        dtype = "{}[{}]".format(dtype, unit)
        arr = np.array([[1, 2, 3]], dtype=arr_dtype)
        df = DataFrame(arr)
        result = df.astype(dtype)
        expected = DataFrame(arr.astype(dtype))

        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"])
    def test_astype_to_datetime_unit(self, unit):
        # tests all units from datetime origination
        # gh-19223
        dtype = "M8[{}]".format(unit)
        arr = np.array([[1, 2, 3]], dtype=dtype)
        df = DataFrame(arr)
        result = df.astype(dtype)
        expected = DataFrame(arr.astype(dtype))

        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize("unit", ["ns"])
    def test_astype_to_timedelta_unit_ns(self, unit):
        # preserver the timedelta conversion
        # gh-19223
        dtype = "m8[{}]".format(unit)
        arr = np.array([[1, 2, 3]], dtype=dtype)
        df = DataFrame(arr)
        result = df.astype(dtype)
        expected = DataFrame(arr.astype(dtype))

        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize("unit", ["us", "ms", "s", "h", "m", "D"])
    def test_astype_to_timedelta_unit(self, unit):
        # coerce to float
        # gh-19223
        dtype = "m8[{}]".format(unit)
        arr = np.array([[1, 2, 3]], dtype=dtype)
        df = DataFrame(arr)
        result = df.astype(dtype)
        expected = DataFrame(df.values.astype(dtype).astype(float))

        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"])
    def test_astype_to_incorrect_datetimelike(self, unit):
        # trying to astype a m to a M, or vice-versa
        # gh-19224
        dtype = "M8[{}]".format(unit)
        other = "m8[{}]".format(unit)

        df = DataFrame(np.array([[1, 2, 3]], dtype=dtype))
        msg = (
            r"cannot astype a datetimelike from \[datetime64\[ns\]\] to"
            r" \[timedelta64\[{}\]\]"
        with pytest.raises(TypeError, match=msg):

        msg = (
            r"cannot astype a timedelta from \[timedelta64\[ns\]\] to"
            r" \[datetime64\[{}\]\]"
        df = DataFrame(np.array([[1, 2, 3]], dtype=other))
        with pytest.raises(TypeError, match=msg):

    def test_timedeltas(self):
        df = DataFrame(
                A=Series(date_range("2012-1-1", periods=3, freq="D")),
                B=Series([timedelta(days=i) for i in range(3)]),
        result = df.dtypes
        expected = Series(
            [np.dtype("datetime64[ns]"), np.dtype("timedelta64[ns]")], index=list("AB")
        assert_series_equal(result, expected)

        df["C"] = df["A"] + df["B"]
        result = df.dtypes
        expected = Series(
        assert_series_equal(result, expected)

        # mixed int types
        df["D"] = 1
        result = df.dtypes
        expected = Series(
        assert_series_equal(result, expected)

    def test_arg_for_errors_in_astype(self):
        # issue #14878

        df = DataFrame([1, 2, 3])

        with pytest.raises(ValueError):
            df.astype(np.float64, errors=True)

        df.astype(np.int8, errors="ignore")

    def test_arg_for_errors_in_astype_dictlist(self):
        # GH-25905
        df = pd.DataFrame(
                {"a": "1", "b": "16.5%", "c": "test"},
                {"a": "2.2", "b": "15.3", "c": "another_test"},
        expected = pd.DataFrame(
                {"a": 1.0, "b": "16.5%", "c": "test"},
                {"a": 2.2, "b": "15.3", "c": "another_test"},
        type_dict = {"a": "float64", "b": "float64", "c": "object"}

        result = df.astype(dtype=type_dict, errors="ignore")

        tm.assert_frame_equal(result, expected)

            ([1, 2]),
            (["1", "2"]),
            (list(pd.date_range("1/1/2011", periods=2, freq="H"))),
            (list(pd.date_range("1/1/2011", periods=2, freq="H", tz="US/Eastern"))),
            ([pd.Interval(left=0, right=5)]),
    def test_constructor_list_str(self, input_vals, string_dtype):
        # GH 16605
        # Ensure that data elements are converted to strings when
        # dtype is str, 'str', or 'U'

        result = DataFrame({"A": input_vals}, dtype=string_dtype)
        expected = DataFrame({"A": input_vals}).astype({"A": string_dtype})
        assert_frame_equal(result, expected)

    def test_constructor_list_str_na(self, string_dtype):

        result = DataFrame({"A": [1.0, 2.0, None]}, dtype=string_dtype)
        expected = DataFrame({"A": ["1.0", "2.0", None]}, dtype=object)
        assert_frame_equal(result, expected)

        "data, expected",
            # empty
            (DataFrame(), True),
            # multi-same
            (DataFrame({"A": [1, 2], "B": [1, 2]}), True),
            # multi-object
                        "A": np.array([1, 2], dtype=object),
                        "B": np.array(["a", "b"], dtype=object),
            # multi-extension
                    {"A": pd.Categorical(["a", "b"]), "B": pd.Categorical(["a", "b"])}
            # differ types
            (DataFrame({"A": [1, 2], "B": [1.0, 2.0]}), False),
            # differ sizes
                        "A": np.array([1, 2], dtype=np.int32),
                        "B": np.array([1, 2], dtype=np.int64),
            # multi-extension differ
                    {"A": pd.Categorical(["a", "b"]), "B": pd.Categorical(["b", "c"])}
    def test_is_homogeneous_type(self, data, expected):
        assert data._is_homogeneous_type is expected

    def test_asarray_homogenous(self):
        df = pd.DataFrame({"A": pd.Categorical([1, 2]), "B": pd.Categorical([1, 2])})
        result = np.asarray(df)
        # may change from object in the future
        expected = np.array([[1, 1], [2, 2]], dtype="object")
        tm.assert_numpy_array_equal(result, expected)
Beispiel #16
 def test_is_all_dates(self):
     # GH 23576
     year_2017 = pd.Interval(pd.Timestamp('2017-01-01 00:00:00'),
                             pd.Timestamp('2018-01-01 00:00:00'))
     year_2017_index = pd.IntervalIndex([year_2017])
     assert not year_2017_index.is_all_dates
Beispiel #17
    def testJSONSerialize(self):
        for serial_type in self._get_serial_types():
            provider = JsonSerializeProvider(

            node2 = Node2(a=[['ss'], ['dd']], data=[3, 7, 212])
            node1 = Node1(
                cl1=1 + 2j,
                cl2=2.5 + 3.1j,
                g=Node2(a=[['1', '2'], ['3', '4']]),
                h=[[2, 3], node2, True, {
                    1: node2
                   np.timedelta64(1, 'D'),
                   np.complex64(1 + 2j),
                   np.complex128(2 + 3j), lambda x: x + 2,
                       [pd.Interval(0, 1),
                        pd.Interval(1, 5)]),
                   nt(1, 2)],
                i=[Node8(b1=111), Node8(b1=222)],
                j=Node2(a=[['u'], ['v']]),
                   Node8(b1=222, j=Node5(a='xyz')), None],
                l=lambda x: x + 1,
                    [pd.Interval(0, 1), pd.Interval(1, 5)]),
                o=nt(3, 4))
            node3 = Node3(value=node1)

            serials = serializes(provider, [node2, node3])
            serials = [
                json.loads(json.dumps(s), object_hook=OrderedDict)
                for s in serials

            loads_fun = _loads_with_check if serial_type == dataserializer.SerialType.PICKLE \
                else original_pickle_loads
            with unittest.mock.patch('pickle.loads', new=loads_fun):
                d_node2, d_node3 = deserializes(provider, [Node2, Node3],

            self.assertIsNot(node2, d_node2)
            self.assertEqual(node2.a, d_node2.a)

            self.assertIsNot(node3, d_node3)
            self.assertIsInstance(d_node3.value, Node8)
            self.assertIsNot(node3.value, d_node3.value)
            self.assertEqual(node3.value.a, d_node3.value.a)
            self.assertEqual(node3.value.b1, d_node3.value.b1)
            self.assertEqual(node3.value.b2, d_node3.value.b2)
            self.assertEqual(node3.value.b3, d_node3.value.b3)
            self.assertEqual(node3.value.b4, d_node3.value.b4)
            self.assertEqual(node3.value.c1, d_node3.value.c1)
            self.assertEqual(node3.value.c2, d_node3.value.c2)
            self.assertEqual(node3.value.c3, d_node3.value.c3)
            self.assertEqual(node3.value.c4, d_node3.value.c4)
            self.assertAlmostEqual(node3.value.d1, d_node3.value.d1, places=2)
            self.assertAlmostEqual(node3.value.d2, d_node3.value.d2, places=4)
            self.assertAlmostEqual(node3.value.d3, d_node3.value.d3)
            self.assertAlmostEqual(node3.value.cl1, d_node3.value.cl1)
            self.assertAlmostEqual(node3.value.cl2, d_node3.value.cl2)
            self.assertEqual(node3.value.e, d_node3.value.e)
            self.assertIsNot(node3.value.f1, d_node3.value.f1)
            self.assertEqual(node3.value.f1.a, d_node3.value.f1.a)
            self.assertIsNot(node3.value.f2, d_node3.value.f2)
            self.assertEqual(node3.value.f2.a, d_node3.value.f2.a)
            self.assertIsNot(node3.value.g, d_node3.value.g)
            self.assertEqual(node3.value.g.a, d_node3.value.g.a)
            self.assertEqual(node3.value.h[0], d_node3.value.h[0])
            self.assertNotIsInstance(d_node3.value.h[1], str)
            self.assertIs(d_node3.value.h[1], d_node3.value.f1)
            self.assertEqual(node3.value.h[2], True)
            self.assertAlmostEqual(node3.value.h[6], d_node3.value.h[6])
            self.assertAlmostEqual(node3.value.h[7], d_node3.value.h[7])
            self.assertEqual(node3.value.h[8](2), 4)
            self.assertEqual(node3.value.h[9], d_node3.value.h[9])
            self.assertEqual(node3.value.h[11], d_node3.value.h[11])
            self.assertEqual([n.b1 for n in node3.value.i],
                             [n.b1 for n in d_node3.value.i])
            self.assertIsInstance(d_node3.value.i[0], Node8)
            self.assertIsInstance(d_node3.value.j, Node2)
            self.assertEqual(node3.value.j.a, d_node3.value.j.a)
            self.assertIsInstance(d_node3.value.k[0], Node5)
            self.assertEqual(node3.value.k[0].a, d_node3.value.k[0].a)
            self.assertIsInstance(d_node3.value.k[1], Node8)
            self.assertEqual(node3.value.k[1].b1, d_node3.value.k[1].b1)
            self.assertIsInstance(d_node3.value.k[1].j, Node5)
            self.assertEqual(node3.value.k[1].j.a, d_node3.value.k[1].j.a)
            self.assertEqual(d_node3.value.l(1), 2)
            self.assertEqual(d_node3.value.m, node3.value.m)
            np.testing.assert_array_equal(d_node3.value.n, node3.value.n)
            self.assertEqual(d_node3.value.o, node3.value.o)

            with self.assertRaises(ValueError):
                serializes(provider, [Node3(value='sth else')])
Beispiel #18
def create_file_index_for_climate_observations(
    parameter_set: DwdObservationDataset,
    resolution: Resolution,
    period: Period,
) -> pd.DataFrame:
    Function (cached) to create a file index of the DWD station data. The file index
    is created for an individual set of parameters.
        parameter_set: parameter of Parameter enumeration
        resolution: time resolution of TimeResolution enumeration
        period: period type of PeriodType enumeration
        file index in a pandas.DataFrame with sets of parameters and station id
    file_index = _create_file_index_for_dwd_server(
        parameter_set, resolution, period, DWDCDCBase.CLIMATE_OBSERVATIONS

    file_index = file_index[

    file_index[DwdColumns.STATION_ID.value] = (

    file_index = file_index.dropna().reset_index(drop=True)

    file_index.loc[:, DwdColumns.STATION_ID.value] = file_index[

    if resolution in HIGH_RESOLUTIONS and period == Period.HISTORICAL:
        # Date range string for additional filtering of historical files
        file_index[DwdColumns.DATE_RANGE.value] = (

        file_index[[DwdColumns.FROM_DATE.value, DwdColumns.TO_DATE.value]] = file_index[
        ].str.split("_", expand=True)

        file_index[DwdColumns.FROM_DATE.value] = pd.to_datetime(

        file_index[DwdColumns.TO_DATE.value] = pd.to_datetime(

        # Temporary fix for filenames with wrong ordered/faulty dates
        # Fill those cases with minimum/maximum date to ensure that they are loaded as
        # we don't know what exact date range the included data has
        wrong_date_order_index = (
            > file_index[DwdColumns.TO_DATE.value]

        file_index.loc[wrong_date_order_index, DwdColumns.FROM_DATE.value] = file_index[
        file_index.loc[wrong_date_order_index, DwdColumns.TO_DATE.value] = file_index[

        file_index[DwdColumns.INTERVAL.value] = file_index.apply(
            lambda x: pd.Interval(

    file_index = file_index.sort_values(
        by=[DwdColumns.STATION_ID.value, DwdColumns.FILENAME.value]

    return file_index
Beispiel #19
        pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]),
            ("infer_objects", False): np.dtype("object")
    (pd.period_range("1/1/2011", freq="M",
                     periods=3), None, pd.PeriodDtype("M"), {}),
        pd.arrays.IntervalArray([pd.Interval(0, 1),
                                 pd.Interval(1, 5)]),

class TestSeriesConvertDtypes:
        "data, maindtype, expected_default, expected_other",
    @pytest.mark.parametrize("params", product(*[(True, False)] * 5))
    def test_convert_dtypes(self, data, maindtype, params, expected_default,
Beispiel #20
     [np.datetime64('2013-01-01'), np.nan,
    ('datetime', [pd.Timestamp('20130101'), np.nan,
    ('date', [date(2013, 1, 1), np.nan,
              date(2018, 1, 1)]),
    # The following two dtypes are commented out due to GH 23554
    # ('complex', [1 + 1j, np.nan, 2 + 2j]),
    # ('timedelta64', [np.timedelta64(1, 'D'),
    #                  np.nan, np.timedelta64(2, 'D')]),
    ('timedelta', [timedelta(1), np.nan, timedelta(2)]),
    ('time', [time(1), np.nan, time(2)]),
    ('period', [pd.Period(2013), pd.NaT,
    ('interval', [pd.Interval(0, 1), np.nan,
                  pd.Interval(0, 2)])
ids, _ = zip(*_any_skipna_inferred_dtype)  # use inferred type as fixture-id

@pytest.fixture(params=_any_skipna_inferred_dtype, ids=ids)
def any_skipna_inferred_dtype(request):
    Fixture for all inferred dtypes from _libs.lib.infer_dtype

    The covered (inferred) types are:
    * 'string'
    * 'unicode' (if PY2)
    * 'empty'
    * 'bytes' (if PY3)
def correlate_binned_data(top_donor_data, binned, bins):
    plot_hourly_runned_summed_data(binned, bins)

    print("TOP DONATION DF")

    top_donor_data = top_donor_data.drop("donated_amount", axis=1)
    top_donor_data = top_donor_data.drop("bin", axis=1)
    top_donor_data_per_hour = fix_intervals_for_data(top_donor_data)

    corrs = []
    hours_to_shift = 300

    # get and print correlations for binned groups
    for i in range(1, len(bins) - 1):
        left = bins[i - 1]
        right = bins[i]
        interval = pd.Interval(left=left, right=right)
        print("Interval to compare with top-donors", interval)

        data_to_comp = binned.get_group(interval)

        data_to_comp = data_to_comp.drop("donated_amount", axis=1)
        data_to_comp = data_to_comp.drop("bin", axis=1)
        data_to_comp = fix_intervals_for_data(data_to_comp)

        # Assuming top donor data is the most influential

        corr_per_range = []
        for i in range(-hours_to_shift, hours_to_shift):
            # hours are actually reversed
            data_to_comp_mod = copy.deepcopy(data_to_comp)
            top_donor_data_per_hour_mod = copy.deepcopy(
            if i > 0:
                top_donor_data_per_hour_mod = top_donor_data_per_hour_mod[:-i]
                data_to_comp_mod = data_to_comp_mod[i:]
            elif i < 0:
                top_donor_data_per_hour_mod = top_donor_data_per_hour_mod[-i:]
                data_to_comp_mod = data_to_comp_mod[:i]
                pass  # its 0
            corr = np.corrcoef(
                data_to_comp_mod)[0, 1]  # grab the compared correlation

             corr_per_range))  # append tuple-> inteval, corrs over the hours

    # init new plot
    fig, ax = plt.subplots()
    for data_brick in corrs:
        interval = data_brick[0]
        corrs_shifted = np.asarray(data_brick[1])

        # reverse the hour amount as this is logical for the graph
        x = np.asarray(range(-hours_to_shift, hours_to_shift)) * -1
                label="Interval: " + str(interval),
        xmax = x[np.argmax(corrs_shifted)]
        ymax = corrs_shifted.max()

        print("Max correlation coefficients")
        print(xmax, ymax, interval)

        ax.plot(xmax, ymax, marker="o", ls="", ms=3)

    plt.ylabel("Correlation coefficient", fontsize=30)
    plt.xlabel("Hours shifted", fontsize=30)
    plt.legend(loc='upper left',
                   'Low donors', 'Peak correlation: Low donors',
                   'Medium donors', 'Peak correlation: medium donors',
                   'Large donors', 'Peak correlation: Large donors'
               prop={'size': 20})
Beispiel #22
class TestFillnaSeriesCoercion(CoercionBase):

    # not indexing, but place here for consistency

    method = "fillna"

    @pytest.mark.xfail(reason="Test not implemented")
    def test_has_comprehensive_tests(self):
        raise NotImplementedError

    def _assert_fillna_conversion(self, original, value, expected,
        """test coercion triggered by fillna"""
        target = original.copy()
        res = target.fillna(value)
        tm.assert_equal(res, expected)
        assert res.dtype == expected_dtype

        "fill_val, fill_dtype",
        [(1, object), (1.1, object), (1 + 1j, object), (True, object)],
    def test_fillna_object(self, index_or_series, fill_val, fill_dtype):
        klass = index_or_series
        obj = klass(["a", np.nan, "c", "d"])
        assert obj.dtype == object

        exp = klass(["a", fill_val, "c", "d"])
        self._assert_fillna_conversion(obj, fill_val, exp, fill_dtype)

        [(1, np.float64), (1.1, np.float64), (1 + 1j, np.complex128),
         (True, object)],
    def test_fillna_float64(self, index_or_series, fill_val, fill_dtype):
        klass = index_or_series
        obj = klass([1.1, np.nan, 3.3, 4.4])
        assert obj.dtype == np.float64

        exp = klass([1.1, fill_val, 3.3, 4.4])
        self._assert_fillna_conversion(obj, fill_val, exp, fill_dtype)

            (1, np.complex128),
            (1.1, np.complex128),
            (1 + 1j, np.complex128),
            (True, object),
    def test_fillna_complex128(self, index_or_series, fill_val, fill_dtype):
        klass = index_or_series
        obj = klass([1 + 1j, np.nan, 3 + 3j, 4 + 4j], dtype=np.complex128)
        assert obj.dtype == np.complex128

        exp = klass([1 + 1j, fill_val, 3 + 3j, 4 + 4j])
        self._assert_fillna_conversion(obj, fill_val, exp, fill_dtype)

            (pd.Timestamp("2012-01-01"), "datetime64[ns]"),
            (pd.Timestamp("2012-01-01", tz="US/Eastern"), object),
            (1, object),
            ("x", object),
        ids=["datetime64", "datetime64tz", "object", "object"],
    def test_fillna_datetime(self, index_or_series, fill_val, fill_dtype):
        klass = index_or_series
        obj = klass([
        assert obj.dtype == "datetime64[ns]"

        exp = klass([
        self._assert_fillna_conversion(obj, fill_val, exp, fill_dtype)

                          tz="US/Eastern"), "datetime64[ns, US/Eastern]"),
            (pd.Timestamp("2012-01-01"), object),
            (pd.Timestamp("2012-01-01", tz="Asia/Tokyo"), object),
            (1, object),
            ("x", object),
    def test_fillna_datetime64tz(self, index_or_series, fill_val, fill_dtype):
        klass = index_or_series
        tz = "US/Eastern"

        obj = klass([
            pd.Timestamp("2011-01-01", tz=tz),
            pd.Timestamp("2011-01-03", tz=tz),
            pd.Timestamp("2011-01-04", tz=tz),
        assert obj.dtype == "datetime64[ns, US/Eastern]"

        exp = klass([
            pd.Timestamp("2011-01-01", tz=tz),
            # Once deprecation is enforced, this becomes:
            # fill_val.tz_convert(tz) if getattr(fill_val, "tz", None)
            #  is not None else fill_val,
            pd.Timestamp("2011-01-03", tz=tz),
            pd.Timestamp("2011-01-04", tz=tz),
        warn = None
        if getattr(fill_val, "tz",
                   None) is not None and != obj[0].tz:
            warn = FutureWarning
        with tm.assert_produces_warning(warn, match="mismatched timezone"):
            self._assert_fillna_conversion(obj, fill_val, exp, fill_dtype)

            1 + 1j,
            pd.Interval(1, 2, inclusive="left"),
            pd.Timestamp("2012-01-01", tz="US/Eastern"),
            pd.Period("2016-01-01", "D"),
    def test_fillna_interval(self, index_or_series, fill_val):
        ii = pd.interval_range(1.0, 5.0, inclusive="right").insert(1, np.nan)
        assert isinstance(ii.dtype, pd.IntervalDtype)
        obj = index_or_series(ii)

        exp = index_or_series([ii[0], fill_val, ii[2], ii[3], ii[4]],

        fill_dtype = object
        self._assert_fillna_conversion(obj, fill_val, exp, fill_dtype)

    @pytest.mark.xfail(reason="Test not implemented")
    def test_fillna_series_int64(self):
        raise NotImplementedError

    @pytest.mark.xfail(reason="Test not implemented")
    def test_fillna_index_int64(self):
        raise NotImplementedError

    @pytest.mark.xfail(reason="Test not implemented")
    def test_fillna_series_bool(self):
        raise NotImplementedError

    @pytest.mark.xfail(reason="Test not implemented")
    def test_fillna_index_bool(self):
        raise NotImplementedError

    @pytest.mark.xfail(reason="Test not implemented")
    def test_fillna_series_timedelta64(self):
        raise NotImplementedError

            1 + 1j,
            pd.Interval(1, 2, inclusive="left"),
            pd.Timestamp("2012-01-01", tz="US/Eastern"),
            pd.Period("2016-01-01", "W"),
    def test_fillna_series_period(self, index_or_series, fill_val):

        pi = pd.period_range("2016-01-01", periods=4,
                             freq="D").insert(1, pd.NaT)
        assert isinstance(pi.dtype, pd.PeriodDtype)
        obj = index_or_series(pi)

        exp = index_or_series([pi[0], fill_val, pi[2], pi[3], pi[4]],

        fill_dtype = object
        self._assert_fillna_conversion(obj, fill_val, exp, fill_dtype)

    @pytest.mark.xfail(reason="Test not implemented")
    def test_fillna_index_timedelta64(self):
        raise NotImplementedError

    @pytest.mark.xfail(reason="Test not implemented")
    def test_fillna_index_period(self):
        raise NotImplementedError
Beispiel #23
     pd.TimedeltaIndex(["1H", "2H"]),
     TimedeltaArray._from_sequence(["1H", "2H"]),
 # Category
 (["a", "b"], "category", pd.Categorical(["a", "b"])),
     ["a", "b"],
     pd.CategoricalDtype(None, ordered=True),
     pd.Categorical(["a", "b"], ordered=True),
 # Interval
     [pd.Interval(1, 2), pd.Interval(3, 4)],
     IntervalArray.from_tuples([(1, 2), (3, 4)]),
 # Sparse
 ([0, 1], "Sparse[int64]", SparseArray([0, 1], dtype="int64")),
 # IntegerNA
 ([1, None], "Int16", integer_array([1, None], dtype="Int16")),
 (pd.Series([1, 2]), None, PandasArray(np.array([1, 2],
 # String
 (["a", None], "string", StringArray._from_sequence(["a", None])),
     ["a", None],
     StringArray._from_sequence(["a", None]),
Beispiel #24
    "array, expected",
        (np.array([1, 2], dtype=np.int64), np.array([1, 2], dtype=np.int64)),
        (pd.Categorical(["a", "b"]), np.array(["a", "b"], dtype=object)),
            pd.core.arrays.period_array(["2000", "2001"], freq="D"),
                [pd.Period("2000", freq="D"),
                 pd.Period("2001", freq="D")]),
        (pd.array([0, np.nan],
                  dtype="Int64"), np.array([0, pd.NA], dtype=object)),
            IntervalArray.from_breaks([0, 1, 2]),
            np.array([pd.Interval(0, 1), pd.Interval(1, 2)], dtype=object),
        (SparseArray([0, 1]), np.array([0, 1], dtype=np.int64)),
        # tz-naive datetime
            DatetimeArray(np.array(["2000", "2001"], dtype="M8[ns]")),
            np.array(["2000", "2001"], dtype="M8[ns]"),
        # tz-aware stays tz`-aware
                np.array(["2000-01-01T06:00:00", "2000-01-02T06:00:00"],
Beispiel #25
class TestCategoricalIndex(Base):
    _holder = CategoricalIndex

    def indices(self, request):
        return tm.makeCategoricalIndex(100)

    def create_index(self, categories=None, ordered=False):
        if categories is None:
            categories = list("cab")
        return CategoricalIndex(list("aabbca"),

    def test_can_hold_identifiers(self):
        idx = self.create_index(categories=list("abcd"))
        key = idx[0]
        assert idx._can_hold_identifiers_and_holds_name(key) is True

            (lambda idx: idx - idx, "__sub__"),
            (lambda idx: idx + idx, "__add__"),
            (lambda idx: idx - ["a", "b"], "__sub__"),
            (lambda idx: idx + ["a", "b"], "__add__"),
            (lambda idx: ["a", "b"] - idx, "__rsub__"),
            (lambda idx: ["a", "b"] + idx, "__radd__"),
    def test_disallow_addsub_ops(self, func, op_name):
        # GH 10039
        # set ops (+/-) raise TypeError
        idx = pd.Index(pd.Categorical(["a", "b"]))
        msg = f"cannot perform {op_name} with this index type: CategoricalIndex"
        with pytest.raises(TypeError, match=msg):

    def test_method_delegation(self):

        ci = CategoricalIndex(list("aabbca"), categories=list("cabdef"))
        result = ci.set_categories(list("cab"))
            result, CategoricalIndex(list("aabbca"), categories=list("cab")))

        ci = CategoricalIndex(list("aabbca"), categories=list("cab"))
        result = ci.rename_categories(list("efg"))
            result, CategoricalIndex(list("ffggef"), categories=list("efg")))

        # GH18862 (let rename_categories take callables)
        result = ci.rename_categories(lambda x: x.upper())
            result, CategoricalIndex(list("AABBCA"), categories=list("CAB")))

        ci = CategoricalIndex(list("aabbca"), categories=list("cab"))
        result = ci.add_categories(["d"])
            result, CategoricalIndex(list("aabbca"), categories=list("cabd")))

        ci = CategoricalIndex(list("aabbca"), categories=list("cab"))
        result = ci.remove_categories(["c"])
            CategoricalIndex(list("aabb") + [np.nan] + ["a"],

        ci = CategoricalIndex(list("aabbca"), categories=list("cabdef"))
        result = ci.as_unordered()
        tm.assert_index_equal(result, ci)

        ci = CategoricalIndex(list("aabbca"), categories=list("cabdef"))
        result = ci.as_ordered()

        # invalid
        msg = "cannot use inplace with CategoricalIndex"
        with pytest.raises(ValueError, match=msg):
            ci.set_categories(list("cab"), inplace=True)

    def test_contains(self):

        ci = self.create_index(categories=list("cabdef"))

        assert "a" in ci
        assert "z" not in ci
        assert "e" not in ci
        assert np.nan not in ci

        # assert codes NOT in index
        assert 0 not in ci
        assert 1 not in ci

        ci = CategoricalIndex(list("aabbca") + [np.nan],
        assert np.nan in ci

        "item, expected",
            (pd.Interval(0, 1), True),
            (1.5, True),
            (pd.Interval(0.5, 1.5), False),
            ("a", False),
            (pd.Timestamp(1), False),
            (pd.Timedelta(1), False),
    def test_contains_interval(self, item, expected):
        # GH 23705
        ci = CategoricalIndex(IntervalIndex.from_breaks(range(3)))
        result = item in ci
        assert result is expected

    def test_contains_list(self):
        # GH#21729
        idx = pd.CategoricalIndex([1, 2, 3])

        assert "a" not in idx

        with pytest.raises(TypeError, match="unhashable type"):
            ["a"] in idx

        with pytest.raises(TypeError, match="unhashable type"):
            ["a", "b"] in idx

    def test_map(self):
        ci = pd.CategoricalIndex(list("ABABC"),
        result = x: x.lower())
        exp = pd.CategoricalIndex(list("ababc"),
        tm.assert_index_equal(result, exp)

        ci = pd.CategoricalIndex(list("ABABC"),
        result = x: x.lower())
        exp = pd.CategoricalIndex(list("ababc"),
        tm.assert_index_equal(result, exp)

        # GH 12766: Return an index not an array
   x: 1),
            Index(np.array([1] * 5, dtype=np.int64), name="XXX"))

        # change categories dtype
        ci = pd.CategoricalIndex(list("ABABC"),

        def f(x):
            return {"A": 10, "B": 20, "C": 30}.get(x)

        result =
        exp = pd.CategoricalIndex([10, 20, 10, 20, 30],
                                  categories=[20, 10, 30],
        tm.assert_index_equal(result, exp)

        result =[10, 20, 30], index=["A", "B", "C"]))
        tm.assert_index_equal(result, exp)

        result ={"A": 10, "B": 20, "C": 30})
        tm.assert_index_equal(result, exp)

    def test_map_with_categorical_series(self):
        # GH 12756
        a = pd.Index([1, 2, 3, 4])
        b = pd.Series(["even", "odd", "even", "odd"], dtype="category")
        c = pd.Series(["even", "odd", "even", "odd"])

        exp = CategoricalIndex(["odd", "even", "odd", np.nan])
        tm.assert_index_equal(, exp)
        exp = pd.Index(["odd", "even", "odd", np.nan])
        tm.assert_index_equal(, exp)

        ("data", "f"),
            ([1, 1, np.nan], pd.isna),
            ([1, 2, np.nan], pd.isna),
            ([1, 1, np.nan], {
                1: False
            ([1, 2, np.nan], {
                1: False,
                2: False
            ([1, 1, np.nan], pd.Series([False, False])),
            ([1, 2, np.nan], pd.Series([False, False, False])),
    def test_map_with_nan(self, data, f):  # GH 24241
        values = pd.Categorical(data)
        result =
        if data[1] == 1:
            expected = pd.Categorical([False, False, np.nan])
            tm.assert_categorical_equal(result, expected)
            expected = pd.Index([False, False, np.nan])
            tm.assert_index_equal(result, expected)

    @pytest.mark.parametrize("klass", [list, tuple, np.array, pd.Series])
    def test_where(self, klass):
        i = self.create_index()
        cond = [True] * len(i)
        expected = i
        result = i.where(klass(cond))
        tm.assert_index_equal(result, expected)

        cond = [False] + [True] * (len(i) - 1)
        expected = CategoricalIndex([np.nan] + i[1:].tolist(),
        result = i.where(klass(cond))
        tm.assert_index_equal(result, expected)

    def test_append(self):

        ci = self.create_index()
        categories = ci.categories

        # append cats with the same categories
        result = ci[:3].append(ci[3:])
        tm.assert_index_equal(result, ci, exact=True)

        foos = [ci[:1], ci[1:3], ci[3:]]
        result = foos[0].append(foos[1:])
        tm.assert_index_equal(result, ci, exact=True)

        # empty
        result = ci.append([])
        tm.assert_index_equal(result, ci, exact=True)

        # appending with different categories or reordered is not ok
        msg = "all inputs must be Index"
        with pytest.raises(TypeError, match=msg):
        with pytest.raises(TypeError, match=msg):

        # with objects
        result = ci.append(Index(["c", "a"]))
        expected = CategoricalIndex(list("aabbcaca"), categories=categories)
        tm.assert_index_equal(result, expected, exact=True)

        # invalid objects
        msg = "cannot append a non-category item to a CategoricalIndex"
        with pytest.raises(TypeError, match=msg):
            ci.append(Index(["a", "d"]))

        # GH14298 - if base object is not categorical -> coerce to object
        result = Index(["c", "a"]).append(ci)
        expected = Index(list("caaabbca"))
        tm.assert_index_equal(result, expected, exact=True)

    def test_append_to_another(self):
        # hits Index._concat_same_dtype
        fst = Index(["a", "b"])
        snd = CategoricalIndex(["d", "e"])
        result = fst.append(snd)
        expected = Index(["a", "b", "d", "e"])
        tm.assert_index_equal(result, expected)

    def test_insert(self):

        ci = self.create_index()
        categories = ci.categories

        # test 0th element
        result = ci.insert(0, "a")
        expected = CategoricalIndex(list("aaabbca"), categories=categories)
        tm.assert_index_equal(result, expected, exact=True)

        # test Nth element that follows Python list behavior
        result = ci.insert(-1, "a")
        expected = CategoricalIndex(list("aabbcaa"), categories=categories)
        tm.assert_index_equal(result, expected, exact=True)

        # test empty
        result = CategoricalIndex(categories=categories).insert(0, "a")
        expected = CategoricalIndex(["a"], categories=categories)
        tm.assert_index_equal(result, expected, exact=True)

        # invalid
        msg = ("cannot insert an item into a CategoricalIndex that is not"
               " already an existing category")
        with pytest.raises(TypeError, match=msg):
            ci.insert(0, "d")

        # GH 18295 (test missing)
        expected = CategoricalIndex(["a", np.nan, "a", "b", "c", "b"])
        for na in (np.nan, pd.NaT, None):
            result = CategoricalIndex(list("aabcb")).insert(1, na)
            tm.assert_index_equal(result, expected)

    def test_delete(self):

        ci = self.create_index()
        categories = ci.categories

        result = ci.delete(0)
        expected = CategoricalIndex(list("abbca"), categories=categories)
        tm.assert_index_equal(result, expected, exact=True)

        result = ci.delete(-1)
        expected = CategoricalIndex(list("aabbc"), categories=categories)
        tm.assert_index_equal(result, expected, exact=True)

        with pytest.raises((IndexError, ValueError)):
            # Either depending on NumPy version

    def test_astype(self):

        ci = self.create_index()
        result = ci.astype(object)
        tm.assert_index_equal(result, Index(np.array(ci)))

        # this IS equal, but not the same class
        assert result.equals(ci)
        assert isinstance(result, Index)
        assert not isinstance(result, CategoricalIndex)

        # interval
        ii = IntervalIndex.from_arrays(left=[-0.001, 2.0],
                                       right=[2, 4],

        ci = CategoricalIndex(
            Categorical.from_codes([0, 1, -1], categories=ii, ordered=True))

        result = ci.astype("interval")
        expected = ii.take([0, 1, -1])
        tm.assert_index_equal(result, expected)

        result = IntervalIndex(result.values)
        tm.assert_index_equal(result, expected)

    @pytest.mark.parametrize("name", [None, "foo"])
    @pytest.mark.parametrize("dtype_ordered", [True, False])
    @pytest.mark.parametrize("index_ordered", [True, False])
    def test_astype_category(self, name, dtype_ordered, index_ordered):
        # GH 18630
        index = self.create_index(ordered=index_ordered)
        if name:
            index = index.rename(name)

        # standard categories
        dtype = CategoricalDtype(ordered=dtype_ordered)
        result = index.astype(dtype)
        expected = CategoricalIndex(
        tm.assert_index_equal(result, expected)

        # non-standard categories
        dtype = CategoricalDtype(index.unique().tolist()[:-1], dtype_ordered)
        result = index.astype(dtype)
        expected = CategoricalIndex(index.tolist(), name=name, dtype=dtype)
        tm.assert_index_equal(result, expected)

        if dtype_ordered is False:
            # dtype='category' can't specify ordered, so only test once
            result = index.astype("category")
            expected = index
            tm.assert_index_equal(result, expected)

    def test_reindex_base(self):
        # Determined by cat ordering.
        idx = CategoricalIndex(list("cab"), categories=list("cab"))
        expected = np.arange(len(idx), dtype=np.intp)

        actual = idx.get_indexer(idx)
        tm.assert_numpy_array_equal(expected, actual)

        with pytest.raises(ValueError, match="Invalid fill method"):
            idx.get_indexer(idx, method="invalid")

    def test_reindexing(self):

        ci = self.create_index()
        oidx = Index(np.array(ci))

        for n in [1, 2, 5, len(ci)]:
            finder = oidx[np.random.randint(0, len(ci), size=n)]
            expected = oidx.get_indexer_non_unique(finder)[0]

            actual = ci.get_indexer(finder)
            tm.assert_numpy_array_equal(expected, actual)

        # see gh-17323
        # Even when indexer is equal to the
        # members in the index, we should
        # respect duplicates instead of taking
        # the fast-track path.
        for finder in [list("aabbca"), list("aababca")]:
            expected = oidx.get_indexer_non_unique(finder)[0]

            actual = ci.get_indexer(finder)
            tm.assert_numpy_array_equal(expected, actual)

    def test_reindex_dtype(self):
        c = CategoricalIndex(["a", "b", "c", "a"])
        res, indexer = c.reindex(["a", "c"])
        tm.assert_index_equal(res, Index(["a", "a", "c"]), exact=True)
        tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2],

        c = CategoricalIndex(["a", "b", "c", "a"])
        res, indexer = c.reindex(Categorical(["a", "c"]))

        exp = CategoricalIndex(["a", "a", "c"], categories=["a", "c"])
        tm.assert_index_equal(res, exp, exact=True)
        tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2],

        c = CategoricalIndex(["a", "b", "c", "a"],
                             categories=["a", "b", "c", "d"])
        res, indexer = c.reindex(["a", "c"])
        exp = Index(["a", "a", "c"], dtype="object")
        tm.assert_index_equal(res, exp, exact=True)
        tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2],

        c = CategoricalIndex(["a", "b", "c", "a"],
                             categories=["a", "b", "c", "d"])
        res, indexer = c.reindex(Categorical(["a", "c"]))
        exp = CategoricalIndex(["a", "a", "c"], categories=["a", "c"])
        tm.assert_index_equal(res, exp, exact=True)
        tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2],

    def test_reindex_duplicate_target(self):
        # See GH25459
        cat = CategoricalIndex(["a", "b", "c"],
                               categories=["a", "b", "c", "d"])
        res, indexer = cat.reindex(["a", "c", "c"])
        exp = Index(["a", "c", "c"], dtype="object")
        tm.assert_index_equal(res, exp, exact=True)
        tm.assert_numpy_array_equal(indexer, np.array([0, 2, 2],

        res, indexer = cat.reindex(
            CategoricalIndex(["a", "c", "c"], categories=["a", "b", "c", "d"]))
        exp = CategoricalIndex(["a", "c", "c"],
                               categories=["a", "b", "c", "d"])
        tm.assert_index_equal(res, exp, exact=True)
        tm.assert_numpy_array_equal(indexer, np.array([0, 2, 2],

    def test_reindex_empty_index(self):
        # See GH16770
        c = CategoricalIndex([])
        res, indexer = c.reindex(["a", "b"])
        tm.assert_index_equal(res, Index(["a", "b"]), exact=True)
        tm.assert_numpy_array_equal(indexer, np.array([-1, -1], dtype=np.intp))

        "data, non_lexsorted_data",
        [[[1, 2, 3], [9, 0, 1, 2, 3]], [list("abc"),
    def test_is_monotonic(self, data, non_lexsorted_data):
        c = CategoricalIndex(data)
        assert c.is_monotonic_increasing is True
        assert c.is_monotonic_decreasing is False

        c = CategoricalIndex(data, ordered=True)
        assert c.is_monotonic_increasing is True
        assert c.is_monotonic_decreasing is False

        c = CategoricalIndex(data, categories=reversed(data))
        assert c.is_monotonic_increasing is False
        assert c.is_monotonic_decreasing is True

        c = CategoricalIndex(data, categories=reversed(data), ordered=True)
        assert c.is_monotonic_increasing is False
        assert c.is_monotonic_decreasing is True

        # test when data is neither monotonic increasing nor decreasing
        reordered_data = [data[0], data[2], data[1]]
        c = CategoricalIndex(reordered_data, categories=reversed(data))
        assert c.is_monotonic_increasing is False
        assert c.is_monotonic_decreasing is False

        # non lexsorted categories
        categories = non_lexsorted_data

        c = CategoricalIndex(categories[:2], categories=categories)
        assert c.is_monotonic_increasing is True
        assert c.is_monotonic_decreasing is False

        c = CategoricalIndex(categories[1:3], categories=categories)
        assert c.is_monotonic_increasing is True
        assert c.is_monotonic_decreasing is False

    def test_has_duplicates(self):

        idx = CategoricalIndex([0, 0, 0], name="foo")
        assert idx.is_unique is False
        assert idx.has_duplicates is True

    def test_drop_duplicates(self):

        idx = CategoricalIndex([0, 0, 0], name="foo")
        expected = CategoricalIndex([0], name="foo")
        tm.assert_index_equal(idx.drop_duplicates(), expected)
        tm.assert_index_equal(idx.unique(), expected)

    def test_get_indexer(self):

        idx1 = CategoricalIndex(list("aabcde"), categories=list("edabc"))
        idx2 = CategoricalIndex(list("abf"))

        for indexer in [idx2, list("abf"), Index(list("abf"))]:
            r1 = idx1.get_indexer(idx2)
            tm.assert_almost_equal(r1, np.array([0, 1, 2, -1], dtype=np.intp))

        msg = ("method='pad' and method='backfill' not implemented yet for"
               " CategoricalIndex")
        with pytest.raises(NotImplementedError, match=msg):
            idx2.get_indexer(idx1, method="pad")
        with pytest.raises(NotImplementedError, match=msg):
            idx2.get_indexer(idx1, method="backfill")

        msg = "method='nearest' not implemented yet for CategoricalIndex"
        with pytest.raises(NotImplementedError, match=msg):
            idx2.get_indexer(idx1, method="nearest")

    def test_get_loc(self):
        # GH 12531
        cidx1 = CategoricalIndex(list("abcde"), categories=list("edabc"))
        idx1 = Index(list("abcde"))
        assert cidx1.get_loc("a") == idx1.get_loc("a")
        assert cidx1.get_loc("e") == idx1.get_loc("e")

        for i in [cidx1, idx1]:
            with pytest.raises(KeyError, match="'NOT-EXIST'"):

        # non-unique
        cidx2 = CategoricalIndex(list("aacded"), categories=list("edabc"))
        idx2 = Index(list("aacded"))

        # results in bool array
        res = cidx2.get_loc("d")
        tm.assert_numpy_array_equal(res, idx2.get_loc("d"))
            res, np.array([False, False, False, True, False, True]))
        # unique element results in scalar
        res = cidx2.get_loc("e")
        assert res == idx2.get_loc("e")
        assert res == 4

        for i in [cidx2, idx2]:
            with pytest.raises(KeyError, match="'NOT-EXIST'"):

        # non-unique, sliceable
        cidx3 = CategoricalIndex(list("aabbb"), categories=list("abc"))
        idx3 = Index(list("aabbb"))

        # results in slice
        res = cidx3.get_loc("a")
        assert res == idx3.get_loc("a")
        assert res == slice(0, 2, None)

        res = cidx3.get_loc("b")
        assert res == idx3.get_loc("b")
        assert res == slice(2, 5, None)

        for i in [cidx3, idx3]:
            with pytest.raises(KeyError, match="'c'"):

    def test_repr_roundtrip(self):

        ci = CategoricalIndex(["a", "b"], categories=["a", "b"], ordered=True)
        tm.assert_index_equal(eval(repr(ci)), ci, exact=True)

        # formatting

        # long format
        # this is not reprable
        ci = CategoricalIndex(np.random.randint(0, 5, size=100))

    def test_isin(self):

        ci = CategoricalIndex(list("aabca") + [np.nan],
                              categories=["c", "a", "b"])
            ci.isin(["c"]), np.array([False, False, False, True, False,
        tm.assert_numpy_array_equal(ci.isin(["c", "a", "b"]),
                                    np.array([True] * 5 + [False]))
        tm.assert_numpy_array_equal(ci.isin(["c", "a", "b", np.nan]),
                                    np.array([True] * 6))

        # mismatched categorical -> coerced to ndarray so doesn't matter
        result = ci.isin(ci.set_categories(list("abcdefghi")))
        expected = np.array([True] * 6)
        tm.assert_numpy_array_equal(result, expected)

        result = ci.isin(ci.set_categories(list("defghi")))
        expected = np.array([False] * 5 + [True])
        tm.assert_numpy_array_equal(result, expected)

    def test_identical(self):

        ci1 = CategoricalIndex(["a", "b"], categories=["a", "b"], ordered=True)
        ci2 = CategoricalIndex(["a", "b"],
                               categories=["a", "b", "c"],
        assert ci1.identical(ci1)
        assert ci1.identical(ci1.copy())
        assert not ci1.identical(ci2)

    def test_ensure_copied_data(self, indices):
        # gh-12309: Check the "copy" argument of each
        # Index.__new__ is honored.
        # Must be tested separately from other indexes because
        # self.values is not an ndarray.
        # GH#29918 Index.base has been removed
        # FIXME: is this test still meaningful?
        _base = lambda ar: ar if getattr(ar, "base", None) is None else ar.base

        result = CategoricalIndex(indices.values, copy=True)
        tm.assert_index_equal(indices, result)
        assert _base(indices.values) is not _base(result.values)

        result = CategoricalIndex(indices.values, copy=False)
        assert _base(indices.values) is _base(result.values)

    def test_equals_categorical(self):
        ci1 = CategoricalIndex(["a", "b"], categories=["a", "b"], ordered=True)
        ci2 = CategoricalIndex(["a", "b"],
                               categories=["a", "b", "c"],

        assert ci1.equals(ci1)
        assert not ci1.equals(ci2)
        assert ci1.equals(ci1.astype(object))
        assert ci1.astype(object).equals(ci1)

        assert (ci1 == ci1).all()
        assert not (ci1 != ci1).all()
        assert not (ci1 > ci1).all()
        assert not (ci1 < ci1).all()
        assert (ci1 <= ci1).all()
        assert (ci1 >= ci1).all()

        assert not (ci1 == 1).all()
        assert (ci1 == Index(["a", "b"])).all()
        assert (ci1 == ci1.values).all()

        # invalid comparisons
        with pytest.raises(ValueError, match="Lengths must match"):
            ci1 == Index(["a", "b", "c"])

        msg = (
            "categorical index comparisons must have the same categories"
            " and ordered attributes"
            "Categoricals can only be compared if 'categories' are the same. "
            "Categories are different lengths"
            "Categoricals can only be compared if 'ordered' is the same")
        with pytest.raises(TypeError, match=msg):
            ci1 == ci2
        with pytest.raises(TypeError, match=msg):
            ci1 == Categorical(ci1.values, ordered=False)
        with pytest.raises(TypeError, match=msg):
            ci1 == Categorical(ci1.values, categories=list("abc"))

        # tests
        # make sure that we are testing for category inclusion properly
        ci = CategoricalIndex(list("aabca"), categories=["c", "a", "b"])
        assert not ci.equals(list("aabca"))
        # Same categories, but different order
        # Unordered
        assert ci.equals(CategoricalIndex(list("aabca")))
        # Ordered
        assert not ci.equals(CategoricalIndex(list("aabca"), ordered=True))
        assert ci.equals(ci.copy())

        ci = CategoricalIndex(list("aabca") + [np.nan],
                              categories=["c", "a", "b"])
        assert not ci.equals(list("aabca"))
        assert not ci.equals(CategoricalIndex(list("aabca")))
        assert ci.equals(ci.copy())

        ci = CategoricalIndex(list("aabca") + [np.nan],
                              categories=["c", "a", "b"])
        assert not ci.equals(list("aabca") + [np.nan])
        assert ci.equals(CategoricalIndex(list("aabca") + [np.nan]))
        assert not ci.equals(
            CategoricalIndex(list("aabca") + [np.nan], ordered=True))
        assert ci.equals(ci.copy())

    def test_equals_categoridcal_unordered(self):
        a = pd.CategoricalIndex(["A"], categories=["A", "B"])
        b = pd.CategoricalIndex(["A"], categories=["B", "A"])
        c = pd.CategoricalIndex(["C"], categories=["B", "A"])
        assert a.equals(b)
        assert not a.equals(c)
        assert not b.equals(c)

    def test_frame_repr(self):
        df = pd.DataFrame({"A": [1, 2, 3]},
                          index=pd.CategoricalIndex(["a", "b", "c"]))
        result = repr(df)
        expected = "   A\na  1\nb  2\nc  3"
        assert result == expected

    def test_string_categorical_index_repr(self):
        # short
        idx = pd.CategoricalIndex(["a", "bb", "ccc"])
        expected = """CategoricalIndex(['a', 'bb', 'ccc'], categories=['a', 'bb', 'ccc'], ordered=False, dtype='category')"""  # noqa
        assert repr(idx) == expected

        # multiple lines
        idx = pd.CategoricalIndex(["a", "bb", "ccc"] * 10)
        expected = """CategoricalIndex(['a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a',
                  'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb',
                  'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc'],
                 categories=['a', 'bb', 'ccc'], ordered=False, dtype='category')"""  # noqa

        assert repr(idx) == expected

        # truncated
        idx = pd.CategoricalIndex(["a", "bb", "ccc"] * 100)
        expected = """CategoricalIndex(['a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a',
                  'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc'],
                 categories=['a', 'bb', 'ccc'], ordered=False, dtype='category', length=300)"""  # noqa

        assert repr(idx) == expected

        # larger categories
        idx = pd.CategoricalIndex(list("abcdefghijklmmo"))
        expected = """CategoricalIndex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l',
                  'm', 'm', 'o'],
                 categories=['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', ...], ordered=False, dtype='category')"""  # noqa

        assert repr(idx) == expected

        # short
        idx = pd.CategoricalIndex(["あ", "いい", "ううう"])
        expected = """CategoricalIndex(['あ', 'いい', 'ううう'], categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')"""  # noqa
        assert repr(idx) == expected

        # multiple lines
        idx = pd.CategoricalIndex(["あ", "いい", "ううう"] * 10)
        expected = """CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ',
                  'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい',
                  'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'],
                 categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')"""  # noqa

        assert repr(idx) == expected

        # truncated
        idx = pd.CategoricalIndex(["あ", "いい", "ううう"] * 100)
        expected = """CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ',
                  'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'],
                 categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category', length=300)"""  # noqa

        assert repr(idx) == expected

        # larger categories
        idx = pd.CategoricalIndex(list("あいうえおかきくけこさしすせそ"))
        expected = """CategoricalIndex(['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', 'け', 'こ', 'さ', 'し',
                  'す', 'せ', 'そ'],
                 categories=['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', ...], ordered=False, dtype='category')"""  # noqa

        assert repr(idx) == expected

        # Emable Unicode option -----------------------------------------
        with cf.option_context("display.unicode.east_asian_width", True):

            # short
            idx = pd.CategoricalIndex(["あ", "いい", "ううう"])
            expected = """CategoricalIndex(['あ', 'いい', 'ううう'], categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')"""  # noqa
            assert repr(idx) == expected

            # multiple lines
            idx = pd.CategoricalIndex(["あ", "いい", "ううう"] * 10)
            expected = """CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい',
                  'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう',
                  'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい',
                  'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'],
                 categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')"""  # noqa

            assert repr(idx) == expected

            # truncated
            idx = pd.CategoricalIndex(["あ", "いい", "ううう"] * 100)
            expected = """CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい',
                  'ううう', 'あ',
                  'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう',
                  'あ', 'いい', 'ううう'],
                 categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category', length=300)"""  # noqa

            assert repr(idx) == expected

            # larger categories
            idx = pd.CategoricalIndex(list("あいうえおかきくけこさしすせそ"))
            expected = """CategoricalIndex(['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', 'け', 'こ',
                  'さ', 'し', 'す', 'せ', 'そ'],
                 categories=['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', ...], ordered=False, dtype='category')"""  # noqa

            assert repr(idx) == expected

    def test_fillna_categorical(self):
        # GH 11343
        idx = CategoricalIndex([1.0, np.nan, 3.0, 1.0], name="x")
        # fill by value in categories
        exp = CategoricalIndex([1.0, 1.0, 3.0, 1.0], name="x")
        tm.assert_index_equal(idx.fillna(1.0), exp)

        # fill by value not in categories raises ValueError
        msg = "fill value must be in categories"
        with pytest.raises(ValueError, match=msg):

    def test_take_fill_value(self):
        # GH 12631

        # numeric category
        idx = pd.CategoricalIndex([1, 2, 3], name="xxx")
        result = idx.take(np.array([1, 0, -1]))
        expected = pd.CategoricalIndex([2, 1, 3], name="xxx")
        tm.assert_index_equal(result, expected)
        tm.assert_categorical_equal(result.values, expected.values)

        # fill_value
        result = idx.take(np.array([1, 0, -1]), fill_value=True)
        expected = pd.CategoricalIndex([2, 1, np.nan],
                                       categories=[1, 2, 3],
        tm.assert_index_equal(result, expected)
        tm.assert_categorical_equal(result.values, expected.values)

        # allow_fill=False
        result = idx.take(np.array([1, 0, -1]),
        expected = pd.CategoricalIndex([2, 1, 3], name="xxx")
        tm.assert_index_equal(result, expected)
        tm.assert_categorical_equal(result.values, expected.values)

        # object category
        idx = pd.CategoricalIndex(list("CBA"),
        result = idx.take(np.array([1, 0, -1]))
        expected = pd.CategoricalIndex(list("BCA"),
        tm.assert_index_equal(result, expected)
        tm.assert_categorical_equal(result.values, expected.values)

        # fill_value
        result = idx.take(np.array([1, 0, -1]), fill_value=True)
        expected = pd.CategoricalIndex(["B", "C", np.nan],
        tm.assert_index_equal(result, expected)
        tm.assert_categorical_equal(result.values, expected.values)

        # allow_fill=False
        result = idx.take(np.array([1, 0, -1]),
        expected = pd.CategoricalIndex(list("BCA"),
        tm.assert_index_equal(result, expected)
        tm.assert_categorical_equal(result.values, expected.values)

        msg = ("When allow_fill=True and fill_value is not None, "
               "all indices must be >= -1")
        with pytest.raises(ValueError, match=msg):
            idx.take(np.array([1, 0, -2]), fill_value=True)
        with pytest.raises(ValueError, match=msg):
            idx.take(np.array([1, 0, -5]), fill_value=True)

        with pytest.raises(IndexError):
            idx.take(np.array([1, -5]))

    def test_take_fill_value_datetime(self):

        # datetime category
        idx = pd.DatetimeIndex(["2011-01-01", "2011-02-01", "2011-03-01"],
        idx = pd.CategoricalIndex(idx)
        result = idx.take(np.array([1, 0, -1]))
        expected = pd.DatetimeIndex(["2011-02-01", "2011-01-01", "2011-03-01"],
        expected = pd.CategoricalIndex(expected)
        tm.assert_index_equal(result, expected)

        # fill_value
        result = idx.take(np.array([1, 0, -1]), fill_value=True)
        expected = pd.DatetimeIndex(["2011-02-01", "2011-01-01", "NaT"],
        exp_cats = pd.DatetimeIndex(["2011-01-01", "2011-02-01", "2011-03-01"])
        expected = pd.CategoricalIndex(expected, categories=exp_cats)
        tm.assert_index_equal(result, expected)

        # allow_fill=False
        result = idx.take(np.array([1, 0, -1]),
        expected = pd.DatetimeIndex(["2011-02-01", "2011-01-01", "2011-03-01"],
        expected = pd.CategoricalIndex(expected)
        tm.assert_index_equal(result, expected)

        msg = ("When allow_fill=True and fill_value is not None, "
               "all indices must be >= -1")
        with pytest.raises(ValueError, match=msg):
            idx.take(np.array([1, 0, -2]), fill_value=True)
        with pytest.raises(ValueError, match=msg):
            idx.take(np.array([1, 0, -5]), fill_value=True)

        with pytest.raises(IndexError):
            idx.take(np.array([1, -5]))

    def test_take_invalid_kwargs(self):
        idx = pd.CategoricalIndex([1, 2, 3], name="foo")
        indices = [1, 0, -1]

        msg = r"take\(\) got an unexpected keyword argument 'foo'"
        with pytest.raises(TypeError, match=msg):
            idx.take(indices, foo=2)

        msg = "the 'out' parameter is not supported"
        with pytest.raises(ValueError, match=msg):
            idx.take(indices, out=indices)

        msg = "the 'mode' parameter is not supported"
        with pytest.raises(ValueError, match=msg):
            idx.take(indices, mode="clip")

        "dtype, engine_type",
            (np.int8, libindex.Int8Engine),
            (np.int16, libindex.Int16Engine),
            (np.int32, libindex.Int32Engine),
            (np.int64, libindex.Int64Engine),
    def test_engine_type(self, dtype, engine_type):
        if dtype != np.int64:
            # num. of uniques required to push to a
            # dtype (128 categories required for .codes dtype to be int16 etc.)
            num_uniques = {np.int8: 1, np.int16: 128, np.int32: 32768}[dtype]
            ci = pd.CategoricalIndex(range(num_uniques))
            # having 2**32 - 2**31 categories would be very memory-intensive,
            # so we cheat a bit with the dtype
            ci = pd.CategoricalIndex(range(32768))  # == 2**16 - 2**(16 - 1)
            ci.values._codes = ci.values._codes.astype("int64")
        assert np.issubdtype(, dtype)
        assert isinstance(ci._engine, engine_type)
Beispiel #26
class TestSeriesConstructors:
            # NOTE: some overlap with test_constructor_empty but that test does not
            # test for None or an empty generator.
            # test_constructor_pass_none tests None but only with the index also
            # passed.
            (lambda: Series(), True),
            (lambda: Series(None), True),
            (lambda: Series({}), True),
            (lambda: Series(()), False),  # creates a RangeIndex
            (lambda: Series([]), False),  # creates a RangeIndex
            (lambda: Series((x for x in [])), False),  # creates a RangeIndex
            (lambda: Series(data=None), True),
            (lambda: Series(data={}), True),
            (lambda: Series(data=()), False),  # creates a RangeIndex
            (lambda: Series(data=[]), False),  # creates a RangeIndex
            (lambda: Series(data=(x for x in [])),
             False),  # creates a RangeIndex
    def test_empty_constructor(self, constructor, check_index_type):
        expected = Series()
        result = constructor()
        assert len(result.index) == 0

    def test_invalid_dtype(self):
        # GH15520
        msg = "not understood"
        invalid_list = [pd.Timestamp, "pd.Timestamp", list]
        for dtype in invalid_list:
            with pytest.raises(TypeError, match=msg):
                Series([], name="time", dtype=dtype)

    def test_scalar_conversion(self):

        # Pass in scalar is disabled
        scalar = Series(0.5)
        assert not isinstance(scalar, float)

        # Coercion
        assert float(Series([1.0])) == 1.0
        assert int(Series([1.0])) == 1

    def test_constructor(self, datetime_series):
        empty_series = Series()

        assert datetime_series.index.is_all_dates

        # Pass in Series
        derived = Series(datetime_series)
        assert derived.index.is_all_dates

        assert tm.equalContents(derived.index, datetime_series.index)
        # Ensure new index is not created
        assert id(datetime_series.index) == id(derived.index)

        # Mixed type Series
        mixed = Series(["hello", np.NaN], index=[0, 1])
        assert mixed.dtype == np.object_
        assert mixed[1] is np.NaN

        assert not empty_series.index.is_all_dates
        assert not Series().index.is_all_dates

        # exception raised is of type Exception
        with pytest.raises(Exception, match="Data must be 1-dimensional"):
            Series(np.random.randn(3, 3), index=np.arange(3)) = "Series"
        rs = Series(mixed).name
        xp = "Series"
        assert rs == xp

        # raise on MultiIndex GH4187
        m = MultiIndex.from_arrays([[1, 2], [3, 4]])
        msg = "initializing a Series from a MultiIndex is not supported"
        with pytest.raises(NotImplementedError, match=msg):

    @pytest.mark.parametrize("input_class", [list, dict, OrderedDict])
    def test_constructor_empty(self, input_class):
        empty = Series()
        empty2 = Series(input_class())

        # these are Index() and RangeIndex() which don't compare type equal
        # but are just .equals
        assert_series_equal(empty, empty2, check_index_type=False)

        # With explicit dtype:
        empty = Series(dtype="float64")
        empty2 = Series(input_class(), dtype="float64")
        assert_series_equal(empty, empty2, check_index_type=False)

        # GH 18515 : with dtype=category:
        empty = Series(dtype="category")
        empty2 = Series(input_class(), dtype="category")
        assert_series_equal(empty, empty2, check_index_type=False)

        if input_class is not list:
            # With index:
            empty = Series(index=range(10))
            empty2 = Series(input_class(), index=range(10))
            assert_series_equal(empty, empty2)

            # With index and dtype float64:
            empty = Series(np.nan, index=range(10))
            empty2 = Series(input_class(), index=range(10), dtype="float64")
            assert_series_equal(empty, empty2)

            # GH 19853 : with empty string, index and dtype str
            empty = Series("", dtype=str, index=range(3))
            empty2 = Series("", index=range(3))
            assert_series_equal(empty, empty2)

    @pytest.mark.parametrize("input_arg", [np.nan, float("nan")])
    def test_constructor_nan(self, input_arg):
        empty = Series(dtype="float64", index=range(10))
        empty2 = Series(input_arg, index=range(10))

        assert_series_equal(empty, empty2, check_index_type=False)

            "f8", "i8", "M8[ns]", "m8[ns]", "category", "object",
            "datetime64[ns, UTC]"
    @pytest.mark.parametrize("index", [None, pd.Index([])])
    def test_constructor_dtype_only(self, dtype, index):
        # GH-20865
        result = pd.Series(dtype=dtype, index=index)
        assert result.dtype == dtype
        assert len(result) == 0

    def test_constructor_no_data_index_order(self):
        result = pd.Series(index=["b", "a", "c"])
        assert result.index.tolist() == ["b", "a", "c"]

    def test_constructor_no_data_string_type(self):
        # GH 22477
        result = pd.Series(index=[1], dtype=str)
        assert np.isnan(result.iloc[0])

    @pytest.mark.parametrize("item", ["entry", "ѐ", 13])
    def test_constructor_string_element_string_type(self, item):
        # GH 22477
        result = pd.Series(item, index=[1], dtype=str)
        assert result.iloc[0] == str(item)

    def test_constructor_dtype_str_na_values(self, string_dtype):
        ser = Series(["x", None], dtype=string_dtype)
        result = ser.isna()
        expected = Series([False, True])
        tm.assert_series_equal(result, expected)
        assert ser.iloc[1] is None

        ser = Series(["x", np.nan], dtype=string_dtype)
        assert np.isnan(ser.iloc[1])

    def test_constructor_series(self):
        index1 = ["d", "b", "a", "c"]
        index2 = sorted(index1)
        s1 = Series([4, 7, -5, 3], index=index1)
        s2 = Series(s1, index=index2)

        assert_series_equal(s2, s1.sort_index())

    def test_constructor_iterable(self):
        # GH 21987
        class Iter:
            def __iter__(self):
                for i in range(10):
                    yield i

        expected = Series(list(range(10)), dtype="int64")
        result = Series(Iter(), dtype="int64")
        assert_series_equal(result, expected)

    def test_constructor_sequence(self):
        # GH 21987
        expected = Series(list(range(10)), dtype="int64")
        result = Series(range(10), dtype="int64")
        assert_series_equal(result, expected)

    def test_constructor_single_str(self):
        # GH 21987
        expected = Series(["abc"])
        result = Series("abc")
        assert_series_equal(result, expected)

    def test_constructor_list_like(self):

        # make sure that we are coercing different
        # list-likes to standard dtypes and not
        # platform specific
        expected = Series([1, 2, 3], dtype="int64")
        for obj in [[1, 2, 3], (1, 2, 3), np.array([1, 2, 3], dtype="int64")]:
            result = Series(obj, index=[0, 1, 2])
            assert_series_equal(result, expected)

    @pytest.mark.parametrize("dtype", ["bool", "int32", "int64", "float64"])
    def test_constructor_index_dtype(self, dtype):
        # GH 17088

        s = Series(Index([0, 2, 4]), dtype=dtype)
        assert s.dtype == dtype

            ([1, 2]),
            (["1", "2"]),
            (list(pd.date_range("1/1/2011", periods=2, freq="H"))),
                pd.date_range("1/1/2011", periods=2, freq="H",
            ([pd.Interval(left=0, right=5)]),
    def test_constructor_list_str(self, input_vals, string_dtype):
        # GH 16605
        # Ensure that data elements from a list are converted to strings
        # when dtype is str, 'str', or 'U'
        result = Series(input_vals, dtype=string_dtype)
        expected = Series(input_vals).astype(string_dtype)
        assert_series_equal(result, expected)

    def test_constructor_list_str_na(self, string_dtype):
        result = Series([1.0, 2.0, np.nan], dtype=string_dtype)
        expected = Series(["1.0", "2.0", np.nan], dtype=object)
        assert_series_equal(result, expected)
        assert np.isnan(result[2])

    def test_constructor_generator(self):
        gen = (i for i in range(10))

        result = Series(gen)
        exp = Series(range(10))
        assert_series_equal(result, exp)

        gen = (i for i in range(10))
        result = Series(gen, index=range(10, 20))
        exp.index = range(10, 20)
        assert_series_equal(result, exp)

    def test_constructor_map(self):
        # GH8909
        m = map(lambda x: x, range(10))

        result = Series(m)
        exp = Series(range(10))
        assert_series_equal(result, exp)

        m = map(lambda x: x, range(10))
        result = Series(m, index=range(10, 20))
        exp.index = range(10, 20)
        assert_series_equal(result, exp)

    def test_constructor_categorical(self):
        cat = pd.Categorical([0, 1, 2, 0, 1, 2], ["a", "b", "c"],
        res = Series(cat)
        tm.assert_categorical_equal(res.values, cat)

        # can cast to a new dtype
        result = Series(pd.Categorical([1, 2, 3]), dtype="int64")
        expected = pd.Series([1, 2, 3], dtype="int64")
        tm.assert_series_equal(result, expected)

        # GH12574
        cat = Series(pd.Categorical([1, 2, 3]), dtype="category")
        assert is_categorical_dtype(cat)
        assert is_categorical_dtype(cat.dtype)
        s = Series([1, 2, 3], dtype="category")
        assert is_categorical_dtype(s)
        assert is_categorical_dtype(s.dtype)

    def test_constructor_categorical_with_coercion(self):
        factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"])
        # test basic creation / coercion of categoricals
        s = Series(factor, name="A")
        assert s.dtype == "category"
        assert len(s) == len(factor)

        # in a frame
        df = DataFrame({"A": factor})
        result = df["A"]
        tm.assert_series_equal(result, s)
        result = df.iloc[:, 0]
        tm.assert_series_equal(result, s)
        assert len(df) == len(factor)

        df = DataFrame({"A": s})
        result = df["A"]
        tm.assert_series_equal(result, s)
        assert len(df) == len(factor)

        # multiples
        df = DataFrame({"A": s, "B": s, "C": 1})
        result1 = df["A"]
        result2 = df["B"]
        tm.assert_series_equal(result1, s)
        tm.assert_series_equal(result2, s, check_names=False)
        assert == "B"
        assert len(df) == len(factor)

        # GH8623
        x = DataFrame(
            [[1, "John P. Doe"], [2, "Jane Dove"], [1, "John P. Doe"]],
            columns=["person_id", "person_name"],
        x["person_name"] = Categorical(
            x.person_name)  # doing this breaks transform

        expected = x.iloc[0].person_name
        result = x.person_name.iloc[0]
        assert result == expected

        result = x.person_name[0]
        assert result == expected

        result = x.person_name.loc[0]
        assert result == expected

    def test_constructor_categorical_dtype(self):
        result = pd.Series(["a", "b"],
                           dtype=CategoricalDtype(["a", "b", "c"],
        assert is_categorical_dtype(result) is True
        tm.assert_index_equal(, pd.Index(["a", "b", "c"]))

        result = pd.Series(["a", "b"], dtype=CategoricalDtype(["b", "a"]))
        assert is_categorical_dtype(result)
        tm.assert_index_equal(, pd.Index(["b", "a"]))
        assert is False

        # GH 19565 - Check broadcasting of scalar with Categorical dtype
        result = Series("a",
                        index=[0, 1],
                        dtype=CategoricalDtype(["a", "b"], ordered=True))
        expected = Series(["a", "a"],
                          index=[0, 1],
                          dtype=CategoricalDtype(["a", "b"], ordered=True))
        tm.assert_series_equal(result, expected, check_categorical=True)

    def test_constructor_categorical_string(self):
        # GH 26336: the string 'category' maintains existing CategoricalDtype
        cdt = CategoricalDtype(categories=list("dabc"), ordered=True)
        expected = Series(list("abcabc"), dtype=cdt)

        # Series(Categorical, dtype='category') keeps existing dtype
        cat = Categorical(list("abcabc"), dtype=cdt)
        result = Series(cat, dtype="category")
        tm.assert_series_equal(result, expected)

        # Series(Series[Categorical], dtype='category') keeps existing dtype
        result = Series(result, dtype="category")
        tm.assert_series_equal(result, expected)

    @pytest.mark.parametrize("none, warning",
                             [(None, None), (ordered_sentinel, FutureWarning)])
    def test_categorical_ordered_none_deprecated(self, none, warning):
        # GH 26336: only warn if None is not explicitly passed
        cdt1 = CategoricalDtype(categories=list("cdab"), ordered=True)
        cdt2 = CategoricalDtype(categories=list("cedafb"), ordered=none)

        cat = Categorical(list("abcdaba"), dtype=cdt1)
        with tm.assert_produces_warning(warning, check_stacklevel=False):
            Series(cat, dtype=cdt2)

        s = Series(cat)
        with tm.assert_produces_warning(warning, check_stacklevel=False):
            Series(s, dtype=cdt2)

    def test_categorical_sideeffects_free(self):
        # Passing a categorical to a Series and then changing values in either
        # the series or the categorical should not change the values in the
        # other one, IF you specify copy!
        cat = Categorical(["a", "b", "c", "a"])
        s = Series(cat, copy=True)
        assert is not cat = [1, 2, 3]
        exp_s = np.array([1, 2, 3, 1], dtype=np.int64)
        exp_cat = np.array(["a", "b", "c", "a"], dtype=np.object_)
        tm.assert_numpy_array_equal(s.__array__(), exp_s)
        tm.assert_numpy_array_equal(cat.__array__(), exp_cat)

        # setting
        s[0] = 2
        exp_s2 = np.array([2, 2, 3, 1], dtype=np.int64)
        tm.assert_numpy_array_equal(s.__array__(), exp_s2)
        tm.assert_numpy_array_equal(cat.__array__(), exp_cat)

        # however, copy is False by default
        # so this WILL change values
        cat = Categorical(["a", "b", "c", "a"])
        s = Series(cat)
        assert s.values is cat = [1, 2, 3]
        exp_s = np.array([1, 2, 3, 1], dtype=np.int64)
        tm.assert_numpy_array_equal(s.__array__(), exp_s)
        tm.assert_numpy_array_equal(cat.__array__(), exp_s)

        s[0] = 2
        exp_s2 = np.array([2, 2, 3, 1], dtype=np.int64)
        tm.assert_numpy_array_equal(s.__array__(), exp_s2)
        tm.assert_numpy_array_equal(cat.__array__(), exp_s2)

    def test_unordered_compare_equal(self):
        left = pd.Series(["a", "b", "c"], dtype=CategoricalDtype(["a", "b"]))
        right = pd.Series(
            pd.Categorical(["a", "b", np.nan], categories=["a", "b"]))
        tm.assert_series_equal(left, right)

    def test_constructor_maskedarray(self):
        data = ma.masked_all((3, ), dtype=float)
        result = Series(data)
        expected = Series([nan, nan, nan])
        assert_series_equal(result, expected)

        data[0] = 0.0
        data[2] = 2.0
        index = ["a", "b", "c"]
        result = Series(data, index=index)
        expected = Series([0.0, nan, 2.0], index=index)
        assert_series_equal(result, expected)

        data[1] = 1.0
        result = Series(data, index=index)
        expected = Series([0.0, 1.0, 2.0], index=index)
        assert_series_equal(result, expected)

        data = ma.masked_all((3, ), dtype=int)
        result = Series(data)
        expected = Series([nan, nan, nan], dtype=float)
        assert_series_equal(result, expected)

        data[0] = 0
        data[2] = 2
        index = ["a", "b", "c"]
        result = Series(data, index=index)
        expected = Series([0, nan, 2], index=index, dtype=float)
        assert_series_equal(result, expected)

        data[1] = 1
        result = Series(data, index=index)
        expected = Series([0, 1, 2], index=index, dtype=int)
        assert_series_equal(result, expected)

        data = ma.masked_all((3, ), dtype=bool)
        result = Series(data)
        expected = Series([nan, nan, nan], dtype=object)
        assert_series_equal(result, expected)

        data[0] = True
        data[2] = False
        index = ["a", "b", "c"]
        result = Series(data, index=index)
        expected = Series([True, nan, False], index=index, dtype=object)
        assert_series_equal(result, expected)

        data[1] = True
        result = Series(data, index=index)
        expected = Series([True, True, False], index=index, dtype=bool)
        assert_series_equal(result, expected)

        data = ma.masked_all((3, ), dtype="M8[ns]")
        result = Series(data)
        expected = Series([iNaT, iNaT, iNaT], dtype="M8[ns]")
        assert_series_equal(result, expected)

        data[0] = datetime(2001, 1, 1)
        data[2] = datetime(2001, 1, 3)
        index = ["a", "b", "c"]
        result = Series(data, index=index)
        expected = Series(
            [datetime(2001, 1, 1), iNaT,
             datetime(2001, 1, 3)],
        assert_series_equal(result, expected)

        data[1] = datetime(2001, 1, 2)
        result = Series(data, index=index)
        expected = Series(
            [datetime(2001, 1, 1),
             datetime(2001, 1, 2),
             datetime(2001, 1, 3)],
        assert_series_equal(result, expected)

    def test_constructor_maskedarray_hardened(self):
        # Check numpy masked arrays with hard masks -- from GH24574
        data = ma.masked_all((3, ), dtype=float).harden_mask()
        result = pd.Series(data)
        expected = pd.Series([nan, nan, nan])
        tm.assert_series_equal(result, expected)

    def test_series_ctor_plus_datetimeindex(self):
        rng = date_range("20090415", "20090519", freq="B")
        data = {k: 1 for k in rng}

        result = Series(data, index=rng)
        assert result.index is rng

    def test_constructor_default_index(self):
        s = Series([0, 1, 2])
        tm.assert_index_equal(s.index, pd.Index(np.arange(3)))

            [1, 2, 3],
            (1, 2, 3),
            pd.Categorical(["a", "b", "a"]),
            (i for i in range(3)),
            map(lambda x: x, range(3)),
    def test_constructor_index_mismatch(self, input):
        # GH 19342
        # test that construction of a Series with an index of different length
        # raises an error
        msg = "Length of passed values is 3, index implies 4"
        with pytest.raises(ValueError, match=msg):
            Series(input, index=np.arange(4))

    def test_constructor_numpy_scalar(self):
        # GH 19342
        # construction with a numpy scalar
        # should not raise
        result = Series(np.array(100), index=np.arange(4), dtype="int64")
        expected = Series(100, index=np.arange(4), dtype="int64")
        tm.assert_series_equal(result, expected)

    def test_constructor_broadcast_list(self):
        # GH 19342
        # construction with single-element container and index
        # should raise
        msg = "Length of passed values is 1, index implies 3"
        with pytest.raises(ValueError, match=msg):
            Series(["foo"], index=["a", "b", "c"])

    def test_constructor_corner(self):
        df = tm.makeTimeDataFrame()
        objs = [df, df]
        s = Series(objs, index=[0, 1])
        assert isinstance(s, Series)

    def test_constructor_sanitize(self):
        s = Series(np.array([1.0, 1.0, 8.0]), dtype="i8")
        assert s.dtype == np.dtype("i8")

        s = Series(np.array([1.0, 1.0, np.nan]), copy=True, dtype="i8")
        assert s.dtype == np.dtype("f8")

    def test_constructor_copy(self):
        # GH15125
        # test dtype parameter has no side effects on copy=True
        for data in [[1.0], np.array([1.0])]:
            x = Series(data)
            y = pd.Series(x, copy=True, dtype=float)

            # copy=True maintains original data in Series
            tm.assert_series_equal(x, y)

            # changes to origin of copy does not affect the copy
            x[0] = 2.0
            assert not x.equals(y)
            assert x[0] == 2.0
            assert y[0] == 1.0

            pd.date_range("20170101", periods=3, tz="US/Eastern"),
            pd.date_range("20170101", periods=3),
            pd.timedelta_range("1 day", periods=3),
            pd.period_range("2012Q1", periods=3, freq="Q"),
            pd.Int64Index([1, 2, 3]),
            pd.RangeIndex(0, 3),
        ids=lambda x: type(x).__name__,
    def test_constructor_limit_copies(self, index):
        # GH 17449
        # limit copies of input
        s = pd.Series(index)

        # we make 1 copy; this is just a smoke test here
        assert s._data.blocks[0].values is not index

    def test_constructor_pass_none(self):
        s = Series(None, index=range(5))
        assert s.dtype == np.float64

        s = Series(None, index=range(5), dtype=object)
        assert s.dtype == np.object_

        # GH 7431
        # inference on the index
        s = Series(index=np.array([None]))
        expected = Series(index=Index([None]))
        assert_series_equal(s, expected)

    def test_constructor_pass_nan_nat(self):
        # GH 13467
        exp = Series([np.nan, np.nan], dtype=np.float64)
        assert exp.dtype == np.float64
        tm.assert_series_equal(Series([np.nan, np.nan]), exp)
        tm.assert_series_equal(Series(np.array([np.nan, np.nan])), exp)

        exp = Series([pd.NaT, pd.NaT])
        assert exp.dtype == "datetime64[ns]"
        tm.assert_series_equal(Series([pd.NaT, pd.NaT]), exp)
        tm.assert_series_equal(Series(np.array([pd.NaT, pd.NaT])), exp)

        tm.assert_series_equal(Series([pd.NaT, np.nan]), exp)
        tm.assert_series_equal(Series(np.array([pd.NaT, np.nan])), exp)

        tm.assert_series_equal(Series([np.nan, pd.NaT]), exp)
        tm.assert_series_equal(Series(np.array([np.nan, pd.NaT])), exp)

    def test_constructor_cast(self):
        msg = "could not convert string to float"
        with pytest.raises(ValueError, match=msg):
            Series(["a", "b", "c"], dtype=float)

    def test_constructor_unsigned_dtype_overflow(self, uint_dtype):
        # see gh-15832
        msg = "Trying to coerce negative values to unsigned integers"
        with pytest.raises(OverflowError, match=msg):
            Series([-1], dtype=uint_dtype)

    def test_constructor_coerce_float_fail(self, any_int_dtype):
        # see gh-15832
        msg = "Trying to coerce float values to integers"
        with pytest.raises(ValueError, match=msg):
            Series([1, 2, 3.5], dtype=any_int_dtype)

    def test_constructor_coerce_float_valid(self, float_dtype):
        s = Series([1, 2, 3.5], dtype=float_dtype)
        expected = Series([1, 2, 3.5]).astype(float_dtype)
        assert_series_equal(s, expected)

    def test_constructor_dtype_no_cast(self):
        # see gh-1572
        s = Series([1, 2, 3])
        s2 = Series(s, dtype=np.int64)

        s2[1] = 5
        assert s[1] == 5

    def test_constructor_datelike_coercion(self):

        # GH 9477
        # incorrectly inferring on dateimelike looking when object dtype is
        # specified
        s = Series([Timestamp("20130101"), "NOV"], dtype=object)
        assert s.iloc[0] == Timestamp("20130101")
        assert s.iloc[1] == "NOV"
        assert s.dtype == object

        # the dtype was being reset on the slicing and re-inferred to datetime
        # even thought the blocks are mixed
        belly = "216 3T19".split()
        wing1 = "2T15 4H19".split()
        wing2 = "416 4T20".split()
        mat = pd.to_datetime("2016-01-22 2019-09-07".split())
        df = pd.DataFrame({
            "wing1": wing1,
            "wing2": wing2,
            "mat": mat

        result = df.loc["3T19"]
        assert result.dtype == object
        result = df.loc["216"]
        assert result.dtype == object

    def test_constructor_datetimes_with_nulls(self):
        # gh-15869
        for arr in [
                np.array([None, None, None, None,
                , None]),
                np.array([None, None,, None]),
            result = Series(arr)
            assert result.dtype == "M8[ns]"

    def test_constructor_dtype_datetime64(self):

        s = Series(iNaT, dtype="M8[ns]", index=range(5))
        assert isna(s).all()

        # in theory this should be all nulls, but since
        # we are not specifying a dtype is ambiguous
        s = Series(iNaT, index=range(5))
        assert not isna(s).all()

        s = Series(nan, dtype="M8[ns]", index=range(5))
        assert isna(s).all()

        s = Series([datetime(2001, 1, 2, 0, 0), iNaT], dtype="M8[ns]")
        assert isna(s[1])
        assert s.dtype == "M8[ns]"

        s = Series([datetime(2001, 1, 2, 0, 0), nan], dtype="M8[ns]")
        assert isna(s[1])
        assert s.dtype == "M8[ns]"

        # GH3416
        dates = [
            np.datetime64(datetime(2013, 1, 1)),
            np.datetime64(datetime(2013, 1, 2)),
            np.datetime64(datetime(2013, 1, 3)),

        s = Series(dates)
        assert s.dtype == "M8[ns]"

        s.iloc[0] = np.nan
        assert s.dtype == "M8[ns]"

        # GH3414 related
        expected = Series(
            [datetime(2013, 1, 1),
             datetime(2013, 1, 2),
             datetime(2013, 1, 3)],

        result = Series(Series(dates).astype(np.int64) / 1000000,
        tm.assert_series_equal(result, expected)

        result = Series(dates, dtype="datetime64[ns]")
        tm.assert_series_equal(result, expected)

        expected = Series(
            [pd.NaT, datetime(2013, 1, 2),
             datetime(2013, 1, 3)],
        result = Series([np.nan] + dates[1:], dtype="datetime64[ns]")
        tm.assert_series_equal(result, expected)

        dts = Series(dates, dtype="datetime64[ns]")

        # valid astype

        # invalid casting
        msg = r"cannot astype a datetimelike from \[datetime64\[ns\]\] to" r" \[int32\]"
        with pytest.raises(TypeError, match=msg):

        # ints are ok
        # we test with np.int64 to get similar results on
        # windows / 32-bit platforms
        result = Series(dts, dtype=np.int64)
        expected = Series(dts.astype(np.int64))
        tm.assert_series_equal(result, expected)

        # invalid dates can be help as object
        result = Series([datetime(2, 1, 1)])
        assert result[0] == datetime(2, 1, 1, 0, 0)

        result = Series([datetime(3000, 1, 1)])
        assert result[0] == datetime(3000, 1, 1, 0, 0)

        # don't mix types
        result = Series([Timestamp("20130101"), 1], index=["a", "b"])
        assert result["a"] == Timestamp("20130101")
        assert result["b"] == 1

        # GH6529
        # coerce datetime64 non-ns properly
        dates = date_range("01-Jan-2015", "01-Dec-2015", freq="M")
        values2 = dates.view(np.ndarray).astype("datetime64[ns]")
        expected = Series(values2, index=dates)

        for dtype in ["s", "D", "ms", "us", "ns"]:
            values1 = dates.view(np.ndarray).astype("M8[{0}]".format(dtype))
            result = Series(values1, dates)
            assert_series_equal(result, expected)

        # GH 13876
        # coerce to non-ns to object properly
        expected = Series(values2, index=dates, dtype=object)
        for dtype in ["s", "D", "ms", "us", "ns"]:
            values1 = dates.view(np.ndarray).astype("M8[{0}]".format(dtype))
            result = Series(values1, index=dates, dtype=object)
            assert_series_equal(result, expected)

        # leave alone
        dates2 = np.array([ for d in dates.to_pydatetime()],
        series1 = Series(dates2, dates)
        tm.assert_numpy_array_equal(series1.values, dates2)
        assert series1.dtype == object

        # these will correctly infer a datetime
        s = Series([None, pd.NaT, "2013-08-05 15:30:00.000001"])
        assert s.dtype == "datetime64[ns]"
        s = Series([np.nan, pd.NaT, "2013-08-05 15:30:00.000001"])
        assert s.dtype == "datetime64[ns]"
        s = Series([pd.NaT, None, "2013-08-05 15:30:00.000001"])
        assert s.dtype == "datetime64[ns]"
        s = Series([pd.NaT, np.nan, "2013-08-05 15:30:00.000001"])
        assert s.dtype == "datetime64[ns]"

        # tz-aware (UTC and other tz's)
        # GH 8411
        dr = date_range("20130101", periods=3)
        assert Series(dr).iloc[0].tz is None
        dr = date_range("20130101", periods=3, tz="UTC")
        assert str(Series(dr).iloc[0].tz) == "UTC"
        dr = date_range("20130101", periods=3, tz="US/Eastern")
        assert str(Series(dr).iloc[0].tz) == "US/Eastern"

        # non-convertible
        s = Series([1479596223000, -1479590, pd.NaT])
        assert s.dtype == "object"
        assert s[2] is pd.NaT
        assert "NaT" in str(s)

        # if we passed a NaT it remains
        s = Series([datetime(2010, 1, 1), datetime(2, 1, 1), pd.NaT])
        assert s.dtype == "object"
        assert s[2] is pd.NaT
        assert "NaT" in str(s)

        # if we passed a nan it remains
        s = Series([datetime(2010, 1, 1), datetime(2, 1, 1), np.nan])
        assert s.dtype == "object"
        assert s[2] is np.nan
        assert "NaN" in str(s)

    def test_constructor_with_datetime_tz(self):

        # 8260
        # support datetime64 with tz

        dr = date_range("20130101", periods=3, tz="US/Eastern")
        s = Series(dr)
        assert == "datetime64[ns, US/Eastern]"
        assert s.dtype == "datetime64[ns, US/Eastern]"
        assert is_datetime64tz_dtype(s.dtype)
        assert "datetime64[ns, US/Eastern]" in str(s)

        # export
        result = s.values
        assert isinstance(result, np.ndarray)
        assert result.dtype == "datetime64[ns]"

        exp = pd.DatetimeIndex(result)
        exp = exp.tz_localize("UTC").tz_convert(
        tm.assert_index_equal(dr, exp)

        # indexing
        result = s.iloc[0]
        assert result == Timestamp("2013-01-01 00:00:00-0500",
        result = s[0]
        assert result == Timestamp("2013-01-01 00:00:00-0500",

        result = s[Series([True, True, False], index=s.index)]
        assert_series_equal(result, s[0:2])

        result = s.iloc[0:1]
        assert_series_equal(result, Series(dr[0:1]))

        # concat
        result = pd.concat([s.iloc[0:1], s.iloc[1:]])
        assert_series_equal(result, s)

        # short str
        assert "datetime64[ns, US/Eastern]" in str(s)

        # formatting with NaT
        result = s.shift()
        assert "datetime64[ns, US/Eastern]" in str(result)
        assert "NaT" in str(result)

        # long str
        t = Series(date_range("20130101", periods=1000, tz="US/Eastern"))
        assert "datetime64[ns, US/Eastern]" in str(t)

        result = pd.DatetimeIndex(s, freq="infer")
        tm.assert_index_equal(result, dr)

        # inference
        s = Series([
            pd.Timestamp("2013-01-01 13:00:00-0800", tz="US/Pacific"),
            pd.Timestamp("2013-01-02 14:00:00-0800", tz="US/Pacific"),
        assert s.dtype == "datetime64[ns, US/Pacific]"
        assert lib.infer_dtype(s, skipna=True) == "datetime64"

        s = Series([
            pd.Timestamp("2013-01-01 13:00:00-0800", tz="US/Pacific"),
            pd.Timestamp("2013-01-02 14:00:00-0800", tz="US/Eastern"),
        assert s.dtype == "object"
        assert lib.infer_dtype(s, skipna=True) == "datetime"

        # with all NaT
        s = Series(pd.NaT, index=[0, 1], dtype="datetime64[ns, US/Eastern]")
        expected = Series(pd.DatetimeIndex(["NaT", "NaT"], tz="US/Eastern"))
        assert_series_equal(s, expected)

    @pytest.mark.parametrize("arr_dtype", [np.int64, np.float64])
    @pytest.mark.parametrize("dtype", ["M8", "m8"])
    @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"])
    def test_construction_to_datetimelike_unit(self, arr_dtype, dtype, unit):
        # tests all units
        # gh-19223
        dtype = "{}[{}]".format(dtype, unit)
        arr = np.array([1, 2, 3], dtype=arr_dtype)
        s = Series(arr)
        result = s.astype(dtype)
        expected = Series(arr.astype(dtype))

        tm.assert_series_equal(result, expected)

                             ["2013-01-01 00:00:00", pd.NaT, np.nan, None])
    def test_constructor_with_naive_string_and_datetimetz_dtype(self, arg):
        # GH 17415: With naive string
        result = Series([arg], dtype="datetime64[ns, CET]")
        expected = Series(pd.Timestamp(arg)).dt.tz_localize("CET")
        assert_series_equal(result, expected)

    def test_construction_interval(self):
        # construction from interval & array of intervals
        index = IntervalIndex.from_breaks(np.arange(3), closed="right")
        result = Series(index)
        tm.assert_index_equal(Index(result.values), index)

        result = Series(index.values)
        tm.assert_index_equal(Index(result.values), index)

    def test_construction_consistency(self):

        # make sure that we are not re-localizing upon construction
        # GH 14928
        s = Series(pd.date_range("20130101", periods=3, tz="US/Eastern"))

        result = Series(s, dtype=s.dtype)
        tm.assert_series_equal(result, s)

        result = Series(s.dt.tz_convert("UTC"), dtype=s.dtype)
        tm.assert_series_equal(result, s)

        result = Series(s.values, dtype=s.dtype)
        tm.assert_series_equal(result, s)

    def test_constructor_infer_period(self):
        data = [pd.Period("2000", "D"), pd.Period("2001", "D"), None]
        result = pd.Series(data)
        expected = pd.Series(period_array(data))
        tm.assert_series_equal(result, expected)
        assert result.dtype == "Period[D]"

        data = np.asarray(data, dtype=object)
        tm.assert_series_equal(result, expected)
        assert result.dtype == "Period[D]"

    def test_constructor_period_incompatible_frequency(self):
        data = [pd.Period("2000", "D"), pd.Period("2001", "A")]
        result = pd.Series(data)
        assert result.dtype == object
        assert result.tolist() == data

    def test_constructor_periodindex(self):
        # GH7932
        # converting a PeriodIndex when put in a Series

        pi = period_range("20130101", periods=5, freq="D")
        s = Series(pi)
        assert s.dtype == "Period[D]"
        expected = Series(pi.astype(object))
        assert_series_equal(s, expected)

    def test_constructor_dict(self):
        d = {"a": 0.0, "b": 1.0, "c": 2.0}
        result = Series(d, index=["b", "c", "d", "a"])
        expected = Series([1, 2, nan, 0], index=["b", "c", "d", "a"])
        assert_series_equal(result, expected)

        pidx = tm.makePeriodIndex(100)
        d = {pidx[0]: 0, pidx[1]: 1}
        result = Series(d, index=pidx)
        expected = Series(np.nan, pidx)
        expected.iloc[0] = 0
        expected.iloc[1] = 1
        assert_series_equal(result, expected)

    def test_constructor_dict_order(self):
        # GH19018
        # initialization ordering: by insertion order if python>= 3.6, else
        # order by value
        d = {"b": 1, "a": 0, "c": 2}
        result = Series(d)
        if PY36:
            expected = Series([1, 0, 2], index=list("bac"))
            expected = Series([0, 1, 2], index=list("abc"))
        tm.assert_series_equal(result, expected)

    @pytest.mark.parametrize("value", [2, np.nan, None, float("nan")])
    def test_constructor_dict_nan_key(self, value):
        # GH 18480
        d = {1: "a", value: "b", float("nan"): "c", 4: "d"}
        result = Series(d).sort_values()
        expected = Series(["a", "b", "c", "d"], index=[1, value, np.nan, 4])
        assert_series_equal(result, expected)

        # MultiIndex:
        d = {(1, 1): "a", (2, np.nan): "b", (3, value): "c"}
        result = Series(d).sort_values()
        expected = Series(["a", "b", "c"],
                          index=Index([(1, 1), (2, np.nan), (3, value)]))
        assert_series_equal(result, expected)

    def test_constructor_dict_datetime64_index(self):
        # GH 9456

        dates_as_str = ["1984-02-19", "1988-11-06", "1989-12-03", "1990-03-15"]
        values = [42544017.198965244, 1234565, 40512335.181958228, -1]

        def create_data(constructor):
            return dict(zip((constructor(x) for x in dates_as_str), values))

        data_datetime64 = create_data(np.datetime64)
        data_datetime = create_data(lambda x: datetime.strptime(x, "%Y-%m-%d"))
        data_Timestamp = create_data(Timestamp)

        expected = Series(values, (Timestamp(x) for x in dates_as_str))

        result_datetime64 = Series(data_datetime64)
        result_datetime = Series(data_datetime)
        result_Timestamp = Series(data_Timestamp)

        assert_series_equal(result_datetime64, expected)
        assert_series_equal(result_datetime, expected)
        assert_series_equal(result_Timestamp, expected)

    def test_constructor_list_of_tuples(self):
        data = [(1, 1), (2, 2), (2, 3)]
        s = Series(data)
        assert list(s) == data

    def test_constructor_tuple_of_tuples(self):
        data = ((1, 1), (2, 2), (2, 3))
        s = Series(data)
        assert tuple(s) == data

    def test_constructor_dict_of_tuples(self):
        data = {(1, 2): 3, (None, 5): 6}
        result = Series(data).sort_values()
        expected = Series([3, 6],
                          index=MultiIndex.from_tuples([(1, 2), (None, 5)]))
        tm.assert_series_equal(result, expected)

    def test_constructor_set(self):
        values = {1, 2, 3, 4, 5}
        with pytest.raises(TypeError, match="'set' type is unordered"):
        values = frozenset(values)
        with pytest.raises(TypeError, match="'frozenset' type is unordered"):

    @pytest.mark.filterwarnings("ignore:elementwise comparison:FutureWarning")
    def test_fromDict(self):
        data = {"a": 0, "b": 1, "c": 2, "d": 3}

        series = Series(data)

        data = {"a": 0, "b": "1", "c": "2", "d":}
        series = Series(data)
        assert series.dtype == np.object_

        data = {"a": 0, "b": "1", "c": "2", "d": "3"}
        series = Series(data)
        assert series.dtype == np.object_

        data = {"a": "0", "b": "1"}
        series = Series(data, dtype=float)
        assert series.dtype == np.float64

    def test_fromValue(self, datetime_series):

        nans = Series(np.NaN, index=datetime_series.index)
        assert nans.dtype == np.float_
        assert len(nans) == len(datetime_series)

        strings = Series("foo", index=datetime_series.index)
        assert strings.dtype == np.object_
        assert len(strings) == len(datetime_series)

        d =
        dates = Series(d, index=datetime_series.index)
        assert dates.dtype == "M8[ns]"
        assert len(dates) == len(datetime_series)

        # GH12336
        # Test construction of categorical series from value
        categorical = Series(0, index=datetime_series.index, dtype="category")
        expected = Series(0, index=datetime_series.index).astype("category")
        assert categorical.dtype == "category"
        assert len(categorical) == len(datetime_series)
        tm.assert_series_equal(categorical, expected)

    def test_constructor_dtype_timedelta64(self):

        # basic
        td = Series([timedelta(days=i) for i in range(3)])
        assert td.dtype == "timedelta64[ns]"

        td = Series([timedelta(days=1)])
        assert td.dtype == "timedelta64[ns]"

        td = Series(
             np.timedelta64(1, "s")])

        assert td.dtype == "timedelta64[ns]"

        # mixed with NaT
        td = Series([timedelta(days=1), NaT], dtype="m8[ns]")
        assert td.dtype == "timedelta64[ns]"

        td = Series([timedelta(days=1), np.nan], dtype="m8[ns]")
        assert td.dtype == "timedelta64[ns]"

        td = Series([np.timedelta64(300000000), pd.NaT], dtype="m8[ns]")
        assert td.dtype == "timedelta64[ns]"

        # improved inference
        # GH5689
        td = Series([np.timedelta64(300000000), NaT])
        assert td.dtype == "timedelta64[ns]"

        # because iNaT is int, not coerced to timedelta
        td = Series([np.timedelta64(300000000), iNaT])
        assert td.dtype == "object"

        td = Series([np.timedelta64(300000000), np.nan])
        assert td.dtype == "timedelta64[ns]"

        td = Series([pd.NaT, np.timedelta64(300000000)])
        assert td.dtype == "timedelta64[ns]"

        td = Series([np.timedelta64(1, "s")])
        assert td.dtype == "timedelta64[ns]"

        # these are frequency conversion astypes
        # for t in ['s', 'D', 'us', 'ms']:
        #    with pytest.raises(TypeError):
        #        td.astype('m8[%s]' % t)

        # valid astype

        # invalid casting
        msg = r"cannot astype a timedelta from \[timedelta64\[ns\]\] to" r" \[int32\]"
        with pytest.raises(TypeError, match=msg):

        # this is an invalid casting
        msg = "Could not convert object to NumPy timedelta"
        with pytest.raises(ValueError, match=msg):
            Series([timedelta(days=1), "foo"], dtype="m8[ns]")

        # leave as object here
        td = Series([timedelta(days=i) for i in range(3)] + ["foo"])
        assert td.dtype == "object"

        # these will correctly infer a timedelta
        s = Series([None, pd.NaT, "1 Day"])
        assert s.dtype == "timedelta64[ns]"
        s = Series([np.nan, pd.NaT, "1 Day"])
        assert s.dtype == "timedelta64[ns]"
        s = Series([pd.NaT, None, "1 Day"])
        assert s.dtype == "timedelta64[ns]"
        s = Series([pd.NaT, np.nan, "1 Day"])
        assert s.dtype == "timedelta64[ns]"

    # GH 16406
    def test_constructor_mixed_tz(self):
        s = Series(
             Timestamp("20130101", tz="US/Eastern")])
        expected = Series(
             Timestamp("20130101", tz="US/Eastern")],
        assert_series_equal(s, expected)

    def test_NaT_scalar(self):
        series = Series([0, 1000, 2000, iNaT], dtype="M8[ns]")

        val = series[3]
        assert isna(val)

        series[2] = val
        assert isna(series[2])

    def test_NaT_cast(self):
        # GH10747
        result = Series([np.nan]).astype("M8[ns]")
        expected = Series([NaT])
        assert_series_equal(result, expected)

    def test_constructor_name_hashable(self):
        for n in [777, 777.0, "name", datetime(2001, 11, 11), (1, ), "\u05D0"]:
            for data in [[1, 2, 3], np.ones(3), {"a": 0, "b": 1}]:
                s = Series(data, name=n)
                assert == n

    def test_constructor_name_unhashable(self):
        msg = r"Series\.name must be a hashable type"
        for n in [["name_list"], np.ones(2), {1: 2}]:
            for data in [["name_list"], np.ones(2), {1: 2}]:
                with pytest.raises(TypeError, match=msg):
                    Series(data, name=n)

    def test_auto_conversion(self):
        series = Series(list(date_range("1/1/2000", periods=10)))
        assert series.dtype == "M8[ns]"

    def test_convert_non_ns(self):
        # convert from a numpy array of non-ns timedelta64
        arr = np.array([1, 2, 3], dtype="timedelta64[s]")
        s = Series(arr)
        expected = Series(pd.timedelta_range("00:00:01", periods=3, freq="s"))
        assert_series_equal(s, expected)

        # convert from a numpy array of non-ns datetime64
        # note that creating a numpy datetime64 is in LOCAL time!!!!
        # seems to work for M8[D], but not for M8[s]

        s = Series(
            np.array(["2013-01-01", "2013-01-02", "2013-01-03"],
            s, Series(date_range("20130101", periods=3, freq="D")))

        # s = Series(np.array(['2013-01-01 00:00:01','2013-01-01
        # 00:00:02','2013-01-01 00:00:03'],dtype='datetime64[s]'))

        # assert_series_equal(s,date_range('20130101
        # 00:00:01',period=3,freq='s'))

            date_range("1/1/2000", periods=10),
            timedelta_range("1 day", periods=10),
            period_range("2000-Q1", periods=10, freq="Q"),
        ids=lambda x: type(x).__name__,
    def test_constructor_cant_cast_datetimelike(self, index):

        # floats are not ok
        msg = "Cannot cast {}.*? to ".format(
            # strip Index to convert PeriodIndex -> Period
            # We don't care whether the error message says
            # PeriodIndex or PeriodArray
        with pytest.raises(TypeError, match=msg):
            Series(index, dtype=float)

        # ints are ok
        # we test with np.int64 to get similar results on
        # windows / 32-bit platforms
        result = Series(index, dtype=np.int64)
        expected = Series(index.astype(np.int64))
        tm.assert_series_equal(result, expected)

            date_range("1/1/2000", periods=10),
            timedelta_range("1 day", periods=10),
            period_range("2000-Q1", periods=10, freq="Q"),
        ids=lambda x: type(x).__name__,
    def test_constructor_cast_object(self, index):
        s = Series(index, dtype=object)
        exp = Series(index).astype(object)
        tm.assert_series_equal(s, exp)

        s = Series(pd.Index(index, dtype=object), dtype=object)
        exp = Series(index).astype(object)
        tm.assert_series_equal(s, exp)

        s = Series(index.astype(object), dtype=object)
        exp = Series(index).astype(object)
        tm.assert_series_equal(s, exp)

    @pytest.mark.parametrize("dtype", [np.datetime64, np.timedelta64])
    def test_constructor_generic_timestamp_no_frequency(self, dtype):
        # see gh-15524, gh-15987
        msg = "dtype has no unit. Please pass in"

        with pytest.raises(ValueError, match=msg):
            Series([], dtype=dtype)

            ("m8[ps]", "cannot convert timedeltalike"),
            ("M8[ps]", "cannot convert datetimelike"),
    def test_constructor_generic_timestamp_bad_frequency(self, dtype, msg):
        # see gh-15524, gh-15987

        with pytest.raises(TypeError, match=msg):
            Series([], dtype=dtype)

    @pytest.mark.parametrize("dtype", [None, "uint8", "category"])
    def test_constructor_range_dtype(self, dtype):
        # GH 16804
        expected = Series([0, 1, 2, 3, 4], dtype=dtype or "int64")
        result = Series(range(5), dtype=dtype)
        tm.assert_series_equal(result, expected)

    def test_constructor_tz_mixed_data(self):
        # GH 13051
        dt_list = [
            Timestamp("2016-05-01 02:03:37"),
            Timestamp("2016-04-30 19:03:37-0700", tz="US/Pacific"),
        result = Series(dt_list)
        expected = Series(dt_list, dtype=object)
        tm.assert_series_equal(result, expected)
Beispiel #27
def test_survival_table_from_events_will_collapse_if_asked():
    T, C = np.array([1, 3, 4, 5]), np.array([True, True, True, True])
    table = utils.survival_table_from_events(T, C, collapse=True)
    assert table.index.tolist() == [pd.Interval(0, 3.5089999999999999, closed='right'), pd.Interval(3.5089999999999999,  7.0179999999999998, closed='right')]
Beispiel #28
class TestSeriesConstructors():

    def test_invalid_dtype(self):
        # GH15520
        msg = 'not understood'
        invalid_list = [pd.Timestamp, 'pd.Timestamp', list]
        for dtype in invalid_list:
            with pytest.raises(TypeError, match=msg):
                Series([], name='time', dtype=dtype)

    def test_scalar_conversion(self):

        # Pass in scalar is disabled
        scalar = Series(0.5)
        assert not isinstance(scalar, float)

        # Coercion
        assert float(Series([1.])) == 1.0
        assert int(Series([1.])) == 1
        assert long(Series([1.])) == 1

    def test_constructor(self, datetime_series, empty_series):
        assert datetime_series.index.is_all_dates

        # Pass in Series
        derived = Series(datetime_series)
        assert derived.index.is_all_dates

        assert tm.equalContents(derived.index, datetime_series.index)
        # Ensure new index is not created
        assert id(datetime_series.index) == id(derived.index)

        # Mixed type Series
        mixed = Series(['hello', np.NaN], index=[0, 1])
        assert mixed.dtype == np.object_
        assert mixed[1] is np.NaN

        assert not empty_series.index.is_all_dates
        assert not Series({}).index.is_all_dates
        pytest.raises(Exception, Series, np.random.randn(3, 3),
                      index=np.arange(3)) = 'Series'
        rs = Series(mixed).name
        xp = 'Series'
        assert rs == xp

        # raise on MultiIndex GH4187
        m = MultiIndex.from_arrays([[1, 2], [3, 4]])
        pytest.raises(NotImplementedError, Series, m)

    @pytest.mark.parametrize('input_class', [list, dict, OrderedDict])
    def test_constructor_empty(self, input_class):
        empty = Series()
        empty2 = Series(input_class())

        # these are Index() and RangeIndex() which don't compare type equal
        # but are just .equals
        assert_series_equal(empty, empty2, check_index_type=False)

        # With explicit dtype:
        empty = Series(dtype='float64')
        empty2 = Series(input_class(), dtype='float64')
        assert_series_equal(empty, empty2, check_index_type=False)

        # GH 18515 : with dtype=category:
        empty = Series(dtype='category')
        empty2 = Series(input_class(), dtype='category')
        assert_series_equal(empty, empty2, check_index_type=False)

        if input_class is not list:
            # With index:
            empty = Series(index=lrange(10))
            empty2 = Series(input_class(), index=lrange(10))
            assert_series_equal(empty, empty2)

            # With index and dtype float64:
            empty = Series(np.nan, index=lrange(10))
            empty2 = Series(input_class(), index=lrange(10), dtype='float64')
            assert_series_equal(empty, empty2)

            # GH 19853 : with empty string, index and dtype str
            empty = Series('', dtype=str, index=range(3))
            empty2 = Series('', index=range(3))
            assert_series_equal(empty, empty2)

    @pytest.mark.parametrize('input_arg', [np.nan, float('nan')])
    def test_constructor_nan(self, input_arg):
        empty = Series(dtype='float64', index=lrange(10))
        empty2 = Series(input_arg, index=lrange(10))

        assert_series_equal(empty, empty2, check_index_type=False)

    @pytest.mark.parametrize('dtype', [
        'f8', 'i8', 'M8[ns]', 'm8[ns]', 'category', 'object',
        'datetime64[ns, UTC]',
    @pytest.mark.parametrize('index', [None, pd.Index([])])
    def test_constructor_dtype_only(self, dtype, index):
        # GH-20865
        result = pd.Series(dtype=dtype, index=index)
        assert result.dtype == dtype
        assert len(result) == 0

    def test_constructor_no_data_index_order(self):
        result = pd.Series(index=['b', 'a', 'c'])
        assert result.index.tolist() == ['b', 'a', 'c']

    def test_constructor_no_data_string_type(self):
        # GH 22477
        result = pd.Series(index=[1], dtype=str)
        assert np.isnan(result.iloc[0])

    @pytest.mark.parametrize('item', ['entry', 'ѐ', 13])
    def test_constructor_string_element_string_type(self, item):
        # GH 22477
        result = pd.Series(item, index=[1], dtype=str)
        assert result.iloc[0] == str(item)

    def test_constructor_dtype_str_na_values(self, string_dtype):
        ser = Series(['x', None], dtype=string_dtype)
        result = ser.isna()
        expected = Series([False, True])
        tm.assert_series_equal(result, expected)
        assert ser.iloc[1] is None

        ser = Series(['x', np.nan], dtype=string_dtype)
        assert np.isnan(ser.iloc[1])

    def test_constructor_series(self):
        index1 = ['d', 'b', 'a', 'c']
        index2 = sorted(index1)
        s1 = Series([4, 7, -5, 3], index=index1)
        s2 = Series(s1, index=index2)

        assert_series_equal(s2, s1.sort_index())

    def test_constructor_iterable(self):
        # GH 21987
        class Iter():
            def __iter__(self):
                for i in range(10):
                    yield i

        expected = Series(list(range(10)), dtype='int64')
        result = Series(Iter(), dtype='int64')
        assert_series_equal(result, expected)

    def test_constructor_sequence(self):
        # GH 21987
        expected = Series(list(range(10)), dtype='int64')
        result = Series(range(10), dtype='int64')
        assert_series_equal(result, expected)

    def test_constructor_single_str(self):
        # GH 21987
        expected = Series(['abc'])
        result = Series('abc')
        assert_series_equal(result, expected)

    def test_constructor_list_like(self):

        # make sure that we are coercing different
        # list-likes to standard dtypes and not
        # platform specific
        expected = Series([1, 2, 3], dtype='int64')
        for obj in [[1, 2, 3], (1, 2, 3),
                    np.array([1, 2, 3], dtype='int64')]:
            result = Series(obj, index=[0, 1, 2])
            assert_series_equal(result, expected)

    @pytest.mark.parametrize('input_vals', [
        ([1, 2]),
        (['1', '2']),
        (list(pd.date_range('1/1/2011', periods=2, freq='H'))),
        (list(pd.date_range('1/1/2011', periods=2, freq='H',
        ([pd.Interval(left=0, right=5)]),
    def test_constructor_list_str(self, input_vals, string_dtype):
        # GH 16605
        # Ensure that data elements from a list are converted to strings
        # when dtype is str, 'str', or 'U'
        result = Series(input_vals, dtype=string_dtype)
        expected = Series(input_vals).astype(string_dtype)
        assert_series_equal(result, expected)

    def test_constructor_list_str_na(self, string_dtype):
        result = Series([1.0, 2.0, np.nan], dtype=string_dtype)
        expected = Series(['1.0', '2.0', np.nan], dtype=object)
        assert_series_equal(result, expected)
        assert np.isnan(result[2])

    def test_constructor_generator(self):
        gen = (i for i in range(10))

        result = Series(gen)
        exp = Series(lrange(10))
        assert_series_equal(result, exp)

        gen = (i for i in range(10))
        result = Series(gen, index=lrange(10, 20))
        exp.index = lrange(10, 20)
        assert_series_equal(result, exp)

    def test_constructor_map(self):
        # GH8909
        m = map(lambda x: x, range(10))

        result = Series(m)
        exp = Series(lrange(10))
        assert_series_equal(result, exp)

        m = map(lambda x: x, range(10))
        result = Series(m, index=lrange(10, 20))
        exp.index = lrange(10, 20)
        assert_series_equal(result, exp)

    def test_constructor_categorical(self):
        cat = pd.Categorical([0, 1, 2, 0, 1, 2], ['a', 'b', 'c'],
        res = Series(cat)
        tm.assert_categorical_equal(res.values, cat)

        # can cast to a new dtype
        result = Series(pd.Categorical([1, 2, 3]),
        expected = pd.Series([1, 2, 3], dtype='int64')
        tm.assert_series_equal(result, expected)

        # GH12574
        cat = Series(pd.Categorical([1, 2, 3]), dtype='category')
        assert is_categorical_dtype(cat)
        assert is_categorical_dtype(cat.dtype)
        s = Series([1, 2, 3], dtype='category')
        assert is_categorical_dtype(s)
        assert is_categorical_dtype(s.dtype)

    def test_constructor_categorical_with_coercion(self):
        factor = Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c'])
        # test basic creation / coercion of categoricals
        s = Series(factor, name='A')
        assert s.dtype == 'category'
        assert len(s) == len(factor)

        # in a frame
        df = DataFrame({'A': factor})
        result = df['A']
        tm.assert_series_equal(result, s)
        result = df.iloc[:, 0]
        tm.assert_series_equal(result, s)
        assert len(df) == len(factor)

        df = DataFrame({'A': s})
        result = df['A']
        tm.assert_series_equal(result, s)
        assert len(df) == len(factor)

        # multiples
        df = DataFrame({'A': s, 'B': s, 'C': 1})
        result1 = df['A']
        result2 = df['B']
        tm.assert_series_equal(result1, s)
        tm.assert_series_equal(result2, s, check_names=False)
        assert == 'B'
        assert len(df) == len(factor)

        # GH8623
        x = DataFrame([[1, 'John P. Doe'], [2, 'Jane Dove'],
                       [1, 'John P. Doe']],
                      columns=['person_id', 'person_name'])
        x['person_name'] = Categorical(x.person_name
                                       )  # doing this breaks transform

        expected = x.iloc[0].person_name
        result = x.person_name.iloc[0]
        assert result == expected

        result = x.person_name[0]
        assert result == expected

        result = x.person_name.loc[0]
        assert result == expected

    def test_constructor_categorical_dtype(self):
        result = pd.Series(['a', 'b'],
                           dtype=CategoricalDtype(['a', 'b', 'c'],
        assert is_categorical_dtype(result) is True
        tm.assert_index_equal(, pd.Index(['a', 'b', 'c']))

        result = pd.Series(['a', 'b'], dtype=CategoricalDtype(['b', 'a']))
        assert is_categorical_dtype(result)
        tm.assert_index_equal(, pd.Index(['b', 'a']))
        assert is False

        # GH 19565 - Check broadcasting of scalar with Categorical dtype
        result = Series('a', index=[0, 1],
                        dtype=CategoricalDtype(['a', 'b'], ordered=True))
        expected = Series(['a', 'a'], index=[0, 1],
                          dtype=CategoricalDtype(['a', 'b'], ordered=True))
        tm.assert_series_equal(result, expected, check_categorical=True)

    def test_categorical_sideeffects_free(self):
        # Passing a categorical to a Series and then changing values in either
        # the series or the categorical should not change the values in the
        # other one, IF you specify copy!
        cat = Categorical(["a", "b", "c", "a"])
        s = Series(cat, copy=True)
        assert is not cat = [1, 2, 3]
        exp_s = np.array([1, 2, 3, 1], dtype=np.int64)
        exp_cat = np.array(["a", "b", "c", "a"], dtype=np.object_)
        tm.assert_numpy_array_equal(s.__array__(), exp_s)
        tm.assert_numpy_array_equal(cat.__array__(), exp_cat)

        # setting
        s[0] = 2
        exp_s2 = np.array([2, 2, 3, 1], dtype=np.int64)
        tm.assert_numpy_array_equal(s.__array__(), exp_s2)
        tm.assert_numpy_array_equal(cat.__array__(), exp_cat)

        # however, copy is False by default
        # so this WILL change values
        cat = Categorical(["a", "b", "c", "a"])
        s = Series(cat)
        assert s.values is cat = [1, 2, 3]
        exp_s = np.array([1, 2, 3, 1], dtype=np.int64)
        tm.assert_numpy_array_equal(s.__array__(), exp_s)
        tm.assert_numpy_array_equal(cat.__array__(), exp_s)

        s[0] = 2
        exp_s2 = np.array([2, 2, 3, 1], dtype=np.int64)
        tm.assert_numpy_array_equal(s.__array__(), exp_s2)
        tm.assert_numpy_array_equal(cat.__array__(), exp_s2)

    def test_unordered_compare_equal(self):
        left = pd.Series(['a', 'b', 'c'],
                         dtype=CategoricalDtype(['a', 'b']))
        right = pd.Series(pd.Categorical(['a', 'b', np.nan],
                                         categories=['a', 'b']))
        tm.assert_series_equal(left, right)

    def test_constructor_maskedarray(self):
        data = ma.masked_all((3, ), dtype=float)
        result = Series(data)
        expected = Series([nan, nan, nan])
        assert_series_equal(result, expected)

        data[0] = 0.0
        data[2] = 2.0
        index = ['a', 'b', 'c']
        result = Series(data, index=index)
        expected = Series([0.0, nan, 2.0], index=index)
        assert_series_equal(result, expected)

        data[1] = 1.0
        result = Series(data, index=index)
        expected = Series([0.0, 1.0, 2.0], index=index)
        assert_series_equal(result, expected)

        data = ma.masked_all((3, ), dtype=int)
        result = Series(data)
        expected = Series([nan, nan, nan], dtype=float)
        assert_series_equal(result, expected)

        data[0] = 0
        data[2] = 2
        index = ['a', 'b', 'c']
        result = Series(data, index=index)
        expected = Series([0, nan, 2], index=index, dtype=float)
        assert_series_equal(result, expected)

        data[1] = 1
        result = Series(data, index=index)
        expected = Series([0, 1, 2], index=index, dtype=int)
        assert_series_equal(result, expected)

        data = ma.masked_all((3, ), dtype=bool)
        result = Series(data)
        expected = Series([nan, nan, nan], dtype=object)
        assert_series_equal(result, expected)

        data[0] = True
        data[2] = False
        index = ['a', 'b', 'c']
        result = Series(data, index=index)
        expected = Series([True, nan, False], index=index, dtype=object)
        assert_series_equal(result, expected)

        data[1] = True
        result = Series(data, index=index)
        expected = Series([True, True, False], index=index, dtype=bool)
        assert_series_equal(result, expected)

        data = ma.masked_all((3, ), dtype='M8[ns]')
        result = Series(data)
        expected = Series([iNaT, iNaT, iNaT], dtype='M8[ns]')
        assert_series_equal(result, expected)

        data[0] = datetime(2001, 1, 1)
        data[2] = datetime(2001, 1, 3)
        index = ['a', 'b', 'c']
        result = Series(data, index=index)
        expected = Series([datetime(2001, 1, 1), iNaT,
                           datetime(2001, 1, 3)], index=index, dtype='M8[ns]')
        assert_series_equal(result, expected)

        data[1] = datetime(2001, 1, 2)
        result = Series(data, index=index)
        expected = Series([datetime(2001, 1, 1), datetime(2001, 1, 2),
                           datetime(2001, 1, 3)], index=index, dtype='M8[ns]')
        assert_series_equal(result, expected)

    def test_series_ctor_plus_datetimeindex(self):
        rng = date_range('20090415', '20090519', freq='B')
        data = {k: 1 for k in rng}

        result = Series(data, index=rng)
        assert result.index is rng

    def test_constructor_default_index(self):
        s = Series([0, 1, 2])
        tm.assert_index_equal(s.index, pd.Index(np.arange(3)))

    @pytest.mark.parametrize('input', [[1, 2, 3],
                                       (1, 2, 3),
                                       pd.Categorical(['a', 'b', 'a']),
                                       (i for i in range(3)),
                                       map(lambda x: x, range(3))])
    def test_constructor_index_mismatch(self, input):
        # GH 19342
        # test that construction of a Series with an index of different length
        # raises an error
        msg = 'Length of passed values is 3, index implies 4'
        with pytest.raises(ValueError, match=msg):
            Series(input, index=np.arange(4))

    def test_constructor_numpy_scalar(self):
        # GH 19342
        # construction with a numpy scalar
        # should not raise
        result = Series(np.array(100), index=np.arange(4), dtype='int64')
        expected = Series(100, index=np.arange(4), dtype='int64')
        tm.assert_series_equal(result, expected)

    def test_constructor_broadcast_list(self):
        # GH 19342
        # construction with single-element container and index
        # should raise
        pytest.raises(ValueError, Series, ['foo'], index=['a', 'b', 'c'])

    def test_constructor_corner(self):
        df = tm.makeTimeDataFrame()
        objs = [df, df]
        s = Series(objs, index=[0, 1])
        assert isinstance(s, Series)

    def test_constructor_sanitize(self):
        s = Series(np.array([1., 1., 8.]), dtype='i8')
        assert s.dtype == np.dtype('i8')

        s = Series(np.array([1., 1., np.nan]), copy=True, dtype='i8')
        assert s.dtype == np.dtype('f8')

    def test_constructor_copy(self):
        # GH15125
        # test dtype parameter has no side effects on copy=True
        for data in [[1.], np.array([1.])]:
            x = Series(data)
            y = pd.Series(x, copy=True, dtype=float)

            # copy=True maintains original data in Series
            tm.assert_series_equal(x, y)

            # changes to origin of copy does not affect the copy
            x[0] = 2.
            assert not x.equals(y)
            assert x[0] == 2.
            assert y[0] == 1.

            pd.date_range('20170101', periods=3, tz='US/Eastern'),
            pd.date_range('20170101', periods=3),
            pd.timedelta_range('1 day', periods=3),
            pd.period_range('2012Q1', periods=3, freq='Q'),
            pd.Int64Index([1, 2, 3]),
            pd.RangeIndex(0, 3)],
        ids=lambda x: type(x).__name__)
    def test_constructor_limit_copies(self, index):
        # GH 17449
        # limit copies of input
        s = pd.Series(index)

        # we make 1 copy; this is just a smoke test here
        assert s._data.blocks[0].values is not index

    def test_constructor_pass_none(self):
        s = Series(None, index=lrange(5))
        assert s.dtype == np.float64

        s = Series(None, index=lrange(5), dtype=object)
        assert s.dtype == np.object_

        # GH 7431
        # inference on the index
        s = Series(index=np.array([None]))
        expected = Series(index=Index([None]))
        assert_series_equal(s, expected)

    def test_constructor_pass_nan_nat(self):
        # GH 13467
        exp = Series([np.nan, np.nan], dtype=np.float64)
        assert exp.dtype == np.float64
        tm.assert_series_equal(Series([np.nan, np.nan]), exp)
        tm.assert_series_equal(Series(np.array([np.nan, np.nan])), exp)

        exp = Series([pd.NaT, pd.NaT])
        assert exp.dtype == 'datetime64[ns]'
        tm.assert_series_equal(Series([pd.NaT, pd.NaT]), exp)
        tm.assert_series_equal(Series(np.array([pd.NaT, pd.NaT])), exp)

        tm.assert_series_equal(Series([pd.NaT, np.nan]), exp)
        tm.assert_series_equal(Series(np.array([pd.NaT, np.nan])), exp)

        tm.assert_series_equal(Series([np.nan, pd.NaT]), exp)
        tm.assert_series_equal(Series(np.array([np.nan, pd.NaT])), exp)

    def test_constructor_cast(self):
        msg = "could not convert string to float"
        with pytest.raises(ValueError, match=msg):
            Series(["a", "b", "c"], dtype=float)

    def test_constructor_unsigned_dtype_overflow(self, uint_dtype):
        # see gh-15832
        msg = 'Trying to coerce negative values to unsigned integers'
        with pytest.raises(OverflowError, match=msg):
            Series([-1], dtype=uint_dtype)

    def test_constructor_coerce_float_fail(self, any_int_dtype):
        # see gh-15832
        msg = "Trying to coerce float values to integers"
        with pytest.raises(ValueError, match=msg):
            Series([1, 2, 3.5], dtype=any_int_dtype)

    def test_constructor_coerce_float_valid(self, float_dtype):
        s = Series([1, 2, 3.5], dtype=float_dtype)
        expected = Series([1, 2, 3.5]).astype(float_dtype)
        assert_series_equal(s, expected)

    def test_constructor_dtype_no_cast(self):
        # see gh-1572
        s = Series([1, 2, 3])
        s2 = Series(s, dtype=np.int64)

        s2[1] = 5
        assert s[1] == 5

    def test_constructor_datelike_coercion(self):

        # GH 9477
        # incorrectly inferring on dateimelike looking when object dtype is
        # specified
        s = Series([Timestamp('20130101'), 'NOV'], dtype=object)
        assert s.iloc[0] == Timestamp('20130101')
        assert s.iloc[1] == 'NOV'
        assert s.dtype == object

        # the dtype was being reset on the slicing and re-inferred to datetime
        # even thought the blocks are mixed
        belly = '216 3T19'.split()
        wing1 = '2T15 4H19'.split()
        wing2 = '416 4T20'.split()
        mat = pd.to_datetime('2016-01-22 2019-09-07'.split())
        df = pd.DataFrame(
            {'wing1': wing1,
             'wing2': wing2,
             'mat': mat}, index=belly)

        result = df.loc['3T19']
        assert result.dtype == object
        result = df.loc['216']
        assert result.dtype == object

    def test_constructor_datetimes_with_nulls(self):
        # gh-15869
        for arr in [np.array([None, None, None, None,
                    , None]),
                    np.array([None, None,, None])]:
            result = Series(arr)
            assert result.dtype == 'M8[ns]'

    def test_constructor_dtype_datetime64(self):

        s = Series(iNaT, dtype='M8[ns]', index=lrange(5))
        assert isna(s).all()

        # in theory this should be all nulls, but since
        # we are not specifying a dtype is ambiguous
        s = Series(iNaT, index=lrange(5))
        assert not isna(s).all()

        s = Series(nan, dtype='M8[ns]', index=lrange(5))
        assert isna(s).all()

        s = Series([datetime(2001, 1, 2, 0, 0), iNaT], dtype='M8[ns]')
        assert isna(s[1])
        assert s.dtype == 'M8[ns]'

        s = Series([datetime(2001, 1, 2, 0, 0), nan], dtype='M8[ns]')
        assert isna(s[1])
        assert s.dtype == 'M8[ns]'

        # GH3416
        dates = [
            np.datetime64(datetime(2013, 1, 1)),
            np.datetime64(datetime(2013, 1, 2)),
            np.datetime64(datetime(2013, 1, 3)),

        s = Series(dates)
        assert s.dtype == 'M8[ns]'

        s.iloc[0] = np.nan
        assert s.dtype == 'M8[ns]'

        # GH3414 related
        pytest.raises(TypeError, lambda x: Series(
            Series(dates).astype('int') / 1000000, dtype='M8[ms]'))
                      lambda x: Series(dates, dtype='datetime64'))

        # invalid dates can be help as object
        result = Series([datetime(2, 1, 1)])
        assert result[0] == datetime(2, 1, 1, 0, 0)

        result = Series([datetime(3000, 1, 1)])
        assert result[0] == datetime(3000, 1, 1, 0, 0)

        # don't mix types
        result = Series([Timestamp('20130101'), 1], index=['a', 'b'])
        assert result['a'] == Timestamp('20130101')
        assert result['b'] == 1

        # GH6529
        # coerce datetime64 non-ns properly
        dates = date_range('01-Jan-2015', '01-Dec-2015', freq='M')
        values2 = dates.view(np.ndarray).astype('datetime64[ns]')
        expected = Series(values2, index=dates)

        for dtype in ['s', 'D', 'ms', 'us', 'ns']:
            values1 = dates.view(np.ndarray).astype('M8[{0}]'.format(dtype))
            result = Series(values1, dates)
            assert_series_equal(result, expected)

        # GH 13876
        # coerce to non-ns to object properly
        expected = Series(values2, index=dates, dtype=object)
        for dtype in ['s', 'D', 'ms', 'us', 'ns']:
            values1 = dates.view(np.ndarray).astype('M8[{0}]'.format(dtype))
            result = Series(values1, index=dates, dtype=object)
            assert_series_equal(result, expected)

        # leave alone
        dates2 = np.array([ for d in dates.to_pydatetime()],
        series1 = Series(dates2, dates)
        tm.assert_numpy_array_equal(series1.values, dates2)
        assert series1.dtype == object

        # these will correctly infer a datetime
        s = Series([None, pd.NaT, '2013-08-05 15:30:00.000001'])
        assert s.dtype == 'datetime64[ns]'
        s = Series([np.nan, pd.NaT, '2013-08-05 15:30:00.000001'])
        assert s.dtype == 'datetime64[ns]'
        s = Series([pd.NaT, None, '2013-08-05 15:30:00.000001'])
        assert s.dtype == 'datetime64[ns]'
        s = Series([pd.NaT, np.nan, '2013-08-05 15:30:00.000001'])
        assert s.dtype == 'datetime64[ns]'

        # tz-aware (UTC and other tz's)
        # GH 8411
        dr = date_range('20130101', periods=3)
        assert Series(dr).iloc[0].tz is None
        dr = date_range('20130101', periods=3, tz='UTC')
        assert str(Series(dr).iloc[0].tz) == 'UTC'
        dr = date_range('20130101', periods=3, tz='US/Eastern')
        assert str(Series(dr).iloc[0].tz) == 'US/Eastern'

        # non-convertible
        s = Series([1479596223000, -1479590, pd.NaT])
        assert s.dtype == 'object'
        assert s[2] is pd.NaT
        assert 'NaT' in str(s)

        # if we passed a NaT it remains
        s = Series([datetime(2010, 1, 1), datetime(2, 1, 1), pd.NaT])
        assert s.dtype == 'object'
        assert s[2] is pd.NaT
        assert 'NaT' in str(s)

        # if we passed a nan it remains
        s = Series([datetime(2010, 1, 1), datetime(2, 1, 1), np.nan])
        assert s.dtype == 'object'
        assert s[2] is np.nan
        assert 'NaN' in str(s)

    def test_constructor_with_datetime_tz(self):

        # 8260
        # support datetime64 with tz

        dr = date_range('20130101', periods=3, tz='US/Eastern')
        s = Series(dr)
        assert == 'datetime64[ns, US/Eastern]'
        assert s.dtype == 'datetime64[ns, US/Eastern]'
        assert is_datetime64tz_dtype(s.dtype)
        assert 'datetime64[ns, US/Eastern]' in str(s)

        # export
        result = s.values
        assert isinstance(result, np.ndarray)
        assert result.dtype == 'datetime64[ns]'

        exp = pd.DatetimeIndex(result)
        exp = exp.tz_localize('UTC').tz_convert(
        tm.assert_index_equal(dr, exp)

        # indexing
        result = s.iloc[0]
        assert result == Timestamp('2013-01-01 00:00:00-0500',
                                   tz='US/Eastern', freq='D')
        result = s[0]
        assert result == Timestamp('2013-01-01 00:00:00-0500',
                                   tz='US/Eastern', freq='D')

        result = s[Series([True, True, False], index=s.index)]
        assert_series_equal(result, s[0:2])

        result = s.iloc[0:1]
        assert_series_equal(result, Series(dr[0:1]))

        # concat
        result = pd.concat([s.iloc[0:1], s.iloc[1:]])
        assert_series_equal(result, s)

        # short str
        assert 'datetime64[ns, US/Eastern]' in str(s)

        # formatting with NaT
        result = s.shift()
        assert 'datetime64[ns, US/Eastern]' in str(result)
        assert 'NaT' in str(result)

        # long str
        t = Series(date_range('20130101', periods=1000, tz='US/Eastern'))
        assert 'datetime64[ns, US/Eastern]' in str(t)

        result = pd.DatetimeIndex(s, freq='infer')
        tm.assert_index_equal(result, dr)

        # inference
        s = Series([pd.Timestamp('2013-01-01 13:00:00-0800', tz='US/Pacific'),
                    pd.Timestamp('2013-01-02 14:00:00-0800', tz='US/Pacific')])
        assert s.dtype == 'datetime64[ns, US/Pacific]'
        assert lib.infer_dtype(s, skipna=False) == 'datetime64'

        s = Series([pd.Timestamp('2013-01-01 13:00:00-0800', tz='US/Pacific'),
                    pd.Timestamp('2013-01-02 14:00:00-0800', tz='US/Eastern')])
        assert s.dtype == 'object'
        assert lib.infer_dtype(s, skipna=False) == 'datetime'

        # with all NaT
        s = Series(pd.NaT, index=[0, 1], dtype='datetime64[ns, US/Eastern]')
        expected = Series(pd.DatetimeIndex(['NaT', 'NaT'], tz='US/Eastern'))
        assert_series_equal(s, expected)

    @pytest.mark.parametrize("arr_dtype", [np.int64, np.float64])
    @pytest.mark.parametrize("dtype", ["M8", "m8"])
    @pytest.mark.parametrize("unit", ['ns', 'us', 'ms', 's', 'h', 'm', 'D'])
    def test_construction_to_datetimelike_unit(self, arr_dtype, dtype, unit):
        # tests all units
        # gh-19223
        dtype = "{}[{}]".format(dtype, unit)
        arr = np.array([1, 2, 3], dtype=arr_dtype)
        s = Series(arr)
        result = s.astype(dtype)
        expected = Series(arr.astype(dtype))

        tm.assert_series_equal(result, expected)

                             ['2013-01-01 00:00:00', pd.NaT, np.nan, None])
    def test_constructor_with_naive_string_and_datetimetz_dtype(self, arg):
        # GH 17415: With naive string
        result = Series([arg], dtype='datetime64[ns, CET]')
        expected = Series(pd.Timestamp(arg)).dt.tz_localize('CET')
        assert_series_equal(result, expected)

    def test_construction_interval(self):
        # construction from interval & array of intervals
        index = IntervalIndex.from_breaks(np.arange(3), closed='right')
        result = Series(index)
        tm.assert_index_equal(Index(result.values), index)

        result = Series(index.values)
        tm.assert_index_equal(Index(result.values), index)

    def test_construction_consistency(self):

        # make sure that we are not re-localizing upon construction
        # GH 14928
        s = Series(pd.date_range('20130101', periods=3, tz='US/Eastern'))

        result = Series(s, dtype=s.dtype)
        tm.assert_series_equal(result, s)

        result = Series(s.dt.tz_convert('UTC'), dtype=s.dtype)
        tm.assert_series_equal(result, s)

        result = Series(s.values, dtype=s.dtype)
        tm.assert_series_equal(result, s)

    def test_constructor_infer_period(self):
        data = [pd.Period('2000', 'D'), pd.Period('2001', 'D'), None]
        result = pd.Series(data)
        expected = pd.Series(period_array(data))
        tm.assert_series_equal(result, expected)
        assert result.dtype == 'Period[D]'

        data = np.asarray(data, dtype=object)
        tm.assert_series_equal(result, expected)
        assert result.dtype == 'Period[D]'

    def test_constructor_period_incompatible_frequency(self):
        data = [pd.Period('2000', 'D'), pd.Period('2001', 'A')]
        result = pd.Series(data)
        assert result.dtype == object
        assert result.tolist() == data

    def test_constructor_periodindex(self):
        # GH7932
        # converting a PeriodIndex when put in a Series

        pi = period_range('20130101', periods=5, freq='D')
        s = Series(pi)
        assert s.dtype == 'Period[D]'
        expected = Series(pi.astype(object))
        assert_series_equal(s, expected)

    def test_constructor_dict(self):
        d = {'a': 0., 'b': 1., 'c': 2.}
        result = Series(d, index=['b', 'c', 'd', 'a'])
        expected = Series([1, 2, nan, 0], index=['b', 'c', 'd', 'a'])
        assert_series_equal(result, expected)

        pidx = tm.makePeriodIndex(100)
        d = {pidx[0]: 0, pidx[1]: 1}
        result = Series(d, index=pidx)
        expected = Series(np.nan, pidx)
        expected.iloc[0] = 0
        expected.iloc[1] = 1
        assert_series_equal(result, expected)

    def test_constructor_dict_order(self):
        # GH19018
        # initialization ordering: by insertion order if python>= 3.6, else
        # order by value
        d = {'b': 1, 'a': 0, 'c': 2}
        result = Series(d)
        if PY36:
            expected = Series([1, 0, 2], index=list('bac'))
            expected = Series([0, 1, 2], index=list('abc'))
        tm.assert_series_equal(result, expected)

    @pytest.mark.parametrize("value", [2, np.nan, None, float('nan')])
    def test_constructor_dict_nan_key(self, value):
        # GH 18480
        d = {1: 'a', value: 'b', float('nan'): 'c', 4: 'd'}
        result = Series(d).sort_values()
        expected = Series(['a', 'b', 'c', 'd'], index=[1, value, np.nan, 4])
        assert_series_equal(result, expected)

        # MultiIndex:
        d = {(1, 1): 'a', (2, np.nan): 'b', (3, value): 'c'}
        result = Series(d).sort_values()
        expected = Series(['a', 'b', 'c'],
                          index=Index([(1, 1), (2, np.nan), (3, value)]))
        assert_series_equal(result, expected)

    def test_constructor_dict_datetime64_index(self):
        # GH 9456

        dates_as_str = ['1984-02-19', '1988-11-06', '1989-12-03', '1990-03-15']
        values = [42544017.198965244, 1234565, 40512335.181958228, -1]

        def create_data(constructor):
            return dict(zip((constructor(x) for x in dates_as_str), values))

        data_datetime64 = create_data(np.datetime64)
        data_datetime = create_data(lambda x: datetime.strptime(x, '%Y-%m-%d'))
        data_Timestamp = create_data(Timestamp)

        expected = Series(values, (Timestamp(x) for x in dates_as_str))

        result_datetime64 = Series(data_datetime64)
        result_datetime = Series(data_datetime)
        result_Timestamp = Series(data_Timestamp)

        assert_series_equal(result_datetime64, expected)
        assert_series_equal(result_datetime, expected)
        assert_series_equal(result_Timestamp, expected)

    def test_constructor_list_of_tuples(self):
        data = [(1, 1), (2, 2), (2, 3)]
        s = Series(data)
        assert list(s) == data

    def test_constructor_tuple_of_tuples(self):
        data = ((1, 1), (2, 2), (2, 3))
        s = Series(data)
        assert tuple(s) == data

    def test_constructor_dict_of_tuples(self):
        data = {(1, 2): 3,
                (None, 5): 6}
        result = Series(data).sort_values()
        expected = Series([3, 6],
                          index=MultiIndex.from_tuples([(1, 2), (None, 5)]))
        tm.assert_series_equal(result, expected)

    def test_constructor_set(self):
        values = {1, 2, 3, 4, 5}
        pytest.raises(TypeError, Series, values)
        values = frozenset(values)
        pytest.raises(TypeError, Series, values)

    @pytest.mark.filterwarnings("ignore:elementwise comparison:FutureWarning")
    def test_fromDict(self):
        data = {'a': 0, 'b': 1, 'c': 2, 'd': 3}

        series = Series(data)
        assert tm.is_sorted(series.index)

        data = {'a': 0, 'b': '1', 'c': '2', 'd':}
        series = Series(data)
        assert series.dtype == np.object_

        data = {'a': 0, 'b': '1', 'c': '2', 'd': '3'}
        series = Series(data)
        assert series.dtype == np.object_

        data = {'a': '0', 'b': '1'}
        series = Series(data, dtype=float)
        assert series.dtype == np.float64

    def test_fromValue(self, datetime_series):

        nans = Series(np.NaN, index=datetime_series.index)
        assert nans.dtype == np.float_
        assert len(nans) == len(datetime_series)

        strings = Series('foo', index=datetime_series.index)
        assert strings.dtype == np.object_
        assert len(strings) == len(datetime_series)

        d =
        dates = Series(d, index=datetime_series.index)
        assert dates.dtype == 'M8[ns]'
        assert len(dates) == len(datetime_series)

        # GH12336
        # Test construction of categorical series from value
        categorical = Series(0, index=datetime_series.index, dtype="category")
        expected = Series(0, index=datetime_series.index).astype("category")
        assert categorical.dtype == 'category'
        assert len(categorical) == len(datetime_series)
        tm.assert_series_equal(categorical, expected)

    def test_constructor_dtype_timedelta64(self):

        # basic
        td = Series([timedelta(days=i) for i in range(3)])
        assert td.dtype == 'timedelta64[ns]'

        td = Series([timedelta(days=1)])
        assert td.dtype == 'timedelta64[ns]'

        td = Series([timedelta(days=1), timedelta(days=2), np.timedelta64(
            1, 's')])

        assert td.dtype == 'timedelta64[ns]'

        # mixed with NaT
        td = Series([timedelta(days=1), NaT], dtype='m8[ns]')
        assert td.dtype == 'timedelta64[ns]'

        td = Series([timedelta(days=1), np.nan], dtype='m8[ns]')
        assert td.dtype == 'timedelta64[ns]'

        td = Series([np.timedelta64(300000000), pd.NaT], dtype='m8[ns]')
        assert td.dtype == 'timedelta64[ns]'

        # improved inference
        # GH5689
        td = Series([np.timedelta64(300000000), NaT])
        assert td.dtype == 'timedelta64[ns]'

        # because iNaT is int, not coerced to timedelta
        td = Series([np.timedelta64(300000000), iNaT])
        assert td.dtype == 'object'

        td = Series([np.timedelta64(300000000), np.nan])
        assert td.dtype == 'timedelta64[ns]'

        td = Series([pd.NaT, np.timedelta64(300000000)])
        assert td.dtype == 'timedelta64[ns]'

        td = Series([np.timedelta64(1, 's')])
        assert td.dtype == 'timedelta64[ns]'

        # these are frequency conversion astypes
        # for t in ['s', 'D', 'us', 'ms']:
        #    pytest.raises(TypeError, td.astype, 'm8[%s]' % t)

        # valid astype

        # invalid casting
        pytest.raises(TypeError, td.astype, 'int32')

        # this is an invalid casting
        def f():
            Series([timedelta(days=1), 'foo'], dtype='m8[ns]')

        pytest.raises(Exception, f)

        # leave as object here
        td = Series([timedelta(days=i) for i in range(3)] + ['foo'])
        assert td.dtype == 'object'

        # these will correctly infer a timedelta
        s = Series([None, pd.NaT, '1 Day'])
        assert s.dtype == 'timedelta64[ns]'
        s = Series([np.nan, pd.NaT, '1 Day'])
        assert s.dtype == 'timedelta64[ns]'
        s = Series([pd.NaT, None, '1 Day'])
        assert s.dtype == 'timedelta64[ns]'
        s = Series([pd.NaT, np.nan, '1 Day'])
        assert s.dtype == 'timedelta64[ns]'

    # GH 16406
    def test_constructor_mixed_tz(self):
        s = Series([Timestamp('20130101'),
                    Timestamp('20130101', tz='US/Eastern')])
        expected = Series([Timestamp('20130101'),
                           Timestamp('20130101', tz='US/Eastern')],
        assert_series_equal(s, expected)

    def test_NaT_scalar(self):
        series = Series([0, 1000, 2000, iNaT], dtype='M8[ns]')

        val = series[3]
        assert isna(val)

        series[2] = val
        assert isna(series[2])

    def test_NaT_cast(self):
        # GH10747
        result = Series([np.nan]).astype('M8[ns]')
        expected = Series([NaT])
        assert_series_equal(result, expected)

    def test_constructor_name_hashable(self):
        for n in [777, 777., 'name', datetime(2001, 11, 11), (1, ), u"\u05D0"]:
            for data in [[1, 2, 3], np.ones(3), {'a': 0, 'b': 1}]:
                s = Series(data, name=n)
                assert == n

    def test_constructor_name_unhashable(self):
        for n in [['name_list'], np.ones(2), {1: 2}]:
            for data in [['name_list'], np.ones(2), {1: 2}]:
                pytest.raises(TypeError, Series, data, name=n)

    def test_auto_conversion(self):
        series = Series(list(date_range('1/1/2000', periods=10)))
        assert series.dtype == 'M8[ns]'

    def test_convert_non_ns(self):
        # convert from a numpy array of non-ns timedelta64
        arr = np.array([1, 2, 3], dtype='timedelta64[s]')
        s = Series(arr)
        expected = Series(pd.timedelta_range('00:00:01', periods=3, freq='s'))
        assert_series_equal(s, expected)

        # convert from a numpy array of non-ns datetime64
        # note that creating a numpy datetime64 is in LOCAL time!!!!
        # seems to work for M8[D], but not for M8[s]

        s = Series(np.array(['2013-01-01', '2013-01-02',
                             '2013-01-03'], dtype='datetime64[D]'))
        assert_series_equal(s, Series(date_range('20130101', periods=3,

        # s = Series(np.array(['2013-01-01 00:00:01','2013-01-01
        # 00:00:02','2013-01-01 00:00:03'],dtype='datetime64[s]'))

        # assert_series_equal(s,date_range('20130101
        # 00:00:01',period=3,freq='s'))

            date_range('1/1/2000', periods=10),
            timedelta_range('1 day', periods=10),
            period_range('2000-Q1', periods=10, freq='Q')],
        ids=lambda x: type(x).__name__)
    def test_constructor_cant_cast_datetimelike(self, index):

        # floats are not ok
        msg = "Cannot cast {}.*? to ".format(
            # strip Index to convert PeriodIndex -> Period
            # We don't care whether the error message says
            # PeriodIndex or PeriodArray
        with pytest.raises(TypeError, match=msg):
            Series(index, dtype=float)

        # ints are ok
        # we test with np.int64 to get similar results on
        # windows / 32-bit platforms
        result = Series(index, dtype=np.int64)
        expected = Series(index.astype(np.int64))
        tm.assert_series_equal(result, expected)

            date_range('1/1/2000', periods=10),
            timedelta_range('1 day', periods=10),
            period_range('2000-Q1', periods=10, freq='Q')],
        ids=lambda x: type(x).__name__)
    def test_constructor_cast_object(self, index):
        s = Series(index, dtype=object)
        exp = Series(index).astype(object)
        tm.assert_series_equal(s, exp)

        s = Series(pd.Index(index, dtype=object), dtype=object)
        exp = Series(index).astype(object)
        tm.assert_series_equal(s, exp)

        s = Series(index.astype(object), dtype=object)
        exp = Series(index).astype(object)
        tm.assert_series_equal(s, exp)

    @pytest.mark.parametrize("dtype", [
    def test_constructor_generic_timestamp_no_frequency(self, dtype):
        # see gh-15524, gh-15987
        msg = "dtype has no unit. Please pass in"

        with pytest.raises(ValueError, match=msg):
            Series([], dtype=dtype)

    @pytest.mark.parametrize("dtype,msg", [
        ("m8[ps]", "cannot convert timedeltalike"),
        ("M8[ps]", "cannot convert datetimelike"),
    def test_constructor_generic_timestamp_bad_frequency(self, dtype, msg):
        # see gh-15524, gh-15987

        with pytest.raises(TypeError, match=msg):
            Series([], dtype=dtype)

    @pytest.mark.parametrize('dtype', [None, 'uint8', 'category'])
    def test_constructor_range_dtype(self, dtype):
        # GH 16804
        expected = Series([0, 1, 2, 3, 4], dtype=dtype or 'int64')
        result = Series(range(5), dtype=dtype)
        tm.assert_series_equal(result, expected)

    def test_constructor_tz_mixed_data(self):
        # GH 13051
        dt_list = [Timestamp('2016-05-01 02:03:37'),
                   Timestamp('2016-04-30 19:03:37-0700', tz='US/Pacific')]
        result = Series(dt_list)
        expected = Series(dt_list, dtype=object)
        tm.assert_series_equal(result, expected)
Beispiel #29
class TestCategoricalOps:
    def test_compare_frame(self):
        # GH#24282 check that Categorical.__cmp__(DataFrame) defers to frame
        data = ["a", "b", 2, "a"]
        cat = Categorical(data)

        df = DataFrame(cat)

        result = cat == df.T
        expected = DataFrame([[True, True, True, True]])
        tm.assert_frame_equal(result, expected)

        result = cat[::-1] != df.T
        expected = DataFrame([[False, True, True, False]])
        tm.assert_frame_equal(result, expected)

    def test_compare_frame_raises(self, all_compare_operators):
        # alignment raises unless we transpose
        op = getattr(operator, all_compare_operators)
        cat = Categorical(["a", "b", 2, "a"])
        df = DataFrame(cat)
        msg = "Unable to coerce to Series, length must be 1: given 4"
        with pytest.raises(ValueError, match=msg):
            op(cat, df)

    def test_datetime_categorical_comparison(self):
        dt_cat = Categorical(date_range("2014-01-01", periods=3), ordered=True)
        tm.assert_numpy_array_equal(dt_cat > dt_cat[0],
                                    np.array([False, True, True]))
        tm.assert_numpy_array_equal(dt_cat[0] < dt_cat,
                                    np.array([False, True, True]))

    def test_reflected_comparison_with_scalars(self):
        # GH8658
        cat = Categorical([1, 2, 3], ordered=True)
        tm.assert_numpy_array_equal(cat > cat[0], np.array([False, True,
        tm.assert_numpy_array_equal(cat[0] < cat, np.array([False, True,

    def test_comparison_with_unknown_scalars(self):
        # and following comparisons with scalars not in categories should raise
        # for unequal comps, but not for equal/not equal
        cat = Categorical([1, 2, 3], ordered=True)

        msg = "Invalid comparison between dtype=category and int"
        with pytest.raises(TypeError, match=msg):
            cat < 4
        with pytest.raises(TypeError, match=msg):
            cat > 4
        with pytest.raises(TypeError, match=msg):
            4 < cat
        with pytest.raises(TypeError, match=msg):
            4 > cat

        tm.assert_numpy_array_equal(cat == 4, np.array([False, False, False]))
        tm.assert_numpy_array_equal(cat != 4, np.array([True, True, True]))

    def test_comparison_of_ordered_categorical_with_nan_to_scalar(
            self, compare_operators_no_eq_ne):
        # BUG: fix ordered categorical comparison with missing values (#26504 )
        # and following comparisons with scalars in categories with missing
        # values should be evaluated as False

        cat = Categorical([1, 2, 3, None], categories=[1, 2, 3], ordered=True)
        scalar = 2
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", RuntimeWarning)
            expected = getattr(np.array(cat),
        actual = getattr(cat, compare_operators_no_eq_ne)(scalar)
        tm.assert_numpy_array_equal(actual, expected)

    def test_comparison_of_ordered_categorical_with_nan_to_listlike(
            self, compare_operators_no_eq_ne):
        # and following comparisons of missing values in ordered Categorical
        # with listlike should be evaluated as False

        cat = Categorical([1, 2, 3, None], categories=[1, 2, 3], ordered=True)
        other = Categorical([2, 2, 2, 2], categories=[1, 2, 3], ordered=True)
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", RuntimeWarning)
            expected = getattr(np.array(cat), compare_operators_no_eq_ne)(2)
        actual = getattr(cat, compare_operators_no_eq_ne)(other)
        tm.assert_numpy_array_equal(actual, expected)

        [(list("abc"), list("cba"), list("bbb")),
         ([1, 2, 3], [3, 2, 1], [2, 2, 2])],
    def test_comparisons(self, data, reverse, base):
        cat_rev = Series(Categorical(data, categories=reverse, ordered=True))
        cat_rev_base = Series(
            Categorical(base, categories=reverse, ordered=True))
        cat = Series(Categorical(data, ordered=True))
        cat_base = Series(
            Categorical(base,, ordered=True))
        s = Series(base)
        a = np.array(base)

        # comparisons need to take categories ordering into account
        res_rev = cat_rev > cat_rev_base
        exp_rev = Series([True, False, False])
        tm.assert_series_equal(res_rev, exp_rev)

        res_rev = cat_rev < cat_rev_base
        exp_rev = Series([False, False, True])
        tm.assert_series_equal(res_rev, exp_rev)

        res = cat > cat_base
        exp = Series([False, False, True])
        tm.assert_series_equal(res, exp)

        scalar = base[1]
        res = cat > scalar
        exp = Series([False, False, True])
        exp2 = cat.values > scalar
        tm.assert_series_equal(res, exp)
        tm.assert_numpy_array_equal(res.values, exp2)
        res_rev = cat_rev > scalar
        exp_rev = Series([True, False, False])
        exp_rev2 = cat_rev.values > scalar
        tm.assert_series_equal(res_rev, exp_rev)
        tm.assert_numpy_array_equal(res_rev.values, exp_rev2)

        # Only categories with same categories can be compared
        msg = "Categoricals can only be compared if 'categories' are the same"
        with pytest.raises(TypeError, match=msg):
            cat > cat_rev

        # categorical cannot be compared to Series or numpy array, and also
        # not the other way around
        msg = ("Cannot compare a Categorical for op __gt__ with type "
               r"<class 'numpy\.ndarray'>")
        with pytest.raises(TypeError, match=msg):
            cat > s
        with pytest.raises(TypeError, match=msg):
            cat_rev > s
        with pytest.raises(TypeError, match=msg):
            cat > a
        with pytest.raises(TypeError, match=msg):
            cat_rev > a

        with pytest.raises(TypeError, match=msg):
            s < cat
        with pytest.raises(TypeError, match=msg):
            s < cat_rev

        with pytest.raises(TypeError, match=msg):
            a < cat
        with pytest.raises(TypeError, match=msg):
            a < cat_rev

            lambda *args, **kwargs: Categorical(*args, **kwargs),
            lambda *args, **kwargs: Series(Categorical(*args, **kwargs)),
    def test_unordered_different_order_equal(self, ctor):
        c1 = ctor(["a", "b"], categories=["a", "b"], ordered=False)
        c2 = ctor(["a", "b"], categories=["b", "a"], ordered=False)
        assert (c1 == c2).all()

        c1 = ctor(["a", "b"], categories=["a", "b"], ordered=False)
        c2 = ctor(["b", "a"], categories=["b", "a"], ordered=False)
        assert (c1 != c2).all()

        c1 = ctor(["a", "a"], categories=["a", "b"], ordered=False)
        c2 = ctor(["b", "b"], categories=["b", "a"], ordered=False)
        assert (c1 != c2).all()

        c1 = ctor(["a", "a"], categories=["a", "b"], ordered=False)
        c2 = ctor(["a", "b"], categories=["b", "a"], ordered=False)
        result = c1 == c2
        tm.assert_numpy_array_equal(np.array(result), np.array([True, False]))

    def test_unordered_different_categories_raises(self):
        c1 = Categorical(["a", "b"], categories=["a", "b"], ordered=False)
        c2 = Categorical(["a", "c"], categories=["c", "a"], ordered=False)

        with pytest.raises(TypeError,
                           match=("Categoricals can only be compared")):
            c1 == c2

    def test_compare_different_lengths(self):
        c1 = Categorical([], categories=["a", "b"])
        c2 = Categorical([], categories=["a"])

        msg = "Categoricals can only be compared if 'categories' are the same."
        with pytest.raises(TypeError, match=msg):
            c1 == c2

    def test_compare_unordered_different_order(self):
        # 349290078
        a = pd.Categorical(["a"], categories=["a", "b"])
        b = pd.Categorical(["b"], categories=["b", "a"])
        assert not a.equals(b)

    def test_numeric_like_ops(self):

        df = DataFrame({"value": np.random.randint(0, 10000, 100)})
        labels = [f"{i} - {i + 499}" for i in range(0, 10000, 500)]
        cat_labels = Categorical(labels, labels)

        df = df.sort_values(by=["value"], ascending=True)
        df["value_group"] = pd.cut(df.value,
                                   range(0, 10500, 500),

        # numeric ops should not succeed
        for op, str_rep in [
            ("__add__", r"\+"),
            ("__sub__", "-"),
            ("__mul__", r"\*"),
            ("__truediv__", "/"),
            msg = f"Series cannot perform the operation {str_rep}|unsupported operand"
            with pytest.raises(TypeError, match=msg):
                getattr(df, op)(df)

        # reduction ops should not succeed (unless specifically defined, e.g.
        # min/max)
        s = df["value_group"]
        for op in ["kurt", "skew", "var", "std", "mean", "sum", "median"]:
            msg = f"Categorical cannot perform the operation {op}"
            with pytest.raises(TypeError, match=msg):
                getattr(s, op)(numeric_only=False)

        # mad technically works because it takes always the numeric data

        # numpy ops
        s = Series(Categorical([1, 2, 3, 4]))
        with pytest.raises(
                match="Categorical cannot perform the operation sum"):

        # numeric ops on a Series
        for op, str_rep in [
            ("__add__", r"\+"),
            ("__sub__", "-"),
            ("__mul__", r"\*"),
            ("__truediv__", "/"),
            msg = f"Series cannot perform the operation {str_rep}|unsupported operand"
            with pytest.raises(TypeError, match=msg):
                getattr(s, op)(2)

        # invalid ufunc
        msg = "Object with dtype category cannot perform the numpy op log"
        with pytest.raises(TypeError, match=msg):

    def test_contains(self):
        # GH21508
        c = pd.Categorical(list("aabbca"), categories=list("cab"))

        assert "b" in c
        assert "z" not in c
        assert np.nan not in c
        with pytest.raises(TypeError, match="unhashable type: 'list'"):
            assert [1] in c

        # assert codes NOT in index
        assert 0 not in c
        assert 1 not in c

        c = pd.Categorical(list("aabbca") + [np.nan], categories=list("cab"))
        assert np.nan in c

        "item, expected",
            (pd.Interval(0, 1), True),
            (1.5, True),
            (pd.Interval(0.5, 1.5), False),
            ("a", False),
            (pd.Timestamp(1), False),
            (pd.Timedelta(1), False),
    def test_contains_interval(self, item, expected):
        # GH 23705
        cat = Categorical(pd.IntervalIndex.from_breaks(range(3)))
        result = item in cat
        assert result is expected

    def test_contains_list(self):
        # GH#21729
        cat = Categorical([1, 2, 3])

        assert "a" not in cat

        with pytest.raises(TypeError, match="unhashable type"):
            ["a"] in cat

        with pytest.raises(TypeError, match="unhashable type"):
            ["a", "b"] in cat
Beispiel #30
    def transform(self, df):
        if self.args.user_type == 'adult' or self.args.user_type == 'bank':
            if self.args.user_type == 'adult':
                    {'income': {
                        ' <=50K.': ' <=50K',
                        ' >50K.': ' >50K'

            df.replace({'income': {' <=50K': '0', ' >50K': '1'}}, inplace=True)

        self.drop_column = list(
        df.drop(self.drop_column, axis=1, inplace=True)
        self.columns = [x for x in df.columns if x not in self.drop_column]

        # print stats
        # for col in self.columns:
        #     print(col, end=': ')
        #     if df[col].dtypes == 'object':
        #         print(df[col].unique())
        #     else:
        #         print(df[col].min(), df[col].max(), df[col].unique())

        for col in self.columns:
            bins = np.array([])
            if col not in self.ranges:
                df[col] = df[col].apply(str)
                df[col] = df[col].str.strip()
                df[col] = pd.Categorical(df[col])
                bins = np.round(
                    np.arange(self.ranges[col]['min'], self.ranges[col]['max'],
                              self.ranges[col]['gran']), 2)
                # too slow when there are many bins
                # df[col] = pd.cut(df[col], bins, right=False)
                start = bins[0]
                step = bins[1] - bins[0]
                if self.args.perturb_type in ['ord2cat2', 'ord2cat2q11', 'ord2cat2q20', 'ord2cat2q22', 'ord2cat2q10',
                                              'ord2cat2q12', 'ord4cat4q11', 'ord4cat4q12', 'ord4cat4q10',
                                              'ord4cat4q02', 'ord4cat4q01', 'ord4cat4q20', 'ord4cat4q21'] \
                        and col in ['INCTOT', 'FTOTINC'] and self.args.user_type[0:5] == 'ipums':
                    df[col] = np.random.randint(0, len(bins), df.shape[0])
                    df[col] = ((df[col].values - start) / step).astype(

  'finish %s' % col)

            if bins.any():
                bins = [
                                np.round(x + self.ranges[col]['gran'], 2),
                                closed='left') for x in bins
                self.kv_map[col] = dict(enumerate(bins))
                self.vk_map[col] = {v: k for k, v in self.kv_map[col].items()}
                self.kv_map[col] = dict(enumerate(df[col].cat.categories))
                self.vk_map[col] = {v: k for k, v in self.kv_map[col].items()}
                df.replace({col: self.vk_map[col]}, inplace=True)

        return df.values.astype(