Beispiel #1
0
                           pd.Series(np.add(ser, Dummy(1))))


@pytest.mark.parametrize(
    "values",
    [
        pd.array([1, 3, 2], dtype=np.int64),
        pd.array([1, 3, 2], dtype="Int64"),
        pd.array([1, 3, 2], dtype="Float32"),
        pd.array([1, 10, 2], dtype="Sparse[int]"),
        pd.to_datetime(["2000", "2010", "2001"]),
        pd.to_datetime(["2000", "2010", "2001"]).tz_localize("CET"),
        pd.to_datetime(["2000", "2010", "2001"]).to_period(freq="D"),
        pd.to_timedelta(["1 Day", "3 Days", "2 Days"]),
        pd.IntervalIndex(
            [pd.Interval(0, 1),
             pd.Interval(2, 3),
             pd.Interval(1, 2)]),
    ],
    ids=lambda x: str(x.dtype),
)
@pytest.mark.parametrize("box", [pd.array, pd.Index, pd.Series, pd.DataFrame])
def test_reduce(values, box, request):
    # TODO: cases with NAs

    same_type = True

    if box is pd.Index:
        if values.dtype.kind in ["i", "f"]:
            # ATM Index casts to object, so we get python ints/floats
            same_type = False
Beispiel #2
0
class TestDataFrameDataTypes:
    def test_empty_frame_dtypes(self):
        empty_df = DataFrame()
        tm.assert_series_equal(empty_df.dtypes, Series(dtype=object))

        nocols_df = DataFrame(index=[1, 2, 3])
        tm.assert_series_equal(nocols_df.dtypes, Series(dtype=object))

        norows_df = DataFrame(columns=list("abc"))
        tm.assert_series_equal(norows_df.dtypes,
                               Series(object, index=list("abc")))

        norows_int_df = DataFrame(columns=list("abc")).astype(np.int32)
        tm.assert_series_equal(norows_int_df.dtypes,
                               Series(np.dtype("int32"), index=list("abc")))

        df = DataFrame(dict([("a", 1), ("b", True), ("c", 1.0)]),
                       index=[1, 2, 3])
        ex_dtypes = Series(
            dict([("a", np.int64), ("b", np.bool_), ("c", np.float64)]))
        tm.assert_series_equal(df.dtypes, ex_dtypes)

        # same but for empty slice of df
        tm.assert_series_equal(df[:0].dtypes, ex_dtypes)

    def test_datetime_with_tz_dtypes(self):
        tzframe = DataFrame({
            "A":
            date_range("20130101", periods=3),
            "B":
            date_range("20130101", periods=3, tz="US/Eastern"),
            "C":
            date_range("20130101", periods=3, tz="CET"),
        })
        tzframe.iloc[1, 1] = pd.NaT
        tzframe.iloc[1, 2] = pd.NaT
        result = tzframe.dtypes.sort_index()
        expected = Series(
            [
                np.dtype("datetime64[ns]"),
                DatetimeTZDtype("ns", "US/Eastern"),
                DatetimeTZDtype("ns", "CET"),
            ],
            ["A", "B", "C"],
        )

        tm.assert_series_equal(result, expected)

    def test_dtypes_are_correct_after_column_slice(self):
        # GH6525
        df = DataFrame(index=range(5), columns=list("abc"), dtype=np.float_)
        tm.assert_series_equal(
            df.dtypes,
            Series(dict([("a", np.float_), ("b", np.float_),
                         ("c", np.float_)])),
        )
        tm.assert_series_equal(df.iloc[:, 2:].dtypes,
                               Series(dict([("c", np.float_)])))
        tm.assert_series_equal(
            df.dtypes,
            Series(dict([("a", np.float_), ("b", np.float_),
                         ("c", np.float_)])),
        )

    def test_dtypes_gh8722(self, float_string_frame):
        float_string_frame["bool"] = float_string_frame["A"] > 0
        result = float_string_frame.dtypes
        expected = Series({k: v.dtype
                           for k, v in float_string_frame.items()},
                          index=result.index)
        tm.assert_series_equal(result, expected)

        # compat, GH 8722
        with option_context("use_inf_as_na", True):
            df = DataFrame([[1]])
            result = df.dtypes
            tm.assert_series_equal(result, Series({0: np.dtype("int64")}))

    def test_singlerow_slice_categoricaldtype_gives_series(self):
        # GH29521
        df = DataFrame({"x": pd.Categorical("a b c d e".split())})
        result = df.iloc[0]
        raw_cat = pd.Categorical(["a"], categories=["a", "b", "c", "d", "e"])
        expected = Series(raw_cat, index=["x"], name=0, dtype="category")

        tm.assert_series_equal(result, expected)

    def test_timedeltas(self):
        df = DataFrame(
            dict(
                A=Series(date_range("2012-1-1", periods=3, freq="D")),
                B=Series([timedelta(days=i) for i in range(3)]),
            ))
        result = df.dtypes
        expected = Series(
            [np.dtype("datetime64[ns]"),
             np.dtype("timedelta64[ns]")],
            index=list("AB"))
        tm.assert_series_equal(result, expected)

        df["C"] = df["A"] + df["B"]
        result = df.dtypes
        expected = Series(
            [
                np.dtype("datetime64[ns]"),
                np.dtype("timedelta64[ns]"),
                np.dtype("datetime64[ns]"),
            ],
            index=list("ABC"),
        )
        tm.assert_series_equal(result, expected)

        # mixed int types
        df["D"] = 1
        result = df.dtypes
        expected = Series(
            [
                np.dtype("datetime64[ns]"),
                np.dtype("timedelta64[ns]"),
                np.dtype("datetime64[ns]"),
                np.dtype("int64"),
            ],
            index=list("ABCD"),
        )
        tm.assert_series_equal(result, expected)

    @pytest.mark.parametrize(
        "input_vals",
        [
            ([1, 2]),
            (["1", "2"]),
            (list(pd.date_range("1/1/2011", periods=2, freq="H"))),
            (list(
                pd.date_range("1/1/2011", periods=2, freq="H",
                              tz="US/Eastern"))),
            ([pd.Interval(left=0, right=5)]),
        ],
    )
    def test_constructor_list_str(self, input_vals, string_dtype):
        # GH 16605
        # Ensure that data elements are converted to strings when
        # dtype is str, 'str', or 'U'

        result = DataFrame({"A": input_vals}, dtype=string_dtype)
        expected = DataFrame({"A": input_vals}).astype({"A": string_dtype})
        tm.assert_frame_equal(result, expected)

    def test_constructor_list_str_na(self, string_dtype):

        result = DataFrame({"A": [1.0, 2.0, None]}, dtype=string_dtype)
        expected = DataFrame({"A": ["1.0", "2.0", None]}, dtype=object)
        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize(
        "data, expected",
        [
            # empty
            (DataFrame(), True),
            # multi-same
            (DataFrame({
                "A": [1, 2],
                "B": [1, 2]
            }), True),
            # multi-object
            (
                DataFrame({
                    "A": np.array([1, 2], dtype=object),
                    "B": np.array(["a", "b"], dtype=object),
                }),
                True,
            ),
            # multi-extension
            (
                DataFrame({
                    "A": pd.Categorical(["a", "b"]),
                    "B": pd.Categorical(["a", "b"])
                }),
                True,
            ),
            # differ types
            (DataFrame({
                "A": [1, 2],
                "B": [1.0, 2.0]
            }), False),
            # differ sizes
            (
                DataFrame({
                    "A": np.array([1, 2], dtype=np.int32),
                    "B": np.array([1, 2], dtype=np.int64),
                }),
                False,
            ),
            # multi-extension differ
            (
                DataFrame({
                    "A": pd.Categorical(["a", "b"]),
                    "B": pd.Categorical(["b", "c"])
                }),
                False,
            ),
        ],
    )
    def test_is_homogeneous_type(self, data, expected):
        assert data._is_homogeneous_type is expected

    def test_asarray_homogenous(self):
        df = DataFrame({
            "A": pd.Categorical([1, 2]),
            "B": pd.Categorical([1, 2])
        })
        result = np.asarray(df)
        # may change from object in the future
        expected = np.array([[1, 1], [2, 2]], dtype="object")
        tm.assert_numpy_array_equal(result, expected)

    def test_str_to_small_float_conversion_type(self):
        # GH 20388
        np.random.seed(13)
        col_data = [str(np.random.random() * 1e-12) for _ in range(5)]
        result = DataFrame(col_data, columns=["A"])
        expected = DataFrame(col_data, columns=["A"], dtype=object)
        tm.assert_frame_equal(result, expected)
        # change the dtype of the elements from object to float one by one
        result.loc[result.index, "A"] = [float(x) for x in col_data]
        expected = DataFrame(col_data, columns=["A"], dtype=float)
        tm.assert_frame_equal(result, expected)
Beispiel #3
0
def test_survival_table_from_events_will_collapse_to_desired_bins():
    T, C = np.array([1, 3, 4, 5]), np.array([True, True, True, True])
    table = utils.survival_table_from_events(T, C, collapse=True, intervals=[0, 4, 8])
    assert table.index.tolist() == [pd.Interval(0, 4, closed='right'), pd.Interval(4,  8, closed='right')]
def catagorize_donation_amounts(donation_df):
    # pop the first 11 rows which are not per 10s
    SKIP = 11

    # donation catagories
    bins = [0, 100, 5000, 50000, 9999999999]

    data_dates = donation_df['date'].iloc[SKIP:]
    donation_data_delta = donation_df.diff(periods=1, axis=0)
    donation_data_delta = donation_data_delta.iloc[SKIP:]['amount']

    merged = pd.concat([data_dates, donation_data_delta],
                       axis=1,
                       keys=['date', 'donated_amount'])
    """
    Full
    """
    plt.hist(np.abs(merged['donated_amount']), bins=500, log=True)
    plt.yscale('log')
    plt.xlim(left=0)
    plt.xlabel("Amount donated in USD", fontsize=30)
    plt.ylabel("Frequency", fontsize=30)
    plt.yticks(fontsize=20)
    plt.xticks(fontsize=20)
    plt.axvline(50000,
                color='k',
                linestyle='dashed',
                label="$50000",
                alpha=0.5)
    plt.legend(prop={'size': 20})
    plt.show()
    """
    MEDIUM - HIGH - Histogram of donation amounts
    """
    plt.hist(np.abs(merged['donated_amount']), bins=10000, log=True)
    plt.yscale('log')
    plt.xlim(left=0, right=50000)
    plt.xlabel("Amount donated in USD", fontsize=30)
    plt.ylabel("Frequency", fontsize=30)
    plt.yticks(fontsize=20)
    plt.xticks(fontsize=20)
    plt.axvline(5000, color='k', linestyle='dashed', label="$5000", alpha=0.5)
    plt.axvline(100, color='r', linestyle='dashed', label="$100", alpha=0.5)
    plt.legend(prop={'size': 20})
    plt.show()
    """
    LOW BOUND HIST
    """

    merged = merged[merged['donated_amount'] < 300]
    plt.hist(merged['donated_amount'], bins=2000)
    plt.yscale('log')
    plt.xlim(left=0, right=300)
    plt.xlabel("Amount donated in USD", fontsize=30)
    plt.ylabel("Frequency", fontsize=30)
    plt.yticks(fontsize=20)
    plt.xticks(fontsize=20)
    plt.axvline(100, color='r', linestyle='dashed', label="$100", alpha=0.5)
    plt.legend(prop={'size': 20})
    plt.show()
    """
    BINNED DONATIONS
    """

    merged = pd.concat([data_dates, donation_data_delta],
                       axis=1,
                       keys=['date', 'donated_amount'])
    # Bin the donations
    merged['bin'] = pd.cut(x=merged['donated_amount'], bins=bins)
    # cumsum the bins
    merged['cumsum'] = merged.groupby('bin')['donated_amount'].cumsum()

    # Group the tweets per catagory for v-lines
    binned = merged.groupby(['bin'])

    # Get the donors in highest interval
    TOP_DONORS_INTERVAL = pd.Interval(left=50000, right=9999999999)
    top_donor_data = binned.get_group(TOP_DONORS_INTERVAL)

    ax = sns.lineplot(x="date",
                      y="cumsum",
                      hue="bin",
                      data=merged,
                      drawstyle="steps-pre")
    ax.set(xlabel='Date', ylabel='Binned Cumulative Sum')
    ax.xaxis.get_label().set_fontsize(30)
    ax.yaxis.get_label().set_fontsize(30)
    ax.tick_params(labelsize=17)
    plt.legend(loc='upper left',
               labels=[
                   'Biggest donors', 'Medium donors', 'Large donors',
                   'Smallest donors'
               ],
               prop={'size': 20})
    # plt.yscale('log')
    plt.ylabel("USD donated", fontsize=30)
    plt.show()

    correlate_binned_data(top_donor_data, binned, bins)
Beispiel #5
0
 def test_is_all_dates(self):
     # GH 23576
     year_2017 = pd.Interval(Timestamp("2017-01-01 00:00:00"),
                             Timestamp("2018-01-01 00:00:00"))
     year_2017_index = pd.IntervalIndex([year_2017])
     assert not year_2017_index._is_all_dates
Beispiel #6
0
class TestSeriesConvertDtypes:
    # The answerdict has keys that have 4 tuples, corresponding to the arguments
    # infer_objects, convert_string, convert_integer, convert_boolean
    # This allows all 16 possible combinations to be tested.  Since common
    # combinations expect the same answer, this provides an easy way to list
    # all the possibilities
    @pytest.mark.parametrize(
        "data, maindtype, answerdict",
        [
            (
                [1, 2, 3],
                np.dtype("int32"),
                {
                    ((True, False), (True, False), (True, ), (True, False)):
                    "Int32",
                    ((True, False), (True, False), (False, ), (True, False)):
                    np.dtype("int32"),
                },
            ),
            (
                [1, 2, 3],
                np.dtype("int64"),
                {
                    ((True, False), (True, False), (True, ), (True, False)):
                    "Int64",
                    ((True, False), (True, False), (False, ), (True, False)):
                    np.dtype("int64"),
                },
            ),
            (
                ["x", "y", "z"],
                np.dtype("O"),
                {
                    (
                        (True, False),
                        (True, ),
                        (True, False),
                        (True, False),
                    ):
                    pd.StringDtype(),
                    ((True, False), (False, ), (True, False), (True, False)):
                    np.dtype("O"),
                },
            ),
            (
                [True, False, np.nan],
                np.dtype("O"),
                {
                    (
                        (True, False),
                        (True, False),
                        (True, False),
                        (True, ),
                    ):
                    pd.BooleanDtype(),
                    ((True, False), (True, False), (True, False), (False, )):
                    np.dtype("O"),
                },
            ),
            (
                ["h", "i", np.nan],
                np.dtype("O"),
                {
                    (
                        (True, False),
                        (True, ),
                        (True, False),
                        (True, False),
                    ):
                    pd.StringDtype(),
                    ((True, False), (False, ), (True, False), (True, False)):
                    np.dtype("O"),
                },
            ),
            (
                [10, np.nan, 20],
                np.dtype("float"),
                {
                    ((True, False), (True, False), (True, ), (True, False)):
                    "Int64",
                    ((True, False), (True, False), (False, ), (True, False)):
                    np.dtype("float"),
                },
            ),
            (
                [np.nan, 100.5, 200],
                np.dtype("float"),
                {
                    (
                        (True, False),
                        (True, False),
                        (True, False),
                        (True, False),
                    ):
                    np.dtype("float"),
                },
            ),
            (
                [3, 4, 5],
                "Int8",
                {
                    ((True, False), (True, False), (True, False), (True, False)):
                    "Int8"
                },
            ),
            (
                [[1, 2], [3, 4], [5]],
                None,
                {
                    (
                        (True, False),
                        (True, False),
                        (True, False),
                        (True, False),
                    ):
                    np.dtype("O"),
                },
            ),
            (
                [4, 5, 6],
                np.dtype("uint32"),
                {
                    ((True, False), (True, False), (True, ), (True, False)):
                    "UInt32",
                    ((True, False), (True, False), (False, ), (True, False)):
                    np.dtype("uint32"),
                },
            ),
            (
                [-10, 12, 13],
                np.dtype("i1"),
                {
                    ((True, False), (True, False), (True, ), (True, False)):
                    "Int8",
                    ((True, False), (True, False), (False, ), (True, False)):
                    np.dtype("i1"),
                },
            ),
            (
                [1, 2.0],
                object,
                {
                    ((True, False), (True, False), (True, ), (True, False)):
                    "Int64",
                    ((True, ), (True, False), (False, ), (True, False)):
                    np.dtype("float"),
                    ((False, ), (True, False), (False, ), (True, False)):
                    np.dtype("object"),
                },
            ),
            (
                ["a", "b"],
                pd.CategoricalDtype(),
                {
                    (
                        (True, False),
                        (True, False),
                        (True, False),
                        (True, False),
                    ):
                    pd.CategoricalDtype(),
                },
            ),
            (
                pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]),
                pd.DatetimeTZDtype(tz="UTC"),
                {
                    (
                        (True, False),
                        (True, False),
                        (True, False),
                        (True, False),
                    ):
                    pd.DatetimeTZDtype(tz="UTC"),
                },
            ),
            (
                pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]),
                "datetime64[ns]",
                {
                    (
                        (True, False),
                        (True, False),
                        (True, False),
                        (True, False),
                    ):
                    np.dtype("datetime64[ns]"),
                },
            ),
            (
                pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]),
                object,
                {
                    (
                        (True, ),
                        (True, False),
                        (True, False),
                        (True, False),
                    ):
                    np.dtype("datetime64[ns]"),
                    (
                        (False, ),
                        (True, False),
                        (True, False),
                        (True, False),
                    ):
                    np.dtype("O"),
                },
            ),
            (
                pd.period_range("1/1/2011", freq="M", periods=3),
                None,
                {
                    (
                        (True, False),
                        (True, False),
                        (True, False),
                        (True, False),
                    ):
                    pd.PeriodDtype("M"),
                },
            ),
            (
                pd.arrays.IntervalArray([pd.Interval(0, 1),
                                         pd.Interval(1, 5)]),
                None,
                {
                    (
                        (True, False),
                        (True, False),
                        (True, False),
                        (True, False),
                    ):
                    pd.IntervalDtype("int64"),
                },
            ),
        ],
    )
    @pytest.mark.parametrize("params", product(*[(True, False)] * 4))
    def test_convert_dtypes(self, data, maindtype, params, answerdict):
        if maindtype is not None:
            series = pd.Series(data, dtype=maindtype)
        else:
            series = pd.Series(data)
        answers = {
            k: a
            for (kk, a) in answerdict.items() for k in product(*kk)
        }

        ns = series.convert_dtypes(*params)
        expected_dtype = answers[tuple(params)]
        expected = pd.Series(series.values, dtype=expected_dtype)
        tm.assert_series_equal(ns, expected)

        # Test that it is a copy
        copy = series.copy(deep=True)
        ns[ns.notna()] = np.nan

        # Make sure original not changed
        tm.assert_series_equal(series, copy)
Beispiel #7
0
    ("datetime64",
     [np.datetime64("2013-01-01"), np.nan,
      np.datetime64("2018-01-01")]),
    ("datetime", [pd.Timestamp("20130101"), np.nan,
                  pd.Timestamp("20180101")]),
    ("date", [date(2013, 1, 1), np.nan,
              date(2018, 1, 1)]),
    # The following two dtypes are commented out due to GH 23554
    # ('complex', [1 + 1j, np.nan, 2 + 2j]),
    # ('timedelta64', [np.timedelta64(1, 'D'),
    #                  np.nan, np.timedelta64(2, 'D')]),
    ("timedelta", [timedelta(1), np.nan, timedelta(2)]),
    ("time", [time(1), np.nan, time(2)]),
    ("period", [pd.Period(2013), pd.NaT,
                pd.Period(2018)]),
    ("interval", [pd.Interval(0, 1), np.nan,
                  pd.Interval(0, 2)]),
]
ids, _ = zip(*_any_skipna_inferred_dtype)  # use inferred type as fixture-id


@pytest.fixture(params=_any_skipna_inferred_dtype, ids=ids)
def any_skipna_inferred_dtype(request):
    """
    Fixture for all inferred dtypes from _libs.lib.infer_dtype

    The covered (inferred) types are:
    * 'string'
    * 'empty'
    * 'bytes'
    * 'mixed'
Beispiel #8
0
class TestContains:
    def test_contains(self):

        ci = CategoricalIndex(list("aabbca"), categories=list("cabdef"), ordered=False)

        assert "a" in ci
        assert "z" not in ci
        assert "e" not in ci
        assert np.nan not in ci

        # assert codes NOT in index
        assert 0 not in ci
        assert 1 not in ci

    def test_contains_nan(self):
        ci = CategoricalIndex(list("aabbca") + [np.nan], categories=list("cabdef"))
        assert np.nan in ci

    @pytest.mark.parametrize("unwrap", [True, False])
    def test_contains_na_dtype(self, unwrap):
        dti = pd.date_range("2016-01-01", periods=100).insert(0, pd.NaT)
        pi = dti.to_period("D")
        tdi = dti - dti[-1]
        ci = CategoricalIndex(dti)

        obj = ci
        if unwrap:
            obj = ci._data

        assert np.nan in obj
        assert None in obj
        assert pd.NaT in obj
        assert np.datetime64("NaT") in obj
        assert np.timedelta64("NaT") not in obj

        obj2 = CategoricalIndex(tdi)
        if unwrap:
            obj2 = obj2._data

        assert np.nan in obj2
        assert None in obj2
        assert pd.NaT in obj2
        assert np.datetime64("NaT") not in obj2
        assert np.timedelta64("NaT") in obj2

        obj3 = CategoricalIndex(pi)
        if unwrap:
            obj3 = obj3._data

        assert np.nan in obj3
        assert None in obj3
        assert pd.NaT in obj3
        assert np.datetime64("NaT") not in obj3
        assert np.timedelta64("NaT") not in obj3

    @pytest.mark.parametrize(
        "item, expected",
        [
            (pd.Interval(0, 1), True),
            (1.5, True),
            (pd.Interval(0.5, 1.5), False),
            ("a", False),
            (Timestamp(1), False),
            (pd.Timedelta(1), False),
        ],
        ids=str,
    )
    def test_contains_interval(self, item, expected):
        # GH 23705
        ci = CategoricalIndex(IntervalIndex.from_breaks(range(3)))
        result = item in ci
        assert result is expected

    def test_contains_list(self):
        # GH#21729
        idx = CategoricalIndex([1, 2, 3])

        assert "a" not in idx

        with pytest.raises(TypeError, match="unhashable type"):
            ["a"] in idx

        with pytest.raises(TypeError, match="unhashable type"):
            ["a", "b"] in idx
Beispiel #9
0
    def _get(self,
             ep,
             symbol,
             start_date,
             end_date,
             retry,
             retry_wait,
             freq='6H'):
        dates = [None]
        if start_date:
            if not end_date:
                end_date = pd.Timestamp.utcnow()
            dates = pd.interval_range(API._timestamp(start_date),
                                      API._timestamp(end_date),
                                      freq=freq).tolist()
            if len(dates) == 0:
                dates.append(
                    pd.Interval(left=API._timestamp(start_date),
                                right=API._timestamp(end_date)))
            elif dates[-1].right < API._timestamp(end_date):
                dates.append(
                    pd.Interval(dates[-1].right, API._timestamp(end_date)))

        @request_retry(self.ID, retry, retry_wait)
        def helper(start, start_date, end_date):
            if start_date and end_date:
                endpoint = f'/api/v1/{ep}?symbol={symbol}&count={API_MAX}&reverse=false&start={start}&startTime={start_date}&endTime={end_date}'
            else:
                endpoint = f'/api/v1/{ep}?symbol={symbol}&reverse=true'
            header = {}
            if self.key_id and self.key_secret:
                header = self._generate_signature("GET", endpoint)
            header['Accept'] = 'application/json'
            return requests.get('{}{}'.format(self.api, endpoint),
                                headers=header)

        for interval in dates:
            start = 0
            if interval is not None:
                end = interval.right
                end -= pd.Timedelta(nanoseconds=1)

                start_date = str(interval.left).replace(" ", "T") + "Z"
                end_date = str(end).replace(" ", "T") + "Z"

            while True:
                r = helper(start, start_date, end_date)

                if r.status_code in {502, 504}:
                    LOG.warning("%s: %d for URL %s - %s", self.ID,
                                r.status_code, r.url, r.text)
                    sleep(retry_wait)
                    continue
                elif r.status_code == 429:
                    sleep(API_REFRESH)
                    continue
                elif r.status_code != 200:
                    self._handle_error(r, LOG)

                limit = int(r.headers['X-RateLimit-Remaining'])
                data = r.json()

                yield data

                if len(data) != API_MAX:
                    break

                if limit < 1:
                    sleep(API_REFRESH)

                start += len(data)
Beispiel #10
0
class TestSeriesReplace:
    def test_replace_explicit_none(self):
        # GH#36984 if the user explicitly passes value=None, give it to them
        ser = pd.Series([0, 0, ""], dtype=object)
        result = ser.replace("", None)
        expected = pd.Series([0, 0, None], dtype=object)
        tm.assert_series_equal(result, expected)

        df = pd.DataFrame(np.zeros((3, 3)))
        df.iloc[2, 2] = ""
        result = df.replace("", None)
        expected = pd.DataFrame({
            0: np.zeros(3),
            1: np.zeros(3),
            2: np.array([0.0, 0.0, None], dtype=object),
        })
        assert expected.iloc[2, 2] is None
        tm.assert_frame_equal(result, expected)

        # GH#19998 same thing with object dtype
        ser = pd.Series([10, 20, 30, "a", "a", "b", "a"])
        result = ser.replace("a", None)
        expected = pd.Series([10, 20, 30, None, None, "b", None])
        assert expected.iloc[-1] is None
        tm.assert_series_equal(result, expected)

    def test_replace_noop_doesnt_downcast(self):
        # GH#44498
        ser = pd.Series(
            [None, None, pd.Timestamp("2021-12-16 17:31")], dtype=object)
        res = ser.replace({np.nan: None})  # should be a no-op
        tm.assert_series_equal(res, ser)
        assert res.dtype == object

        # same thing but different calling convention
        res = ser.replace(np.nan, None)
        tm.assert_series_equal(res, ser)
        assert res.dtype == object

    def test_replace(self):
        N = 100
        ser = pd.Series(np.random.randn(N))
        ser[0:4] = np.nan
        ser[6:10] = 0

        # replace list with a single value
        return_value = ser.replace([np.nan], -1, inplace=True)
        assert return_value is None

        exp = ser.fillna(-1)
        tm.assert_series_equal(ser, exp)

        rs = ser.replace(0.0, np.nan)
        ser[ser == 0.0] = np.nan
        tm.assert_series_equal(rs, ser)

        ser = pd.Series(np.fabs(np.random.randn(N)),
                        tm.makeDateIndex(N),
                        dtype=object)
        ser[:5] = np.nan
        ser[6:10] = "foo"
        ser[20:30] = "bar"

        # replace list with a single value
        rs = ser.replace([np.nan, "foo", "bar"], -1)

        assert (rs[:5] == -1).all()
        assert (rs[6:10] == -1).all()
        assert (rs[20:30] == -1).all()
        assert (pd.isna(ser[:5])).all()

        # replace with different values
        rs = ser.replace({np.nan: -1, "foo": -2, "bar": -3})

        assert (rs[:5] == -1).all()
        assert (rs[6:10] == -2).all()
        assert (rs[20:30] == -3).all()
        assert (pd.isna(ser[:5])).all()

        # replace with different values with 2 lists
        rs2 = ser.replace([np.nan, "foo", "bar"], [-1, -2, -3])
        tm.assert_series_equal(rs, rs2)

        # replace inplace
        return_value = ser.replace([np.nan, "foo", "bar"], -1, inplace=True)
        assert return_value is None

        assert (ser[:5] == -1).all()
        assert (ser[6:10] == -1).all()
        assert (ser[20:30] == -1).all()

    def test_replace_nan_with_inf(self):
        ser = pd.Series([np.nan, 0, np.inf])
        tm.assert_series_equal(ser.replace(np.nan, 0), ser.fillna(0))

        ser = pd.Series([np.nan, 0, "foo", "bar", np.inf, None, pd.NaT])
        tm.assert_series_equal(ser.replace(np.nan, 0), ser.fillna(0))
        filled = ser.copy()
        filled[4] = 0
        tm.assert_series_equal(ser.replace(np.inf, 0), filled)

    def test_replace_listlike_value_listlike_target(self, datetime_series):
        ser = pd.Series(datetime_series.index)
        tm.assert_series_equal(ser.replace(np.nan, 0), ser.fillna(0))

        # malformed
        msg = r"Replacement lists must match in length\. Expecting 3 got 2"
        with pytest.raises(ValueError, match=msg):
            ser.replace([1, 2, 3], [np.nan, 0])

        # ser is dt64 so can't hold 1 or 2, so this replace is a no-op
        result = ser.replace([1, 2], [np.nan, 0])
        tm.assert_series_equal(result, ser)

        ser = pd.Series([0, 1, 2, 3, 4])
        result = ser.replace([0, 1, 2, 3, 4], [4, 3, 2, 1, 0])
        tm.assert_series_equal(result, pd.Series([4, 3, 2, 1, 0]))

    def test_replace_gh5319(self):
        # API change from 0.12?
        # GH 5319
        ser = pd.Series([0, np.nan, 2, 3, 4])
        expected = ser.ffill()
        result = ser.replace([np.nan])
        tm.assert_series_equal(result, expected)

        ser = pd.Series([0, np.nan, 2, 3, 4])
        expected = ser.ffill()
        result = ser.replace(np.nan)
        tm.assert_series_equal(result, expected)

    def test_replace_datetime64(self):
        # GH 5797
        ser = pd.Series(pd.date_range("20130101", periods=5))
        expected = ser.copy()
        expected.loc[2] = pd.Timestamp("20120101")
        result = ser.replace(
            {pd.Timestamp("20130103"): pd.Timestamp("20120101")})
        tm.assert_series_equal(result, expected)
        result = ser.replace(pd.Timestamp("20130103"),
                             pd.Timestamp("20120101"))
        tm.assert_series_equal(result, expected)

    def test_replace_nat_with_tz(self):
        # GH 11792: Test with replacing NaT in a list with tz data
        ts = pd.Timestamp("2015/01/01", tz="UTC")
        s = pd.Series([pd.NaT, pd.Timestamp("2015/01/01", tz="UTC")])
        result = s.replace([np.nan, pd.NaT], pd.Timestamp.min)
        expected = pd.Series([pd.Timestamp.min, ts], dtype=object)
        tm.assert_series_equal(expected, result)

    def test_replace_timedelta_td64(self):
        tdi = pd.timedelta_range(0, periods=5)
        ser = pd.Series(tdi)

        # Using a single dict argument means we go through replace_list
        result = ser.replace({ser[1]: ser[3]})

        expected = pd.Series([ser[0], ser[3], ser[2], ser[3], ser[4]])
        tm.assert_series_equal(result, expected)

    def test_replace_with_single_list(self):
        ser = pd.Series([0, 1, 2, 3, 4])
        result = ser.replace([1, 2, 3])
        tm.assert_series_equal(result, pd.Series([0, 0, 0, 0, 4]))

        s = ser.copy()
        return_value = s.replace([1, 2, 3], inplace=True)
        assert return_value is None
        tm.assert_series_equal(s, pd.Series([0, 0, 0, 0, 4]))

        # make sure things don't get corrupted when fillna call fails
        s = ser.copy()
        msg = (r"Invalid fill method\. Expecting pad \(ffill\) or backfill "
               r"\(bfill\)\. Got crash_cymbal")
        with pytest.raises(ValueError, match=msg):
            return_value = s.replace([1, 2, 3],
                                     inplace=True,
                                     method="crash_cymbal")
            assert return_value is None
        tm.assert_series_equal(s, ser)

    def test_replace_mixed_types(self):
        ser = pd.Series(np.arange(5), dtype="int64")

        def check_replace(to_rep, val, expected):
            sc = ser.copy()
            result = ser.replace(to_rep, val)
            return_value = sc.replace(to_rep, val, inplace=True)
            assert return_value is None
            tm.assert_series_equal(expected, result)
            tm.assert_series_equal(expected, sc)

        # 3.0 can still be held in our int64 series, so we do not upcast GH#44940
        tr, v = [3], [3.0]
        check_replace(tr, v, ser)
        # Note this matches what we get with the scalars 3 and 3.0
        check_replace(tr[0], v[0], ser)

        # MUST upcast to float
        e = pd.Series([0, 1, 2, 3.5, 4])
        tr, v = [3], [3.5]
        check_replace(tr, v, e)

        # casts to object
        e = pd.Series([0, 1, 2, 3.5, "a"])
        tr, v = [3, 4], [3.5, "a"]
        check_replace(tr, v, e)

        # again casts to object
        e = pd.Series([0, 1, 2, 3.5, pd.Timestamp("20130101")])
        tr, v = [3, 4], [3.5, pd.Timestamp("20130101")]
        check_replace(tr, v, e)

        # casts to object
        e = pd.Series([0, 1, 2, 3.5, True], dtype="object")
        tr, v = [3, 4], [3.5, True]
        check_replace(tr, v, e)

        # test an object with dates + floats + integers + strings
        dr = pd.Series(pd.date_range("1/1/2001", "1/10/2001", freq="D"))
        result = dr.astype(object).replace([dr[0], dr[1], dr[2]],
                                           [1.0, 2, "a"])
        expected = pd.Series([1.0, 2, "a"] + dr[3:].tolist(), dtype=object)
        tm.assert_series_equal(result, expected)

    def test_replace_bool_with_string_no_op(self):
        s = pd.Series([True, False, True])
        result = s.replace("fun", "in-the-sun")
        tm.assert_series_equal(s, result)

    def test_replace_bool_with_string(self):
        # nonexistent elements
        s = pd.Series([True, False, True])
        result = s.replace(True, "2u")
        expected = pd.Series(["2u", False, "2u"])
        tm.assert_series_equal(expected, result)

    def test_replace_bool_with_bool(self):
        s = pd.Series([True, False, True])
        result = s.replace(True, False)
        expected = pd.Series([False] * len(s))
        tm.assert_series_equal(expected, result)

    def test_replace_with_dict_with_bool_keys(self):
        s = pd.Series([True, False, True])
        result = s.replace({"asdf": "asdb", True: "yes"})
        expected = pd.Series(["yes", False, "yes"])
        tm.assert_series_equal(result, expected)

    def test_replace_Int_with_na(self, any_int_ea_dtype):
        # GH 38267
        result = pd.Series([0, None], dtype=any_int_ea_dtype).replace(0, pd.NA)
        expected = pd.Series([pd.NA, pd.NA], dtype=any_int_ea_dtype)
        tm.assert_series_equal(result, expected)
        result = pd.Series([0, 1], dtype=any_int_ea_dtype).replace(0, pd.NA)
        result.replace(1, pd.NA, inplace=True)
        tm.assert_series_equal(result, expected)

    def test_replace2(self):
        N = 100
        ser = pd.Series(np.fabs(np.random.randn(N)),
                        tm.makeDateIndex(N),
                        dtype=object)
        ser[:5] = np.nan
        ser[6:10] = "foo"
        ser[20:30] = "bar"

        # replace list with a single value
        rs = ser.replace([np.nan, "foo", "bar"], -1)

        assert (rs[:5] == -1).all()
        assert (rs[6:10] == -1).all()
        assert (rs[20:30] == -1).all()
        assert (pd.isna(ser[:5])).all()

        # replace with different values
        rs = ser.replace({np.nan: -1, "foo": -2, "bar": -3})

        assert (rs[:5] == -1).all()
        assert (rs[6:10] == -2).all()
        assert (rs[20:30] == -3).all()
        assert (pd.isna(ser[:5])).all()

        # replace with different values with 2 lists
        rs2 = ser.replace([np.nan, "foo", "bar"], [-1, -2, -3])
        tm.assert_series_equal(rs, rs2)

        # replace inplace
        return_value = ser.replace([np.nan, "foo", "bar"], -1, inplace=True)
        assert return_value is None
        assert (ser[:5] == -1).all()
        assert (ser[6:10] == -1).all()
        assert (ser[20:30] == -1).all()

    def test_replace_with_dictlike_and_string_dtype(self,
                                                    nullable_string_dtype):
        # GH 32621, GH#44940
        ser = pd.Series(["one", "two", np.nan], dtype=nullable_string_dtype)
        expected = pd.Series(["1", "2", np.nan], dtype=nullable_string_dtype)
        result = ser.replace({"one": "1", "two": "2"})
        tm.assert_series_equal(expected, result)

    def test_replace_with_empty_dictlike(self):
        # GH 15289
        s = pd.Series(list("abcd"))
        tm.assert_series_equal(s, s.replace({}))

        with tm.assert_produces_warning(FutureWarning):
            empty_series = pd.Series([])
        tm.assert_series_equal(s, s.replace(empty_series))

    def test_replace_string_with_number(self):
        # GH 15743
        s = pd.Series([1, 2, 3])
        result = s.replace("2", np.nan)
        expected = pd.Series([1, 2, 3])
        tm.assert_series_equal(expected, result)

    def test_replace_replacer_equals_replacement(self):
        # GH 20656
        # make sure all replacers are matching against original values
        s = pd.Series(["a", "b"])
        expected = pd.Series(["b", "a"])
        result = s.replace({"a": "b", "b": "a"})
        tm.assert_series_equal(expected, result)

    def test_replace_unicode_with_number(self):
        # GH 15743
        s = pd.Series([1, 2, 3])
        result = s.replace("2", np.nan)
        expected = pd.Series([1, 2, 3])
        tm.assert_series_equal(expected, result)

    def test_replace_mixed_types_with_string(self):
        # Testing mixed
        s = pd.Series([1, 2, 3, "4", 4, 5])
        result = s.replace([2, "4"], np.nan)
        expected = pd.Series([1, np.nan, 3, np.nan, 4, 5])
        tm.assert_series_equal(expected, result)

    @pytest.mark.parametrize(
        "categorical, numeric",
        [
            (pd.Categorical(["A"], categories=["A", "B"]), [1]),
            (pd.Categorical(["A", "B"], categories=["A", "B"]), [1, 2]),
        ],
    )
    def test_replace_categorical(self, categorical, numeric):
        # GH 24971, GH#23305
        ser = pd.Series(categorical)
        result = ser.replace({"A": 1, "B": 2})
        expected = pd.Series(numeric).astype("category")
        if 2 not in expected.cat.categories:
            # i.e. categories should be [1, 2] even if there are no "B"s present
            # GH#44940
            expected = expected.cat.add_categories(2)
        tm.assert_series_equal(expected, result)

    def test_replace_categorical_single(self):
        # GH 26988
        dti = pd.date_range("2016-01-01", periods=3, tz="US/Pacific")
        s = pd.Series(dti)
        c = s.astype("category")

        expected = c.copy()
        expected = expected.cat.add_categories("foo")
        expected[2] = "foo"
        expected = expected.cat.remove_unused_categories()
        assert c[2] != "foo"

        result = c.replace(c[2], "foo")
        tm.assert_series_equal(expected, result)
        assert c[2] != "foo"  # ensure non-inplace call does not alter original

        return_value = c.replace(c[2], "foo", inplace=True)
        assert return_value is None
        tm.assert_series_equal(expected, c)

        first_value = c[0]
        return_value = c.replace(c[1], c[0], inplace=True)
        assert return_value is None
        assert c[0] == c[1] == first_value  # test replacing with existing value

    def test_replace_with_no_overflowerror(self):
        # GH 25616
        # casts to object without Exception from OverflowError
        s = pd.Series([0, 1, 2, 3, 4])
        result = s.replace([3], ["100000000000000000000"])
        expected = pd.Series([0, 1, 2, "100000000000000000000", 4])
        tm.assert_series_equal(result, expected)

        s = pd.Series([0, "100000000000000000000", "100000000000000000001"])
        result = s.replace(["100000000000000000000"], [1])
        expected = pd.Series([0, 1, "100000000000000000001"])
        tm.assert_series_equal(result, expected)

    @pytest.mark.parametrize(
        "ser, to_replace, exp",
        [
            ([1, 2, 3], {
                1: 2,
                2: 3,
                3: 4
            }, [2, 3, 4]),
            (["1", "2", "3"], {
                "1": "2",
                "2": "3",
                "3": "4"
            }, ["2", "3", "4"]),
        ],
    )
    def test_replace_commutative(self, ser, to_replace, exp):
        # GH 16051
        # DataFrame.replace() overwrites when values are non-numeric

        series = pd.Series(ser)

        expected = pd.Series(exp)
        result = series.replace(to_replace)

        tm.assert_series_equal(result, expected)

    @pytest.mark.parametrize("ser, exp", [([1, 2, 3], [1, True, 3]),
                                          (["x", 2, 3], ["x", True, 3])])
    def test_replace_no_cast(self, ser, exp):
        # GH 9113
        # BUG: replace int64 dtype with bool coerces to int64

        series = pd.Series(ser)
        result = series.replace(2, True)
        expected = pd.Series(exp)

        tm.assert_series_equal(result, expected)

    def test_replace_invalid_to_replace(self):
        # GH 18634
        # API: replace() should raise an exception if invalid argument is given
        series = pd.Series(["a", "b", "c "])
        msg = (r"Expecting 'to_replace' to be either a scalar, array-like, "
               r"dict or None, got invalid type.*")
        with pytest.raises(TypeError, match=msg):
            series.replace(lambda x: x.strip())

    @pytest.mark.parametrize("frame", [False, True])
    def test_replace_nonbool_regex(self, frame):
        obj = pd.Series(["a", "b", "c "])
        if frame:
            obj = obj.to_frame()

        msg = "'to_replace' must be 'None' if 'regex' is not a bool"
        with pytest.raises(ValueError, match=msg):
            obj.replace(to_replace=["a"], regex="foo")

    @pytest.mark.parametrize("frame", [False, True])
    def test_replace_empty_copy(self, frame):
        obj = pd.Series([], dtype=np.float64)
        if frame:
            obj = obj.to_frame()

        res = obj.replace(4, 5, inplace=True)
        assert res is None

        res = obj.replace(4, 5, inplace=False)
        tm.assert_equal(res, obj)
        assert res is not obj

    def test_replace_only_one_dictlike_arg(self, fixed_now_ts):
        # GH#33340

        ser = pd.Series([1, 2, "A", fixed_now_ts, True])
        to_replace = {0: 1, 2: "A"}
        value = "foo"
        msg = "Series.replace cannot use dict-like to_replace and non-None value"
        with pytest.raises(ValueError, match=msg):
            ser.replace(to_replace, value)

        to_replace = 1
        value = {0: "foo", 2: "bar"}
        msg = "Series.replace cannot use dict-value and non-None to_replace"
        with pytest.raises(ValueError, match=msg):
            ser.replace(to_replace, value)

    def test_replace_extension_other(self, frame_or_series):
        # https://github.com/pandas-dev/pandas/issues/34530
        obj = frame_or_series(pd.array([1, 2, 3], dtype="Int64"))
        result = obj.replace("", "")  # no exception
        # should not have changed dtype
        tm.assert_equal(obj, result)

    def _check_replace_with_method(self, ser: pd.Series):
        df = ser.to_frame()

        res = ser.replace(ser[1], method="pad")
        expected = pd.Series([ser[0], ser[0]] + list(ser[2:]), dtype=ser.dtype)
        tm.assert_series_equal(res, expected)

        res_df = df.replace(ser[1], method="pad")
        tm.assert_frame_equal(res_df, expected.to_frame())

        ser2 = ser.copy()
        res2 = ser2.replace(ser[1], method="pad", inplace=True)
        assert res2 is None
        tm.assert_series_equal(ser2, expected)

        res_df2 = df.replace(ser[1], method="pad", inplace=True)
        assert res_df2 is None
        tm.assert_frame_equal(df, expected.to_frame())

    def test_replace_ea_dtype_with_method(self, any_numeric_ea_dtype):
        arr = pd.array([1, 2, pd.NA, 4], dtype=any_numeric_ea_dtype)
        ser = pd.Series(arr)

        self._check_replace_with_method(ser)

    @pytest.mark.parametrize("as_categorical", [True, False])
    def test_replace_interval_with_method(self, as_categorical):
        # in particular interval that can't hold NA

        idx = pd.IntervalIndex.from_breaks(range(4))
        ser = pd.Series(idx)
        if as_categorical:
            ser = ser.astype("category")

        self._check_replace_with_method(ser)

    @pytest.mark.parametrize("as_period", [True, False])
    @pytest.mark.parametrize("as_categorical", [True, False])
    def test_replace_datetimelike_with_method(self, as_period, as_categorical):
        idx = pd.date_range("2016-01-01", periods=5, tz="US/Pacific")
        if as_period:
            idx = idx.tz_localize(None).to_period("D")

        ser = pd.Series(idx)
        ser.iloc[-2] = pd.NaT
        if as_categorical:
            ser = ser.astype("category")

        self._check_replace_with_method(ser)

    def test_replace_with_compiled_regex(self):
        # https://github.com/pandas-dev/pandas/issues/35680
        s = pd.Series(["a", "b", "c"])
        regex = re.compile("^a$")
        result = s.replace({regex: "z"}, regex=True)
        expected = pd.Series(["z", "b", "c"])
        tm.assert_series_equal(result, expected)

    def test_pandas_replace_na(self):
        # GH#43344
        ser = pd.Series(["AA", "BB", "CC", "DD", "EE", "", pd.NA],
                        dtype="string")
        regex_mapping = {
            "AA": "CC",
            "BB": "CC",
            "EE": "CC",
            "CC": "CC-REPL",
        }
        result = ser.replace(regex_mapping, regex=True)
        exp = pd.Series(["CC", "CC", "CC-REPL", "DD", "CC", "", pd.NA],
                        dtype="string")
        tm.assert_series_equal(result, exp)

    @pytest.mark.parametrize(
        "dtype, input_data, to_replace, expected_data",
        [
            ("bool", [True, False], {
                True: False
            }, [False, False]),
            ("int64", [1, 2], {
                1: 10,
                2: 20
            }, [10, 20]),
            ("Int64", [1, 2], {
                1: 10,
                2: 20
            }, [10, 20]),
            ("float64", [1.1, 2.2], {
                1.1: 10.1,
                2.2: 20.5
            }, [10.1, 20.5]),
            ("Float64", [1.1, 2.2], {
                1.1: 10.1,
                2.2: 20.5
            }, [10.1, 20.5]),
            ("string", ["one", "two"], {
                "one": "1",
                "two": "2"
            }, ["1", "2"]),
            (
                pd.IntervalDtype("int64"),
                IntervalArray([pd.Interval(1, 2),
                               pd.Interval(2, 3)]),
                {
                    pd.Interval(1, 2): pd.Interval(10, 20)
                },
                IntervalArray([pd.Interval(10, 20),
                               pd.Interval(2, 3)]),
            ),
            (
                pd.IntervalDtype("float64"),
                IntervalArray([pd.Interval(1.0, 2.7),
                               pd.Interval(2.8, 3.1)]),
                {
                    pd.Interval(1.0, 2.7): pd.Interval(10.6, 20.8)
                },
                IntervalArray([pd.Interval(10.6, 20.8),
                               pd.Interval(2.8, 3.1)]),
            ),
            (
                pd.PeriodDtype("M"),
                [pd.Period("2020-05", freq="M")],
                {
                    pd.Period("2020-05", freq="M"): pd.Period("2020-06",
                                                              freq="M")
                },
                [pd.Period("2020-06", freq="M")],
            ),
        ],
    )
    def test_replace_dtype(self, dtype, input_data, to_replace, expected_data):
        # GH#33484
        ser = pd.Series(input_data, dtype=dtype)
        result = ser.replace(to_replace)
        expected = pd.Series(expected_data, dtype=dtype)
        tm.assert_series_equal(result, expected)

    def test_replace_string_dtype(self):
        # GH#40732, GH#44940
        ser = pd.Series(["one", "two", np.nan], dtype="string")
        res = ser.replace({"one": "1", "two": "2"})
        expected = pd.Series(["1", "2", np.nan], dtype="string")
        tm.assert_series_equal(res, expected)

        # GH#31644
        ser2 = pd.Series(["A", np.nan], dtype="string")
        res2 = ser2.replace("A", "B")
        expected2 = pd.Series(["B", np.nan], dtype="string")
        tm.assert_series_equal(res2, expected2)

        ser3 = pd.Series(["A", "B"], dtype="string")
        res3 = ser3.replace("A", pd.NA)
        expected3 = pd.Series([pd.NA, "B"], dtype="string")
        tm.assert_series_equal(res3, expected3)

    def test_replace_string_dtype_list_to_replace(self):
        # GH#41215, GH#44940
        ser = pd.Series(["abc", "def"], dtype="string")
        res = ser.replace(["abc", "any other string"], "xyz")
        expected = pd.Series(["xyz", "def"], dtype="string")
        tm.assert_series_equal(res, expected)

    def test_replace_string_dtype_regex(self):
        # GH#31644
        ser = pd.Series(["A", "B"], dtype="string")
        res = ser.replace(r".", "C", regex=True)
        expected = pd.Series(["C", "C"], dtype="string")
        tm.assert_series_equal(res, expected)

    def test_replace_nullable_numeric(self):
        # GH#40732, GH#44940

        floats = pd.Series([1.0, 2.0, 3.999, 4.4], dtype=pd.Float64Dtype())
        assert floats.replace({1.0: 9}).dtype == floats.dtype
        assert floats.replace(1.0, 9).dtype == floats.dtype
        assert floats.replace({1.0: 9.0}).dtype == floats.dtype
        assert floats.replace(1.0, 9.0).dtype == floats.dtype

        res = floats.replace(to_replace=[1.0, 2.0], value=[9.0, 10.0])
        assert res.dtype == floats.dtype

        ints = pd.Series([1, 2, 3, 4], dtype=pd.Int64Dtype())
        assert ints.replace({1: 9}).dtype == ints.dtype
        assert ints.replace(1, 9).dtype == ints.dtype
        assert ints.replace({1: 9.0}).dtype == ints.dtype
        assert ints.replace(1, 9.0).dtype == ints.dtype

        # nullable (for now) raises instead of casting
        with pytest.raises(TypeError, match="Invalid value"):
            ints.replace({1: 9.5})
        with pytest.raises(TypeError, match="Invalid value"):
            ints.replace(1, 9.5)

    @pytest.mark.parametrize("regex", [False, True])
    def test_replace_regex_dtype_series(self, regex):
        # GH-48644
        series = pd.Series(["0"])
        expected = pd.Series([1])
        result = series.replace(to_replace="0", value=1, regex=regex)
        tm.assert_series_equal(result, expected)

    def test_replace_different_int_types(self, any_int_numpy_dtype):
        # GH#45311
        labs = pd.Series([1, 1, 1, 0, 0, 2, 2, 2], dtype=any_int_numpy_dtype)

        maps = pd.Series([0, 2, 1], dtype=any_int_numpy_dtype)
        map_dict = {old: new for (old, new) in zip(maps.values, maps.index)}

        result = labs.replace(map_dict)
        expected = labs.replace({0: 0, 2: 1, 1: 2})
        tm.assert_series_equal(result, expected)
def test_slicing_agg_min_max(s1_fix):
    result = s1_fix.slice(range(-4, 11, 2)).agg(["min", "max"])
    pd.testing.assert_series_equal(
        result["min"],
        pd.Series({
            pd.Interval(-4, -2, closed="left"): -1.75,
            pd.Interval(-2, 0, closed="left"): -1.75,
            pd.Interval(0, 2, closed="left"): -1.75,
            pd.Interval(2, 4, closed="left"): 0.25,
            pd.Interval(4, 6, closed="left"): 2.0,
            pd.Interval(6, 8, closed="left"): -0.5,
            pd.Interval(8, 10, closed="left"): -0.5,
        }),
        check_names=False,
        check_index_type=False,
    )
    pd.testing.assert_series_equal(
        result["max"],
        pd.Series({
            pd.Interval(-4, -2, closed="left"): -1.75,
            pd.Interval(-2, 0, closed="left"): -1.75,
            pd.Interval(0, 2, closed="left"): 0.25,
            pd.Interval(2, 4, closed="left"): 2.75,
            pd.Interval(4, 6, closed="left"): 2.75,
            pd.Interval(6, 8, closed="left"): -0.5,
            pd.Interval(8, 10, closed="left"): -0.5,
        }),
        check_names=False,
        check_index_type=False,
    )
Beispiel #12
0
class TestHashing(object):
    @pytest.fixture(params=[
        Series([1, 2, 3] * 3, dtype='int32'),
        Series([None, 2.5, 3.5] * 3, dtype='float32'),
        Series(['a', 'b', 'c'] * 3, dtype='category'),
        Series(['d', 'e', 'f'] * 3),
        Series([True, False, True] * 3),
        Series(pd.date_range('20130101', periods=9)),
        Series(pd.date_range('20130101', periods=9, tz='US/Eastern')),
        Series(pd.timedelta_range('2000', periods=9))
    ])
    def series(self, request):
        return request.param

    def test_consistency(self):
        # check that our hash doesn't change because of a mistake
        # in the actual code; this is the ground truth
        result = hash_pandas_object(Index(['foo', 'bar', 'baz']))
        expected = Series(np.array(
            [3600424527151052760, 1374399572096150070, 477881037637427054],
            dtype='uint64'),
                          index=['foo', 'bar', 'baz'])
        tm.assert_series_equal(result, expected)

    def test_hash_array(self, series):
        a = series.values
        tm.assert_numpy_array_equal(hash_array(a), hash_array(a))

    def test_hash_array_mixed(self):
        result1 = hash_array(np.array([3, 4, 'All']))
        result2 = hash_array(np.array(['3', '4', 'All']))
        result3 = hash_array(np.array([3, 4, 'All'], dtype=object))
        tm.assert_numpy_array_equal(result1, result2)
        tm.assert_numpy_array_equal(result1, result3)

    @pytest.mark.parametrize('val', [5, 'foo', pd.Timestamp('20130101')])
    def test_hash_array_errors(self, val):
        msg = 'must pass a ndarray-like'
        with tm.assert_raises_regex(TypeError, msg):
            hash_array(val)

    def check_equal(self, obj, **kwargs):
        a = hash_pandas_object(obj, **kwargs)
        b = hash_pandas_object(obj, **kwargs)
        tm.assert_series_equal(a, b)

        kwargs.pop('index', None)
        a = hash_pandas_object(obj, **kwargs)
        b = hash_pandas_object(obj, **kwargs)
        tm.assert_series_equal(a, b)

    def check_not_equal_with_index(self, obj):

        # check that we are not hashing the same if
        # we include the index
        if not isinstance(obj, Index):
            a = hash_pandas_object(obj, index=True)
            b = hash_pandas_object(obj, index=False)
            if len(obj):
                assert not (a == b).all()

    def test_hash_tuples(self):
        tups = [(1, 'one'), (1, 'two'), (2, 'one')]
        result = hash_tuples(tups)
        expected = hash_pandas_object(MultiIndex.from_tuples(tups)).values
        tm.assert_numpy_array_equal(result, expected)

        result = hash_tuples(tups[0])
        assert result == expected[0]

    @pytest.mark.parametrize('tup', [(1, 'one'), (1, np.nan),
                                     (1.0, pd.NaT, 'A'),
                                     ('A', pd.Timestamp("2012-01-01"))])
    def test_hash_tuple(self, tup):
        # test equivalence between hash_tuples and hash_tuple
        result = hash_tuple(tup)
        expected = hash_tuples([tup])[0]
        assert result == expected

    @pytest.mark.parametrize('val', [
        1, 1.4, 'A', b'A', u'A',
        pd.Timestamp("2012-01-01"),
        pd.Timestamp("2012-01-01", tz='Europe/Brussels'),
        datetime.datetime(2012, 1, 1),
        pd.Timestamp("2012-01-01", tz='EST').to_pydatetime(),
        pd.Timedelta('1 days'),
        datetime.timedelta(1),
        pd.Period('2012-01-01', freq='D'),
        pd.Interval(0, 1), np.nan, pd.NaT, None
    ])
    def test_hash_scalar(self, val):
        result = _hash_scalar(val)
        expected = hash_array(np.array([val], dtype=object), categorize=True)
        assert result[0] == expected[0]

    @pytest.mark.parametrize('val', [5, 'foo', pd.Timestamp('20130101')])
    def test_hash_tuples_err(self, val):
        msg = 'must be convertible to a list-of-tuples'
        with tm.assert_raises_regex(TypeError, msg):
            hash_tuples(val)

    def test_multiindex_unique(self):
        mi = MultiIndex.from_tuples([(118, 472), (236, 118), (51, 204),
                                     (102, 51)])
        assert mi.is_unique
        result = hash_pandas_object(mi)
        assert result.is_unique

    def test_multiindex_objects(self):
        mi = MultiIndex(levels=[['b', 'd', 'a'], [1, 2, 3]],
                        labels=[[0, 1, 0, 2], [2, 0, 0, 1]],
                        names=['col1', 'col2'])
        recons = mi._sort_levels_monotonic()

        # these are equal
        assert mi.equals(recons)
        assert Index(mi.values).equals(Index(recons.values))

        # _hashed_values and hash_pandas_object(..., index=False)
        # equivalency
        expected = hash_pandas_object(mi, index=False).values
        result = mi._hashed_values
        tm.assert_numpy_array_equal(result, expected)

        expected = hash_pandas_object(recons, index=False).values
        result = recons._hashed_values
        tm.assert_numpy_array_equal(result, expected)

        expected = mi._hashed_values
        result = recons._hashed_values

        # values should match, but in different order
        tm.assert_numpy_array_equal(np.sort(result), np.sort(expected))

    @pytest.mark.parametrize('obj', [
        Series([1, 2, 3]),
        Series([1.0, 1.5, 3.2]),
        Series([1.0, 1.5, np.nan]),
        Series([1.0, 1.5, 3.2], index=[1.5, 1.1, 3.3]),
        Series(['a', 'b', 'c']),
        Series(['a', np.nan, 'c']),
        Series(['a', None, 'c']),
        Series([True, False, True]),
        Series(),
        Index([1, 2, 3]),
        Index([True, False, True]),
        DataFrame({
            'x': ['a', 'b', 'c'],
            'y': [1, 2, 3]
        }),
        DataFrame(),
        tm.makeMissingDataframe(),
        tm.makeMixedDataFrame(),
        tm.makeTimeDataFrame(),
        tm.makeTimeSeries(),
        tm.makeTimedeltaIndex(),
        tm.makePeriodIndex(),
        Series(tm.makePeriodIndex()),
        Series(pd.date_range('20130101', periods=3, tz='US/Eastern')),
        MultiIndex.from_product([
            range(5), ['foo', 'bar', 'baz'],
            pd.date_range('20130101', periods=2)
        ]),
        MultiIndex.from_product([pd.CategoricalIndex(list('aabc')),
                                 range(3)])
    ])
    def test_hash_pandas_object(self, obj):
        self.check_equal(obj)
        self.check_not_equal_with_index(obj)

    def test_hash_pandas_object2(self, series):
        self.check_equal(series)
        self.check_not_equal_with_index(series)

    @pytest.mark.parametrize(
        'obj',
        [Series([], dtype='float64'),
         Series([], dtype='object'),
         Index([])])
    def test_hash_pandas_empty_object(self, obj):
        # these are by-definition the same with
        # or w/o the index as the data is empty
        self.check_equal(obj)

    @pytest.mark.parametrize('s1', [
        Series(['a', 'b', 'c', 'd']),
        Series([1000, 2000, 3000, 4000]),
        Series(pd.date_range(0, periods=4))
    ])
    @pytest.mark.parametrize('categorize', [True, False])
    def test_categorical_consistency(self, s1, categorize):
        # GH15143
        # Check that categoricals hash consistent with their values, not codes
        # This should work for categoricals of any dtype
        s2 = s1.astype('category').cat.set_categories(s1)
        s3 = s2.cat.set_categories(list(reversed(s1)))

        # These should all hash identically
        h1 = hash_pandas_object(s1, categorize=categorize)
        h2 = hash_pandas_object(s2, categorize=categorize)
        h3 = hash_pandas_object(s3, categorize=categorize)
        tm.assert_series_equal(h1, h2)
        tm.assert_series_equal(h1, h3)

    def test_categorical_with_nan_consistency(self):
        c = pd.Categorical.from_codes([-1, 0, 1, 2, 3, 4],
                                      categories=pd.date_range('2012-01-01',
                                                               periods=5,
                                                               name='B'))
        expected = hash_array(c, categorize=False)
        c = pd.Categorical.from_codes([-1, 0],
                                      categories=[pd.Timestamp('2012-01-01')])
        result = hash_array(c, categorize=False)
        assert result[0] in expected
        assert result[1] in expected

    @pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning")
    def test_pandas_errors(self):
        with pytest.raises(TypeError):
            hash_pandas_object(pd.Timestamp('20130101'))

        obj = tm.makePanel()

        with pytest.raises(TypeError):
            hash_pandas_object(obj)

    def test_hash_keys(self):
        # using different hash keys, should have different hashes
        # for the same data

        # this only matters for object dtypes
        obj = Series(list('abc'))
        a = hash_pandas_object(obj, hash_key='9876543210123456')
        b = hash_pandas_object(obj, hash_key='9876543210123465')
        assert (a != b).all()

    def test_invalid_key(self):
        # this only matters for object dtypes
        msg = 'key should be a 16-byte string encoded'
        with tm.assert_raises_regex(ValueError, msg):
            hash_pandas_object(Series(list('abc')), hash_key='foo')

    def test_alread_encoded(self):
        # if already encoded then ok

        obj = Series(list('abc')).str.encode('utf8')
        self.check_equal(obj)

    def test_alternate_encoding(self):

        obj = Series(list('abc'))
        self.check_equal(obj, encoding='ascii')

    @pytest.mark.parametrize('l_exp', range(8))
    @pytest.mark.parametrize('l_add', [0, 1])
    def test_same_len_hash_collisions(self, l_exp, l_add):
        length = 2**(l_exp + 8) + l_add
        s = tm.rands_array(length, 2)
        result = hash_array(s, 'utf8')
        assert not result[0] == result[1]

    def test_hash_collisions(self):

        # hash collisions are bad
        # https://github.com/pandas-dev/pandas/issues/14711#issuecomment-264885726
        L = [
            'Ingrid-9Z9fKIZmkO7i7Cn51Li34pJm44fgX6DYGBNj3VPlOH50m7HnBlPxfIwFMrcNJNMP6PSgLmwWnInciMWrCSAlLEvt7JkJl4IxiMrVbXSa8ZQoVaq5xoQPjltuJEfwdNlO6jo8qRRHvD8sBEBMQASrRa6TsdaPTPCBo3nwIBpE7YzzmyH0vMBhjQZLx1aCT7faSEx7PgFxQhHdKFWROcysamgy9iVj8DO2Fmwg1NNl93rIAqC3mdqfrCxrzfvIY8aJdzin2cHVzy3QUJxZgHvtUtOLxoqnUHsYbNTeq0xcLXpTZEZCxD4PGubIuCNf32c33M7HFsnjWSEjE2yVdWKhmSVodyF8hFYVmhYnMCztQnJrt3O8ZvVRXd5IKwlLexiSp4h888w7SzAIcKgc3g5XQJf6MlSMftDXm9lIsE1mJNiJEv6uY6pgvC3fUPhatlR5JPpVAHNSbSEE73MBzJrhCAbOLXQumyOXigZuPoME7QgJcBalliQol7YZ9',  # noqa
            'Tim-b9MddTxOWW2AT1Py6vtVbZwGAmYCjbp89p8mxsiFoVX4FyDOF3wFiAkyQTUgwg9sVqVYOZo09Dh1AzhFHbgij52ylF0SEwgzjzHH8TGY8Lypart4p4onnDoDvVMBa0kdthVGKl6K0BDVGzyOXPXKpmnMF1H6rJzqHJ0HywfwS4XYpVwlAkoeNsiicHkJUFdUAhG229INzvIAiJuAHeJDUoyO4DCBqtoZ5TDend6TK7Y914yHlfH3g1WZu5LksKv68VQHJriWFYusW5e6ZZ6dKaMjTwEGuRgdT66iU5nqWTHRH8WSzpXoCFwGcTOwyuqPSe0fTe21DVtJn1FKj9F9nEnR9xOvJUO7E0piCIF4Ad9yAIDY4DBimpsTfKXCu1vdHpKYerzbndfuFe5AhfMduLYZJi5iAw8qKSwR5h86ttXV0Mc0QmXz8dsRvDgxjXSmupPxBggdlqUlC828hXiTPD7am0yETBV0F3bEtvPiNJfremszcV8NcqAoARMe'
        ]  # noqa

        # these should be different!
        result1 = hash_array(np.asarray(L[0:1], dtype=object), 'utf8')
        expected1 = np.array([14963968704024874985], dtype=np.uint64)
        tm.assert_numpy_array_equal(result1, expected1)

        result2 = hash_array(np.asarray(L[1:2], dtype=object), 'utf8')
        expected2 = np.array([16428432627716348016], dtype=np.uint64)
        tm.assert_numpy_array_equal(result2, expected2)

        result = hash_array(np.asarray(L, dtype=object), 'utf8')
        tm.assert_numpy_array_equal(
            result, np.concatenate([expected1, expected2], axis=0))
Beispiel #13
0
class TestDataFrameDataTypes(TestData):

    def test_concat_empty_dataframe_dtypes(self):
        df = DataFrame(columns=list("abc"))
        df['a'] = df['a'].astype(np.bool_)
        df['b'] = df['b'].astype(np.int32)
        df['c'] = df['c'].astype(np.float64)

        result = pd.concat([df, df])
        assert result['a'].dtype == np.bool_
        assert result['b'].dtype == np.int32
        assert result['c'].dtype == np.float64

        result = pd.concat([df, df.astype(np.float64)])
        assert result['a'].dtype == np.object_
        assert result['b'].dtype == np.float64
        assert result['c'].dtype == np.float64

    def test_empty_frame_dtypes_ftypes(self):
        empty_df = pd.DataFrame()
        assert_series_equal(empty_df.dtypes, pd.Series(dtype=np.object))
        assert_series_equal(empty_df.ftypes, pd.Series(dtype=np.object))

        nocols_df = pd.DataFrame(index=[1, 2, 3])
        assert_series_equal(nocols_df.dtypes, pd.Series(dtype=np.object))
        assert_series_equal(nocols_df.ftypes, pd.Series(dtype=np.object))

        norows_df = pd.DataFrame(columns=list("abc"))
        assert_series_equal(norows_df.dtypes, pd.Series(
            np.object, index=list("abc")))
        assert_series_equal(norows_df.ftypes, pd.Series(
            'object:dense', index=list("abc")))

        norows_int_df = pd.DataFrame(columns=list("abc")).astype(np.int32)
        assert_series_equal(norows_int_df.dtypes, pd.Series(
            np.dtype('int32'), index=list("abc")))
        assert_series_equal(norows_int_df.ftypes, pd.Series(
            'int32:dense', index=list("abc")))

        odict = compat.OrderedDict
        df = pd.DataFrame(odict([('a', 1), ('b', True), ('c', 1.0)]),
                          index=[1, 2, 3])
        ex_dtypes = pd.Series(odict([('a', np.int64),
                                     ('b', np.bool),
                                     ('c', np.float64)]))
        ex_ftypes = pd.Series(odict([('a', 'int64:dense'),
                                     ('b', 'bool:dense'),
                                     ('c', 'float64:dense')]))
        assert_series_equal(df.dtypes, ex_dtypes)
        assert_series_equal(df.ftypes, ex_ftypes)

        # same but for empty slice of df
        assert_series_equal(df[:0].dtypes, ex_dtypes)
        assert_series_equal(df[:0].ftypes, ex_ftypes)

    def test_datetime_with_tz_dtypes(self):
        tzframe = DataFrame({'A': date_range('20130101', periods=3),
                             'B': date_range('20130101', periods=3,
                                             tz='US/Eastern'),
                             'C': date_range('20130101', periods=3, tz='CET')})
        tzframe.iloc[1, 1] = pd.NaT
        tzframe.iloc[1, 2] = pd.NaT
        result = tzframe.dtypes.sort_index()
        expected = Series([np.dtype('datetime64[ns]'),
                           DatetimeTZDtype('datetime64[ns, US/Eastern]'),
                           DatetimeTZDtype('datetime64[ns, CET]')],
                          ['A', 'B', 'C'])

        assert_series_equal(result, expected)

    def test_dtypes_are_correct_after_column_slice(self):
        # GH6525
        df = pd.DataFrame(index=range(5), columns=list("abc"), dtype=np.float_)
        odict = compat.OrderedDict
        assert_series_equal(df.dtypes,
                            pd.Series(odict([('a', np.float_),
                                             ('b', np.float_),
                                             ('c', np.float_)])))
        assert_series_equal(df.iloc[:, 2:].dtypes,
                            pd.Series(odict([('c', np.float_)])))
        assert_series_equal(df.dtypes,
                            pd.Series(odict([('a', np.float_),
                                             ('b', np.float_),
                                             ('c', np.float_)])))

    def test_select_dtypes_include_using_list_like(self):
        df = DataFrame({'a': list('abc'),
                        'b': list(range(1, 4)),
                        'c': np.arange(3, 6).astype('u1'),
                        'd': np.arange(4.0, 7.0, dtype='float64'),
                        'e': [True, False, True],
                        'f': pd.Categorical(list('abc')),
                        'g': pd.date_range('20130101', periods=3),
                        'h': pd.date_range('20130101', periods=3,
                                           tz='US/Eastern'),
                        'i': pd.date_range('20130101', periods=3,
                                           tz='CET'),
                        'j': pd.period_range('2013-01', periods=3,
                                             freq='M'),
                        'k': pd.timedelta_range('1 day', periods=3)})

        ri = df.select_dtypes(include=[np.number])
        ei = df[['b', 'c', 'd', 'k']]
        assert_frame_equal(ri, ei)

        ri = df.select_dtypes(include=[np.number], exclude=['timedelta'])
        ei = df[['b', 'c', 'd']]
        assert_frame_equal(ri, ei)

        ri = df.select_dtypes(include=[np.number, 'category'],
                              exclude=['timedelta'])
        ei = df[['b', 'c', 'd', 'f']]
        assert_frame_equal(ri, ei)

        ri = df.select_dtypes(include=['datetime'])
        ei = df[['g']]
        assert_frame_equal(ri, ei)

        ri = df.select_dtypes(include=['datetime64'])
        ei = df[['g']]
        assert_frame_equal(ri, ei)

        ri = df.select_dtypes(include=['datetimetz'])
        ei = df[['h', 'i']]
        assert_frame_equal(ri, ei)

        pytest.raises(NotImplementedError,
                      lambda: df.select_dtypes(include=['period']))

    def test_select_dtypes_exclude_using_list_like(self):
        df = DataFrame({'a': list('abc'),
                        'b': list(range(1, 4)),
                        'c': np.arange(3, 6).astype('u1'),
                        'd': np.arange(4.0, 7.0, dtype='float64'),
                        'e': [True, False, True]})
        re = df.select_dtypes(exclude=[np.number])
        ee = df[['a', 'e']]
        assert_frame_equal(re, ee)

    def test_select_dtypes_exclude_include_using_list_like(self):
        df = DataFrame({'a': list('abc'),
                        'b': list(range(1, 4)),
                        'c': np.arange(3, 6).astype('u1'),
                        'd': np.arange(4.0, 7.0, dtype='float64'),
                        'e': [True, False, True],
                        'f': pd.date_range('now', periods=3).values})
        exclude = np.datetime64,
        include = np.bool_, 'integer'
        r = df.select_dtypes(include=include, exclude=exclude)
        e = df[['b', 'c', 'e']]
        assert_frame_equal(r, e)

        exclude = 'datetime',
        include = 'bool', 'int64', 'int32'
        r = df.select_dtypes(include=include, exclude=exclude)
        e = df[['b', 'e']]
        assert_frame_equal(r, e)

    def test_select_dtypes_include_using_scalars(self):
        df = DataFrame({'a': list('abc'),
                        'b': list(range(1, 4)),
                        'c': np.arange(3, 6).astype('u1'),
                        'd': np.arange(4.0, 7.0, dtype='float64'),
                        'e': [True, False, True],
                        'f': pd.Categorical(list('abc')),
                        'g': pd.date_range('20130101', periods=3),
                        'h': pd.date_range('20130101', periods=3,
                                           tz='US/Eastern'),
                        'i': pd.date_range('20130101', periods=3,
                                           tz='CET'),
                        'j': pd.period_range('2013-01', periods=3,
                                             freq='M'),
                        'k': pd.timedelta_range('1 day', periods=3)})

        ri = df.select_dtypes(include=np.number)
        ei = df[['b', 'c', 'd', 'k']]
        assert_frame_equal(ri, ei)

        ri = df.select_dtypes(include='datetime')
        ei = df[['g']]
        assert_frame_equal(ri, ei)

        ri = df.select_dtypes(include='datetime64')
        ei = df[['g']]
        assert_frame_equal(ri, ei)

        ri = df.select_dtypes(include='category')
        ei = df[['f']]
        assert_frame_equal(ri, ei)

        pytest.raises(NotImplementedError,
                      lambda: df.select_dtypes(include='period'))

    def test_select_dtypes_exclude_using_scalars(self):
        df = DataFrame({'a': list('abc'),
                        'b': list(range(1, 4)),
                        'c': np.arange(3, 6).astype('u1'),
                        'd': np.arange(4.0, 7.0, dtype='float64'),
                        'e': [True, False, True],
                        'f': pd.Categorical(list('abc')),
                        'g': pd.date_range('20130101', periods=3),
                        'h': pd.date_range('20130101', periods=3,
                                           tz='US/Eastern'),
                        'i': pd.date_range('20130101', periods=3,
                                           tz='CET'),
                        'j': pd.period_range('2013-01', periods=3,
                                             freq='M'),
                        'k': pd.timedelta_range('1 day', periods=3)})

        ri = df.select_dtypes(exclude=np.number)
        ei = df[['a', 'e', 'f', 'g', 'h', 'i', 'j']]
        assert_frame_equal(ri, ei)

        ri = df.select_dtypes(exclude='category')
        ei = df[['a', 'b', 'c', 'd', 'e', 'g', 'h', 'i', 'j', 'k']]
        assert_frame_equal(ri, ei)

        pytest.raises(NotImplementedError,
                      lambda: df.select_dtypes(exclude='period'))

    def test_select_dtypes_include_exclude_using_scalars(self):
        df = DataFrame({'a': list('abc'),
                        'b': list(range(1, 4)),
                        'c': np.arange(3, 6).astype('u1'),
                        'd': np.arange(4.0, 7.0, dtype='float64'),
                        'e': [True, False, True],
                        'f': pd.Categorical(list('abc')),
                        'g': pd.date_range('20130101', periods=3),
                        'h': pd.date_range('20130101', periods=3,
                                           tz='US/Eastern'),
                        'i': pd.date_range('20130101', periods=3,
                                           tz='CET'),
                        'j': pd.period_range('2013-01', periods=3,
                                             freq='M'),
                        'k': pd.timedelta_range('1 day', periods=3)})

        ri = df.select_dtypes(include=np.number, exclude='floating')
        ei = df[['b', 'c', 'k']]
        assert_frame_equal(ri, ei)

    def test_select_dtypes_include_exclude_mixed_scalars_lists(self):
        df = DataFrame({'a': list('abc'),
                        'b': list(range(1, 4)),
                        'c': np.arange(3, 6).astype('u1'),
                        'd': np.arange(4.0, 7.0, dtype='float64'),
                        'e': [True, False, True],
                        'f': pd.Categorical(list('abc')),
                        'g': pd.date_range('20130101', periods=3),
                        'h': pd.date_range('20130101', periods=3,
                                           tz='US/Eastern'),
                        'i': pd.date_range('20130101', periods=3,
                                           tz='CET'),
                        'j': pd.period_range('2013-01', periods=3,
                                             freq='M'),
                        'k': pd.timedelta_range('1 day', periods=3)})

        ri = df.select_dtypes(include=np.number,
                              exclude=['floating', 'timedelta'])
        ei = df[['b', 'c']]
        assert_frame_equal(ri, ei)

        ri = df.select_dtypes(include=[np.number, 'category'],
                              exclude='floating')
        ei = df[['b', 'c', 'f', 'k']]
        assert_frame_equal(ri, ei)

    def test_select_dtypes_duplicate_columns(self):
        # GH20839
        odict = compat.OrderedDict
        df = DataFrame(odict([('a', list('abc')),
                              ('b', list(range(1, 4))),
                              ('c', np.arange(3, 6).astype('u1')),
                              ('d', np.arange(4.0, 7.0, dtype='float64')),
                              ('e', [True, False, True]),
                              ('f', pd.date_range('now', periods=3).values)]))
        df.columns = ['a', 'a', 'b', 'b', 'b', 'c']

        expected = DataFrame({'a': list(range(1, 4)),
                              'b': np.arange(3, 6).astype('u1')})

        result = df.select_dtypes(include=[np.number], exclude=['floating'])
        assert_frame_equal(result, expected)

    def test_select_dtypes_not_an_attr_but_still_valid_dtype(self):
        df = DataFrame({'a': list('abc'),
                        'b': list(range(1, 4)),
                        'c': np.arange(3, 6).astype('u1'),
                        'd': np.arange(4.0, 7.0, dtype='float64'),
                        'e': [True, False, True],
                        'f': pd.date_range('now', periods=3).values})
        df['g'] = df.f.diff()
        assert not hasattr(np, 'u8')
        r = df.select_dtypes(include=['i8', 'O'], exclude=['timedelta'])
        e = df[['a', 'b']]
        assert_frame_equal(r, e)

        r = df.select_dtypes(include=['i8', 'O', 'timedelta64[ns]'])
        e = df[['a', 'b', 'g']]
        assert_frame_equal(r, e)

    def test_select_dtypes_empty(self):
        df = DataFrame({'a': list('abc'), 'b': list(range(1, 4))})
        with tm.assert_raises_regex(ValueError, 'at least one of '
                                    'include or exclude '
                                    'must be nonempty'):
            df.select_dtypes()

    def test_select_dtypes_bad_datetime64(self):
        df = DataFrame({'a': list('abc'),
                        'b': list(range(1, 4)),
                        'c': np.arange(3, 6).astype('u1'),
                        'd': np.arange(4.0, 7.0, dtype='float64'),
                        'e': [True, False, True],
                        'f': pd.date_range('now', periods=3).values})
        with tm.assert_raises_regex(ValueError, '.+ is too specific'):
            df.select_dtypes(include=['datetime64[D]'])

        with tm.assert_raises_regex(ValueError, '.+ is too specific'):
            df.select_dtypes(exclude=['datetime64[as]'])

    def test_select_dtypes_datetime_with_tz(self):

        df2 = DataFrame(dict(A=Timestamp('20130102', tz='US/Eastern'),
                             B=Timestamp('20130603', tz='CET')),
                        index=range(5))
        df3 = pd.concat([df2.A.to_frame(), df2.B.to_frame()], axis=1)
        result = df3.select_dtypes(include=['datetime64[ns]'])
        expected = df3.reindex(columns=[])
        assert_frame_equal(result, expected)

    @pytest.mark.parametrize(
        "dtype", [str, "str", np.string_, "S1",
                  "unicode", np.unicode_, "U1"] + ([unicode] if PY2 else []))
    @pytest.mark.parametrize("arg", ["include", "exclude"])
    def test_select_dtypes_str_raises(self, dtype, arg):
        df = DataFrame({"a": list("abc"),
                        "g": list(u("abc")),
                        "b": list(range(1, 4)),
                        "c": np.arange(3, 6).astype("u1"),
                        "d": np.arange(4.0, 7.0, dtype="float64"),
                        "e": [True, False, True],
                        "f": pd.date_range("now", periods=3).values})
        msg = "string dtypes are not allowed"
        kwargs = {arg: [dtype]}

        with tm.assert_raises_regex(TypeError, msg):
            df.select_dtypes(**kwargs)

    def test_select_dtypes_bad_arg_raises(self):
        df = DataFrame({'a': list('abc'),
                        'g': list(u('abc')),
                        'b': list(range(1, 4)),
                        'c': np.arange(3, 6).astype('u1'),
                        'd': np.arange(4.0, 7.0, dtype='float64'),
                        'e': [True, False, True],
                        'f': pd.date_range('now', periods=3).values})
        with tm.assert_raises_regex(TypeError, 'data type.'
                                    '*not understood'):
            df.select_dtypes(['blargy, blarg, blarg'])

    def test_select_dtypes_typecodes(self):
        # GH 11990
        df = mkdf(30, 3, data_gen_f=lambda x, y: np.random.random())
        expected = df
        FLOAT_TYPES = list(np.typecodes['AllFloat'])
        assert_frame_equal(df.select_dtypes(FLOAT_TYPES), expected)

    def test_dtypes_gh8722(self):
        self.mixed_frame['bool'] = self.mixed_frame['A'] > 0
        result = self.mixed_frame.dtypes
        expected = Series(dict((k, v.dtype)
                               for k, v in compat.iteritems(self.mixed_frame)),
                          index=result.index)
        assert_series_equal(result, expected)

        # compat, GH 8722
        with option_context('use_inf_as_na', True):
            df = DataFrame([[1]])
            result = df.dtypes
            assert_series_equal(result, Series({0: np.dtype('int64')}))

    def test_ftypes(self):
        frame = self.mixed_float
        expected = Series(dict(A='float32:dense',
                               B='float32:dense',
                               C='float16:dense',
                               D='float64:dense')).sort_values()
        result = frame.ftypes.sort_values()
        assert_series_equal(result, expected)

    def test_astype(self):
        casted = self.frame.astype(int)
        expected = DataFrame(self.frame.values.astype(int),
                             index=self.frame.index,
                             columns=self.frame.columns)
        assert_frame_equal(casted, expected)

        casted = self.frame.astype(np.int32)
        expected = DataFrame(self.frame.values.astype(np.int32),
                             index=self.frame.index,
                             columns=self.frame.columns)
        assert_frame_equal(casted, expected)

        self.frame['foo'] = '5'
        casted = self.frame.astype(int)
        expected = DataFrame(self.frame.values.astype(int),
                             index=self.frame.index,
                             columns=self.frame.columns)
        assert_frame_equal(casted, expected)

        # mixed casting
        def _check_cast(df, v):
            assert (list(set(s.dtype.name for
                             _, s in compat.iteritems(df)))[0] == v)

        mn = self.all_mixed._get_numeric_data().copy()
        mn['little_float'] = np.array(12345., dtype='float16')
        mn['big_float'] = np.array(123456789101112., dtype='float64')

        casted = mn.astype('float64')
        _check_cast(casted, 'float64')

        casted = mn.astype('int64')
        _check_cast(casted, 'int64')

        casted = self.mixed_float.reindex(columns=['A', 'B']).astype('float32')
        _check_cast(casted, 'float32')

        casted = mn.reindex(columns=['little_float']).astype('float16')
        _check_cast(casted, 'float16')

        casted = self.mixed_float.reindex(columns=['A', 'B']).astype('float16')
        _check_cast(casted, 'float16')

        casted = mn.astype('float32')
        _check_cast(casted, 'float32')

        casted = mn.astype('int32')
        _check_cast(casted, 'int32')

        # to object
        casted = mn.astype('O')
        _check_cast(casted, 'object')

    def test_astype_with_exclude_string(self):
        df = self.frame.copy()
        expected = self.frame.astype(int)
        df['string'] = 'foo'
        casted = df.astype(int, errors='ignore')

        expected['string'] = 'foo'
        assert_frame_equal(casted, expected)

        df = self.frame.copy()
        expected = self.frame.astype(np.int32)
        df['string'] = 'foo'
        casted = df.astype(np.int32, errors='ignore')

        expected['string'] = 'foo'
        assert_frame_equal(casted, expected)

    def test_astype_with_view(self):

        tf = self.mixed_float.reindex(columns=['A', 'B', 'C'])

        casted = tf.astype(np.int64)

        casted = tf.astype(np.float32)

        # this is the only real reason to do it this way
        tf = np.round(self.frame).astype(np.int32)
        casted = tf.astype(np.float32, copy=False)

        # TODO(wesm): verification?
        tf = self.frame.astype(np.float64)
        casted = tf.astype(np.int64, copy=False)  # noqa

    @pytest.mark.parametrize("dtype", [np.int32, np.int64])
    @pytest.mark.parametrize("val", [np.nan, np.inf])
    def test_astype_cast_nan_inf_int(self, val, dtype):
        # see gh-14265
        #
        # Check NaN and inf --> raise error when converting to int.
        msg = "Cannot convert non-finite values \\(NA or inf\\) to integer"
        df = DataFrame([val])

        with tm.assert_raises_regex(ValueError, msg):
            df.astype(dtype)

    def test_astype_str(self, text_dtype):
        # see gh-9757
        a = Series(date_range("2010-01-04", periods=5))
        b = Series(date_range("3/6/2012 00:00", periods=5, tz="US/Eastern"))
        c = Series([Timedelta(x, unit="d") for x in range(5)])
        d = Series(range(5))
        e = Series([0.0, 0.2, 0.4, 0.6, 0.8])

        df = DataFrame({"a": a, "b": b, "c": c, "d": d, "e": e})

        # Datetime-like
        # Test str and unicode on Python 2.x and just str on Python 3.x
        result = df.astype(text_dtype)

        expected = DataFrame({
            "a": list(map(text_dtype,
                          map(lambda x: Timestamp(x)._date_repr, a._values))),
            "b": list(map(text_dtype, map(Timestamp, b._values))),
            "c": list(map(text_dtype,
                          map(lambda x: Timedelta(x)._repr_base(format="all"),
                              c._values))),
            "d": list(map(text_dtype, d._values)),
            "e": list(map(text_dtype, e._values)),
        })

        assert_frame_equal(result, expected)

    def test_astype_str_float(self, text_dtype):
        # see gh-11302
        result = DataFrame([np.NaN]).astype(text_dtype)
        expected = DataFrame(["nan"])

        assert_frame_equal(result, expected)
        result = DataFrame([1.12345678901234567890]).astype(text_dtype)

        # < 1.14 truncates
        # >= 1.14 preserves the full repr
        val = ("1.12345678901" if _np_version_under1p14
               else "1.1234567890123457")
        expected = DataFrame([val])
        assert_frame_equal(result, expected)

    @pytest.mark.parametrize("dtype_class", [dict, Series])
    def test_astype_dict_like(self, dtype_class):
        # GH7271 & GH16717
        a = Series(date_range('2010-01-04', periods=5))
        b = Series(range(5))
        c = Series([0.0, 0.2, 0.4, 0.6, 0.8])
        d = Series(['1.0', '2', '3.14', '4', '5.4'])
        df = DataFrame({'a': a, 'b': b, 'c': c, 'd': d})
        original = df.copy(deep=True)

        # change type of a subset of columns
        dt1 = dtype_class({'b': 'str', 'd': 'float32'})
        result = df.astype(dt1)
        expected = DataFrame({
            'a': a,
            'b': Series(['0', '1', '2', '3', '4']),
            'c': c,
            'd': Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype='float32')})
        assert_frame_equal(result, expected)
        assert_frame_equal(df, original)

        dt2 = dtype_class({'b': np.float32, 'c': 'float32', 'd': np.float64})
        result = df.astype(dt2)
        expected = DataFrame({
            'a': a,
            'b': Series([0.0, 1.0, 2.0, 3.0, 4.0], dtype='float32'),
            'c': Series([0.0, 0.2, 0.4, 0.6, 0.8], dtype='float32'),
            'd': Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype='float64')})
        assert_frame_equal(result, expected)
        assert_frame_equal(df, original)

        # change all columns
        dt3 = dtype_class({'a': str, 'b': str, 'c': str, 'd': str})
        assert_frame_equal(df.astype(dt3),
                           df.astype(str))
        assert_frame_equal(df, original)

        # error should be raised when using something other than column labels
        # in the keys of the dtype dict
        dt4 = dtype_class({'b': str, 2: str})
        dt5 = dtype_class({'e': str})
        pytest.raises(KeyError, df.astype, dt4)
        pytest.raises(KeyError, df.astype, dt5)
        assert_frame_equal(df, original)

        # if the dtypes provided are the same as the original dtypes, the
        # resulting DataFrame should be the same as the original DataFrame
        dt6 = dtype_class({col: df[col].dtype for col in df.columns})
        equiv = df.astype(dt6)
        assert_frame_equal(df, equiv)
        assert_frame_equal(df, original)

        # GH 16717
        # if dtypes provided is empty, the resulting DataFrame
        # should be the same as the original DataFrame
        dt7 = dtype_class({})
        result = df.astype(dt7)
        assert_frame_equal(df, equiv)
        assert_frame_equal(df, original)

    def test_astype_duplicate_col(self):
        a1 = Series([1, 2, 3, 4, 5], name='a')
        b = Series([0.1, 0.2, 0.4, 0.6, 0.8], name='b')
        a2 = Series([0, 1, 2, 3, 4], name='a')
        df = concat([a1, b, a2], axis=1)

        result = df.astype(str)
        a1_str = Series(['1', '2', '3', '4', '5'], dtype='str', name='a')
        b_str = Series(['0.1', '0.2', '0.4', '0.6', '0.8'], dtype=str,
                       name='b')
        a2_str = Series(['0', '1', '2', '3', '4'], dtype='str', name='a')
        expected = concat([a1_str, b_str, a2_str], axis=1)
        assert_frame_equal(result, expected)

        result = df.astype({'a': 'str'})
        expected = concat([a1_str, b, a2_str], axis=1)
        assert_frame_equal(result, expected)

    @pytest.mark.parametrize('dtype', [
        'category',
        CategoricalDtype(),
        CategoricalDtype(ordered=True),
        CategoricalDtype(ordered=False),
        CategoricalDtype(categories=list('abcdef')),
        CategoricalDtype(categories=list('edba'), ordered=False),
        CategoricalDtype(categories=list('edcb'), ordered=True)], ids=repr)
    def test_astype_categorical(self, dtype):
        # GH 18099
        d = {'A': list('abbc'), 'B': list('bccd'), 'C': list('cdde')}
        df = DataFrame(d)
        result = df.astype(dtype)
        expected = DataFrame({k: Categorical(d[k], dtype=dtype) for k in d})
        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize("cls", [
        pd.api.types.CategoricalDtype,
        pd.api.types.DatetimeTZDtype,
        pd.api.types.IntervalDtype
    ])
    def test_astype_categoricaldtype_class_raises(self, cls):
        df = DataFrame({"A": ['a', 'a', 'b', 'c']})
        xpr = "Expected an instance of {}".format(cls.__name__)
        with tm.assert_raises_regex(TypeError, xpr):
            df.astype({"A": cls})

        with tm.assert_raises_regex(TypeError, xpr):
            df['A'].astype(cls)

    @pytest.mark.parametrize('dtype', [
        {100: 'float64', 200: 'uint64'}, 'category', 'float64'])
    def test_astype_column_metadata(self, dtype):
        # GH 19920
        columns = pd.UInt64Index([100, 200, 300], name='foo')
        df = DataFrame(np.arange(15).reshape(5, 3), columns=columns)
        df = df.astype(dtype)
        tm.assert_index_equal(df.columns, columns)

    @pytest.mark.parametrize("dtype", ["M8", "m8"])
    @pytest.mark.parametrize("unit", ['ns', 'us', 'ms', 's', 'h', 'm', 'D'])
    def test_astype_from_datetimelike_to_objectt(self, dtype, unit):
        # tests astype to object dtype
        # gh-19223 / gh-12425
        dtype = "{}[{}]".format(dtype, unit)
        arr = np.array([[1, 2, 3]], dtype=dtype)
        df = DataFrame(arr)
        result = df.astype(object)
        assert (result.dtypes == object).all()

        if dtype.startswith('M8'):
            assert result.iloc[0, 0] == pd.to_datetime(1, unit=unit)
        else:
            assert result.iloc[0, 0] == pd.to_timedelta(1, unit=unit)

    @pytest.mark.parametrize("arr_dtype", [np.int64, np.float64])
    @pytest.mark.parametrize("dtype", ["M8", "m8"])
    @pytest.mark.parametrize("unit", ['ns', 'us', 'ms', 's', 'h', 'm', 'D'])
    def test_astype_to_datetimelike_unit(self, arr_dtype, dtype, unit):
        # tests all units from numeric origination
        # gh-19223 / gh-12425
        dtype = "{}[{}]".format(dtype, unit)
        arr = np.array([[1, 2, 3]], dtype=arr_dtype)
        df = DataFrame(arr)
        result = df.astype(dtype)
        expected = DataFrame(arr.astype(dtype))

        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize("unit", ['ns', 'us', 'ms', 's', 'h', 'm', 'D'])
    def test_astype_to_datetime_unit(self, unit):
        # tests all units from datetime origination
        # gh-19223
        dtype = "M8[{}]".format(unit)
        arr = np.array([[1, 2, 3]], dtype=dtype)
        df = DataFrame(arr)
        result = df.astype(dtype)
        expected = DataFrame(arr.astype(dtype))

        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize("unit", ['ns'])
    def test_astype_to_timedelta_unit_ns(self, unit):
        # preserver the timedelta conversion
        # gh-19223
        dtype = "m8[{}]".format(unit)
        arr = np.array([[1, 2, 3]], dtype=dtype)
        df = DataFrame(arr)
        result = df.astype(dtype)
        expected = DataFrame(arr.astype(dtype))

        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize("unit", ['us', 'ms', 's', 'h', 'm', 'D'])
    def test_astype_to_timedelta_unit(self, unit):
        # coerce to float
        # gh-19223
        dtype = "m8[{}]".format(unit)
        arr = np.array([[1, 2, 3]], dtype=dtype)
        df = DataFrame(arr)
        result = df.astype(dtype)
        expected = DataFrame(df.values.astype(dtype).astype(float))

        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize("unit", ['ns', 'us', 'ms', 's', 'h', 'm', 'D'])
    def test_astype_to_incorrect_datetimelike(self, unit):
        # trying to astype a m to a M, or vice-versa
        # gh-19224
        dtype = "M8[{}]".format(unit)
        other = "m8[{}]".format(unit)

        df = DataFrame(np.array([[1, 2, 3]], dtype=dtype))
        with pytest.raises(TypeError):
            df.astype(other)

        df = DataFrame(np.array([[1, 2, 3]], dtype=other))
        with pytest.raises(TypeError):
            df.astype(dtype)

    def test_timedeltas(self):
        df = DataFrame(dict(A=Series(date_range('2012-1-1', periods=3,
                                                freq='D')),
                            B=Series([timedelta(days=i) for i in range(3)])))
        result = df.get_dtype_counts().sort_index()
        expected = Series(
            {'datetime64[ns]': 1, 'timedelta64[ns]': 1}).sort_index()
        assert_series_equal(result, expected)

        df['C'] = df['A'] + df['B']
        expected = Series(
            {'datetime64[ns]': 2, 'timedelta64[ns]': 1}).sort_values()
        result = df.get_dtype_counts().sort_values()
        assert_series_equal(result, expected)

        # mixed int types
        df['D'] = 1
        expected = Series({'datetime64[ns]': 2,
                           'timedelta64[ns]': 1,
                           'int64': 1}).sort_values()
        result = df.get_dtype_counts().sort_values()
        assert_series_equal(result, expected)

    def test_arg_for_errors_in_astype(self):
        # issue #14878

        df = DataFrame([1, 2, 3])

        with pytest.raises(ValueError):
            df.astype(np.float64, errors=True)

        with tm.assert_produces_warning(FutureWarning):
            df.astype(np.int8, raise_on_error=False)

        df.astype(np.int8, errors='ignore')

    @pytest.mark.parametrize('input_vals', [
        ([1, 2]),
        (['1', '2']),
        (list(pd.date_range('1/1/2011', periods=2, freq='H'))),
        (list(pd.date_range('1/1/2011', periods=2, freq='H',
                            tz='US/Eastern'))),
        ([pd.Interval(left=0, right=5)]),
    ])
    def test_constructor_list_str(self, input_vals, string_dtype):
        # GH 16605
        # Ensure that data elements are converted to strings when
        # dtype is str, 'str', or 'U'

        result = DataFrame({'A': input_vals}, dtype=string_dtype)
        expected = DataFrame({'A': input_vals}).astype({'A': string_dtype})
        assert_frame_equal(result, expected)

    def test_constructor_list_str_na(self, string_dtype):

        result = DataFrame({"A": [1.0, 2.0, None]}, dtype=string_dtype)
        expected = DataFrame({"A": ['1.0', '2.0', None]}, dtype=object)
        assert_frame_equal(result, expected)
Beispiel #14
0
    result = hash_tuple(tup)
    expected = hash_tuples([tup])[0]

    assert result == expected


@pytest.mark.parametrize("val", [
    1, 1.4, "A", b"A", u"A",
    pd.Timestamp("2012-01-01"),
    pd.Timestamp("2012-01-01", tz="Europe/Brussels"),
    datetime.datetime(2012, 1, 1),
    pd.Timestamp("2012-01-01", tz="EST").to_pydatetime(),
    pd.Timedelta("1 days"),
    datetime.timedelta(1),
    pd.Period("2012-01-01", freq="D"),
    pd.Interval(0, 1), np.nan, pd.NaT, None
])
def test_hash_scalar(val):
    result = _hash_scalar(val)
    expected = hash_array(np.array([val], dtype=object), categorize=True)

    assert result[0] == expected[0]


@pytest.mark.parametrize("val", [5, "foo", pd.Timestamp("20130101")])
def test_hash_tuples_err(val):
    msg = "must be convertible to a list-of-tuples"
    with pytest.raises(TypeError, match=msg):
        hash_tuples(val)

Beispiel #15
0
class TestDataFrameDataTypes:
    def test_concat_empty_dataframe_dtypes(self):
        df = DataFrame(columns=list("abc"))
        df["a"] = df["a"].astype(np.bool_)
        df["b"] = df["b"].astype(np.int32)
        df["c"] = df["c"].astype(np.float64)

        result = pd.concat([df, df])
        assert result["a"].dtype == np.bool_
        assert result["b"].dtype == np.int32
        assert result["c"].dtype == np.float64

        result = pd.concat([df, df.astype(np.float64)])
        assert result["a"].dtype == np.object_
        assert result["b"].dtype == np.float64
        assert result["c"].dtype == np.float64

    def test_empty_frame_dtypes_ftypes(self):
        empty_df = pd.DataFrame()
        assert_series_equal(empty_df.dtypes, pd.Series(dtype=np.object))

        # GH 26705 - Assert .ftypes is deprecated
        with tm.assert_produces_warning(FutureWarning):
            assert_series_equal(empty_df.ftypes, pd.Series(dtype=np.object))

        nocols_df = pd.DataFrame(index=[1, 2, 3])
        assert_series_equal(nocols_df.dtypes, pd.Series(dtype=np.object))

        # GH 26705 - Assert .ftypes is deprecated
        with tm.assert_produces_warning(FutureWarning):
            assert_series_equal(nocols_df.ftypes, pd.Series(dtype=np.object))

        norows_df = pd.DataFrame(columns=list("abc"))
        assert_series_equal(norows_df.dtypes, pd.Series(np.object, index=list("abc")))

        # GH 26705 - Assert .ftypes is deprecated
        with tm.assert_produces_warning(FutureWarning):
            assert_series_equal(
                norows_df.ftypes, pd.Series("object:dense", index=list("abc"))
            )

        norows_int_df = pd.DataFrame(columns=list("abc")).astype(np.int32)
        assert_series_equal(
            norows_int_df.dtypes, pd.Series(np.dtype("int32"), index=list("abc"))
        )
        # GH 26705 - Assert .ftypes is deprecated
        with tm.assert_produces_warning(FutureWarning):
            assert_series_equal(
                norows_int_df.ftypes, pd.Series("int32:dense", index=list("abc"))
            )

        odict = OrderedDict
        df = pd.DataFrame(odict([("a", 1), ("b", True), ("c", 1.0)]), index=[1, 2, 3])
        ex_dtypes = pd.Series(
            odict([("a", np.int64), ("b", np.bool), ("c", np.float64)])
        )
        ex_ftypes = pd.Series(
            odict([("a", "int64:dense"), ("b", "bool:dense"), ("c", "float64:dense")])
        )
        assert_series_equal(df.dtypes, ex_dtypes)

        # GH 26705 - Assert .ftypes is deprecated
        with tm.assert_produces_warning(FutureWarning):
            assert_series_equal(df.ftypes, ex_ftypes)

        # same but for empty slice of df
        assert_series_equal(df[:0].dtypes, ex_dtypes)

        # GH 26705 - Assert .ftypes is deprecated
        with tm.assert_produces_warning(FutureWarning):
            assert_series_equal(df[:0].ftypes, ex_ftypes)

    def test_datetime_with_tz_dtypes(self):
        tzframe = DataFrame(
            {
                "A": date_range("20130101", periods=3),
                "B": date_range("20130101", periods=3, tz="US/Eastern"),
                "C": date_range("20130101", periods=3, tz="CET"),
            }
        )
        tzframe.iloc[1, 1] = pd.NaT
        tzframe.iloc[1, 2] = pd.NaT
        result = tzframe.dtypes.sort_index()
        expected = Series(
            [
                np.dtype("datetime64[ns]"),
                DatetimeTZDtype("ns", "US/Eastern"),
                DatetimeTZDtype("ns", "CET"),
            ],
            ["A", "B", "C"],
        )

        assert_series_equal(result, expected)

    def test_dtypes_are_correct_after_column_slice(self):
        # GH6525
        df = pd.DataFrame(index=range(5), columns=list("abc"), dtype=np.float_)
        odict = OrderedDict
        assert_series_equal(
            df.dtypes,
            pd.Series(odict([("a", np.float_), ("b", np.float_), ("c", np.float_)])),
        )
        assert_series_equal(df.iloc[:, 2:].dtypes, pd.Series(odict([("c", np.float_)])))
        assert_series_equal(
            df.dtypes,
            pd.Series(odict([("a", np.float_), ("b", np.float_), ("c", np.float_)])),
        )

    def test_select_dtypes_include_using_list_like(self):
        df = DataFrame(
            {
                "a": list("abc"),
                "b": list(range(1, 4)),
                "c": np.arange(3, 6).astype("u1"),
                "d": np.arange(4.0, 7.0, dtype="float64"),
                "e": [True, False, True],
                "f": pd.Categorical(list("abc")),
                "g": pd.date_range("20130101", periods=3),
                "h": pd.date_range("20130101", periods=3, tz="US/Eastern"),
                "i": pd.date_range("20130101", periods=3, tz="CET"),
                "j": pd.period_range("2013-01", periods=3, freq="M"),
                "k": pd.timedelta_range("1 day", periods=3),
            }
        )

        ri = df.select_dtypes(include=[np.number])
        ei = df[["b", "c", "d", "k"]]
        assert_frame_equal(ri, ei)

        ri = df.select_dtypes(include=[np.number], exclude=["timedelta"])
        ei = df[["b", "c", "d"]]
        assert_frame_equal(ri, ei)

        ri = df.select_dtypes(include=[np.number, "category"], exclude=["timedelta"])
        ei = df[["b", "c", "d", "f"]]
        assert_frame_equal(ri, ei)

        ri = df.select_dtypes(include=["datetime"])
        ei = df[["g"]]
        assert_frame_equal(ri, ei)

        ri = df.select_dtypes(include=["datetime64"])
        ei = df[["g"]]
        assert_frame_equal(ri, ei)

        ri = df.select_dtypes(include=["datetimetz"])
        ei = df[["h", "i"]]
        assert_frame_equal(ri, ei)

        with pytest.raises(NotImplementedError, match=r"^$"):
            df.select_dtypes(include=["period"])

    def test_select_dtypes_exclude_using_list_like(self):
        df = DataFrame(
            {
                "a": list("abc"),
                "b": list(range(1, 4)),
                "c": np.arange(3, 6).astype("u1"),
                "d": np.arange(4.0, 7.0, dtype="float64"),
                "e": [True, False, True],
            }
        )
        re = df.select_dtypes(exclude=[np.number])
        ee = df[["a", "e"]]
        assert_frame_equal(re, ee)

    def test_select_dtypes_exclude_include_using_list_like(self):
        df = DataFrame(
            {
                "a": list("abc"),
                "b": list(range(1, 4)),
                "c": np.arange(3, 6).astype("u1"),
                "d": np.arange(4.0, 7.0, dtype="float64"),
                "e": [True, False, True],
                "f": pd.date_range("now", periods=3).values,
            }
        )
        exclude = (np.datetime64,)
        include = np.bool_, "integer"
        r = df.select_dtypes(include=include, exclude=exclude)
        e = df[["b", "c", "e"]]
        assert_frame_equal(r, e)

        exclude = ("datetime",)
        include = "bool", "int64", "int32"
        r = df.select_dtypes(include=include, exclude=exclude)
        e = df[["b", "e"]]
        assert_frame_equal(r, e)

    def test_select_dtypes_include_using_scalars(self):
        df = DataFrame(
            {
                "a": list("abc"),
                "b": list(range(1, 4)),
                "c": np.arange(3, 6).astype("u1"),
                "d": np.arange(4.0, 7.0, dtype="float64"),
                "e": [True, False, True],
                "f": pd.Categorical(list("abc")),
                "g": pd.date_range("20130101", periods=3),
                "h": pd.date_range("20130101", periods=3, tz="US/Eastern"),
                "i": pd.date_range("20130101", periods=3, tz="CET"),
                "j": pd.period_range("2013-01", periods=3, freq="M"),
                "k": pd.timedelta_range("1 day", periods=3),
            }
        )

        ri = df.select_dtypes(include=np.number)
        ei = df[["b", "c", "d", "k"]]
        assert_frame_equal(ri, ei)

        ri = df.select_dtypes(include="datetime")
        ei = df[["g"]]
        assert_frame_equal(ri, ei)

        ri = df.select_dtypes(include="datetime64")
        ei = df[["g"]]
        assert_frame_equal(ri, ei)

        ri = df.select_dtypes(include="category")
        ei = df[["f"]]
        assert_frame_equal(ri, ei)

        with pytest.raises(NotImplementedError, match=r"^$"):
            df.select_dtypes(include="period")

    def test_select_dtypes_exclude_using_scalars(self):
        df = DataFrame(
            {
                "a": list("abc"),
                "b": list(range(1, 4)),
                "c": np.arange(3, 6).astype("u1"),
                "d": np.arange(4.0, 7.0, dtype="float64"),
                "e": [True, False, True],
                "f": pd.Categorical(list("abc")),
                "g": pd.date_range("20130101", periods=3),
                "h": pd.date_range("20130101", periods=3, tz="US/Eastern"),
                "i": pd.date_range("20130101", periods=3, tz="CET"),
                "j": pd.period_range("2013-01", periods=3, freq="M"),
                "k": pd.timedelta_range("1 day", periods=3),
            }
        )

        ri = df.select_dtypes(exclude=np.number)
        ei = df[["a", "e", "f", "g", "h", "i", "j"]]
        assert_frame_equal(ri, ei)

        ri = df.select_dtypes(exclude="category")
        ei = df[["a", "b", "c", "d", "e", "g", "h", "i", "j", "k"]]
        assert_frame_equal(ri, ei)

        with pytest.raises(NotImplementedError, match=r"^$"):
            df.select_dtypes(exclude="period")

    def test_select_dtypes_include_exclude_using_scalars(self):
        df = DataFrame(
            {
                "a": list("abc"),
                "b": list(range(1, 4)),
                "c": np.arange(3, 6).astype("u1"),
                "d": np.arange(4.0, 7.0, dtype="float64"),
                "e": [True, False, True],
                "f": pd.Categorical(list("abc")),
                "g": pd.date_range("20130101", periods=3),
                "h": pd.date_range("20130101", periods=3, tz="US/Eastern"),
                "i": pd.date_range("20130101", periods=3, tz="CET"),
                "j": pd.period_range("2013-01", periods=3, freq="M"),
                "k": pd.timedelta_range("1 day", periods=3),
            }
        )

        ri = df.select_dtypes(include=np.number, exclude="floating")
        ei = df[["b", "c", "k"]]
        assert_frame_equal(ri, ei)

    def test_select_dtypes_include_exclude_mixed_scalars_lists(self):
        df = DataFrame(
            {
                "a": list("abc"),
                "b": list(range(1, 4)),
                "c": np.arange(3, 6).astype("u1"),
                "d": np.arange(4.0, 7.0, dtype="float64"),
                "e": [True, False, True],
                "f": pd.Categorical(list("abc")),
                "g": pd.date_range("20130101", periods=3),
                "h": pd.date_range("20130101", periods=3, tz="US/Eastern"),
                "i": pd.date_range("20130101", periods=3, tz="CET"),
                "j": pd.period_range("2013-01", periods=3, freq="M"),
                "k": pd.timedelta_range("1 day", periods=3),
            }
        )

        ri = df.select_dtypes(include=np.number, exclude=["floating", "timedelta"])
        ei = df[["b", "c"]]
        assert_frame_equal(ri, ei)

        ri = df.select_dtypes(include=[np.number, "category"], exclude="floating")
        ei = df[["b", "c", "f", "k"]]
        assert_frame_equal(ri, ei)

    def test_select_dtypes_duplicate_columns(self):
        # GH20839
        odict = OrderedDict
        df = DataFrame(
            odict(
                [
                    ("a", list("abc")),
                    ("b", list(range(1, 4))),
                    ("c", np.arange(3, 6).astype("u1")),
                    ("d", np.arange(4.0, 7.0, dtype="float64")),
                    ("e", [True, False, True]),
                    ("f", pd.date_range("now", periods=3).values),
                ]
            )
        )
        df.columns = ["a", "a", "b", "b", "b", "c"]

        expected = DataFrame(
            {"a": list(range(1, 4)), "b": np.arange(3, 6).astype("u1")}
        )

        result = df.select_dtypes(include=[np.number], exclude=["floating"])
        assert_frame_equal(result, expected)

    def test_select_dtypes_not_an_attr_but_still_valid_dtype(self):
        df = DataFrame(
            {
                "a": list("abc"),
                "b": list(range(1, 4)),
                "c": np.arange(3, 6).astype("u1"),
                "d": np.arange(4.0, 7.0, dtype="float64"),
                "e": [True, False, True],
                "f": pd.date_range("now", periods=3).values,
            }
        )
        df["g"] = df.f.diff()
        assert not hasattr(np, "u8")
        r = df.select_dtypes(include=["i8", "O"], exclude=["timedelta"])
        e = df[["a", "b"]]
        assert_frame_equal(r, e)

        r = df.select_dtypes(include=["i8", "O", "timedelta64[ns]"])
        e = df[["a", "b", "g"]]
        assert_frame_equal(r, e)

    def test_select_dtypes_empty(self):
        df = DataFrame({"a": list("abc"), "b": list(range(1, 4))})
        msg = "at least one of include or exclude must be nonempty"
        with pytest.raises(ValueError, match=msg):
            df.select_dtypes()

    def test_select_dtypes_bad_datetime64(self):
        df = DataFrame(
            {
                "a": list("abc"),
                "b": list(range(1, 4)),
                "c": np.arange(3, 6).astype("u1"),
                "d": np.arange(4.0, 7.0, dtype="float64"),
                "e": [True, False, True],
                "f": pd.date_range("now", periods=3).values,
            }
        )
        with pytest.raises(ValueError, match=".+ is too specific"):
            df.select_dtypes(include=["datetime64[D]"])

        with pytest.raises(ValueError, match=".+ is too specific"):
            df.select_dtypes(exclude=["datetime64[as]"])

    def test_select_dtypes_datetime_with_tz(self):

        df2 = DataFrame(
            dict(
                A=Timestamp("20130102", tz="US/Eastern"),
                B=Timestamp("20130603", tz="CET"),
            ),
            index=range(5),
        )
        df3 = pd.concat([df2.A.to_frame(), df2.B.to_frame()], axis=1)
        result = df3.select_dtypes(include=["datetime64[ns]"])
        expected = df3.reindex(columns=[])
        assert_frame_equal(result, expected)

    @pytest.mark.parametrize(
        "dtype", [str, "str", np.string_, "S1", "unicode", np.unicode_, "U1"]
    )
    @pytest.mark.parametrize("arg", ["include", "exclude"])
    def test_select_dtypes_str_raises(self, dtype, arg):
        df = DataFrame(
            {
                "a": list("abc"),
                "g": list("abc"),
                "b": list(range(1, 4)),
                "c": np.arange(3, 6).astype("u1"),
                "d": np.arange(4.0, 7.0, dtype="float64"),
                "e": [True, False, True],
                "f": pd.date_range("now", periods=3).values,
            }
        )
        msg = "string dtypes are not allowed"
        kwargs = {arg: [dtype]}

        with pytest.raises(TypeError, match=msg):
            df.select_dtypes(**kwargs)

    def test_select_dtypes_bad_arg_raises(self):
        df = DataFrame(
            {
                "a": list("abc"),
                "g": list("abc"),
                "b": list(range(1, 4)),
                "c": np.arange(3, 6).astype("u1"),
                "d": np.arange(4.0, 7.0, dtype="float64"),
                "e": [True, False, True],
                "f": pd.date_range("now", periods=3).values,
            }
        )

        msg = "data type.*not understood"
        with pytest.raises(TypeError, match=msg):
            df.select_dtypes(["blargy, blarg, blarg"])

    def test_select_dtypes_typecodes(self):
        # GH 11990
        df = mkdf(30, 3, data_gen_f=lambda x, y: np.random.random())
        expected = df
        FLOAT_TYPES = list(np.typecodes["AllFloat"])
        assert_frame_equal(df.select_dtypes(FLOAT_TYPES), expected)

    def test_dtypes_gh8722(self, float_string_frame):
        float_string_frame["bool"] = float_string_frame["A"] > 0
        result = float_string_frame.dtypes
        expected = Series(
            {k: v.dtype for k, v in float_string_frame.items()}, index=result.index
        )
        assert_series_equal(result, expected)

        # compat, GH 8722
        with option_context("use_inf_as_na", True):
            df = DataFrame([[1]])
            result = df.dtypes
            assert_series_equal(result, Series({0: np.dtype("int64")}))

    def test_ftypes(self, mixed_float_frame):
        frame = mixed_float_frame
        expected = Series(
            dict(
                A="float32:dense",
                B="float32:dense",
                C="float16:dense",
                D="float64:dense",
            )
        ).sort_values()

        # GH 26705 - Assert .ftypes is deprecated
        with tm.assert_produces_warning(FutureWarning):
            result = frame.ftypes.sort_values()
        assert_series_equal(result, expected)

    def test_astype_float(self, float_frame):
        casted = float_frame.astype(int)
        expected = DataFrame(
            float_frame.values.astype(int),
            index=float_frame.index,
            columns=float_frame.columns,
        )
        assert_frame_equal(casted, expected)

        casted = float_frame.astype(np.int32)
        expected = DataFrame(
            float_frame.values.astype(np.int32),
            index=float_frame.index,
            columns=float_frame.columns,
        )
        assert_frame_equal(casted, expected)

        float_frame["foo"] = "5"
        casted = float_frame.astype(int)
        expected = DataFrame(
            float_frame.values.astype(int),
            index=float_frame.index,
            columns=float_frame.columns,
        )
        assert_frame_equal(casted, expected)

    def test_astype_mixed_float(self, mixed_float_frame):
        # mixed casting
        casted = mixed_float_frame.reindex(columns=["A", "B"]).astype("float32")
        _check_cast(casted, "float32")

        casted = mixed_float_frame.reindex(columns=["A", "B"]).astype("float16")
        _check_cast(casted, "float16")

    def test_astype_mixed_type(self, mixed_type_frame):
        # mixed casting
        mn = mixed_type_frame._get_numeric_data().copy()
        mn["little_float"] = np.array(12345.0, dtype="float16")
        mn["big_float"] = np.array(123456789101112.0, dtype="float64")

        casted = mn.astype("float64")
        _check_cast(casted, "float64")

        casted = mn.astype("int64")
        _check_cast(casted, "int64")

        casted = mn.reindex(columns=["little_float"]).astype("float16")
        _check_cast(casted, "float16")

        casted = mn.astype("float32")
        _check_cast(casted, "float32")

        casted = mn.astype("int32")
        _check_cast(casted, "int32")

        # to object
        casted = mn.astype("O")
        _check_cast(casted, "object")

    def test_astype_with_exclude_string(self, float_frame):
        df = float_frame.copy()
        expected = float_frame.astype(int)
        df["string"] = "foo"
        casted = df.astype(int, errors="ignore")

        expected["string"] = "foo"
        assert_frame_equal(casted, expected)

        df = float_frame.copy()
        expected = float_frame.astype(np.int32)
        df["string"] = "foo"
        casted = df.astype(np.int32, errors="ignore")

        expected["string"] = "foo"
        assert_frame_equal(casted, expected)

    def test_astype_with_view_float(self, float_frame):

        # this is the only real reason to do it this way
        tf = np.round(float_frame).astype(np.int32)
        casted = tf.astype(np.float32, copy=False)

        # TODO(wesm): verification?
        tf = float_frame.astype(np.float64)
        casted = tf.astype(np.int64, copy=False)  # noqa

    def test_astype_with_view_mixed_float(self, mixed_float_frame):

        tf = mixed_float_frame.reindex(columns=["A", "B", "C"])

        casted = tf.astype(np.int64)
        casted = tf.astype(np.float32)  # noqa

    @pytest.mark.parametrize("dtype", [np.int32, np.int64])
    @pytest.mark.parametrize("val", [np.nan, np.inf])
    def test_astype_cast_nan_inf_int(self, val, dtype):
        # see gh-14265
        #
        # Check NaN and inf --> raise error when converting to int.
        msg = "Cannot convert non-finite values \\(NA or inf\\) to integer"
        df = DataFrame([val])

        with pytest.raises(ValueError, match=msg):
            df.astype(dtype)

    def test_astype_str(self):
        # see gh-9757
        a = Series(date_range("2010-01-04", periods=5))
        b = Series(date_range("3/6/2012 00:00", periods=5, tz="US/Eastern"))
        c = Series([Timedelta(x, unit="d") for x in range(5)])
        d = Series(range(5))
        e = Series([0.0, 0.2, 0.4, 0.6, 0.8])

        df = DataFrame({"a": a, "b": b, "c": c, "d": d, "e": e})

        # Datetime-like
        result = df.astype(str)

        expected = DataFrame(
            {
                "a": list(map(str, map(lambda x: Timestamp(x)._date_repr, a._values))),
                "b": list(map(str, map(Timestamp, b._values))),
                "c": list(
                    map(
                        str,
                        map(lambda x: Timedelta(x)._repr_base(format="all"), c._values),
                    )
                ),
                "d": list(map(str, d._values)),
                "e": list(map(str, e._values)),
            }
        )

        assert_frame_equal(result, expected)

    def test_astype_str_float(self):
        # see gh-11302
        result = DataFrame([np.NaN]).astype(str)
        expected = DataFrame(["nan"])

        assert_frame_equal(result, expected)
        result = DataFrame([1.12345678901234567890]).astype(str)

        # < 1.14 truncates
        # >= 1.14 preserves the full repr
        val = "1.12345678901" if _np_version_under1p14 else "1.1234567890123457"
        expected = DataFrame([val])
        assert_frame_equal(result, expected)

    @pytest.mark.parametrize("dtype_class", [dict, Series])
    def test_astype_dict_like(self, dtype_class):
        # GH7271 & GH16717
        a = Series(date_range("2010-01-04", periods=5))
        b = Series(range(5))
        c = Series([0.0, 0.2, 0.4, 0.6, 0.8])
        d = Series(["1.0", "2", "3.14", "4", "5.4"])
        df = DataFrame({"a": a, "b": b, "c": c, "d": d})
        original = df.copy(deep=True)

        # change type of a subset of columns
        dt1 = dtype_class({"b": "str", "d": "float32"})
        result = df.astype(dt1)
        expected = DataFrame(
            {
                "a": a,
                "b": Series(["0", "1", "2", "3", "4"]),
                "c": c,
                "d": Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype="float32"),
            }
        )
        assert_frame_equal(result, expected)
        assert_frame_equal(df, original)

        dt2 = dtype_class({"b": np.float32, "c": "float32", "d": np.float64})
        result = df.astype(dt2)
        expected = DataFrame(
            {
                "a": a,
                "b": Series([0.0, 1.0, 2.0, 3.0, 4.0], dtype="float32"),
                "c": Series([0.0, 0.2, 0.4, 0.6, 0.8], dtype="float32"),
                "d": Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype="float64"),
            }
        )
        assert_frame_equal(result, expected)
        assert_frame_equal(df, original)

        # change all columns
        dt3 = dtype_class({"a": str, "b": str, "c": str, "d": str})
        assert_frame_equal(df.astype(dt3), df.astype(str))
        assert_frame_equal(df, original)

        # error should be raised when using something other than column labels
        # in the keys of the dtype dict
        dt4 = dtype_class({"b": str, 2: str})
        dt5 = dtype_class({"e": str})
        msg = (
            "Only a column name can be used for the key in a dtype mappings" " argument"
        )
        with pytest.raises(KeyError, match=msg):
            df.astype(dt4)
        with pytest.raises(KeyError, match=msg):
            df.astype(dt5)
        assert_frame_equal(df, original)

        # if the dtypes provided are the same as the original dtypes, the
        # resulting DataFrame should be the same as the original DataFrame
        dt6 = dtype_class({col: df[col].dtype for col in df.columns})
        equiv = df.astype(dt6)
        assert_frame_equal(df, equiv)
        assert_frame_equal(df, original)

        # GH 16717
        # if dtypes provided is empty, the resulting DataFrame
        # should be the same as the original DataFrame
        dt7 = dtype_class({})
        result = df.astype(dt7)
        assert_frame_equal(df, equiv)
        assert_frame_equal(df, original)

    def test_astype_duplicate_col(self):
        a1 = Series([1, 2, 3, 4, 5], name="a")
        b = Series([0.1, 0.2, 0.4, 0.6, 0.8], name="b")
        a2 = Series([0, 1, 2, 3, 4], name="a")
        df = concat([a1, b, a2], axis=1)

        result = df.astype(str)
        a1_str = Series(["1", "2", "3", "4", "5"], dtype="str", name="a")
        b_str = Series(["0.1", "0.2", "0.4", "0.6", "0.8"], dtype=str, name="b")
        a2_str = Series(["0", "1", "2", "3", "4"], dtype="str", name="a")
        expected = concat([a1_str, b_str, a2_str], axis=1)
        assert_frame_equal(result, expected)

        result = df.astype({"a": "str"})
        expected = concat([a1_str, b, a2_str], axis=1)
        assert_frame_equal(result, expected)

    @pytest.mark.parametrize(
        "dtype",
        [
            "category",
            CategoricalDtype(),
            CategoricalDtype(ordered=True),
            CategoricalDtype(ordered=False),
            CategoricalDtype(categories=list("abcdef")),
            CategoricalDtype(categories=list("edba"), ordered=False),
            CategoricalDtype(categories=list("edcb"), ordered=True),
        ],
        ids=repr,
    )
    def test_astype_categorical(self, dtype):
        # GH 18099
        d = {"A": list("abbc"), "B": list("bccd"), "C": list("cdde")}
        df = DataFrame(d)
        result = df.astype(dtype)
        expected = DataFrame({k: Categorical(d[k], dtype=dtype) for k in d})
        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize(
        "cls",
        [
            pd.api.types.CategoricalDtype,
            pd.api.types.DatetimeTZDtype,
            pd.api.types.IntervalDtype,
        ],
    )
    def test_astype_categoricaldtype_class_raises(self, cls):
        df = DataFrame({"A": ["a", "a", "b", "c"]})
        xpr = "Expected an instance of {}".format(cls.__name__)
        with pytest.raises(TypeError, match=xpr):
            df.astype({"A": cls})

        with pytest.raises(TypeError, match=xpr):
            df["A"].astype(cls)

    @pytest.mark.parametrize("dtype", ["Int64", "Int32", "Int16"])
    def test_astype_extension_dtypes(self, dtype):
        # GH 22578
        df = pd.DataFrame([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], columns=["a", "b"])

        expected1 = pd.DataFrame(
            {
                "a": integer_array([1, 3, 5], dtype=dtype),
                "b": integer_array([2, 4, 6], dtype=dtype),
            }
        )
        tm.assert_frame_equal(df.astype(dtype), expected1)
        tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1)
        tm.assert_frame_equal(df.astype(dtype).astype("float64"), df)

        df = pd.DataFrame([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], columns=["a", "b"])
        df["b"] = df["b"].astype(dtype)
        expected2 = pd.DataFrame(
            {"a": [1.0, 3.0, 5.0], "b": integer_array([2, 4, 6], dtype=dtype)}
        )
        tm.assert_frame_equal(df, expected2)

        tm.assert_frame_equal(df.astype(dtype), expected1)
        tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1)

    @pytest.mark.parametrize("dtype", ["Int64", "Int32", "Int16"])
    def test_astype_extension_dtypes_1d(self, dtype):
        # GH 22578
        df = pd.DataFrame({"a": [1.0, 2.0, 3.0]})

        expected1 = pd.DataFrame({"a": integer_array([1, 2, 3], dtype=dtype)})
        tm.assert_frame_equal(df.astype(dtype), expected1)
        tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1)

        df = pd.DataFrame({"a": [1.0, 2.0, 3.0]})
        df["a"] = df["a"].astype(dtype)
        expected2 = pd.DataFrame({"a": integer_array([1, 2, 3], dtype=dtype)})
        tm.assert_frame_equal(df, expected2)

        tm.assert_frame_equal(df.astype(dtype), expected1)
        tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1)

    @pytest.mark.parametrize("dtype", ["category", "Int64"])
    def test_astype_extension_dtypes_duplicate_col(self, dtype):
        # GH 24704
        a1 = Series([0, np.nan, 4], name="a")
        a2 = Series([np.nan, 3, 5], name="a")
        df = concat([a1, a2], axis=1)

        result = df.astype(dtype)
        expected = concat([a1.astype(dtype), a2.astype(dtype)], axis=1)
        assert_frame_equal(result, expected)

    @pytest.mark.parametrize(
        "dtype", [{100: "float64", 200: "uint64"}, "category", "float64"]
    )
    def test_astype_column_metadata(self, dtype):
        # GH 19920
        columns = pd.UInt64Index([100, 200, 300], name="foo")
        df = DataFrame(np.arange(15).reshape(5, 3), columns=columns)
        df = df.astype(dtype)
        tm.assert_index_equal(df.columns, columns)

    @pytest.mark.parametrize("dtype", ["M8", "m8"])
    @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"])
    def test_astype_from_datetimelike_to_objectt(self, dtype, unit):
        # tests astype to object dtype
        # gh-19223 / gh-12425
        dtype = "{}[{}]".format(dtype, unit)
        arr = np.array([[1, 2, 3]], dtype=dtype)
        df = DataFrame(arr)
        result = df.astype(object)
        assert (result.dtypes == object).all()

        if dtype.startswith("M8"):
            assert result.iloc[0, 0] == pd.to_datetime(1, unit=unit)
        else:
            assert result.iloc[0, 0] == pd.to_timedelta(1, unit=unit)

    @pytest.mark.parametrize("arr_dtype", [np.int64, np.float64])
    @pytest.mark.parametrize("dtype", ["M8", "m8"])
    @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"])
    def test_astype_to_datetimelike_unit(self, arr_dtype, dtype, unit):
        # tests all units from numeric origination
        # gh-19223 / gh-12425
        dtype = "{}[{}]".format(dtype, unit)
        arr = np.array([[1, 2, 3]], dtype=arr_dtype)
        df = DataFrame(arr)
        result = df.astype(dtype)
        expected = DataFrame(arr.astype(dtype))

        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"])
    def test_astype_to_datetime_unit(self, unit):
        # tests all units from datetime origination
        # gh-19223
        dtype = "M8[{}]".format(unit)
        arr = np.array([[1, 2, 3]], dtype=dtype)
        df = DataFrame(arr)
        result = df.astype(dtype)
        expected = DataFrame(arr.astype(dtype))

        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize("unit", ["ns"])
    def test_astype_to_timedelta_unit_ns(self, unit):
        # preserver the timedelta conversion
        # gh-19223
        dtype = "m8[{}]".format(unit)
        arr = np.array([[1, 2, 3]], dtype=dtype)
        df = DataFrame(arr)
        result = df.astype(dtype)
        expected = DataFrame(arr.astype(dtype))

        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize("unit", ["us", "ms", "s", "h", "m", "D"])
    def test_astype_to_timedelta_unit(self, unit):
        # coerce to float
        # gh-19223
        dtype = "m8[{}]".format(unit)
        arr = np.array([[1, 2, 3]], dtype=dtype)
        df = DataFrame(arr)
        result = df.astype(dtype)
        expected = DataFrame(df.values.astype(dtype).astype(float))

        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"])
    def test_astype_to_incorrect_datetimelike(self, unit):
        # trying to astype a m to a M, or vice-versa
        # gh-19224
        dtype = "M8[{}]".format(unit)
        other = "m8[{}]".format(unit)

        df = DataFrame(np.array([[1, 2, 3]], dtype=dtype))
        msg = (
            r"cannot astype a datetimelike from \[datetime64\[ns\]\] to"
            r" \[timedelta64\[{}\]\]"
        ).format(unit)
        with pytest.raises(TypeError, match=msg):
            df.astype(other)

        msg = (
            r"cannot astype a timedelta from \[timedelta64\[ns\]\] to"
            r" \[datetime64\[{}\]\]"
        ).format(unit)
        df = DataFrame(np.array([[1, 2, 3]], dtype=other))
        with pytest.raises(TypeError, match=msg):
            df.astype(dtype)

    def test_timedeltas(self):
        df = DataFrame(
            dict(
                A=Series(date_range("2012-1-1", periods=3, freq="D")),
                B=Series([timedelta(days=i) for i in range(3)]),
            )
        )
        result = df.dtypes
        expected = Series(
            [np.dtype("datetime64[ns]"), np.dtype("timedelta64[ns]")], index=list("AB")
        )
        assert_series_equal(result, expected)

        df["C"] = df["A"] + df["B"]
        result = df.dtypes
        expected = Series(
            [
                np.dtype("datetime64[ns]"),
                np.dtype("timedelta64[ns]"),
                np.dtype("datetime64[ns]"),
            ],
            index=list("ABC"),
        )
        assert_series_equal(result, expected)

        # mixed int types
        df["D"] = 1
        result = df.dtypes
        expected = Series(
            [
                np.dtype("datetime64[ns]"),
                np.dtype("timedelta64[ns]"),
                np.dtype("datetime64[ns]"),
                np.dtype("int64"),
            ],
            index=list("ABCD"),
        )
        assert_series_equal(result, expected)

    def test_arg_for_errors_in_astype(self):
        # issue #14878

        df = DataFrame([1, 2, 3])

        with pytest.raises(ValueError):
            df.astype(np.float64, errors=True)

        df.astype(np.int8, errors="ignore")

    def test_arg_for_errors_in_astype_dictlist(self):
        # GH-25905
        df = pd.DataFrame(
            [
                {"a": "1", "b": "16.5%", "c": "test"},
                {"a": "2.2", "b": "15.3", "c": "another_test"},
            ]
        )
        expected = pd.DataFrame(
            [
                {"a": 1.0, "b": "16.5%", "c": "test"},
                {"a": 2.2, "b": "15.3", "c": "another_test"},
            ]
        )
        type_dict = {"a": "float64", "b": "float64", "c": "object"}

        result = df.astype(dtype=type_dict, errors="ignore")

        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize(
        "input_vals",
        [
            ([1, 2]),
            (["1", "2"]),
            (list(pd.date_range("1/1/2011", periods=2, freq="H"))),
            (list(pd.date_range("1/1/2011", periods=2, freq="H", tz="US/Eastern"))),
            ([pd.Interval(left=0, right=5)]),
        ],
    )
    def test_constructor_list_str(self, input_vals, string_dtype):
        # GH 16605
        # Ensure that data elements are converted to strings when
        # dtype is str, 'str', or 'U'

        result = DataFrame({"A": input_vals}, dtype=string_dtype)
        expected = DataFrame({"A": input_vals}).astype({"A": string_dtype})
        assert_frame_equal(result, expected)

    def test_constructor_list_str_na(self, string_dtype):

        result = DataFrame({"A": [1.0, 2.0, None]}, dtype=string_dtype)
        expected = DataFrame({"A": ["1.0", "2.0", None]}, dtype=object)
        assert_frame_equal(result, expected)

    @pytest.mark.parametrize(
        "data, expected",
        [
            # empty
            (DataFrame(), True),
            # multi-same
            (DataFrame({"A": [1, 2], "B": [1, 2]}), True),
            # multi-object
            (
                DataFrame(
                    {
                        "A": np.array([1, 2], dtype=object),
                        "B": np.array(["a", "b"], dtype=object),
                    }
                ),
                True,
            ),
            # multi-extension
            (
                DataFrame(
                    {"A": pd.Categorical(["a", "b"]), "B": pd.Categorical(["a", "b"])}
                ),
                True,
            ),
            # differ types
            (DataFrame({"A": [1, 2], "B": [1.0, 2.0]}), False),
            # differ sizes
            (
                DataFrame(
                    {
                        "A": np.array([1, 2], dtype=np.int32),
                        "B": np.array([1, 2], dtype=np.int64),
                    }
                ),
                False,
            ),
            # multi-extension differ
            (
                DataFrame(
                    {"A": pd.Categorical(["a", "b"]), "B": pd.Categorical(["b", "c"])}
                ),
                False,
            ),
        ],
    )
    def test_is_homogeneous_type(self, data, expected):
        assert data._is_homogeneous_type is expected

    def test_asarray_homogenous(self):
        df = pd.DataFrame({"A": pd.Categorical([1, 2]), "B": pd.Categorical([1, 2])})
        result = np.asarray(df)
        # may change from object in the future
        expected = np.array([[1, 1], [2, 2]], dtype="object")
        tm.assert_numpy_array_equal(result, expected)
Beispiel #16
0
 def test_is_all_dates(self):
     # GH 23576
     year_2017 = pd.Interval(pd.Timestamp('2017-01-01 00:00:00'),
                             pd.Timestamp('2018-01-01 00:00:00'))
     year_2017_index = pd.IntervalIndex([year_2017])
     assert not year_2017_index.is_all_dates
Beispiel #17
0
    def testJSONSerialize(self):
        for serial_type in self._get_serial_types():
            provider = JsonSerializeProvider(
                data_serial_type=serial_type,
                pickle_protocol=TEST_PICKLE_PROTOCOL)

            node2 = Node2(a=[['ss'], ['dd']], data=[3, 7, 212])
            node1 = Node1(
                a='test1',
                b1=2,
                b2=2000,
                b3=5000,
                b4=500000,
                c1=2,
                c2=2000,
                c3=5000,
                c4=500000,
                d1=2.5,
                d2=7.37,
                d3=5.976321,
                cl1=1 + 2j,
                cl2=2.5 + 3.1j,
                e=False,
                f1=Node2Entity(node2),
                f2=Node2Entity(node2),
                g=Node2(a=[['1', '2'], ['3', '4']]),
                h=[[2, 3], node2, True, {
                    1: node2
                },
                   np.datetime64('1066-10-13'),
                   np.timedelta64(1, 'D'),
                   np.complex64(1 + 2j),
                   np.complex128(2 + 3j), lambda x: x + 2,
                   pytz.timezone('Asia/Shanghai'),
                   pd.arrays.IntervalArray(
                       [pd.Interval(0, 1),
                        pd.Interval(1, 5)]),
                   nt(1, 2)],
                i=[Node8(b1=111), Node8(b1=222)],
                j=Node2(a=[['u'], ['v']]),
                k=[Node5(a='uvw'),
                   Node8(b1=222, j=Node5(a='xyz')), None],
                l=lambda x: x + 1,
                m=pytz.timezone('Asia/Shanghai'),
                n=pd.arrays.IntervalArray(
                    [pd.Interval(0, 1), pd.Interval(1, 5)]),
                o=nt(3, 4))
            node3 = Node3(value=node1)

            serials = serializes(provider, [node2, node3])
            serials = [
                json.loads(json.dumps(s), object_hook=OrderedDict)
                for s in serials
            ]

            loads_fun = _loads_with_check if serial_type == dataserializer.SerialType.PICKLE \
                else original_pickle_loads
            with unittest.mock.patch('pickle.loads', new=loads_fun):
                d_node2, d_node3 = deserializes(provider, [Node2, Node3],
                                                serials)

            self.assertIsNot(node2, d_node2)
            self.assertEqual(node2.a, d_node2.a)
            self.assertEqual(node2.data, d_node2.data)

            self.assertIsNot(node3, d_node3)
            self.assertIsInstance(d_node3.value, Node8)
            self.assertIsNot(node3.value, d_node3.value)
            self.assertEqual(node3.value.a, d_node3.value.a)
            self.assertEqual(node3.value.b1, d_node3.value.b1)
            self.assertEqual(node3.value.b2, d_node3.value.b2)
            self.assertEqual(node3.value.b3, d_node3.value.b3)
            self.assertEqual(node3.value.b4, d_node3.value.b4)
            self.assertEqual(node3.value.c1, d_node3.value.c1)
            self.assertEqual(node3.value.c2, d_node3.value.c2)
            self.assertEqual(node3.value.c3, d_node3.value.c3)
            self.assertEqual(node3.value.c4, d_node3.value.c4)
            self.assertAlmostEqual(node3.value.d1, d_node3.value.d1, places=2)
            self.assertAlmostEqual(node3.value.d2, d_node3.value.d2, places=4)
            self.assertAlmostEqual(node3.value.d3, d_node3.value.d3)
            self.assertAlmostEqual(node3.value.cl1, d_node3.value.cl1)
            self.assertAlmostEqual(node3.value.cl2, d_node3.value.cl2)
            self.assertEqual(node3.value.e, d_node3.value.e)
            self.assertIsNot(node3.value.f1, d_node3.value.f1)
            self.assertEqual(node3.value.f1.a, d_node3.value.f1.a)
            self.assertIsNot(node3.value.f2, d_node3.value.f2)
            self.assertEqual(node3.value.f2.a, d_node3.value.f2.a)
            self.assertIsNot(node3.value.g, d_node3.value.g)
            self.assertEqual(node3.value.g.a, d_node3.value.g.a)
            self.assertEqual(node3.value.h[0], d_node3.value.h[0])
            self.assertNotIsInstance(d_node3.value.h[1], str)
            self.assertIs(d_node3.value.h[1], d_node3.value.f1)
            self.assertEqual(node3.value.h[2], True)
            self.assertAlmostEqual(node3.value.h[6], d_node3.value.h[6])
            self.assertAlmostEqual(node3.value.h[7], d_node3.value.h[7])
            self.assertEqual(node3.value.h[8](2), 4)
            self.assertEqual(node3.value.h[9], d_node3.value.h[9])
            np.testing.assert_array_equal(node3.value.h[10],
                                          d_node3.value.h[10])
            self.assertEqual(node3.value.h[11], d_node3.value.h[11])
            self.assertEqual([n.b1 for n in node3.value.i],
                             [n.b1 for n in d_node3.value.i])
            self.assertIsInstance(d_node3.value.i[0], Node8)
            self.assertIsInstance(d_node3.value.j, Node2)
            self.assertEqual(node3.value.j.a, d_node3.value.j.a)
            self.assertIsInstance(d_node3.value.k[0], Node5)
            self.assertEqual(node3.value.k[0].a, d_node3.value.k[0].a)
            self.assertIsInstance(d_node3.value.k[1], Node8)
            self.assertEqual(node3.value.k[1].b1, d_node3.value.k[1].b1)
            self.assertIsInstance(d_node3.value.k[1].j, Node5)
            self.assertEqual(node3.value.k[1].j.a, d_node3.value.k[1].j.a)
            self.assertIsNone(node3.value.k[2])
            self.assertEqual(d_node3.value.l(1), 2)
            self.assertEqual(d_node3.value.m, node3.value.m)
            np.testing.assert_array_equal(d_node3.value.n, node3.value.n)
            self.assertEqual(d_node3.value.o, node3.value.o)

            with self.assertRaises(ValueError):
                serializes(provider, [Node3(value='sth else')])
Beispiel #18
0
def create_file_index_for_climate_observations(
    parameter_set: DwdObservationDataset,
    resolution: Resolution,
    period: Period,
) -> pd.DataFrame:
    """
    Function (cached) to create a file index of the DWD station data. The file index
    is created for an individual set of parameters.
    Args:
        parameter_set: parameter of Parameter enumeration
        resolution: time resolution of TimeResolution enumeration
        period: period type of PeriodType enumeration
    Returns:
        file index in a pandas.DataFrame with sets of parameters and station id
    """
    file_index = _create_file_index_for_dwd_server(
        parameter_set, resolution, period, DWDCDCBase.CLIMATE_OBSERVATIONS
    )

    file_index = file_index[
        file_index[DwdColumns.FILENAME.value].str.endswith(Extension.ZIP.value)
    ]

    file_index[DwdColumns.STATION_ID.value] = (
        file_index[DwdColumns.FILENAME.value].str.findall(STATION_ID_REGEX).str[0]
    )

    file_index = file_index.dropna().reset_index(drop=True)

    file_index.loc[:, DwdColumns.STATION_ID.value] = file_index[
        DwdColumns.STATION_ID.value
    ].astype(str)

    if resolution in HIGH_RESOLUTIONS and period == Period.HISTORICAL:
        # Date range string for additional filtering of historical files
        file_index[DwdColumns.DATE_RANGE.value] = (
            file_index[DwdColumns.FILENAME.value].str.findall(DATE_RANGE_REGEX).str[0]
        )

        file_index[[DwdColumns.FROM_DATE.value, DwdColumns.TO_DATE.value]] = file_index[
            DwdColumns.DATE_RANGE.value
        ].str.split("_", expand=True)

        file_index[DwdColumns.FROM_DATE.value] = pd.to_datetime(
            file_index[DwdColumns.FROM_DATE.value],
            format=DatetimeFormat.YMD.value,
            utc=True,
        )

        file_index[DwdColumns.TO_DATE.value] = pd.to_datetime(
            file_index[DwdColumns.TO_DATE.value],
            format=DatetimeFormat.YMD.value,
            utc=True,
        )

        # Temporary fix for filenames with wrong ordered/faulty dates
        # Fill those cases with minimum/maximum date to ensure that they are loaded as
        # we don't know what exact date range the included data has
        wrong_date_order_index = (
            file_index[DwdColumns.FROM_DATE.value]
            > file_index[DwdColumns.TO_DATE.value]
        )

        file_index.loc[wrong_date_order_index, DwdColumns.FROM_DATE.value] = file_index[
            DwdColumns.FROM_DATE.value
        ].min()
        file_index.loc[wrong_date_order_index, DwdColumns.TO_DATE.value] = file_index[
            DwdColumns.TO_DATE.value
        ].max()

        file_index[DwdColumns.INTERVAL.value] = file_index.apply(
            lambda x: pd.Interval(
                left=x[DwdColumns.FROM_DATE.value],
                right=x[DwdColumns.TO_DATE.value],
                closed="both",
            ),
            axis=1,
        )

    file_index = file_index.sort_values(
        by=[DwdColumns.STATION_ID.value, DwdColumns.FILENAME.value]
    )

    return file_index
Beispiel #19
0
        "datetime64[ns]",
        np.dtype("datetime64[ns]"),
        {},
    ),
    (
        pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]),
        object,
        np.dtype("datetime64[ns]"),
        {
            ("infer_objects", False): np.dtype("object")
        },
    ),
    (pd.period_range("1/1/2011", freq="M",
                     periods=3), None, pd.PeriodDtype("M"), {}),
    (
        pd.arrays.IntervalArray([pd.Interval(0, 1),
                                 pd.Interval(1, 5)]),
        None,
        pd.IntervalDtype("int64"),
        {},
    ),
]


class TestSeriesConvertDtypes:
    @pytest.mark.parametrize(
        "data, maindtype, expected_default, expected_other",
        test_cases,
    )
    @pytest.mark.parametrize("params", product(*[(True, False)] * 5))
    def test_convert_dtypes(self, data, maindtype, params, expected_default,
Beispiel #20
0
    ('datetime64',
     [np.datetime64('2013-01-01'), np.nan,
      np.datetime64('2018-01-01')]),
    ('datetime', [pd.Timestamp('20130101'), np.nan,
                  pd.Timestamp('20180101')]),
    ('date', [date(2013, 1, 1), np.nan,
              date(2018, 1, 1)]),
    # The following two dtypes are commented out due to GH 23554
    # ('complex', [1 + 1j, np.nan, 2 + 2j]),
    # ('timedelta64', [np.timedelta64(1, 'D'),
    #                  np.nan, np.timedelta64(2, 'D')]),
    ('timedelta', [timedelta(1), np.nan, timedelta(2)]),
    ('time', [time(1), np.nan, time(2)]),
    ('period', [pd.Period(2013), pd.NaT,
                pd.Period(2018)]),
    ('interval', [pd.Interval(0, 1), np.nan,
                  pd.Interval(0, 2)])
]
ids, _ = zip(*_any_skipna_inferred_dtype)  # use inferred type as fixture-id


@pytest.fixture(params=_any_skipna_inferred_dtype, ids=ids)
def any_skipna_inferred_dtype(request):
    """
    Fixture for all inferred dtypes from _libs.lib.infer_dtype

    The covered (inferred) types are:
    * 'string'
    * 'unicode' (if PY2)
    * 'empty'
    * 'bytes' (if PY3)
def correlate_binned_data(top_donor_data, binned, bins):
    # PREP TOP DONATORS
    plot_hourly_runned_summed_data(binned, bins)

    print("TOP DONATION DF")
    print(top_donor_data.head())

    top_donor_data = top_donor_data.drop("donated_amount", axis=1)
    top_donor_data = top_donor_data.drop("bin", axis=1)
    top_donor_data_per_hour = fix_intervals_for_data(top_donor_data)

    corrs = []
    hours_to_shift = 300

    # get and print correlations for binned groups
    for i in range(1, len(bins) - 1):
        left = bins[i - 1]
        right = bins[i]
        interval = pd.Interval(left=left, right=right)
        print("Interval to compare with top-donors", interval)

        data_to_comp = binned.get_group(interval)

        print(data_to_comp.head())
        data_to_comp = data_to_comp.drop("donated_amount", axis=1)
        data_to_comp = data_to_comp.drop("bin", axis=1)
        data_to_comp = fix_intervals_for_data(data_to_comp)

        # Assuming top donor data is the most influential

        corr_per_range = []
        for i in range(-hours_to_shift, hours_to_shift):
            # hours are actually reversed
            data_to_comp_mod = copy.deepcopy(data_to_comp)
            top_donor_data_per_hour_mod = copy.deepcopy(
                top_donor_data_per_hour)
            if i > 0:
                top_donor_data_per_hour_mod = top_donor_data_per_hour_mod[:-i]
                data_to_comp_mod = data_to_comp_mod[i:]
            elif i < 0:
                top_donor_data_per_hour_mod = top_donor_data_per_hour_mod[-i:]
                data_to_comp_mod = data_to_comp_mod[:i]
            else:
                pass  # its 0
            corr = np.corrcoef(
                top_donor_data_per_hour_mod,
                data_to_comp_mod)[0, 1]  # grab the compared correlation
            corr_per_range.append(corr)

        corrs.append(
            (interval,
             corr_per_range))  # append tuple-> inteval, corrs over the hours

    # init new plot
    fig, ax = plt.subplots()
    for data_brick in corrs:
        interval = data_brick[0]
        corrs_shifted = np.asarray(data_brick[1])

        # reverse the hour amount as this is logical for the graph
        x = np.asarray(range(-hours_to_shift, hours_to_shift)) * -1
        ax.plot(x,
                corrs_shifted,
                label="Interval: " + str(interval),
                alpha=0.5)
        xmax = x[np.argmax(corrs_shifted)]
        ymax = corrs_shifted.max()

        print("Max correlation coefficients")
        print(xmax, ymax, interval)

        ax.plot(xmax, ymax, marker="o", ls="", ms=3)

    plt.ylabel("Correlation coefficient", fontsize=30)
    plt.xlabel("Hours shifted", fontsize=30)
    plt.legend(loc='upper left',
               labels=[
                   'Low donors', 'Peak correlation: Low donors',
                   'Medium donors', 'Peak correlation: medium donors',
                   'Large donors', 'Peak correlation: Large donors'
               ],
               prop={'size': 20})
    plt.yticks(fontsize=20)
    plt.xticks(fontsize=20)
    plt.show()
Beispiel #22
0
class TestFillnaSeriesCoercion(CoercionBase):

    # not indexing, but place here for consistency

    method = "fillna"

    @pytest.mark.xfail(reason="Test not implemented")
    def test_has_comprehensive_tests(self):
        raise NotImplementedError

    def _assert_fillna_conversion(self, original, value, expected,
                                  expected_dtype):
        """test coercion triggered by fillna"""
        target = original.copy()
        res = target.fillna(value)
        tm.assert_equal(res, expected)
        assert res.dtype == expected_dtype

    @pytest.mark.parametrize(
        "fill_val, fill_dtype",
        [(1, object), (1.1, object), (1 + 1j, object), (True, object)],
    )
    def test_fillna_object(self, index_or_series, fill_val, fill_dtype):
        klass = index_or_series
        obj = klass(["a", np.nan, "c", "d"])
        assert obj.dtype == object

        exp = klass(["a", fill_val, "c", "d"])
        self._assert_fillna_conversion(obj, fill_val, exp, fill_dtype)

    @pytest.mark.parametrize(
        "fill_val,fill_dtype",
        [(1, np.float64), (1.1, np.float64), (1 + 1j, np.complex128),
         (True, object)],
    )
    def test_fillna_float64(self, index_or_series, fill_val, fill_dtype):
        klass = index_or_series
        obj = klass([1.1, np.nan, 3.3, 4.4])
        assert obj.dtype == np.float64

        exp = klass([1.1, fill_val, 3.3, 4.4])
        self._assert_fillna_conversion(obj, fill_val, exp, fill_dtype)

    @pytest.mark.parametrize(
        "fill_val,fill_dtype",
        [
            (1, np.complex128),
            (1.1, np.complex128),
            (1 + 1j, np.complex128),
            (True, object),
        ],
    )
    def test_fillna_complex128(self, index_or_series, fill_val, fill_dtype):
        klass = index_or_series
        obj = klass([1 + 1j, np.nan, 3 + 3j, 4 + 4j], dtype=np.complex128)
        assert obj.dtype == np.complex128

        exp = klass([1 + 1j, fill_val, 3 + 3j, 4 + 4j])
        self._assert_fillna_conversion(obj, fill_val, exp, fill_dtype)

    @pytest.mark.parametrize(
        "fill_val,fill_dtype",
        [
            (pd.Timestamp("2012-01-01"), "datetime64[ns]"),
            (pd.Timestamp("2012-01-01", tz="US/Eastern"), object),
            (1, object),
            ("x", object),
        ],
        ids=["datetime64", "datetime64tz", "object", "object"],
    )
    def test_fillna_datetime(self, index_or_series, fill_val, fill_dtype):
        klass = index_or_series
        obj = klass([
            pd.Timestamp("2011-01-01"),
            pd.NaT,
            pd.Timestamp("2011-01-03"),
            pd.Timestamp("2011-01-04"),
        ])
        assert obj.dtype == "datetime64[ns]"

        exp = klass([
            pd.Timestamp("2011-01-01"),
            fill_val,
            pd.Timestamp("2011-01-03"),
            pd.Timestamp("2011-01-04"),
        ])
        self._assert_fillna_conversion(obj, fill_val, exp, fill_dtype)

    @pytest.mark.parametrize(
        "fill_val,fill_dtype",
        [
            (pd.Timestamp("2012-01-01",
                          tz="US/Eastern"), "datetime64[ns, US/Eastern]"),
            (pd.Timestamp("2012-01-01"), object),
            (pd.Timestamp("2012-01-01", tz="Asia/Tokyo"), object),
            (1, object),
            ("x", object),
        ],
    )
    def test_fillna_datetime64tz(self, index_or_series, fill_val, fill_dtype):
        klass = index_or_series
        tz = "US/Eastern"

        obj = klass([
            pd.Timestamp("2011-01-01", tz=tz),
            pd.NaT,
            pd.Timestamp("2011-01-03", tz=tz),
            pd.Timestamp("2011-01-04", tz=tz),
        ])
        assert obj.dtype == "datetime64[ns, US/Eastern]"

        exp = klass([
            pd.Timestamp("2011-01-01", tz=tz),
            fill_val,
            # Once deprecation is enforced, this becomes:
            # fill_val.tz_convert(tz) if getattr(fill_val, "tz", None)
            #  is not None else fill_val,
            pd.Timestamp("2011-01-03", tz=tz),
            pd.Timestamp("2011-01-04", tz=tz),
        ])
        warn = None
        if getattr(fill_val, "tz",
                   None) is not None and fill_val.tz != obj[0].tz:
            warn = FutureWarning
        with tm.assert_produces_warning(warn, match="mismatched timezone"):
            self._assert_fillna_conversion(obj, fill_val, exp, fill_dtype)

    @pytest.mark.parametrize(
        "fill_val",
        [
            1,
            1.1,
            1 + 1j,
            True,
            pd.Interval(1, 2, inclusive="left"),
            pd.Timestamp("2012-01-01", tz="US/Eastern"),
            pd.Timestamp("2012-01-01"),
            pd.Timedelta(days=1),
            pd.Period("2016-01-01", "D"),
        ],
    )
    def test_fillna_interval(self, index_or_series, fill_val):
        ii = pd.interval_range(1.0, 5.0, inclusive="right").insert(1, np.nan)
        assert isinstance(ii.dtype, pd.IntervalDtype)
        obj = index_or_series(ii)

        exp = index_or_series([ii[0], fill_val, ii[2], ii[3], ii[4]],
                              dtype=object)

        fill_dtype = object
        self._assert_fillna_conversion(obj, fill_val, exp, fill_dtype)

    @pytest.mark.xfail(reason="Test not implemented")
    def test_fillna_series_int64(self):
        raise NotImplementedError

    @pytest.mark.xfail(reason="Test not implemented")
    def test_fillna_index_int64(self):
        raise NotImplementedError

    @pytest.mark.xfail(reason="Test not implemented")
    def test_fillna_series_bool(self):
        raise NotImplementedError

    @pytest.mark.xfail(reason="Test not implemented")
    def test_fillna_index_bool(self):
        raise NotImplementedError

    @pytest.mark.xfail(reason="Test not implemented")
    def test_fillna_series_timedelta64(self):
        raise NotImplementedError

    @pytest.mark.parametrize(
        "fill_val",
        [
            1,
            1.1,
            1 + 1j,
            True,
            pd.Interval(1, 2, inclusive="left"),
            pd.Timestamp("2012-01-01", tz="US/Eastern"),
            pd.Timestamp("2012-01-01"),
            pd.Timedelta(days=1),
            pd.Period("2016-01-01", "W"),
        ],
    )
    def test_fillna_series_period(self, index_or_series, fill_val):

        pi = pd.period_range("2016-01-01", periods=4,
                             freq="D").insert(1, pd.NaT)
        assert isinstance(pi.dtype, pd.PeriodDtype)
        obj = index_or_series(pi)

        exp = index_or_series([pi[0], fill_val, pi[2], pi[3], pi[4]],
                              dtype=object)

        fill_dtype = object
        self._assert_fillna_conversion(obj, fill_val, exp, fill_dtype)

    @pytest.mark.xfail(reason="Test not implemented")
    def test_fillna_index_timedelta64(self):
        raise NotImplementedError

    @pytest.mark.xfail(reason="Test not implemented")
    def test_fillna_index_period(self):
        raise NotImplementedError
Beispiel #23
0
 ),
 (
     pd.TimedeltaIndex(["1H", "2H"]),
     None,
     TimedeltaArray._from_sequence(["1H", "2H"]),
 ),
 # Category
 (["a", "b"], "category", pd.Categorical(["a", "b"])),
 (
     ["a", "b"],
     pd.CategoricalDtype(None, ordered=True),
     pd.Categorical(["a", "b"], ordered=True),
 ),
 # Interval
 (
     [pd.Interval(1, 2), pd.Interval(3, 4)],
     "interval",
     IntervalArray.from_tuples([(1, 2), (3, 4)]),
 ),
 # Sparse
 ([0, 1], "Sparse[int64]", SparseArray([0, 1], dtype="int64")),
 # IntegerNA
 ([1, None], "Int16", integer_array([1, None], dtype="Int16")),
 (pd.Series([1, 2]), None, PandasArray(np.array([1, 2],
                                                dtype=np.int64))),
 # String
 (["a", None], "string", StringArray._from_sequence(["a", None])),
 (
     ["a", None],
     pd.StringDtype(),
     StringArray._from_sequence(["a", None]),
Beispiel #24
0
@pytest.mark.parametrize(
    "array, expected",
    [
        (np.array([1, 2], dtype=np.int64), np.array([1, 2], dtype=np.int64)),
        (pd.Categorical(["a", "b"]), np.array(["a", "b"], dtype=object)),
        (
            pd.core.arrays.period_array(["2000", "2001"], freq="D"),
            np.array(
                [pd.Period("2000", freq="D"),
                 pd.Period("2001", freq="D")]),
        ),
        (pd.array([0, np.nan],
                  dtype="Int64"), np.array([0, pd.NA], dtype=object)),
        (
            IntervalArray.from_breaks([0, 1, 2]),
            np.array([pd.Interval(0, 1), pd.Interval(1, 2)], dtype=object),
        ),
        (SparseArray([0, 1]), np.array([0, 1], dtype=np.int64)),
        # tz-naive datetime
        (
            DatetimeArray(np.array(["2000", "2001"], dtype="M8[ns]")),
            np.array(["2000", "2001"], dtype="M8[ns]"),
        ),
        # tz-aware stays tz`-aware
        (
            DatetimeArray(
                np.array(["2000-01-01T06:00:00", "2000-01-02T06:00:00"],
                         dtype="M8[ns]"),
                dtype=DatetimeTZDtype(tz="US/Central"),
            ),
            np.array([
Beispiel #25
0
class TestCategoricalIndex(Base):
    _holder = CategoricalIndex

    @pytest.fixture
    def indices(self, request):
        return tm.makeCategoricalIndex(100)

    def create_index(self, categories=None, ordered=False):
        if categories is None:
            categories = list("cab")
        return CategoricalIndex(list("aabbca"),
                                categories=categories,
                                ordered=ordered)

    def test_can_hold_identifiers(self):
        idx = self.create_index(categories=list("abcd"))
        key = idx[0]
        assert idx._can_hold_identifiers_and_holds_name(key) is True

    @pytest.mark.parametrize(
        "func,op_name",
        [
            (lambda idx: idx - idx, "__sub__"),
            (lambda idx: idx + idx, "__add__"),
            (lambda idx: idx - ["a", "b"], "__sub__"),
            (lambda idx: idx + ["a", "b"], "__add__"),
            (lambda idx: ["a", "b"] - idx, "__rsub__"),
            (lambda idx: ["a", "b"] + idx, "__radd__"),
        ],
    )
    def test_disallow_addsub_ops(self, func, op_name):
        # GH 10039
        # set ops (+/-) raise TypeError
        idx = pd.Index(pd.Categorical(["a", "b"]))
        msg = f"cannot perform {op_name} with this index type: CategoricalIndex"
        with pytest.raises(TypeError, match=msg):
            func(idx)

    def test_method_delegation(self):

        ci = CategoricalIndex(list("aabbca"), categories=list("cabdef"))
        result = ci.set_categories(list("cab"))
        tm.assert_index_equal(
            result, CategoricalIndex(list("aabbca"), categories=list("cab")))

        ci = CategoricalIndex(list("aabbca"), categories=list("cab"))
        result = ci.rename_categories(list("efg"))
        tm.assert_index_equal(
            result, CategoricalIndex(list("ffggef"), categories=list("efg")))

        # GH18862 (let rename_categories take callables)
        result = ci.rename_categories(lambda x: x.upper())
        tm.assert_index_equal(
            result, CategoricalIndex(list("AABBCA"), categories=list("CAB")))

        ci = CategoricalIndex(list("aabbca"), categories=list("cab"))
        result = ci.add_categories(["d"])
        tm.assert_index_equal(
            result, CategoricalIndex(list("aabbca"), categories=list("cabd")))

        ci = CategoricalIndex(list("aabbca"), categories=list("cab"))
        result = ci.remove_categories(["c"])
        tm.assert_index_equal(
            result,
            CategoricalIndex(list("aabb") + [np.nan] + ["a"],
                             categories=list("ab")),
        )

        ci = CategoricalIndex(list("aabbca"), categories=list("cabdef"))
        result = ci.as_unordered()
        tm.assert_index_equal(result, ci)

        ci = CategoricalIndex(list("aabbca"), categories=list("cabdef"))
        result = ci.as_ordered()
        tm.assert_index_equal(
            result,
            CategoricalIndex(list("aabbca"),
                             categories=list("cabdef"),
                             ordered=True),
        )

        # invalid
        msg = "cannot use inplace with CategoricalIndex"
        with pytest.raises(ValueError, match=msg):
            ci.set_categories(list("cab"), inplace=True)

    def test_contains(self):

        ci = self.create_index(categories=list("cabdef"))

        assert "a" in ci
        assert "z" not in ci
        assert "e" not in ci
        assert np.nan not in ci

        # assert codes NOT in index
        assert 0 not in ci
        assert 1 not in ci

        ci = CategoricalIndex(list("aabbca") + [np.nan],
                              categories=list("cabdef"))
        assert np.nan in ci

    @pytest.mark.parametrize(
        "item, expected",
        [
            (pd.Interval(0, 1), True),
            (1.5, True),
            (pd.Interval(0.5, 1.5), False),
            ("a", False),
            (pd.Timestamp(1), False),
            (pd.Timedelta(1), False),
        ],
        ids=str,
    )
    def test_contains_interval(self, item, expected):
        # GH 23705
        ci = CategoricalIndex(IntervalIndex.from_breaks(range(3)))
        result = item in ci
        assert result is expected

    def test_contains_list(self):
        # GH#21729
        idx = pd.CategoricalIndex([1, 2, 3])

        assert "a" not in idx

        with pytest.raises(TypeError, match="unhashable type"):
            ["a"] in idx

        with pytest.raises(TypeError, match="unhashable type"):
            ["a", "b"] in idx

    def test_map(self):
        ci = pd.CategoricalIndex(list("ABABC"),
                                 categories=list("CBA"),
                                 ordered=True)
        result = ci.map(lambda x: x.lower())
        exp = pd.CategoricalIndex(list("ababc"),
                                  categories=list("cba"),
                                  ordered=True)
        tm.assert_index_equal(result, exp)

        ci = pd.CategoricalIndex(list("ABABC"),
                                 categories=list("BAC"),
                                 ordered=False,
                                 name="XXX")
        result = ci.map(lambda x: x.lower())
        exp = pd.CategoricalIndex(list("ababc"),
                                  categories=list("bac"),
                                  ordered=False,
                                  name="XXX")
        tm.assert_index_equal(result, exp)

        # GH 12766: Return an index not an array
        tm.assert_index_equal(
            ci.map(lambda x: 1),
            Index(np.array([1] * 5, dtype=np.int64), name="XXX"))

        # change categories dtype
        ci = pd.CategoricalIndex(list("ABABC"),
                                 categories=list("BAC"),
                                 ordered=False)

        def f(x):
            return {"A": 10, "B": 20, "C": 30}.get(x)

        result = ci.map(f)
        exp = pd.CategoricalIndex([10, 20, 10, 20, 30],
                                  categories=[20, 10, 30],
                                  ordered=False)
        tm.assert_index_equal(result, exp)

        result = ci.map(pd.Series([10, 20, 30], index=["A", "B", "C"]))
        tm.assert_index_equal(result, exp)

        result = ci.map({"A": 10, "B": 20, "C": 30})
        tm.assert_index_equal(result, exp)

    def test_map_with_categorical_series(self):
        # GH 12756
        a = pd.Index([1, 2, 3, 4])
        b = pd.Series(["even", "odd", "even", "odd"], dtype="category")
        c = pd.Series(["even", "odd", "even", "odd"])

        exp = CategoricalIndex(["odd", "even", "odd", np.nan])
        tm.assert_index_equal(a.map(b), exp)
        exp = pd.Index(["odd", "even", "odd", np.nan])
        tm.assert_index_equal(a.map(c), exp)

    @pytest.mark.parametrize(
        ("data", "f"),
        (
            ([1, 1, np.nan], pd.isna),
            ([1, 2, np.nan], pd.isna),
            ([1, 1, np.nan], {
                1: False
            }),
            ([1, 2, np.nan], {
                1: False,
                2: False
            }),
            ([1, 1, np.nan], pd.Series([False, False])),
            ([1, 2, np.nan], pd.Series([False, False, False])),
        ),
    )
    def test_map_with_nan(self, data, f):  # GH 24241
        values = pd.Categorical(data)
        result = values.map(f)
        if data[1] == 1:
            expected = pd.Categorical([False, False, np.nan])
            tm.assert_categorical_equal(result, expected)
        else:
            expected = pd.Index([False, False, np.nan])
            tm.assert_index_equal(result, expected)

    @pytest.mark.parametrize("klass", [list, tuple, np.array, pd.Series])
    def test_where(self, klass):
        i = self.create_index()
        cond = [True] * len(i)
        expected = i
        result = i.where(klass(cond))
        tm.assert_index_equal(result, expected)

        cond = [False] + [True] * (len(i) - 1)
        expected = CategoricalIndex([np.nan] + i[1:].tolist(),
                                    categories=i.categories)
        result = i.where(klass(cond))
        tm.assert_index_equal(result, expected)

    def test_append(self):

        ci = self.create_index()
        categories = ci.categories

        # append cats with the same categories
        result = ci[:3].append(ci[3:])
        tm.assert_index_equal(result, ci, exact=True)

        foos = [ci[:1], ci[1:3], ci[3:]]
        result = foos[0].append(foos[1:])
        tm.assert_index_equal(result, ci, exact=True)

        # empty
        result = ci.append([])
        tm.assert_index_equal(result, ci, exact=True)

        # appending with different categories or reordered is not ok
        msg = "all inputs must be Index"
        with pytest.raises(TypeError, match=msg):
            ci.append(ci.values.set_categories(list("abcd")))
        with pytest.raises(TypeError, match=msg):
            ci.append(ci.values.reorder_categories(list("abc")))

        # with objects
        result = ci.append(Index(["c", "a"]))
        expected = CategoricalIndex(list("aabbcaca"), categories=categories)
        tm.assert_index_equal(result, expected, exact=True)

        # invalid objects
        msg = "cannot append a non-category item to a CategoricalIndex"
        with pytest.raises(TypeError, match=msg):
            ci.append(Index(["a", "d"]))

        # GH14298 - if base object is not categorical -> coerce to object
        result = Index(["c", "a"]).append(ci)
        expected = Index(list("caaabbca"))
        tm.assert_index_equal(result, expected, exact=True)

    def test_append_to_another(self):
        # hits Index._concat_same_dtype
        fst = Index(["a", "b"])
        snd = CategoricalIndex(["d", "e"])
        result = fst.append(snd)
        expected = Index(["a", "b", "d", "e"])
        tm.assert_index_equal(result, expected)

    def test_insert(self):

        ci = self.create_index()
        categories = ci.categories

        # test 0th element
        result = ci.insert(0, "a")
        expected = CategoricalIndex(list("aaabbca"), categories=categories)
        tm.assert_index_equal(result, expected, exact=True)

        # test Nth element that follows Python list behavior
        result = ci.insert(-1, "a")
        expected = CategoricalIndex(list("aabbcaa"), categories=categories)
        tm.assert_index_equal(result, expected, exact=True)

        # test empty
        result = CategoricalIndex(categories=categories).insert(0, "a")
        expected = CategoricalIndex(["a"], categories=categories)
        tm.assert_index_equal(result, expected, exact=True)

        # invalid
        msg = ("cannot insert an item into a CategoricalIndex that is not"
               " already an existing category")
        with pytest.raises(TypeError, match=msg):
            ci.insert(0, "d")

        # GH 18295 (test missing)
        expected = CategoricalIndex(["a", np.nan, "a", "b", "c", "b"])
        for na in (np.nan, pd.NaT, None):
            result = CategoricalIndex(list("aabcb")).insert(1, na)
            tm.assert_index_equal(result, expected)

    def test_delete(self):

        ci = self.create_index()
        categories = ci.categories

        result = ci.delete(0)
        expected = CategoricalIndex(list("abbca"), categories=categories)
        tm.assert_index_equal(result, expected, exact=True)

        result = ci.delete(-1)
        expected = CategoricalIndex(list("aabbc"), categories=categories)
        tm.assert_index_equal(result, expected, exact=True)

        with pytest.raises((IndexError, ValueError)):
            # Either depending on NumPy version
            ci.delete(10)

    def test_astype(self):

        ci = self.create_index()
        result = ci.astype(object)
        tm.assert_index_equal(result, Index(np.array(ci)))

        # this IS equal, but not the same class
        assert result.equals(ci)
        assert isinstance(result, Index)
        assert not isinstance(result, CategoricalIndex)

        # interval
        ii = IntervalIndex.from_arrays(left=[-0.001, 2.0],
                                       right=[2, 4],
                                       closed="right")

        ci = CategoricalIndex(
            Categorical.from_codes([0, 1, -1], categories=ii, ordered=True))

        result = ci.astype("interval")
        expected = ii.take([0, 1, -1])
        tm.assert_index_equal(result, expected)

        result = IntervalIndex(result.values)
        tm.assert_index_equal(result, expected)

    @pytest.mark.parametrize("name", [None, "foo"])
    @pytest.mark.parametrize("dtype_ordered", [True, False])
    @pytest.mark.parametrize("index_ordered", [True, False])
    def test_astype_category(self, name, dtype_ordered, index_ordered):
        # GH 18630
        index = self.create_index(ordered=index_ordered)
        if name:
            index = index.rename(name)

        # standard categories
        dtype = CategoricalDtype(ordered=dtype_ordered)
        result = index.astype(dtype)
        expected = CategoricalIndex(
            index.tolist(),
            name=name,
            categories=index.categories,
            ordered=dtype_ordered,
        )
        tm.assert_index_equal(result, expected)

        # non-standard categories
        dtype = CategoricalDtype(index.unique().tolist()[:-1], dtype_ordered)
        result = index.astype(dtype)
        expected = CategoricalIndex(index.tolist(), name=name, dtype=dtype)
        tm.assert_index_equal(result, expected)

        if dtype_ordered is False:
            # dtype='category' can't specify ordered, so only test once
            result = index.astype("category")
            expected = index
            tm.assert_index_equal(result, expected)

    def test_reindex_base(self):
        # Determined by cat ordering.
        idx = CategoricalIndex(list("cab"), categories=list("cab"))
        expected = np.arange(len(idx), dtype=np.intp)

        actual = idx.get_indexer(idx)
        tm.assert_numpy_array_equal(expected, actual)

        with pytest.raises(ValueError, match="Invalid fill method"):
            idx.get_indexer(idx, method="invalid")

    def test_reindexing(self):
        np.random.seed(123456789)

        ci = self.create_index()
        oidx = Index(np.array(ci))

        for n in [1, 2, 5, len(ci)]:
            finder = oidx[np.random.randint(0, len(ci), size=n)]
            expected = oidx.get_indexer_non_unique(finder)[0]

            actual = ci.get_indexer(finder)
            tm.assert_numpy_array_equal(expected, actual)

        # see gh-17323
        #
        # Even when indexer is equal to the
        # members in the index, we should
        # respect duplicates instead of taking
        # the fast-track path.
        for finder in [list("aabbca"), list("aababca")]:
            expected = oidx.get_indexer_non_unique(finder)[0]

            actual = ci.get_indexer(finder)
            tm.assert_numpy_array_equal(expected, actual)

    def test_reindex_dtype(self):
        c = CategoricalIndex(["a", "b", "c", "a"])
        res, indexer = c.reindex(["a", "c"])
        tm.assert_index_equal(res, Index(["a", "a", "c"]), exact=True)
        tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2],
                                                      dtype=np.intp))

        c = CategoricalIndex(["a", "b", "c", "a"])
        res, indexer = c.reindex(Categorical(["a", "c"]))

        exp = CategoricalIndex(["a", "a", "c"], categories=["a", "c"])
        tm.assert_index_equal(res, exp, exact=True)
        tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2],
                                                      dtype=np.intp))

        c = CategoricalIndex(["a", "b", "c", "a"],
                             categories=["a", "b", "c", "d"])
        res, indexer = c.reindex(["a", "c"])
        exp = Index(["a", "a", "c"], dtype="object")
        tm.assert_index_equal(res, exp, exact=True)
        tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2],
                                                      dtype=np.intp))

        c = CategoricalIndex(["a", "b", "c", "a"],
                             categories=["a", "b", "c", "d"])
        res, indexer = c.reindex(Categorical(["a", "c"]))
        exp = CategoricalIndex(["a", "a", "c"], categories=["a", "c"])
        tm.assert_index_equal(res, exp, exact=True)
        tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2],
                                                      dtype=np.intp))

    def test_reindex_duplicate_target(self):
        # See GH25459
        cat = CategoricalIndex(["a", "b", "c"],
                               categories=["a", "b", "c", "d"])
        res, indexer = cat.reindex(["a", "c", "c"])
        exp = Index(["a", "c", "c"], dtype="object")
        tm.assert_index_equal(res, exp, exact=True)
        tm.assert_numpy_array_equal(indexer, np.array([0, 2, 2],
                                                      dtype=np.intp))

        res, indexer = cat.reindex(
            CategoricalIndex(["a", "c", "c"], categories=["a", "b", "c", "d"]))
        exp = CategoricalIndex(["a", "c", "c"],
                               categories=["a", "b", "c", "d"])
        tm.assert_index_equal(res, exp, exact=True)
        tm.assert_numpy_array_equal(indexer, np.array([0, 2, 2],
                                                      dtype=np.intp))

    def test_reindex_empty_index(self):
        # See GH16770
        c = CategoricalIndex([])
        res, indexer = c.reindex(["a", "b"])
        tm.assert_index_equal(res, Index(["a", "b"]), exact=True)
        tm.assert_numpy_array_equal(indexer, np.array([-1, -1], dtype=np.intp))

    @pytest.mark.parametrize(
        "data, non_lexsorted_data",
        [[[1, 2, 3], [9, 0, 1, 2, 3]], [list("abc"),
                                        list("fabcd")]],
    )
    def test_is_monotonic(self, data, non_lexsorted_data):
        c = CategoricalIndex(data)
        assert c.is_monotonic_increasing is True
        assert c.is_monotonic_decreasing is False

        c = CategoricalIndex(data, ordered=True)
        assert c.is_monotonic_increasing is True
        assert c.is_monotonic_decreasing is False

        c = CategoricalIndex(data, categories=reversed(data))
        assert c.is_monotonic_increasing is False
        assert c.is_monotonic_decreasing is True

        c = CategoricalIndex(data, categories=reversed(data), ordered=True)
        assert c.is_monotonic_increasing is False
        assert c.is_monotonic_decreasing is True

        # test when data is neither monotonic increasing nor decreasing
        reordered_data = [data[0], data[2], data[1]]
        c = CategoricalIndex(reordered_data, categories=reversed(data))
        assert c.is_monotonic_increasing is False
        assert c.is_monotonic_decreasing is False

        # non lexsorted categories
        categories = non_lexsorted_data

        c = CategoricalIndex(categories[:2], categories=categories)
        assert c.is_monotonic_increasing is True
        assert c.is_monotonic_decreasing is False

        c = CategoricalIndex(categories[1:3], categories=categories)
        assert c.is_monotonic_increasing is True
        assert c.is_monotonic_decreasing is False

    def test_has_duplicates(self):

        idx = CategoricalIndex([0, 0, 0], name="foo")
        assert idx.is_unique is False
        assert idx.has_duplicates is True

    def test_drop_duplicates(self):

        idx = CategoricalIndex([0, 0, 0], name="foo")
        expected = CategoricalIndex([0], name="foo")
        tm.assert_index_equal(idx.drop_duplicates(), expected)
        tm.assert_index_equal(idx.unique(), expected)

    def test_get_indexer(self):

        idx1 = CategoricalIndex(list("aabcde"), categories=list("edabc"))
        idx2 = CategoricalIndex(list("abf"))

        for indexer in [idx2, list("abf"), Index(list("abf"))]:
            r1 = idx1.get_indexer(idx2)
            tm.assert_almost_equal(r1, np.array([0, 1, 2, -1], dtype=np.intp))

        msg = ("method='pad' and method='backfill' not implemented yet for"
               " CategoricalIndex")
        with pytest.raises(NotImplementedError, match=msg):
            idx2.get_indexer(idx1, method="pad")
        with pytest.raises(NotImplementedError, match=msg):
            idx2.get_indexer(idx1, method="backfill")

        msg = "method='nearest' not implemented yet for CategoricalIndex"
        with pytest.raises(NotImplementedError, match=msg):
            idx2.get_indexer(idx1, method="nearest")

    def test_get_loc(self):
        # GH 12531
        cidx1 = CategoricalIndex(list("abcde"), categories=list("edabc"))
        idx1 = Index(list("abcde"))
        assert cidx1.get_loc("a") == idx1.get_loc("a")
        assert cidx1.get_loc("e") == idx1.get_loc("e")

        for i in [cidx1, idx1]:
            with pytest.raises(KeyError, match="'NOT-EXIST'"):
                i.get_loc("NOT-EXIST")

        # non-unique
        cidx2 = CategoricalIndex(list("aacded"), categories=list("edabc"))
        idx2 = Index(list("aacded"))

        # results in bool array
        res = cidx2.get_loc("d")
        tm.assert_numpy_array_equal(res, idx2.get_loc("d"))
        tm.assert_numpy_array_equal(
            res, np.array([False, False, False, True, False, True]))
        # unique element results in scalar
        res = cidx2.get_loc("e")
        assert res == idx2.get_loc("e")
        assert res == 4

        for i in [cidx2, idx2]:
            with pytest.raises(KeyError, match="'NOT-EXIST'"):
                i.get_loc("NOT-EXIST")

        # non-unique, sliceable
        cidx3 = CategoricalIndex(list("aabbb"), categories=list("abc"))
        idx3 = Index(list("aabbb"))

        # results in slice
        res = cidx3.get_loc("a")
        assert res == idx3.get_loc("a")
        assert res == slice(0, 2, None)

        res = cidx3.get_loc("b")
        assert res == idx3.get_loc("b")
        assert res == slice(2, 5, None)

        for i in [cidx3, idx3]:
            with pytest.raises(KeyError, match="'c'"):
                i.get_loc("c")

    def test_repr_roundtrip(self):

        ci = CategoricalIndex(["a", "b"], categories=["a", "b"], ordered=True)
        str(ci)
        tm.assert_index_equal(eval(repr(ci)), ci, exact=True)

        # formatting
        str(ci)

        # long format
        # this is not reprable
        ci = CategoricalIndex(np.random.randint(0, 5, size=100))
        str(ci)

    def test_isin(self):

        ci = CategoricalIndex(list("aabca") + [np.nan],
                              categories=["c", "a", "b"])
        tm.assert_numpy_array_equal(
            ci.isin(["c"]), np.array([False, False, False, True, False,
                                      False]))
        tm.assert_numpy_array_equal(ci.isin(["c", "a", "b"]),
                                    np.array([True] * 5 + [False]))
        tm.assert_numpy_array_equal(ci.isin(["c", "a", "b", np.nan]),
                                    np.array([True] * 6))

        # mismatched categorical -> coerced to ndarray so doesn't matter
        result = ci.isin(ci.set_categories(list("abcdefghi")))
        expected = np.array([True] * 6)
        tm.assert_numpy_array_equal(result, expected)

        result = ci.isin(ci.set_categories(list("defghi")))
        expected = np.array([False] * 5 + [True])
        tm.assert_numpy_array_equal(result, expected)

    def test_identical(self):

        ci1 = CategoricalIndex(["a", "b"], categories=["a", "b"], ordered=True)
        ci2 = CategoricalIndex(["a", "b"],
                               categories=["a", "b", "c"],
                               ordered=True)
        assert ci1.identical(ci1)
        assert ci1.identical(ci1.copy())
        assert not ci1.identical(ci2)

    def test_ensure_copied_data(self, indices):
        # gh-12309: Check the "copy" argument of each
        # Index.__new__ is honored.
        #
        # Must be tested separately from other indexes because
        # self.values is not an ndarray.
        # GH#29918 Index.base has been removed
        # FIXME: is this test still meaningful?
        _base = lambda ar: ar if getattr(ar, "base", None) is None else ar.base

        result = CategoricalIndex(indices.values, copy=True)
        tm.assert_index_equal(indices, result)
        assert _base(indices.values) is not _base(result.values)

        result = CategoricalIndex(indices.values, copy=False)
        assert _base(indices.values) is _base(result.values)

    def test_equals_categorical(self):
        ci1 = CategoricalIndex(["a", "b"], categories=["a", "b"], ordered=True)
        ci2 = CategoricalIndex(["a", "b"],
                               categories=["a", "b", "c"],
                               ordered=True)

        assert ci1.equals(ci1)
        assert not ci1.equals(ci2)
        assert ci1.equals(ci1.astype(object))
        assert ci1.astype(object).equals(ci1)

        assert (ci1 == ci1).all()
        assert not (ci1 != ci1).all()
        assert not (ci1 > ci1).all()
        assert not (ci1 < ci1).all()
        assert (ci1 <= ci1).all()
        assert (ci1 >= ci1).all()

        assert not (ci1 == 1).all()
        assert (ci1 == Index(["a", "b"])).all()
        assert (ci1 == ci1.values).all()

        # invalid comparisons
        with pytest.raises(ValueError, match="Lengths must match"):
            ci1 == Index(["a", "b", "c"])

        msg = (
            "categorical index comparisons must have the same categories"
            " and ordered attributes"
            "|"
            "Categoricals can only be compared if 'categories' are the same. "
            "Categories are different lengths"
            "|"
            "Categoricals can only be compared if 'ordered' is the same")
        with pytest.raises(TypeError, match=msg):
            ci1 == ci2
        with pytest.raises(TypeError, match=msg):
            ci1 == Categorical(ci1.values, ordered=False)
        with pytest.raises(TypeError, match=msg):
            ci1 == Categorical(ci1.values, categories=list("abc"))

        # tests
        # make sure that we are testing for category inclusion properly
        ci = CategoricalIndex(list("aabca"), categories=["c", "a", "b"])
        assert not ci.equals(list("aabca"))
        # Same categories, but different order
        # Unordered
        assert ci.equals(CategoricalIndex(list("aabca")))
        # Ordered
        assert not ci.equals(CategoricalIndex(list("aabca"), ordered=True))
        assert ci.equals(ci.copy())

        ci = CategoricalIndex(list("aabca") + [np.nan],
                              categories=["c", "a", "b"])
        assert not ci.equals(list("aabca"))
        assert not ci.equals(CategoricalIndex(list("aabca")))
        assert ci.equals(ci.copy())

        ci = CategoricalIndex(list("aabca") + [np.nan],
                              categories=["c", "a", "b"])
        assert not ci.equals(list("aabca") + [np.nan])
        assert ci.equals(CategoricalIndex(list("aabca") + [np.nan]))
        assert not ci.equals(
            CategoricalIndex(list("aabca") + [np.nan], ordered=True))
        assert ci.equals(ci.copy())

    def test_equals_categoridcal_unordered(self):
        # https://github.com/pandas-dev/pandas/issues/16603
        a = pd.CategoricalIndex(["A"], categories=["A", "B"])
        b = pd.CategoricalIndex(["A"], categories=["B", "A"])
        c = pd.CategoricalIndex(["C"], categories=["B", "A"])
        assert a.equals(b)
        assert not a.equals(c)
        assert not b.equals(c)

    def test_frame_repr(self):
        df = pd.DataFrame({"A": [1, 2, 3]},
                          index=pd.CategoricalIndex(["a", "b", "c"]))
        result = repr(df)
        expected = "   A\na  1\nb  2\nc  3"
        assert result == expected

    def test_string_categorical_index_repr(self):
        # short
        idx = pd.CategoricalIndex(["a", "bb", "ccc"])
        expected = """CategoricalIndex(['a', 'bb', 'ccc'], categories=['a', 'bb', 'ccc'], ordered=False, dtype='category')"""  # noqa
        assert repr(idx) == expected

        # multiple lines
        idx = pd.CategoricalIndex(["a", "bb", "ccc"] * 10)
        expected = """CategoricalIndex(['a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a',
                  'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb',
                  'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc'],
                 categories=['a', 'bb', 'ccc'], ordered=False, dtype='category')"""  # noqa

        assert repr(idx) == expected

        # truncated
        idx = pd.CategoricalIndex(["a", "bb", "ccc"] * 100)
        expected = """CategoricalIndex(['a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a',
                  ...
                  'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc'],
                 categories=['a', 'bb', 'ccc'], ordered=False, dtype='category', length=300)"""  # noqa

        assert repr(idx) == expected

        # larger categories
        idx = pd.CategoricalIndex(list("abcdefghijklmmo"))
        expected = """CategoricalIndex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l',
                  'm', 'm', 'o'],
                 categories=['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', ...], ordered=False, dtype='category')"""  # noqa

        assert repr(idx) == expected

        # short
        idx = pd.CategoricalIndex(["あ", "いい", "ううう"])
        expected = """CategoricalIndex(['あ', 'いい', 'ううう'], categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')"""  # noqa
        assert repr(idx) == expected

        # multiple lines
        idx = pd.CategoricalIndex(["あ", "いい", "ううう"] * 10)
        expected = """CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ',
                  'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい',
                  'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'],
                 categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')"""  # noqa

        assert repr(idx) == expected

        # truncated
        idx = pd.CategoricalIndex(["あ", "いい", "ううう"] * 100)
        expected = """CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ',
                  ...
                  'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'],
                 categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category', length=300)"""  # noqa

        assert repr(idx) == expected

        # larger categories
        idx = pd.CategoricalIndex(list("あいうえおかきくけこさしすせそ"))
        expected = """CategoricalIndex(['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', 'け', 'こ', 'さ', 'し',
                  'す', 'せ', 'そ'],
                 categories=['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', ...], ordered=False, dtype='category')"""  # noqa

        assert repr(idx) == expected

        # Emable Unicode option -----------------------------------------
        with cf.option_context("display.unicode.east_asian_width", True):

            # short
            idx = pd.CategoricalIndex(["あ", "いい", "ううう"])
            expected = """CategoricalIndex(['あ', 'いい', 'ううう'], categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')"""  # noqa
            assert repr(idx) == expected

            # multiple lines
            idx = pd.CategoricalIndex(["あ", "いい", "ううう"] * 10)
            expected = """CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい',
                  'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう',
                  'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい',
                  'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'],
                 categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')"""  # noqa

            assert repr(idx) == expected

            # truncated
            idx = pd.CategoricalIndex(["あ", "いい", "ううう"] * 100)
            expected = """CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい',
                  'ううう', 'あ',
                  ...
                  'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう',
                  'あ', 'いい', 'ううう'],
                 categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category', length=300)"""  # noqa

            assert repr(idx) == expected

            # larger categories
            idx = pd.CategoricalIndex(list("あいうえおかきくけこさしすせそ"))
            expected = """CategoricalIndex(['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', 'け', 'こ',
                  'さ', 'し', 'す', 'せ', 'そ'],
                 categories=['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', ...], ordered=False, dtype='category')"""  # noqa

            assert repr(idx) == expected

    def test_fillna_categorical(self):
        # GH 11343
        idx = CategoricalIndex([1.0, np.nan, 3.0, 1.0], name="x")
        # fill by value in categories
        exp = CategoricalIndex([1.0, 1.0, 3.0, 1.0], name="x")
        tm.assert_index_equal(idx.fillna(1.0), exp)

        # fill by value not in categories raises ValueError
        msg = "fill value must be in categories"
        with pytest.raises(ValueError, match=msg):
            idx.fillna(2.0)

    def test_take_fill_value(self):
        # GH 12631

        # numeric category
        idx = pd.CategoricalIndex([1, 2, 3], name="xxx")
        result = idx.take(np.array([1, 0, -1]))
        expected = pd.CategoricalIndex([2, 1, 3], name="xxx")
        tm.assert_index_equal(result, expected)
        tm.assert_categorical_equal(result.values, expected.values)

        # fill_value
        result = idx.take(np.array([1, 0, -1]), fill_value=True)
        expected = pd.CategoricalIndex([2, 1, np.nan],
                                       categories=[1, 2, 3],
                                       name="xxx")
        tm.assert_index_equal(result, expected)
        tm.assert_categorical_equal(result.values, expected.values)

        # allow_fill=False
        result = idx.take(np.array([1, 0, -1]),
                          allow_fill=False,
                          fill_value=True)
        expected = pd.CategoricalIndex([2, 1, 3], name="xxx")
        tm.assert_index_equal(result, expected)
        tm.assert_categorical_equal(result.values, expected.values)

        # object category
        idx = pd.CategoricalIndex(list("CBA"),
                                  categories=list("ABC"),
                                  ordered=True,
                                  name="xxx")
        result = idx.take(np.array([1, 0, -1]))
        expected = pd.CategoricalIndex(list("BCA"),
                                       categories=list("ABC"),
                                       ordered=True,
                                       name="xxx")
        tm.assert_index_equal(result, expected)
        tm.assert_categorical_equal(result.values, expected.values)

        # fill_value
        result = idx.take(np.array([1, 0, -1]), fill_value=True)
        expected = pd.CategoricalIndex(["B", "C", np.nan],
                                       categories=list("ABC"),
                                       ordered=True,
                                       name="xxx")
        tm.assert_index_equal(result, expected)
        tm.assert_categorical_equal(result.values, expected.values)

        # allow_fill=False
        result = idx.take(np.array([1, 0, -1]),
                          allow_fill=False,
                          fill_value=True)
        expected = pd.CategoricalIndex(list("BCA"),
                                       categories=list("ABC"),
                                       ordered=True,
                                       name="xxx")
        tm.assert_index_equal(result, expected)
        tm.assert_categorical_equal(result.values, expected.values)

        msg = ("When allow_fill=True and fill_value is not None, "
               "all indices must be >= -1")
        with pytest.raises(ValueError, match=msg):
            idx.take(np.array([1, 0, -2]), fill_value=True)
        with pytest.raises(ValueError, match=msg):
            idx.take(np.array([1, 0, -5]), fill_value=True)

        with pytest.raises(IndexError):
            idx.take(np.array([1, -5]))

    def test_take_fill_value_datetime(self):

        # datetime category
        idx = pd.DatetimeIndex(["2011-01-01", "2011-02-01", "2011-03-01"],
                               name="xxx")
        idx = pd.CategoricalIndex(idx)
        result = idx.take(np.array([1, 0, -1]))
        expected = pd.DatetimeIndex(["2011-02-01", "2011-01-01", "2011-03-01"],
                                    name="xxx")
        expected = pd.CategoricalIndex(expected)
        tm.assert_index_equal(result, expected)

        # fill_value
        result = idx.take(np.array([1, 0, -1]), fill_value=True)
        expected = pd.DatetimeIndex(["2011-02-01", "2011-01-01", "NaT"],
                                    name="xxx")
        exp_cats = pd.DatetimeIndex(["2011-01-01", "2011-02-01", "2011-03-01"])
        expected = pd.CategoricalIndex(expected, categories=exp_cats)
        tm.assert_index_equal(result, expected)

        # allow_fill=False
        result = idx.take(np.array([1, 0, -1]),
                          allow_fill=False,
                          fill_value=True)
        expected = pd.DatetimeIndex(["2011-02-01", "2011-01-01", "2011-03-01"],
                                    name="xxx")
        expected = pd.CategoricalIndex(expected)
        tm.assert_index_equal(result, expected)

        msg = ("When allow_fill=True and fill_value is not None, "
               "all indices must be >= -1")
        with pytest.raises(ValueError, match=msg):
            idx.take(np.array([1, 0, -2]), fill_value=True)
        with pytest.raises(ValueError, match=msg):
            idx.take(np.array([1, 0, -5]), fill_value=True)

        with pytest.raises(IndexError):
            idx.take(np.array([1, -5]))

    def test_take_invalid_kwargs(self):
        idx = pd.CategoricalIndex([1, 2, 3], name="foo")
        indices = [1, 0, -1]

        msg = r"take\(\) got an unexpected keyword argument 'foo'"
        with pytest.raises(TypeError, match=msg):
            idx.take(indices, foo=2)

        msg = "the 'out' parameter is not supported"
        with pytest.raises(ValueError, match=msg):
            idx.take(indices, out=indices)

        msg = "the 'mode' parameter is not supported"
        with pytest.raises(ValueError, match=msg):
            idx.take(indices, mode="clip")

    @pytest.mark.parametrize(
        "dtype, engine_type",
        [
            (np.int8, libindex.Int8Engine),
            (np.int16, libindex.Int16Engine),
            (np.int32, libindex.Int32Engine),
            (np.int64, libindex.Int64Engine),
        ],
    )
    def test_engine_type(self, dtype, engine_type):
        if dtype != np.int64:
            # num. of uniques required to push CategoricalIndex.codes to a
            # dtype (128 categories required for .codes dtype to be int16 etc.)
            num_uniques = {np.int8: 1, np.int16: 128, np.int32: 32768}[dtype]
            ci = pd.CategoricalIndex(range(num_uniques))
        else:
            # having 2**32 - 2**31 categories would be very memory-intensive,
            # so we cheat a bit with the dtype
            ci = pd.CategoricalIndex(range(32768))  # == 2**16 - 2**(16 - 1)
            ci.values._codes = ci.values._codes.astype("int64")
        assert np.issubdtype(ci.codes.dtype, dtype)
        assert isinstance(ci._engine, engine_type)
Beispiel #26
0
class TestSeriesConstructors:
    @pytest.mark.parametrize(
        "constructor,check_index_type",
        [
            # NOTE: some overlap with test_constructor_empty but that test does not
            # test for None or an empty generator.
            # test_constructor_pass_none tests None but only with the index also
            # passed.
            (lambda: Series(), True),
            (lambda: Series(None), True),
            (lambda: Series({}), True),
            (lambda: Series(()), False),  # creates a RangeIndex
            (lambda: Series([]), False),  # creates a RangeIndex
            (lambda: Series((x for x in [])), False),  # creates a RangeIndex
            (lambda: Series(data=None), True),
            (lambda: Series(data={}), True),
            (lambda: Series(data=()), False),  # creates a RangeIndex
            (lambda: Series(data=[]), False),  # creates a RangeIndex
            (lambda: Series(data=(x for x in [])),
             False),  # creates a RangeIndex
        ],
    )
    def test_empty_constructor(self, constructor, check_index_type):
        expected = Series()
        result = constructor()
        assert len(result.index) == 0
        tm.assert_series_equal(result,
                               expected,
                               check_index_type=check_index_type)

    def test_invalid_dtype(self):
        # GH15520
        msg = "not understood"
        invalid_list = [pd.Timestamp, "pd.Timestamp", list]
        for dtype in invalid_list:
            with pytest.raises(TypeError, match=msg):
                Series([], name="time", dtype=dtype)

    def test_scalar_conversion(self):

        # Pass in scalar is disabled
        scalar = Series(0.5)
        assert not isinstance(scalar, float)

        # Coercion
        assert float(Series([1.0])) == 1.0
        assert int(Series([1.0])) == 1

    def test_constructor(self, datetime_series):
        empty_series = Series()

        assert datetime_series.index.is_all_dates

        # Pass in Series
        derived = Series(datetime_series)
        assert derived.index.is_all_dates

        assert tm.equalContents(derived.index, datetime_series.index)
        # Ensure new index is not created
        assert id(datetime_series.index) == id(derived.index)

        # Mixed type Series
        mixed = Series(["hello", np.NaN], index=[0, 1])
        assert mixed.dtype == np.object_
        assert mixed[1] is np.NaN

        assert not empty_series.index.is_all_dates
        assert not Series().index.is_all_dates

        # exception raised is of type Exception
        with pytest.raises(Exception, match="Data must be 1-dimensional"):
            Series(np.random.randn(3, 3), index=np.arange(3))

        mixed.name = "Series"
        rs = Series(mixed).name
        xp = "Series"
        assert rs == xp

        # raise on MultiIndex GH4187
        m = MultiIndex.from_arrays([[1, 2], [3, 4]])
        msg = "initializing a Series from a MultiIndex is not supported"
        with pytest.raises(NotImplementedError, match=msg):
            Series(m)

    @pytest.mark.parametrize("input_class", [list, dict, OrderedDict])
    def test_constructor_empty(self, input_class):
        empty = Series()
        empty2 = Series(input_class())

        # these are Index() and RangeIndex() which don't compare type equal
        # but are just .equals
        assert_series_equal(empty, empty2, check_index_type=False)

        # With explicit dtype:
        empty = Series(dtype="float64")
        empty2 = Series(input_class(), dtype="float64")
        assert_series_equal(empty, empty2, check_index_type=False)

        # GH 18515 : with dtype=category:
        empty = Series(dtype="category")
        empty2 = Series(input_class(), dtype="category")
        assert_series_equal(empty, empty2, check_index_type=False)

        if input_class is not list:
            # With index:
            empty = Series(index=range(10))
            empty2 = Series(input_class(), index=range(10))
            assert_series_equal(empty, empty2)

            # With index and dtype float64:
            empty = Series(np.nan, index=range(10))
            empty2 = Series(input_class(), index=range(10), dtype="float64")
            assert_series_equal(empty, empty2)

            # GH 19853 : with empty string, index and dtype str
            empty = Series("", dtype=str, index=range(3))
            empty2 = Series("", index=range(3))
            assert_series_equal(empty, empty2)

    @pytest.mark.parametrize("input_arg", [np.nan, float("nan")])
    def test_constructor_nan(self, input_arg):
        empty = Series(dtype="float64", index=range(10))
        empty2 = Series(input_arg, index=range(10))

        assert_series_equal(empty, empty2, check_index_type=False)

    @pytest.mark.parametrize(
        "dtype",
        [
            "f8", "i8", "M8[ns]", "m8[ns]", "category", "object",
            "datetime64[ns, UTC]"
        ],
    )
    @pytest.mark.parametrize("index", [None, pd.Index([])])
    def test_constructor_dtype_only(self, dtype, index):
        # GH-20865
        result = pd.Series(dtype=dtype, index=index)
        assert result.dtype == dtype
        assert len(result) == 0

    def test_constructor_no_data_index_order(self):
        result = pd.Series(index=["b", "a", "c"])
        assert result.index.tolist() == ["b", "a", "c"]

    def test_constructor_no_data_string_type(self):
        # GH 22477
        result = pd.Series(index=[1], dtype=str)
        assert np.isnan(result.iloc[0])

    @pytest.mark.parametrize("item", ["entry", "ѐ", 13])
    def test_constructor_string_element_string_type(self, item):
        # GH 22477
        result = pd.Series(item, index=[1], dtype=str)
        assert result.iloc[0] == str(item)

    def test_constructor_dtype_str_na_values(self, string_dtype):
        # https://github.com/pandas-dev/pandas/issues/21083
        ser = Series(["x", None], dtype=string_dtype)
        result = ser.isna()
        expected = Series([False, True])
        tm.assert_series_equal(result, expected)
        assert ser.iloc[1] is None

        ser = Series(["x", np.nan], dtype=string_dtype)
        assert np.isnan(ser.iloc[1])

    def test_constructor_series(self):
        index1 = ["d", "b", "a", "c"]
        index2 = sorted(index1)
        s1 = Series([4, 7, -5, 3], index=index1)
        s2 = Series(s1, index=index2)

        assert_series_equal(s2, s1.sort_index())

    def test_constructor_iterable(self):
        # GH 21987
        class Iter:
            def __iter__(self):
                for i in range(10):
                    yield i

        expected = Series(list(range(10)), dtype="int64")
        result = Series(Iter(), dtype="int64")
        assert_series_equal(result, expected)

    def test_constructor_sequence(self):
        # GH 21987
        expected = Series(list(range(10)), dtype="int64")
        result = Series(range(10), dtype="int64")
        assert_series_equal(result, expected)

    def test_constructor_single_str(self):
        # GH 21987
        expected = Series(["abc"])
        result = Series("abc")
        assert_series_equal(result, expected)

    def test_constructor_list_like(self):

        # make sure that we are coercing different
        # list-likes to standard dtypes and not
        # platform specific
        expected = Series([1, 2, 3], dtype="int64")
        for obj in [[1, 2, 3], (1, 2, 3), np.array([1, 2, 3], dtype="int64")]:
            result = Series(obj, index=[0, 1, 2])
            assert_series_equal(result, expected)

    @pytest.mark.parametrize("dtype", ["bool", "int32", "int64", "float64"])
    def test_constructor_index_dtype(self, dtype):
        # GH 17088

        s = Series(Index([0, 2, 4]), dtype=dtype)
        assert s.dtype == dtype

    @pytest.mark.parametrize(
        "input_vals",
        [
            ([1, 2]),
            (["1", "2"]),
            (list(pd.date_range("1/1/2011", periods=2, freq="H"))),
            (list(
                pd.date_range("1/1/2011", periods=2, freq="H",
                              tz="US/Eastern"))),
            ([pd.Interval(left=0, right=5)]),
        ],
    )
    def test_constructor_list_str(self, input_vals, string_dtype):
        # GH 16605
        # Ensure that data elements from a list are converted to strings
        # when dtype is str, 'str', or 'U'
        result = Series(input_vals, dtype=string_dtype)
        expected = Series(input_vals).astype(string_dtype)
        assert_series_equal(result, expected)

    def test_constructor_list_str_na(self, string_dtype):
        result = Series([1.0, 2.0, np.nan], dtype=string_dtype)
        expected = Series(["1.0", "2.0", np.nan], dtype=object)
        assert_series_equal(result, expected)
        assert np.isnan(result[2])

    def test_constructor_generator(self):
        gen = (i for i in range(10))

        result = Series(gen)
        exp = Series(range(10))
        assert_series_equal(result, exp)

        gen = (i for i in range(10))
        result = Series(gen, index=range(10, 20))
        exp.index = range(10, 20)
        assert_series_equal(result, exp)

    def test_constructor_map(self):
        # GH8909
        m = map(lambda x: x, range(10))

        result = Series(m)
        exp = Series(range(10))
        assert_series_equal(result, exp)

        m = map(lambda x: x, range(10))
        result = Series(m, index=range(10, 20))
        exp.index = range(10, 20)
        assert_series_equal(result, exp)

    def test_constructor_categorical(self):
        cat = pd.Categorical([0, 1, 2, 0, 1, 2], ["a", "b", "c"],
                             fastpath=True)
        res = Series(cat)
        tm.assert_categorical_equal(res.values, cat)

        # can cast to a new dtype
        result = Series(pd.Categorical([1, 2, 3]), dtype="int64")
        expected = pd.Series([1, 2, 3], dtype="int64")
        tm.assert_series_equal(result, expected)

        # GH12574
        cat = Series(pd.Categorical([1, 2, 3]), dtype="category")
        assert is_categorical_dtype(cat)
        assert is_categorical_dtype(cat.dtype)
        s = Series([1, 2, 3], dtype="category")
        assert is_categorical_dtype(s)
        assert is_categorical_dtype(s.dtype)

    def test_constructor_categorical_with_coercion(self):
        factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"])
        # test basic creation / coercion of categoricals
        s = Series(factor, name="A")
        assert s.dtype == "category"
        assert len(s) == len(factor)
        str(s.values)
        str(s)

        # in a frame
        df = DataFrame({"A": factor})
        result = df["A"]
        tm.assert_series_equal(result, s)
        result = df.iloc[:, 0]
        tm.assert_series_equal(result, s)
        assert len(df) == len(factor)
        str(df.values)
        str(df)

        df = DataFrame({"A": s})
        result = df["A"]
        tm.assert_series_equal(result, s)
        assert len(df) == len(factor)
        str(df.values)
        str(df)

        # multiples
        df = DataFrame({"A": s, "B": s, "C": 1})
        result1 = df["A"]
        result2 = df["B"]
        tm.assert_series_equal(result1, s)
        tm.assert_series_equal(result2, s, check_names=False)
        assert result2.name == "B"
        assert len(df) == len(factor)
        str(df.values)
        str(df)

        # GH8623
        x = DataFrame(
            [[1, "John P. Doe"], [2, "Jane Dove"], [1, "John P. Doe"]],
            columns=["person_id", "person_name"],
        )
        x["person_name"] = Categorical(
            x.person_name)  # doing this breaks transform

        expected = x.iloc[0].person_name
        result = x.person_name.iloc[0]
        assert result == expected

        result = x.person_name[0]
        assert result == expected

        result = x.person_name.loc[0]
        assert result == expected

    def test_constructor_categorical_dtype(self):
        result = pd.Series(["a", "b"],
                           dtype=CategoricalDtype(["a", "b", "c"],
                                                  ordered=True))
        assert is_categorical_dtype(result) is True
        tm.assert_index_equal(result.cat.categories, pd.Index(["a", "b", "c"]))
        assert result.cat.ordered

        result = pd.Series(["a", "b"], dtype=CategoricalDtype(["b", "a"]))
        assert is_categorical_dtype(result)
        tm.assert_index_equal(result.cat.categories, pd.Index(["b", "a"]))
        assert result.cat.ordered is False

        # GH 19565 - Check broadcasting of scalar with Categorical dtype
        result = Series("a",
                        index=[0, 1],
                        dtype=CategoricalDtype(["a", "b"], ordered=True))
        expected = Series(["a", "a"],
                          index=[0, 1],
                          dtype=CategoricalDtype(["a", "b"], ordered=True))
        tm.assert_series_equal(result, expected, check_categorical=True)

    def test_constructor_categorical_string(self):
        # GH 26336: the string 'category' maintains existing CategoricalDtype
        cdt = CategoricalDtype(categories=list("dabc"), ordered=True)
        expected = Series(list("abcabc"), dtype=cdt)

        # Series(Categorical, dtype='category') keeps existing dtype
        cat = Categorical(list("abcabc"), dtype=cdt)
        result = Series(cat, dtype="category")
        tm.assert_series_equal(result, expected)

        # Series(Series[Categorical], dtype='category') keeps existing dtype
        result = Series(result, dtype="category")
        tm.assert_series_equal(result, expected)

    @pytest.mark.parametrize("none, warning",
                             [(None, None), (ordered_sentinel, FutureWarning)])
    def test_categorical_ordered_none_deprecated(self, none, warning):
        # GH 26336: only warn if None is not explicitly passed
        cdt1 = CategoricalDtype(categories=list("cdab"), ordered=True)
        cdt2 = CategoricalDtype(categories=list("cedafb"), ordered=none)

        cat = Categorical(list("abcdaba"), dtype=cdt1)
        with tm.assert_produces_warning(warning, check_stacklevel=False):
            Series(cat, dtype=cdt2)

        s = Series(cat)
        with tm.assert_produces_warning(warning, check_stacklevel=False):
            Series(s, dtype=cdt2)

    def test_categorical_sideeffects_free(self):
        # Passing a categorical to a Series and then changing values in either
        # the series or the categorical should not change the values in the
        # other one, IF you specify copy!
        cat = Categorical(["a", "b", "c", "a"])
        s = Series(cat, copy=True)
        assert s.cat is not cat
        s.cat.categories = [1, 2, 3]
        exp_s = np.array([1, 2, 3, 1], dtype=np.int64)
        exp_cat = np.array(["a", "b", "c", "a"], dtype=np.object_)
        tm.assert_numpy_array_equal(s.__array__(), exp_s)
        tm.assert_numpy_array_equal(cat.__array__(), exp_cat)

        # setting
        s[0] = 2
        exp_s2 = np.array([2, 2, 3, 1], dtype=np.int64)
        tm.assert_numpy_array_equal(s.__array__(), exp_s2)
        tm.assert_numpy_array_equal(cat.__array__(), exp_cat)

        # however, copy is False by default
        # so this WILL change values
        cat = Categorical(["a", "b", "c", "a"])
        s = Series(cat)
        assert s.values is cat
        s.cat.categories = [1, 2, 3]
        exp_s = np.array([1, 2, 3, 1], dtype=np.int64)
        tm.assert_numpy_array_equal(s.__array__(), exp_s)
        tm.assert_numpy_array_equal(cat.__array__(), exp_s)

        s[0] = 2
        exp_s2 = np.array([2, 2, 3, 1], dtype=np.int64)
        tm.assert_numpy_array_equal(s.__array__(), exp_s2)
        tm.assert_numpy_array_equal(cat.__array__(), exp_s2)

    def test_unordered_compare_equal(self):
        left = pd.Series(["a", "b", "c"], dtype=CategoricalDtype(["a", "b"]))
        right = pd.Series(
            pd.Categorical(["a", "b", np.nan], categories=["a", "b"]))
        tm.assert_series_equal(left, right)

    def test_constructor_maskedarray(self):
        data = ma.masked_all((3, ), dtype=float)
        result = Series(data)
        expected = Series([nan, nan, nan])
        assert_series_equal(result, expected)

        data[0] = 0.0
        data[2] = 2.0
        index = ["a", "b", "c"]
        result = Series(data, index=index)
        expected = Series([0.0, nan, 2.0], index=index)
        assert_series_equal(result, expected)

        data[1] = 1.0
        result = Series(data, index=index)
        expected = Series([0.0, 1.0, 2.0], index=index)
        assert_series_equal(result, expected)

        data = ma.masked_all((3, ), dtype=int)
        result = Series(data)
        expected = Series([nan, nan, nan], dtype=float)
        assert_series_equal(result, expected)

        data[0] = 0
        data[2] = 2
        index = ["a", "b", "c"]
        result = Series(data, index=index)
        expected = Series([0, nan, 2], index=index, dtype=float)
        assert_series_equal(result, expected)

        data[1] = 1
        result = Series(data, index=index)
        expected = Series([0, 1, 2], index=index, dtype=int)
        assert_series_equal(result, expected)

        data = ma.masked_all((3, ), dtype=bool)
        result = Series(data)
        expected = Series([nan, nan, nan], dtype=object)
        assert_series_equal(result, expected)

        data[0] = True
        data[2] = False
        index = ["a", "b", "c"]
        result = Series(data, index=index)
        expected = Series([True, nan, False], index=index, dtype=object)
        assert_series_equal(result, expected)

        data[1] = True
        result = Series(data, index=index)
        expected = Series([True, True, False], index=index, dtype=bool)
        assert_series_equal(result, expected)

        data = ma.masked_all((3, ), dtype="M8[ns]")
        result = Series(data)
        expected = Series([iNaT, iNaT, iNaT], dtype="M8[ns]")
        assert_series_equal(result, expected)

        data[0] = datetime(2001, 1, 1)
        data[2] = datetime(2001, 1, 3)
        index = ["a", "b", "c"]
        result = Series(data, index=index)
        expected = Series(
            [datetime(2001, 1, 1), iNaT,
             datetime(2001, 1, 3)],
            index=index,
            dtype="M8[ns]",
        )
        assert_series_equal(result, expected)

        data[1] = datetime(2001, 1, 2)
        result = Series(data, index=index)
        expected = Series(
            [datetime(2001, 1, 1),
             datetime(2001, 1, 2),
             datetime(2001, 1, 3)],
            index=index,
            dtype="M8[ns]",
        )
        assert_series_equal(result, expected)

    def test_constructor_maskedarray_hardened(self):
        # Check numpy masked arrays with hard masks -- from GH24574
        data = ma.masked_all((3, ), dtype=float).harden_mask()
        result = pd.Series(data)
        expected = pd.Series([nan, nan, nan])
        tm.assert_series_equal(result, expected)

    def test_series_ctor_plus_datetimeindex(self):
        rng = date_range("20090415", "20090519", freq="B")
        data = {k: 1 for k in rng}

        result = Series(data, index=rng)
        assert result.index is rng

    def test_constructor_default_index(self):
        s = Series([0, 1, 2])
        tm.assert_index_equal(s.index, pd.Index(np.arange(3)))

    @pytest.mark.parametrize(
        "input",
        [
            [1, 2, 3],
            (1, 2, 3),
            list(range(3)),
            pd.Categorical(["a", "b", "a"]),
            (i for i in range(3)),
            map(lambda x: x, range(3)),
        ],
    )
    def test_constructor_index_mismatch(self, input):
        # GH 19342
        # test that construction of a Series with an index of different length
        # raises an error
        msg = "Length of passed values is 3, index implies 4"
        with pytest.raises(ValueError, match=msg):
            Series(input, index=np.arange(4))

    def test_constructor_numpy_scalar(self):
        # GH 19342
        # construction with a numpy scalar
        # should not raise
        result = Series(np.array(100), index=np.arange(4), dtype="int64")
        expected = Series(100, index=np.arange(4), dtype="int64")
        tm.assert_series_equal(result, expected)

    def test_constructor_broadcast_list(self):
        # GH 19342
        # construction with single-element container and index
        # should raise
        msg = "Length of passed values is 1, index implies 3"
        with pytest.raises(ValueError, match=msg):
            Series(["foo"], index=["a", "b", "c"])

    def test_constructor_corner(self):
        df = tm.makeTimeDataFrame()
        objs = [df, df]
        s = Series(objs, index=[0, 1])
        assert isinstance(s, Series)

    def test_constructor_sanitize(self):
        s = Series(np.array([1.0, 1.0, 8.0]), dtype="i8")
        assert s.dtype == np.dtype("i8")

        s = Series(np.array([1.0, 1.0, np.nan]), copy=True, dtype="i8")
        assert s.dtype == np.dtype("f8")

    def test_constructor_copy(self):
        # GH15125
        # test dtype parameter has no side effects on copy=True
        for data in [[1.0], np.array([1.0])]:
            x = Series(data)
            y = pd.Series(x, copy=True, dtype=float)

            # copy=True maintains original data in Series
            tm.assert_series_equal(x, y)

            # changes to origin of copy does not affect the copy
            x[0] = 2.0
            assert not x.equals(y)
            assert x[0] == 2.0
            assert y[0] == 1.0

    @pytest.mark.parametrize(
        "index",
        [
            pd.date_range("20170101", periods=3, tz="US/Eastern"),
            pd.date_range("20170101", periods=3),
            pd.timedelta_range("1 day", periods=3),
            pd.period_range("2012Q1", periods=3, freq="Q"),
            pd.Index(list("abc")),
            pd.Int64Index([1, 2, 3]),
            pd.RangeIndex(0, 3),
        ],
        ids=lambda x: type(x).__name__,
    )
    def test_constructor_limit_copies(self, index):
        # GH 17449
        # limit copies of input
        s = pd.Series(index)

        # we make 1 copy; this is just a smoke test here
        assert s._data.blocks[0].values is not index

    def test_constructor_pass_none(self):
        s = Series(None, index=range(5))
        assert s.dtype == np.float64

        s = Series(None, index=range(5), dtype=object)
        assert s.dtype == np.object_

        # GH 7431
        # inference on the index
        s = Series(index=np.array([None]))
        expected = Series(index=Index([None]))
        assert_series_equal(s, expected)

    def test_constructor_pass_nan_nat(self):
        # GH 13467
        exp = Series([np.nan, np.nan], dtype=np.float64)
        assert exp.dtype == np.float64
        tm.assert_series_equal(Series([np.nan, np.nan]), exp)
        tm.assert_series_equal(Series(np.array([np.nan, np.nan])), exp)

        exp = Series([pd.NaT, pd.NaT])
        assert exp.dtype == "datetime64[ns]"
        tm.assert_series_equal(Series([pd.NaT, pd.NaT]), exp)
        tm.assert_series_equal(Series(np.array([pd.NaT, pd.NaT])), exp)

        tm.assert_series_equal(Series([pd.NaT, np.nan]), exp)
        tm.assert_series_equal(Series(np.array([pd.NaT, np.nan])), exp)

        tm.assert_series_equal(Series([np.nan, pd.NaT]), exp)
        tm.assert_series_equal(Series(np.array([np.nan, pd.NaT])), exp)

    def test_constructor_cast(self):
        msg = "could not convert string to float"
        with pytest.raises(ValueError, match=msg):
            Series(["a", "b", "c"], dtype=float)

    def test_constructor_unsigned_dtype_overflow(self, uint_dtype):
        # see gh-15832
        msg = "Trying to coerce negative values to unsigned integers"
        with pytest.raises(OverflowError, match=msg):
            Series([-1], dtype=uint_dtype)

    def test_constructor_coerce_float_fail(self, any_int_dtype):
        # see gh-15832
        msg = "Trying to coerce float values to integers"
        with pytest.raises(ValueError, match=msg):
            Series([1, 2, 3.5], dtype=any_int_dtype)

    def test_constructor_coerce_float_valid(self, float_dtype):
        s = Series([1, 2, 3.5], dtype=float_dtype)
        expected = Series([1, 2, 3.5]).astype(float_dtype)
        assert_series_equal(s, expected)

    def test_constructor_dtype_no_cast(self):
        # see gh-1572
        s = Series([1, 2, 3])
        s2 = Series(s, dtype=np.int64)

        s2[1] = 5
        assert s[1] == 5

    def test_constructor_datelike_coercion(self):

        # GH 9477
        # incorrectly inferring on dateimelike looking when object dtype is
        # specified
        s = Series([Timestamp("20130101"), "NOV"], dtype=object)
        assert s.iloc[0] == Timestamp("20130101")
        assert s.iloc[1] == "NOV"
        assert s.dtype == object

        # the dtype was being reset on the slicing and re-inferred to datetime
        # even thought the blocks are mixed
        belly = "216 3T19".split()
        wing1 = "2T15 4H19".split()
        wing2 = "416 4T20".split()
        mat = pd.to_datetime("2016-01-22 2019-09-07".split())
        df = pd.DataFrame({
            "wing1": wing1,
            "wing2": wing2,
            "mat": mat
        },
                          index=belly)

        result = df.loc["3T19"]
        assert result.dtype == object
        result = df.loc["216"]
        assert result.dtype == object

    def test_constructor_datetimes_with_nulls(self):
        # gh-15869
        for arr in [
                np.array([None, None, None, None,
                          datetime.now(), None]),
                np.array([None, None, datetime.now(), None]),
        ]:
            result = Series(arr)
            assert result.dtype == "M8[ns]"

    def test_constructor_dtype_datetime64(self):

        s = Series(iNaT, dtype="M8[ns]", index=range(5))
        assert isna(s).all()

        # in theory this should be all nulls, but since
        # we are not specifying a dtype is ambiguous
        s = Series(iNaT, index=range(5))
        assert not isna(s).all()

        s = Series(nan, dtype="M8[ns]", index=range(5))
        assert isna(s).all()

        s = Series([datetime(2001, 1, 2, 0, 0), iNaT], dtype="M8[ns]")
        assert isna(s[1])
        assert s.dtype == "M8[ns]"

        s = Series([datetime(2001, 1, 2, 0, 0), nan], dtype="M8[ns]")
        assert isna(s[1])
        assert s.dtype == "M8[ns]"

        # GH3416
        dates = [
            np.datetime64(datetime(2013, 1, 1)),
            np.datetime64(datetime(2013, 1, 2)),
            np.datetime64(datetime(2013, 1, 3)),
        ]

        s = Series(dates)
        assert s.dtype == "M8[ns]"

        s.iloc[0] = np.nan
        assert s.dtype == "M8[ns]"

        # GH3414 related
        expected = Series(
            [datetime(2013, 1, 1),
             datetime(2013, 1, 2),
             datetime(2013, 1, 3)],
            dtype="datetime64[ns]",
        )

        result = Series(Series(dates).astype(np.int64) / 1000000,
                        dtype="M8[ms]")
        tm.assert_series_equal(result, expected)

        result = Series(dates, dtype="datetime64[ns]")
        tm.assert_series_equal(result, expected)

        expected = Series(
            [pd.NaT, datetime(2013, 1, 2),
             datetime(2013, 1, 3)],
            dtype="datetime64[ns]")
        result = Series([np.nan] + dates[1:], dtype="datetime64[ns]")
        tm.assert_series_equal(result, expected)

        dts = Series(dates, dtype="datetime64[ns]")

        # valid astype
        dts.astype("int64")

        # invalid casting
        msg = r"cannot astype a datetimelike from \[datetime64\[ns\]\] to" r" \[int32\]"
        with pytest.raises(TypeError, match=msg):
            dts.astype("int32")

        # ints are ok
        # we test with np.int64 to get similar results on
        # windows / 32-bit platforms
        result = Series(dts, dtype=np.int64)
        expected = Series(dts.astype(np.int64))
        tm.assert_series_equal(result, expected)

        # invalid dates can be help as object
        result = Series([datetime(2, 1, 1)])
        assert result[0] == datetime(2, 1, 1, 0, 0)

        result = Series([datetime(3000, 1, 1)])
        assert result[0] == datetime(3000, 1, 1, 0, 0)

        # don't mix types
        result = Series([Timestamp("20130101"), 1], index=["a", "b"])
        assert result["a"] == Timestamp("20130101")
        assert result["b"] == 1

        # GH6529
        # coerce datetime64 non-ns properly
        dates = date_range("01-Jan-2015", "01-Dec-2015", freq="M")
        values2 = dates.view(np.ndarray).astype("datetime64[ns]")
        expected = Series(values2, index=dates)

        for dtype in ["s", "D", "ms", "us", "ns"]:
            values1 = dates.view(np.ndarray).astype("M8[{0}]".format(dtype))
            result = Series(values1, dates)
            assert_series_equal(result, expected)

        # GH 13876
        # coerce to non-ns to object properly
        expected = Series(values2, index=dates, dtype=object)
        for dtype in ["s", "D", "ms", "us", "ns"]:
            values1 = dates.view(np.ndarray).astype("M8[{0}]".format(dtype))
            result = Series(values1, index=dates, dtype=object)
            assert_series_equal(result, expected)

        # leave datetime.date alone
        dates2 = np.array([d.date() for d in dates.to_pydatetime()],
                          dtype=object)
        series1 = Series(dates2, dates)
        tm.assert_numpy_array_equal(series1.values, dates2)
        assert series1.dtype == object

        # these will correctly infer a datetime
        s = Series([None, pd.NaT, "2013-08-05 15:30:00.000001"])
        assert s.dtype == "datetime64[ns]"
        s = Series([np.nan, pd.NaT, "2013-08-05 15:30:00.000001"])
        assert s.dtype == "datetime64[ns]"
        s = Series([pd.NaT, None, "2013-08-05 15:30:00.000001"])
        assert s.dtype == "datetime64[ns]"
        s = Series([pd.NaT, np.nan, "2013-08-05 15:30:00.000001"])
        assert s.dtype == "datetime64[ns]"

        # tz-aware (UTC and other tz's)
        # GH 8411
        dr = date_range("20130101", periods=3)
        assert Series(dr).iloc[0].tz is None
        dr = date_range("20130101", periods=3, tz="UTC")
        assert str(Series(dr).iloc[0].tz) == "UTC"
        dr = date_range("20130101", periods=3, tz="US/Eastern")
        assert str(Series(dr).iloc[0].tz) == "US/Eastern"

        # non-convertible
        s = Series([1479596223000, -1479590, pd.NaT])
        assert s.dtype == "object"
        assert s[2] is pd.NaT
        assert "NaT" in str(s)

        # if we passed a NaT it remains
        s = Series([datetime(2010, 1, 1), datetime(2, 1, 1), pd.NaT])
        assert s.dtype == "object"
        assert s[2] is pd.NaT
        assert "NaT" in str(s)

        # if we passed a nan it remains
        s = Series([datetime(2010, 1, 1), datetime(2, 1, 1), np.nan])
        assert s.dtype == "object"
        assert s[2] is np.nan
        assert "NaN" in str(s)

    def test_constructor_with_datetime_tz(self):

        # 8260
        # support datetime64 with tz

        dr = date_range("20130101", periods=3, tz="US/Eastern")
        s = Series(dr)
        assert s.dtype.name == "datetime64[ns, US/Eastern]"
        assert s.dtype == "datetime64[ns, US/Eastern]"
        assert is_datetime64tz_dtype(s.dtype)
        assert "datetime64[ns, US/Eastern]" in str(s)

        # export
        result = s.values
        assert isinstance(result, np.ndarray)
        assert result.dtype == "datetime64[ns]"

        exp = pd.DatetimeIndex(result)
        exp = exp.tz_localize("UTC").tz_convert(tz=s.dt.tz)
        tm.assert_index_equal(dr, exp)

        # indexing
        result = s.iloc[0]
        assert result == Timestamp("2013-01-01 00:00:00-0500",
                                   tz="US/Eastern",
                                   freq="D")
        result = s[0]
        assert result == Timestamp("2013-01-01 00:00:00-0500",
                                   tz="US/Eastern",
                                   freq="D")

        result = s[Series([True, True, False], index=s.index)]
        assert_series_equal(result, s[0:2])

        result = s.iloc[0:1]
        assert_series_equal(result, Series(dr[0:1]))

        # concat
        result = pd.concat([s.iloc[0:1], s.iloc[1:]])
        assert_series_equal(result, s)

        # short str
        assert "datetime64[ns, US/Eastern]" in str(s)

        # formatting with NaT
        result = s.shift()
        assert "datetime64[ns, US/Eastern]" in str(result)
        assert "NaT" in str(result)

        # long str
        t = Series(date_range("20130101", periods=1000, tz="US/Eastern"))
        assert "datetime64[ns, US/Eastern]" in str(t)

        result = pd.DatetimeIndex(s, freq="infer")
        tm.assert_index_equal(result, dr)

        # inference
        s = Series([
            pd.Timestamp("2013-01-01 13:00:00-0800", tz="US/Pacific"),
            pd.Timestamp("2013-01-02 14:00:00-0800", tz="US/Pacific"),
        ])
        assert s.dtype == "datetime64[ns, US/Pacific]"
        assert lib.infer_dtype(s, skipna=True) == "datetime64"

        s = Series([
            pd.Timestamp("2013-01-01 13:00:00-0800", tz="US/Pacific"),
            pd.Timestamp("2013-01-02 14:00:00-0800", tz="US/Eastern"),
        ])
        assert s.dtype == "object"
        assert lib.infer_dtype(s, skipna=True) == "datetime"

        # with all NaT
        s = Series(pd.NaT, index=[0, 1], dtype="datetime64[ns, US/Eastern]")
        expected = Series(pd.DatetimeIndex(["NaT", "NaT"], tz="US/Eastern"))
        assert_series_equal(s, expected)

    @pytest.mark.parametrize("arr_dtype", [np.int64, np.float64])
    @pytest.mark.parametrize("dtype", ["M8", "m8"])
    @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"])
    def test_construction_to_datetimelike_unit(self, arr_dtype, dtype, unit):
        # tests all units
        # gh-19223
        dtype = "{}[{}]".format(dtype, unit)
        arr = np.array([1, 2, 3], dtype=arr_dtype)
        s = Series(arr)
        result = s.astype(dtype)
        expected = Series(arr.astype(dtype))

        tm.assert_series_equal(result, expected)

    @pytest.mark.parametrize("arg",
                             ["2013-01-01 00:00:00", pd.NaT, np.nan, None])
    def test_constructor_with_naive_string_and_datetimetz_dtype(self, arg):
        # GH 17415: With naive string
        result = Series([arg], dtype="datetime64[ns, CET]")
        expected = Series(pd.Timestamp(arg)).dt.tz_localize("CET")
        assert_series_equal(result, expected)

    def test_construction_interval(self):
        # construction from interval & array of intervals
        index = IntervalIndex.from_breaks(np.arange(3), closed="right")
        result = Series(index)
        repr(result)
        str(result)
        tm.assert_index_equal(Index(result.values), index)

        result = Series(index.values)
        tm.assert_index_equal(Index(result.values), index)

    def test_construction_consistency(self):

        # make sure that we are not re-localizing upon construction
        # GH 14928
        s = Series(pd.date_range("20130101", periods=3, tz="US/Eastern"))

        result = Series(s, dtype=s.dtype)
        tm.assert_series_equal(result, s)

        result = Series(s.dt.tz_convert("UTC"), dtype=s.dtype)
        tm.assert_series_equal(result, s)

        result = Series(s.values, dtype=s.dtype)
        tm.assert_series_equal(result, s)

    def test_constructor_infer_period(self):
        data = [pd.Period("2000", "D"), pd.Period("2001", "D"), None]
        result = pd.Series(data)
        expected = pd.Series(period_array(data))
        tm.assert_series_equal(result, expected)
        assert result.dtype == "Period[D]"

        data = np.asarray(data, dtype=object)
        tm.assert_series_equal(result, expected)
        assert result.dtype == "Period[D]"

    def test_constructor_period_incompatible_frequency(self):
        data = [pd.Period("2000", "D"), pd.Period("2001", "A")]
        result = pd.Series(data)
        assert result.dtype == object
        assert result.tolist() == data

    def test_constructor_periodindex(self):
        # GH7932
        # converting a PeriodIndex when put in a Series

        pi = period_range("20130101", periods=5, freq="D")
        s = Series(pi)
        assert s.dtype == "Period[D]"
        expected = Series(pi.astype(object))
        assert_series_equal(s, expected)

    def test_constructor_dict(self):
        d = {"a": 0.0, "b": 1.0, "c": 2.0}
        result = Series(d, index=["b", "c", "d", "a"])
        expected = Series([1, 2, nan, 0], index=["b", "c", "d", "a"])
        assert_series_equal(result, expected)

        pidx = tm.makePeriodIndex(100)
        d = {pidx[0]: 0, pidx[1]: 1}
        result = Series(d, index=pidx)
        expected = Series(np.nan, pidx)
        expected.iloc[0] = 0
        expected.iloc[1] = 1
        assert_series_equal(result, expected)

    def test_constructor_dict_order(self):
        # GH19018
        # initialization ordering: by insertion order if python>= 3.6, else
        # order by value
        d = {"b": 1, "a": 0, "c": 2}
        result = Series(d)
        if PY36:
            expected = Series([1, 0, 2], index=list("bac"))
        else:
            expected = Series([0, 1, 2], index=list("abc"))
        tm.assert_series_equal(result, expected)

    @pytest.mark.parametrize("value", [2, np.nan, None, float("nan")])
    def test_constructor_dict_nan_key(self, value):
        # GH 18480
        d = {1: "a", value: "b", float("nan"): "c", 4: "d"}
        result = Series(d).sort_values()
        expected = Series(["a", "b", "c", "d"], index=[1, value, np.nan, 4])
        assert_series_equal(result, expected)

        # MultiIndex:
        d = {(1, 1): "a", (2, np.nan): "b", (3, value): "c"}
        result = Series(d).sort_values()
        expected = Series(["a", "b", "c"],
                          index=Index([(1, 1), (2, np.nan), (3, value)]))
        assert_series_equal(result, expected)

    def test_constructor_dict_datetime64_index(self):
        # GH 9456

        dates_as_str = ["1984-02-19", "1988-11-06", "1989-12-03", "1990-03-15"]
        values = [42544017.198965244, 1234565, 40512335.181958228, -1]

        def create_data(constructor):
            return dict(zip((constructor(x) for x in dates_as_str), values))

        data_datetime64 = create_data(np.datetime64)
        data_datetime = create_data(lambda x: datetime.strptime(x, "%Y-%m-%d"))
        data_Timestamp = create_data(Timestamp)

        expected = Series(values, (Timestamp(x) for x in dates_as_str))

        result_datetime64 = Series(data_datetime64)
        result_datetime = Series(data_datetime)
        result_Timestamp = Series(data_Timestamp)

        assert_series_equal(result_datetime64, expected)
        assert_series_equal(result_datetime, expected)
        assert_series_equal(result_Timestamp, expected)

    def test_constructor_list_of_tuples(self):
        data = [(1, 1), (2, 2), (2, 3)]
        s = Series(data)
        assert list(s) == data

    def test_constructor_tuple_of_tuples(self):
        data = ((1, 1), (2, 2), (2, 3))
        s = Series(data)
        assert tuple(s) == data

    def test_constructor_dict_of_tuples(self):
        data = {(1, 2): 3, (None, 5): 6}
        result = Series(data).sort_values()
        expected = Series([3, 6],
                          index=MultiIndex.from_tuples([(1, 2), (None, 5)]))
        tm.assert_series_equal(result, expected)

    def test_constructor_set(self):
        values = {1, 2, 3, 4, 5}
        with pytest.raises(TypeError, match="'set' type is unordered"):
            Series(values)
        values = frozenset(values)
        with pytest.raises(TypeError, match="'frozenset' type is unordered"):
            Series(values)

    # https://github.com/pandas-dev/pandas/issues/22698
    @pytest.mark.filterwarnings("ignore:elementwise comparison:FutureWarning")
    def test_fromDict(self):
        data = {"a": 0, "b": 1, "c": 2, "d": 3}

        series = Series(data)
        tm.assert_is_sorted(series.index)

        data = {"a": 0, "b": "1", "c": "2", "d": datetime.now()}
        series = Series(data)
        assert series.dtype == np.object_

        data = {"a": 0, "b": "1", "c": "2", "d": "3"}
        series = Series(data)
        assert series.dtype == np.object_

        data = {"a": "0", "b": "1"}
        series = Series(data, dtype=float)
        assert series.dtype == np.float64

    def test_fromValue(self, datetime_series):

        nans = Series(np.NaN, index=datetime_series.index)
        assert nans.dtype == np.float_
        assert len(nans) == len(datetime_series)

        strings = Series("foo", index=datetime_series.index)
        assert strings.dtype == np.object_
        assert len(strings) == len(datetime_series)

        d = datetime.now()
        dates = Series(d, index=datetime_series.index)
        assert dates.dtype == "M8[ns]"
        assert len(dates) == len(datetime_series)

        # GH12336
        # Test construction of categorical series from value
        categorical = Series(0, index=datetime_series.index, dtype="category")
        expected = Series(0, index=datetime_series.index).astype("category")
        assert categorical.dtype == "category"
        assert len(categorical) == len(datetime_series)
        tm.assert_series_equal(categorical, expected)

    def test_constructor_dtype_timedelta64(self):

        # basic
        td = Series([timedelta(days=i) for i in range(3)])
        assert td.dtype == "timedelta64[ns]"

        td = Series([timedelta(days=1)])
        assert td.dtype == "timedelta64[ns]"

        td = Series(
            [timedelta(days=1),
             timedelta(days=2),
             np.timedelta64(1, "s")])

        assert td.dtype == "timedelta64[ns]"

        # mixed with NaT
        td = Series([timedelta(days=1), NaT], dtype="m8[ns]")
        assert td.dtype == "timedelta64[ns]"

        td = Series([timedelta(days=1), np.nan], dtype="m8[ns]")
        assert td.dtype == "timedelta64[ns]"

        td = Series([np.timedelta64(300000000), pd.NaT], dtype="m8[ns]")
        assert td.dtype == "timedelta64[ns]"

        # improved inference
        # GH5689
        td = Series([np.timedelta64(300000000), NaT])
        assert td.dtype == "timedelta64[ns]"

        # because iNaT is int, not coerced to timedelta
        td = Series([np.timedelta64(300000000), iNaT])
        assert td.dtype == "object"

        td = Series([np.timedelta64(300000000), np.nan])
        assert td.dtype == "timedelta64[ns]"

        td = Series([pd.NaT, np.timedelta64(300000000)])
        assert td.dtype == "timedelta64[ns]"

        td = Series([np.timedelta64(1, "s")])
        assert td.dtype == "timedelta64[ns]"

        # these are frequency conversion astypes
        # for t in ['s', 'D', 'us', 'ms']:
        #    with pytest.raises(TypeError):
        #        td.astype('m8[%s]' % t)

        # valid astype
        td.astype("int64")

        # invalid casting
        msg = r"cannot astype a timedelta from \[timedelta64\[ns\]\] to" r" \[int32\]"
        with pytest.raises(TypeError, match=msg):
            td.astype("int32")

        # this is an invalid casting
        msg = "Could not convert object to NumPy timedelta"
        with pytest.raises(ValueError, match=msg):
            Series([timedelta(days=1), "foo"], dtype="m8[ns]")

        # leave as object here
        td = Series([timedelta(days=i) for i in range(3)] + ["foo"])
        assert td.dtype == "object"

        # these will correctly infer a timedelta
        s = Series([None, pd.NaT, "1 Day"])
        assert s.dtype == "timedelta64[ns]"
        s = Series([np.nan, pd.NaT, "1 Day"])
        assert s.dtype == "timedelta64[ns]"
        s = Series([pd.NaT, None, "1 Day"])
        assert s.dtype == "timedelta64[ns]"
        s = Series([pd.NaT, np.nan, "1 Day"])
        assert s.dtype == "timedelta64[ns]"

    # GH 16406
    def test_constructor_mixed_tz(self):
        s = Series(
            [Timestamp("20130101"),
             Timestamp("20130101", tz="US/Eastern")])
        expected = Series(
            [Timestamp("20130101"),
             Timestamp("20130101", tz="US/Eastern")],
            dtype="object",
        )
        assert_series_equal(s, expected)

    def test_NaT_scalar(self):
        series = Series([0, 1000, 2000, iNaT], dtype="M8[ns]")

        val = series[3]
        assert isna(val)

        series[2] = val
        assert isna(series[2])

    def test_NaT_cast(self):
        # GH10747
        result = Series([np.nan]).astype("M8[ns]")
        expected = Series([NaT])
        assert_series_equal(result, expected)

    def test_constructor_name_hashable(self):
        for n in [777, 777.0, "name", datetime(2001, 11, 11), (1, ), "\u05D0"]:
            for data in [[1, 2, 3], np.ones(3), {"a": 0, "b": 1}]:
                s = Series(data, name=n)
                assert s.name == n

    def test_constructor_name_unhashable(self):
        msg = r"Series\.name must be a hashable type"
        for n in [["name_list"], np.ones(2), {1: 2}]:
            for data in [["name_list"], np.ones(2), {1: 2}]:
                with pytest.raises(TypeError, match=msg):
                    Series(data, name=n)

    def test_auto_conversion(self):
        series = Series(list(date_range("1/1/2000", periods=10)))
        assert series.dtype == "M8[ns]"

    def test_convert_non_ns(self):
        # convert from a numpy array of non-ns timedelta64
        arr = np.array([1, 2, 3], dtype="timedelta64[s]")
        s = Series(arr)
        expected = Series(pd.timedelta_range("00:00:01", periods=3, freq="s"))
        assert_series_equal(s, expected)

        # convert from a numpy array of non-ns datetime64
        # note that creating a numpy datetime64 is in LOCAL time!!!!
        # seems to work for M8[D], but not for M8[s]

        s = Series(
            np.array(["2013-01-01", "2013-01-02", "2013-01-03"],
                     dtype="datetime64[D]"))
        assert_series_equal(
            s, Series(date_range("20130101", periods=3, freq="D")))

        # s = Series(np.array(['2013-01-01 00:00:01','2013-01-01
        # 00:00:02','2013-01-01 00:00:03'],dtype='datetime64[s]'))

        # assert_series_equal(s,date_range('20130101
        # 00:00:01',period=3,freq='s'))

    @pytest.mark.parametrize(
        "index",
        [
            date_range("1/1/2000", periods=10),
            timedelta_range("1 day", periods=10),
            period_range("2000-Q1", periods=10, freq="Q"),
        ],
        ids=lambda x: type(x).__name__,
    )
    def test_constructor_cant_cast_datetimelike(self, index):

        # floats are not ok
        msg = "Cannot cast {}.*? to ".format(
            # strip Index to convert PeriodIndex -> Period
            # We don't care whether the error message says
            # PeriodIndex or PeriodArray
            type(index).__name__.rstrip("Index"))
        with pytest.raises(TypeError, match=msg):
            Series(index, dtype=float)

        # ints are ok
        # we test with np.int64 to get similar results on
        # windows / 32-bit platforms
        result = Series(index, dtype=np.int64)
        expected = Series(index.astype(np.int64))
        tm.assert_series_equal(result, expected)

    @pytest.mark.parametrize(
        "index",
        [
            date_range("1/1/2000", periods=10),
            timedelta_range("1 day", periods=10),
            period_range("2000-Q1", periods=10, freq="Q"),
        ],
        ids=lambda x: type(x).__name__,
    )
    def test_constructor_cast_object(self, index):
        s = Series(index, dtype=object)
        exp = Series(index).astype(object)
        tm.assert_series_equal(s, exp)

        s = Series(pd.Index(index, dtype=object), dtype=object)
        exp = Series(index).astype(object)
        tm.assert_series_equal(s, exp)

        s = Series(index.astype(object), dtype=object)
        exp = Series(index).astype(object)
        tm.assert_series_equal(s, exp)

    @pytest.mark.parametrize("dtype", [np.datetime64, np.timedelta64])
    def test_constructor_generic_timestamp_no_frequency(self, dtype):
        # see gh-15524, gh-15987
        msg = "dtype has no unit. Please pass in"

        with pytest.raises(ValueError, match=msg):
            Series([], dtype=dtype)

    @pytest.mark.parametrize(
        "dtype,msg",
        [
            ("m8[ps]", "cannot convert timedeltalike"),
            ("M8[ps]", "cannot convert datetimelike"),
        ],
    )
    def test_constructor_generic_timestamp_bad_frequency(self, dtype, msg):
        # see gh-15524, gh-15987

        with pytest.raises(TypeError, match=msg):
            Series([], dtype=dtype)

    @pytest.mark.parametrize("dtype", [None, "uint8", "category"])
    def test_constructor_range_dtype(self, dtype):
        # GH 16804
        expected = Series([0, 1, 2, 3, 4], dtype=dtype or "int64")
        result = Series(range(5), dtype=dtype)
        tm.assert_series_equal(result, expected)

    def test_constructor_tz_mixed_data(self):
        # GH 13051
        dt_list = [
            Timestamp("2016-05-01 02:03:37"),
            Timestamp("2016-04-30 19:03:37-0700", tz="US/Pacific"),
        ]
        result = Series(dt_list)
        expected = Series(dt_list, dtype=object)
        tm.assert_series_equal(result, expected)
Beispiel #27
0
def test_survival_table_from_events_will_collapse_if_asked():
    T, C = np.array([1, 3, 4, 5]), np.array([True, True, True, True])
    table = utils.survival_table_from_events(T, C, collapse=True)
    assert table.index.tolist() == [pd.Interval(0, 3.5089999999999999, closed='right'), pd.Interval(3.5089999999999999,  7.0179999999999998, closed='right')]
Beispiel #28
0
class TestSeriesConstructors():

    def test_invalid_dtype(self):
        # GH15520
        msg = 'not understood'
        invalid_list = [pd.Timestamp, 'pd.Timestamp', list]
        for dtype in invalid_list:
            with pytest.raises(TypeError, match=msg):
                Series([], name='time', dtype=dtype)

    def test_scalar_conversion(self):

        # Pass in scalar is disabled
        scalar = Series(0.5)
        assert not isinstance(scalar, float)

        # Coercion
        assert float(Series([1.])) == 1.0
        assert int(Series([1.])) == 1
        assert long(Series([1.])) == 1

    def test_constructor(self, datetime_series, empty_series):
        assert datetime_series.index.is_all_dates

        # Pass in Series
        derived = Series(datetime_series)
        assert derived.index.is_all_dates

        assert tm.equalContents(derived.index, datetime_series.index)
        # Ensure new index is not created
        assert id(datetime_series.index) == id(derived.index)

        # Mixed type Series
        mixed = Series(['hello', np.NaN], index=[0, 1])
        assert mixed.dtype == np.object_
        assert mixed[1] is np.NaN

        assert not empty_series.index.is_all_dates
        assert not Series({}).index.is_all_dates
        pytest.raises(Exception, Series, np.random.randn(3, 3),
                      index=np.arange(3))

        mixed.name = 'Series'
        rs = Series(mixed).name
        xp = 'Series'
        assert rs == xp

        # raise on MultiIndex GH4187
        m = MultiIndex.from_arrays([[1, 2], [3, 4]])
        pytest.raises(NotImplementedError, Series, m)

    @pytest.mark.parametrize('input_class', [list, dict, OrderedDict])
    def test_constructor_empty(self, input_class):
        empty = Series()
        empty2 = Series(input_class())

        # these are Index() and RangeIndex() which don't compare type equal
        # but are just .equals
        assert_series_equal(empty, empty2, check_index_type=False)

        # With explicit dtype:
        empty = Series(dtype='float64')
        empty2 = Series(input_class(), dtype='float64')
        assert_series_equal(empty, empty2, check_index_type=False)

        # GH 18515 : with dtype=category:
        empty = Series(dtype='category')
        empty2 = Series(input_class(), dtype='category')
        assert_series_equal(empty, empty2, check_index_type=False)

        if input_class is not list:
            # With index:
            empty = Series(index=lrange(10))
            empty2 = Series(input_class(), index=lrange(10))
            assert_series_equal(empty, empty2)

            # With index and dtype float64:
            empty = Series(np.nan, index=lrange(10))
            empty2 = Series(input_class(), index=lrange(10), dtype='float64')
            assert_series_equal(empty, empty2)

            # GH 19853 : with empty string, index and dtype str
            empty = Series('', dtype=str, index=range(3))
            empty2 = Series('', index=range(3))
            assert_series_equal(empty, empty2)

    @pytest.mark.parametrize('input_arg', [np.nan, float('nan')])
    def test_constructor_nan(self, input_arg):
        empty = Series(dtype='float64', index=lrange(10))
        empty2 = Series(input_arg, index=lrange(10))

        assert_series_equal(empty, empty2, check_index_type=False)

    @pytest.mark.parametrize('dtype', [
        'f8', 'i8', 'M8[ns]', 'm8[ns]', 'category', 'object',
        'datetime64[ns, UTC]',
    ])
    @pytest.mark.parametrize('index', [None, pd.Index([])])
    def test_constructor_dtype_only(self, dtype, index):
        # GH-20865
        result = pd.Series(dtype=dtype, index=index)
        assert result.dtype == dtype
        assert len(result) == 0

    def test_constructor_no_data_index_order(self):
        result = pd.Series(index=['b', 'a', 'c'])
        assert result.index.tolist() == ['b', 'a', 'c']

    def test_constructor_no_data_string_type(self):
        # GH 22477
        result = pd.Series(index=[1], dtype=str)
        assert np.isnan(result.iloc[0])

    @pytest.mark.parametrize('item', ['entry', 'ѐ', 13])
    def test_constructor_string_element_string_type(self, item):
        # GH 22477
        result = pd.Series(item, index=[1], dtype=str)
        assert result.iloc[0] == str(item)

    def test_constructor_dtype_str_na_values(self, string_dtype):
        # https://github.com/pandas-dev/pandas/issues/21083
        ser = Series(['x', None], dtype=string_dtype)
        result = ser.isna()
        expected = Series([False, True])
        tm.assert_series_equal(result, expected)
        assert ser.iloc[1] is None

        ser = Series(['x', np.nan], dtype=string_dtype)
        assert np.isnan(ser.iloc[1])

    def test_constructor_series(self):
        index1 = ['d', 'b', 'a', 'c']
        index2 = sorted(index1)
        s1 = Series([4, 7, -5, 3], index=index1)
        s2 = Series(s1, index=index2)

        assert_series_equal(s2, s1.sort_index())

    def test_constructor_iterable(self):
        # GH 21987
        class Iter():
            def __iter__(self):
                for i in range(10):
                    yield i

        expected = Series(list(range(10)), dtype='int64')
        result = Series(Iter(), dtype='int64')
        assert_series_equal(result, expected)

    def test_constructor_sequence(self):
        # GH 21987
        expected = Series(list(range(10)), dtype='int64')
        result = Series(range(10), dtype='int64')
        assert_series_equal(result, expected)

    def test_constructor_single_str(self):
        # GH 21987
        expected = Series(['abc'])
        result = Series('abc')
        assert_series_equal(result, expected)

    def test_constructor_list_like(self):

        # make sure that we are coercing different
        # list-likes to standard dtypes and not
        # platform specific
        expected = Series([1, 2, 3], dtype='int64')
        for obj in [[1, 2, 3], (1, 2, 3),
                    np.array([1, 2, 3], dtype='int64')]:
            result = Series(obj, index=[0, 1, 2])
            assert_series_equal(result, expected)

    @pytest.mark.parametrize('input_vals', [
        ([1, 2]),
        (['1', '2']),
        (list(pd.date_range('1/1/2011', periods=2, freq='H'))),
        (list(pd.date_range('1/1/2011', periods=2, freq='H',
                            tz='US/Eastern'))),
        ([pd.Interval(left=0, right=5)]),
    ])
    def test_constructor_list_str(self, input_vals, string_dtype):
        # GH 16605
        # Ensure that data elements from a list are converted to strings
        # when dtype is str, 'str', or 'U'
        result = Series(input_vals, dtype=string_dtype)
        expected = Series(input_vals).astype(string_dtype)
        assert_series_equal(result, expected)

    def test_constructor_list_str_na(self, string_dtype):
        result = Series([1.0, 2.0, np.nan], dtype=string_dtype)
        expected = Series(['1.0', '2.0', np.nan], dtype=object)
        assert_series_equal(result, expected)
        assert np.isnan(result[2])

    def test_constructor_generator(self):
        gen = (i for i in range(10))

        result = Series(gen)
        exp = Series(lrange(10))
        assert_series_equal(result, exp)

        gen = (i for i in range(10))
        result = Series(gen, index=lrange(10, 20))
        exp.index = lrange(10, 20)
        assert_series_equal(result, exp)

    def test_constructor_map(self):
        # GH8909
        m = map(lambda x: x, range(10))

        result = Series(m)
        exp = Series(lrange(10))
        assert_series_equal(result, exp)

        m = map(lambda x: x, range(10))
        result = Series(m, index=lrange(10, 20))
        exp.index = lrange(10, 20)
        assert_series_equal(result, exp)

    def test_constructor_categorical(self):
        cat = pd.Categorical([0, 1, 2, 0, 1, 2], ['a', 'b', 'c'],
                             fastpath=True)
        res = Series(cat)
        tm.assert_categorical_equal(res.values, cat)

        # can cast to a new dtype
        result = Series(pd.Categorical([1, 2, 3]),
                        dtype='int64')
        expected = pd.Series([1, 2, 3], dtype='int64')
        tm.assert_series_equal(result, expected)

        # GH12574
        cat = Series(pd.Categorical([1, 2, 3]), dtype='category')
        assert is_categorical_dtype(cat)
        assert is_categorical_dtype(cat.dtype)
        s = Series([1, 2, 3], dtype='category')
        assert is_categorical_dtype(s)
        assert is_categorical_dtype(s.dtype)

    def test_constructor_categorical_with_coercion(self):
        factor = Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c'])
        # test basic creation / coercion of categoricals
        s = Series(factor, name='A')
        assert s.dtype == 'category'
        assert len(s) == len(factor)
        str(s.values)
        str(s)

        # in a frame
        df = DataFrame({'A': factor})
        result = df['A']
        tm.assert_series_equal(result, s)
        result = df.iloc[:, 0]
        tm.assert_series_equal(result, s)
        assert len(df) == len(factor)
        str(df.values)
        str(df)

        df = DataFrame({'A': s})
        result = df['A']
        tm.assert_series_equal(result, s)
        assert len(df) == len(factor)
        str(df.values)
        str(df)

        # multiples
        df = DataFrame({'A': s, 'B': s, 'C': 1})
        result1 = df['A']
        result2 = df['B']
        tm.assert_series_equal(result1, s)
        tm.assert_series_equal(result2, s, check_names=False)
        assert result2.name == 'B'
        assert len(df) == len(factor)
        str(df.values)
        str(df)

        # GH8623
        x = DataFrame([[1, 'John P. Doe'], [2, 'Jane Dove'],
                       [1, 'John P. Doe']],
                      columns=['person_id', 'person_name'])
        x['person_name'] = Categorical(x.person_name
                                       )  # doing this breaks transform

        expected = x.iloc[0].person_name
        result = x.person_name.iloc[0]
        assert result == expected

        result = x.person_name[0]
        assert result == expected

        result = x.person_name.loc[0]
        assert result == expected

    def test_constructor_categorical_dtype(self):
        result = pd.Series(['a', 'b'],
                           dtype=CategoricalDtype(['a', 'b', 'c'],
                                                  ordered=True))
        assert is_categorical_dtype(result) is True
        tm.assert_index_equal(result.cat.categories, pd.Index(['a', 'b', 'c']))
        assert result.cat.ordered

        result = pd.Series(['a', 'b'], dtype=CategoricalDtype(['b', 'a']))
        assert is_categorical_dtype(result)
        tm.assert_index_equal(result.cat.categories, pd.Index(['b', 'a']))
        assert result.cat.ordered is False

        # GH 19565 - Check broadcasting of scalar with Categorical dtype
        result = Series('a', index=[0, 1],
                        dtype=CategoricalDtype(['a', 'b'], ordered=True))
        expected = Series(['a', 'a'], index=[0, 1],
                          dtype=CategoricalDtype(['a', 'b'], ordered=True))
        tm.assert_series_equal(result, expected, check_categorical=True)

    def test_categorical_sideeffects_free(self):
        # Passing a categorical to a Series and then changing values in either
        # the series or the categorical should not change the values in the
        # other one, IF you specify copy!
        cat = Categorical(["a", "b", "c", "a"])
        s = Series(cat, copy=True)
        assert s.cat is not cat
        s.cat.categories = [1, 2, 3]
        exp_s = np.array([1, 2, 3, 1], dtype=np.int64)
        exp_cat = np.array(["a", "b", "c", "a"], dtype=np.object_)
        tm.assert_numpy_array_equal(s.__array__(), exp_s)
        tm.assert_numpy_array_equal(cat.__array__(), exp_cat)

        # setting
        s[0] = 2
        exp_s2 = np.array([2, 2, 3, 1], dtype=np.int64)
        tm.assert_numpy_array_equal(s.__array__(), exp_s2)
        tm.assert_numpy_array_equal(cat.__array__(), exp_cat)

        # however, copy is False by default
        # so this WILL change values
        cat = Categorical(["a", "b", "c", "a"])
        s = Series(cat)
        assert s.values is cat
        s.cat.categories = [1, 2, 3]
        exp_s = np.array([1, 2, 3, 1], dtype=np.int64)
        tm.assert_numpy_array_equal(s.__array__(), exp_s)
        tm.assert_numpy_array_equal(cat.__array__(), exp_s)

        s[0] = 2
        exp_s2 = np.array([2, 2, 3, 1], dtype=np.int64)
        tm.assert_numpy_array_equal(s.__array__(), exp_s2)
        tm.assert_numpy_array_equal(cat.__array__(), exp_s2)

    def test_unordered_compare_equal(self):
        left = pd.Series(['a', 'b', 'c'],
                         dtype=CategoricalDtype(['a', 'b']))
        right = pd.Series(pd.Categorical(['a', 'b', np.nan],
                                         categories=['a', 'b']))
        tm.assert_series_equal(left, right)

    def test_constructor_maskedarray(self):
        data = ma.masked_all((3, ), dtype=float)
        result = Series(data)
        expected = Series([nan, nan, nan])
        assert_series_equal(result, expected)

        data[0] = 0.0
        data[2] = 2.0
        index = ['a', 'b', 'c']
        result = Series(data, index=index)
        expected = Series([0.0, nan, 2.0], index=index)
        assert_series_equal(result, expected)

        data[1] = 1.0
        result = Series(data, index=index)
        expected = Series([0.0, 1.0, 2.0], index=index)
        assert_series_equal(result, expected)

        data = ma.masked_all((3, ), dtype=int)
        result = Series(data)
        expected = Series([nan, nan, nan], dtype=float)
        assert_series_equal(result, expected)

        data[0] = 0
        data[2] = 2
        index = ['a', 'b', 'c']
        result = Series(data, index=index)
        expected = Series([0, nan, 2], index=index, dtype=float)
        assert_series_equal(result, expected)

        data[1] = 1
        result = Series(data, index=index)
        expected = Series([0, 1, 2], index=index, dtype=int)
        assert_series_equal(result, expected)

        data = ma.masked_all((3, ), dtype=bool)
        result = Series(data)
        expected = Series([nan, nan, nan], dtype=object)
        assert_series_equal(result, expected)

        data[0] = True
        data[2] = False
        index = ['a', 'b', 'c']
        result = Series(data, index=index)
        expected = Series([True, nan, False], index=index, dtype=object)
        assert_series_equal(result, expected)

        data[1] = True
        result = Series(data, index=index)
        expected = Series([True, True, False], index=index, dtype=bool)
        assert_series_equal(result, expected)

        data = ma.masked_all((3, ), dtype='M8[ns]')
        result = Series(data)
        expected = Series([iNaT, iNaT, iNaT], dtype='M8[ns]')
        assert_series_equal(result, expected)

        data[0] = datetime(2001, 1, 1)
        data[2] = datetime(2001, 1, 3)
        index = ['a', 'b', 'c']
        result = Series(data, index=index)
        expected = Series([datetime(2001, 1, 1), iNaT,
                           datetime(2001, 1, 3)], index=index, dtype='M8[ns]')
        assert_series_equal(result, expected)

        data[1] = datetime(2001, 1, 2)
        result = Series(data, index=index)
        expected = Series([datetime(2001, 1, 1), datetime(2001, 1, 2),
                           datetime(2001, 1, 3)], index=index, dtype='M8[ns]')
        assert_series_equal(result, expected)

    def test_series_ctor_plus_datetimeindex(self):
        rng = date_range('20090415', '20090519', freq='B')
        data = {k: 1 for k in rng}

        result = Series(data, index=rng)
        assert result.index is rng

    def test_constructor_default_index(self):
        s = Series([0, 1, 2])
        tm.assert_index_equal(s.index, pd.Index(np.arange(3)))

    @pytest.mark.parametrize('input', [[1, 2, 3],
                                       (1, 2, 3),
                                       list(range(3)),
                                       pd.Categorical(['a', 'b', 'a']),
                                       (i for i in range(3)),
                                       map(lambda x: x, range(3))])
    def test_constructor_index_mismatch(self, input):
        # GH 19342
        # test that construction of a Series with an index of different length
        # raises an error
        msg = 'Length of passed values is 3, index implies 4'
        with pytest.raises(ValueError, match=msg):
            Series(input, index=np.arange(4))

    def test_constructor_numpy_scalar(self):
        # GH 19342
        # construction with a numpy scalar
        # should not raise
        result = Series(np.array(100), index=np.arange(4), dtype='int64')
        expected = Series(100, index=np.arange(4), dtype='int64')
        tm.assert_series_equal(result, expected)

    def test_constructor_broadcast_list(self):
        # GH 19342
        # construction with single-element container and index
        # should raise
        pytest.raises(ValueError, Series, ['foo'], index=['a', 'b', 'c'])

    def test_constructor_corner(self):
        df = tm.makeTimeDataFrame()
        objs = [df, df]
        s = Series(objs, index=[0, 1])
        assert isinstance(s, Series)

    def test_constructor_sanitize(self):
        s = Series(np.array([1., 1., 8.]), dtype='i8')
        assert s.dtype == np.dtype('i8')

        s = Series(np.array([1., 1., np.nan]), copy=True, dtype='i8')
        assert s.dtype == np.dtype('f8')

    def test_constructor_copy(self):
        # GH15125
        # test dtype parameter has no side effects on copy=True
        for data in [[1.], np.array([1.])]:
            x = Series(data)
            y = pd.Series(x, copy=True, dtype=float)

            # copy=True maintains original data in Series
            tm.assert_series_equal(x, y)

            # changes to origin of copy does not affect the copy
            x[0] = 2.
            assert not x.equals(y)
            assert x[0] == 2.
            assert y[0] == 1.

    @pytest.mark.parametrize(
        "index",
        [
            pd.date_range('20170101', periods=3, tz='US/Eastern'),
            pd.date_range('20170101', periods=3),
            pd.timedelta_range('1 day', periods=3),
            pd.period_range('2012Q1', periods=3, freq='Q'),
            pd.Index(list('abc')),
            pd.Int64Index([1, 2, 3]),
            pd.RangeIndex(0, 3)],
        ids=lambda x: type(x).__name__)
    def test_constructor_limit_copies(self, index):
        # GH 17449
        # limit copies of input
        s = pd.Series(index)

        # we make 1 copy; this is just a smoke test here
        assert s._data.blocks[0].values is not index

    def test_constructor_pass_none(self):
        s = Series(None, index=lrange(5))
        assert s.dtype == np.float64

        s = Series(None, index=lrange(5), dtype=object)
        assert s.dtype == np.object_

        # GH 7431
        # inference on the index
        s = Series(index=np.array([None]))
        expected = Series(index=Index([None]))
        assert_series_equal(s, expected)

    def test_constructor_pass_nan_nat(self):
        # GH 13467
        exp = Series([np.nan, np.nan], dtype=np.float64)
        assert exp.dtype == np.float64
        tm.assert_series_equal(Series([np.nan, np.nan]), exp)
        tm.assert_series_equal(Series(np.array([np.nan, np.nan])), exp)

        exp = Series([pd.NaT, pd.NaT])
        assert exp.dtype == 'datetime64[ns]'
        tm.assert_series_equal(Series([pd.NaT, pd.NaT]), exp)
        tm.assert_series_equal(Series(np.array([pd.NaT, pd.NaT])), exp)

        tm.assert_series_equal(Series([pd.NaT, np.nan]), exp)
        tm.assert_series_equal(Series(np.array([pd.NaT, np.nan])), exp)

        tm.assert_series_equal(Series([np.nan, pd.NaT]), exp)
        tm.assert_series_equal(Series(np.array([np.nan, pd.NaT])), exp)

    def test_constructor_cast(self):
        msg = "could not convert string to float"
        with pytest.raises(ValueError, match=msg):
            Series(["a", "b", "c"], dtype=float)

    def test_constructor_unsigned_dtype_overflow(self, uint_dtype):
        # see gh-15832
        msg = 'Trying to coerce negative values to unsigned integers'
        with pytest.raises(OverflowError, match=msg):
            Series([-1], dtype=uint_dtype)

    def test_constructor_coerce_float_fail(self, any_int_dtype):
        # see gh-15832
        msg = "Trying to coerce float values to integers"
        with pytest.raises(ValueError, match=msg):
            Series([1, 2, 3.5], dtype=any_int_dtype)

    def test_constructor_coerce_float_valid(self, float_dtype):
        s = Series([1, 2, 3.5], dtype=float_dtype)
        expected = Series([1, 2, 3.5]).astype(float_dtype)
        assert_series_equal(s, expected)

    def test_constructor_dtype_no_cast(self):
        # see gh-1572
        s = Series([1, 2, 3])
        s2 = Series(s, dtype=np.int64)

        s2[1] = 5
        assert s[1] == 5

    def test_constructor_datelike_coercion(self):

        # GH 9477
        # incorrectly inferring on dateimelike looking when object dtype is
        # specified
        s = Series([Timestamp('20130101'), 'NOV'], dtype=object)
        assert s.iloc[0] == Timestamp('20130101')
        assert s.iloc[1] == 'NOV'
        assert s.dtype == object

        # the dtype was being reset on the slicing and re-inferred to datetime
        # even thought the blocks are mixed
        belly = '216 3T19'.split()
        wing1 = '2T15 4H19'.split()
        wing2 = '416 4T20'.split()
        mat = pd.to_datetime('2016-01-22 2019-09-07'.split())
        df = pd.DataFrame(
            {'wing1': wing1,
             'wing2': wing2,
             'mat': mat}, index=belly)

        result = df.loc['3T19']
        assert result.dtype == object
        result = df.loc['216']
        assert result.dtype == object

    def test_constructor_datetimes_with_nulls(self):
        # gh-15869
        for arr in [np.array([None, None, None, None,
                              datetime.now(), None]),
                    np.array([None, None, datetime.now(), None])]:
            result = Series(arr)
            assert result.dtype == 'M8[ns]'

    def test_constructor_dtype_datetime64(self):

        s = Series(iNaT, dtype='M8[ns]', index=lrange(5))
        assert isna(s).all()

        # in theory this should be all nulls, but since
        # we are not specifying a dtype is ambiguous
        s = Series(iNaT, index=lrange(5))
        assert not isna(s).all()

        s = Series(nan, dtype='M8[ns]', index=lrange(5))
        assert isna(s).all()

        s = Series([datetime(2001, 1, 2, 0, 0), iNaT], dtype='M8[ns]')
        assert isna(s[1])
        assert s.dtype == 'M8[ns]'

        s = Series([datetime(2001, 1, 2, 0, 0), nan], dtype='M8[ns]')
        assert isna(s[1])
        assert s.dtype == 'M8[ns]'

        # GH3416
        dates = [
            np.datetime64(datetime(2013, 1, 1)),
            np.datetime64(datetime(2013, 1, 2)),
            np.datetime64(datetime(2013, 1, 3)),
        ]

        s = Series(dates)
        assert s.dtype == 'M8[ns]'

        s.iloc[0] = np.nan
        assert s.dtype == 'M8[ns]'

        # GH3414 related
        pytest.raises(TypeError, lambda x: Series(
            Series(dates).astype('int') / 1000000, dtype='M8[ms]'))
        pytest.raises(TypeError,
                      lambda x: Series(dates, dtype='datetime64'))

        # invalid dates can be help as object
        result = Series([datetime(2, 1, 1)])
        assert result[0] == datetime(2, 1, 1, 0, 0)

        result = Series([datetime(3000, 1, 1)])
        assert result[0] == datetime(3000, 1, 1, 0, 0)

        # don't mix types
        result = Series([Timestamp('20130101'), 1], index=['a', 'b'])
        assert result['a'] == Timestamp('20130101')
        assert result['b'] == 1

        # GH6529
        # coerce datetime64 non-ns properly
        dates = date_range('01-Jan-2015', '01-Dec-2015', freq='M')
        values2 = dates.view(np.ndarray).astype('datetime64[ns]')
        expected = Series(values2, index=dates)

        for dtype in ['s', 'D', 'ms', 'us', 'ns']:
            values1 = dates.view(np.ndarray).astype('M8[{0}]'.format(dtype))
            result = Series(values1, dates)
            assert_series_equal(result, expected)

        # GH 13876
        # coerce to non-ns to object properly
        expected = Series(values2, index=dates, dtype=object)
        for dtype in ['s', 'D', 'ms', 'us', 'ns']:
            values1 = dates.view(np.ndarray).astype('M8[{0}]'.format(dtype))
            result = Series(values1, index=dates, dtype=object)
            assert_series_equal(result, expected)

        # leave datetime.date alone
        dates2 = np.array([d.date() for d in dates.to_pydatetime()],
                          dtype=object)
        series1 = Series(dates2, dates)
        tm.assert_numpy_array_equal(series1.values, dates2)
        assert series1.dtype == object

        # these will correctly infer a datetime
        s = Series([None, pd.NaT, '2013-08-05 15:30:00.000001'])
        assert s.dtype == 'datetime64[ns]'
        s = Series([np.nan, pd.NaT, '2013-08-05 15:30:00.000001'])
        assert s.dtype == 'datetime64[ns]'
        s = Series([pd.NaT, None, '2013-08-05 15:30:00.000001'])
        assert s.dtype == 'datetime64[ns]'
        s = Series([pd.NaT, np.nan, '2013-08-05 15:30:00.000001'])
        assert s.dtype == 'datetime64[ns]'

        # tz-aware (UTC and other tz's)
        # GH 8411
        dr = date_range('20130101', periods=3)
        assert Series(dr).iloc[0].tz is None
        dr = date_range('20130101', periods=3, tz='UTC')
        assert str(Series(dr).iloc[0].tz) == 'UTC'
        dr = date_range('20130101', periods=3, tz='US/Eastern')
        assert str(Series(dr).iloc[0].tz) == 'US/Eastern'

        # non-convertible
        s = Series([1479596223000, -1479590, pd.NaT])
        assert s.dtype == 'object'
        assert s[2] is pd.NaT
        assert 'NaT' in str(s)

        # if we passed a NaT it remains
        s = Series([datetime(2010, 1, 1), datetime(2, 1, 1), pd.NaT])
        assert s.dtype == 'object'
        assert s[2] is pd.NaT
        assert 'NaT' in str(s)

        # if we passed a nan it remains
        s = Series([datetime(2010, 1, 1), datetime(2, 1, 1), np.nan])
        assert s.dtype == 'object'
        assert s[2] is np.nan
        assert 'NaN' in str(s)

    def test_constructor_with_datetime_tz(self):

        # 8260
        # support datetime64 with tz

        dr = date_range('20130101', periods=3, tz='US/Eastern')
        s = Series(dr)
        assert s.dtype.name == 'datetime64[ns, US/Eastern]'
        assert s.dtype == 'datetime64[ns, US/Eastern]'
        assert is_datetime64tz_dtype(s.dtype)
        assert 'datetime64[ns, US/Eastern]' in str(s)

        # export
        result = s.values
        assert isinstance(result, np.ndarray)
        assert result.dtype == 'datetime64[ns]'

        exp = pd.DatetimeIndex(result)
        exp = exp.tz_localize('UTC').tz_convert(tz=s.dt.tz)
        tm.assert_index_equal(dr, exp)

        # indexing
        result = s.iloc[0]
        assert result == Timestamp('2013-01-01 00:00:00-0500',
                                   tz='US/Eastern', freq='D')
        result = s[0]
        assert result == Timestamp('2013-01-01 00:00:00-0500',
                                   tz='US/Eastern', freq='D')

        result = s[Series([True, True, False], index=s.index)]
        assert_series_equal(result, s[0:2])

        result = s.iloc[0:1]
        assert_series_equal(result, Series(dr[0:1]))

        # concat
        result = pd.concat([s.iloc[0:1], s.iloc[1:]])
        assert_series_equal(result, s)

        # short str
        assert 'datetime64[ns, US/Eastern]' in str(s)

        # formatting with NaT
        result = s.shift()
        assert 'datetime64[ns, US/Eastern]' in str(result)
        assert 'NaT' in str(result)

        # long str
        t = Series(date_range('20130101', periods=1000, tz='US/Eastern'))
        assert 'datetime64[ns, US/Eastern]' in str(t)

        result = pd.DatetimeIndex(s, freq='infer')
        tm.assert_index_equal(result, dr)

        # inference
        s = Series([pd.Timestamp('2013-01-01 13:00:00-0800', tz='US/Pacific'),
                    pd.Timestamp('2013-01-02 14:00:00-0800', tz='US/Pacific')])
        assert s.dtype == 'datetime64[ns, US/Pacific]'
        assert lib.infer_dtype(s, skipna=False) == 'datetime64'

        s = Series([pd.Timestamp('2013-01-01 13:00:00-0800', tz='US/Pacific'),
                    pd.Timestamp('2013-01-02 14:00:00-0800', tz='US/Eastern')])
        assert s.dtype == 'object'
        assert lib.infer_dtype(s, skipna=False) == 'datetime'

        # with all NaT
        s = Series(pd.NaT, index=[0, 1], dtype='datetime64[ns, US/Eastern]')
        expected = Series(pd.DatetimeIndex(['NaT', 'NaT'], tz='US/Eastern'))
        assert_series_equal(s, expected)

    @pytest.mark.parametrize("arr_dtype", [np.int64, np.float64])
    @pytest.mark.parametrize("dtype", ["M8", "m8"])
    @pytest.mark.parametrize("unit", ['ns', 'us', 'ms', 's', 'h', 'm', 'D'])
    def test_construction_to_datetimelike_unit(self, arr_dtype, dtype, unit):
        # tests all units
        # gh-19223
        dtype = "{}[{}]".format(dtype, unit)
        arr = np.array([1, 2, 3], dtype=arr_dtype)
        s = Series(arr)
        result = s.astype(dtype)
        expected = Series(arr.astype(dtype))

        tm.assert_series_equal(result, expected)

    @pytest.mark.parametrize('arg',
                             ['2013-01-01 00:00:00', pd.NaT, np.nan, None])
    def test_constructor_with_naive_string_and_datetimetz_dtype(self, arg):
        # GH 17415: With naive string
        result = Series([arg], dtype='datetime64[ns, CET]')
        expected = Series(pd.Timestamp(arg)).dt.tz_localize('CET')
        assert_series_equal(result, expected)

    def test_construction_interval(self):
        # construction from interval & array of intervals
        index = IntervalIndex.from_breaks(np.arange(3), closed='right')
        result = Series(index)
        repr(result)
        str(result)
        tm.assert_index_equal(Index(result.values), index)

        result = Series(index.values)
        tm.assert_index_equal(Index(result.values), index)

    def test_construction_consistency(self):

        # make sure that we are not re-localizing upon construction
        # GH 14928
        s = Series(pd.date_range('20130101', periods=3, tz='US/Eastern'))

        result = Series(s, dtype=s.dtype)
        tm.assert_series_equal(result, s)

        result = Series(s.dt.tz_convert('UTC'), dtype=s.dtype)
        tm.assert_series_equal(result, s)

        result = Series(s.values, dtype=s.dtype)
        tm.assert_series_equal(result, s)

    def test_constructor_infer_period(self):
        data = [pd.Period('2000', 'D'), pd.Period('2001', 'D'), None]
        result = pd.Series(data)
        expected = pd.Series(period_array(data))
        tm.assert_series_equal(result, expected)
        assert result.dtype == 'Period[D]'

        data = np.asarray(data, dtype=object)
        tm.assert_series_equal(result, expected)
        assert result.dtype == 'Period[D]'

    def test_constructor_period_incompatible_frequency(self):
        data = [pd.Period('2000', 'D'), pd.Period('2001', 'A')]
        result = pd.Series(data)
        assert result.dtype == object
        assert result.tolist() == data

    def test_constructor_periodindex(self):
        # GH7932
        # converting a PeriodIndex when put in a Series

        pi = period_range('20130101', periods=5, freq='D')
        s = Series(pi)
        assert s.dtype == 'Period[D]'
        expected = Series(pi.astype(object))
        assert_series_equal(s, expected)

    def test_constructor_dict(self):
        d = {'a': 0., 'b': 1., 'c': 2.}
        result = Series(d, index=['b', 'c', 'd', 'a'])
        expected = Series([1, 2, nan, 0], index=['b', 'c', 'd', 'a'])
        assert_series_equal(result, expected)

        pidx = tm.makePeriodIndex(100)
        d = {pidx[0]: 0, pidx[1]: 1}
        result = Series(d, index=pidx)
        expected = Series(np.nan, pidx)
        expected.iloc[0] = 0
        expected.iloc[1] = 1
        assert_series_equal(result, expected)

    def test_constructor_dict_order(self):
        # GH19018
        # initialization ordering: by insertion order if python>= 3.6, else
        # order by value
        d = {'b': 1, 'a': 0, 'c': 2}
        result = Series(d)
        if PY36:
            expected = Series([1, 0, 2], index=list('bac'))
        else:
            expected = Series([0, 1, 2], index=list('abc'))
        tm.assert_series_equal(result, expected)

    @pytest.mark.parametrize("value", [2, np.nan, None, float('nan')])
    def test_constructor_dict_nan_key(self, value):
        # GH 18480
        d = {1: 'a', value: 'b', float('nan'): 'c', 4: 'd'}
        result = Series(d).sort_values()
        expected = Series(['a', 'b', 'c', 'd'], index=[1, value, np.nan, 4])
        assert_series_equal(result, expected)

        # MultiIndex:
        d = {(1, 1): 'a', (2, np.nan): 'b', (3, value): 'c'}
        result = Series(d).sort_values()
        expected = Series(['a', 'b', 'c'],
                          index=Index([(1, 1), (2, np.nan), (3, value)]))
        assert_series_equal(result, expected)

    def test_constructor_dict_datetime64_index(self):
        # GH 9456

        dates_as_str = ['1984-02-19', '1988-11-06', '1989-12-03', '1990-03-15']
        values = [42544017.198965244, 1234565, 40512335.181958228, -1]

        def create_data(constructor):
            return dict(zip((constructor(x) for x in dates_as_str), values))

        data_datetime64 = create_data(np.datetime64)
        data_datetime = create_data(lambda x: datetime.strptime(x, '%Y-%m-%d'))
        data_Timestamp = create_data(Timestamp)

        expected = Series(values, (Timestamp(x) for x in dates_as_str))

        result_datetime64 = Series(data_datetime64)
        result_datetime = Series(data_datetime)
        result_Timestamp = Series(data_Timestamp)

        assert_series_equal(result_datetime64, expected)
        assert_series_equal(result_datetime, expected)
        assert_series_equal(result_Timestamp, expected)

    def test_constructor_list_of_tuples(self):
        data = [(1, 1), (2, 2), (2, 3)]
        s = Series(data)
        assert list(s) == data

    def test_constructor_tuple_of_tuples(self):
        data = ((1, 1), (2, 2), (2, 3))
        s = Series(data)
        assert tuple(s) == data

    def test_constructor_dict_of_tuples(self):
        data = {(1, 2): 3,
                (None, 5): 6}
        result = Series(data).sort_values()
        expected = Series([3, 6],
                          index=MultiIndex.from_tuples([(1, 2), (None, 5)]))
        tm.assert_series_equal(result, expected)

    def test_constructor_set(self):
        values = {1, 2, 3, 4, 5}
        pytest.raises(TypeError, Series, values)
        values = frozenset(values)
        pytest.raises(TypeError, Series, values)

    # https://github.com/pandas-dev/pandas/issues/22698
    @pytest.mark.filterwarnings("ignore:elementwise comparison:FutureWarning")
    def test_fromDict(self):
        data = {'a': 0, 'b': 1, 'c': 2, 'd': 3}

        series = Series(data)
        assert tm.is_sorted(series.index)

        data = {'a': 0, 'b': '1', 'c': '2', 'd': datetime.now()}
        series = Series(data)
        assert series.dtype == np.object_

        data = {'a': 0, 'b': '1', 'c': '2', 'd': '3'}
        series = Series(data)
        assert series.dtype == np.object_

        data = {'a': '0', 'b': '1'}
        series = Series(data, dtype=float)
        assert series.dtype == np.float64

    def test_fromValue(self, datetime_series):

        nans = Series(np.NaN, index=datetime_series.index)
        assert nans.dtype == np.float_
        assert len(nans) == len(datetime_series)

        strings = Series('foo', index=datetime_series.index)
        assert strings.dtype == np.object_
        assert len(strings) == len(datetime_series)

        d = datetime.now()
        dates = Series(d, index=datetime_series.index)
        assert dates.dtype == 'M8[ns]'
        assert len(dates) == len(datetime_series)

        # GH12336
        # Test construction of categorical series from value
        categorical = Series(0, index=datetime_series.index, dtype="category")
        expected = Series(0, index=datetime_series.index).astype("category")
        assert categorical.dtype == 'category'
        assert len(categorical) == len(datetime_series)
        tm.assert_series_equal(categorical, expected)

    def test_constructor_dtype_timedelta64(self):

        # basic
        td = Series([timedelta(days=i) for i in range(3)])
        assert td.dtype == 'timedelta64[ns]'

        td = Series([timedelta(days=1)])
        assert td.dtype == 'timedelta64[ns]'

        td = Series([timedelta(days=1), timedelta(days=2), np.timedelta64(
            1, 's')])

        assert td.dtype == 'timedelta64[ns]'

        # mixed with NaT
        td = Series([timedelta(days=1), NaT], dtype='m8[ns]')
        assert td.dtype == 'timedelta64[ns]'

        td = Series([timedelta(days=1), np.nan], dtype='m8[ns]')
        assert td.dtype == 'timedelta64[ns]'

        td = Series([np.timedelta64(300000000), pd.NaT], dtype='m8[ns]')
        assert td.dtype == 'timedelta64[ns]'

        # improved inference
        # GH5689
        td = Series([np.timedelta64(300000000), NaT])
        assert td.dtype == 'timedelta64[ns]'

        # because iNaT is int, not coerced to timedelta
        td = Series([np.timedelta64(300000000), iNaT])
        assert td.dtype == 'object'

        td = Series([np.timedelta64(300000000), np.nan])
        assert td.dtype == 'timedelta64[ns]'

        td = Series([pd.NaT, np.timedelta64(300000000)])
        assert td.dtype == 'timedelta64[ns]'

        td = Series([np.timedelta64(1, 's')])
        assert td.dtype == 'timedelta64[ns]'

        # these are frequency conversion astypes
        # for t in ['s', 'D', 'us', 'ms']:
        #    pytest.raises(TypeError, td.astype, 'm8[%s]' % t)

        # valid astype
        td.astype('int64')

        # invalid casting
        pytest.raises(TypeError, td.astype, 'int32')

        # this is an invalid casting
        def f():
            Series([timedelta(days=1), 'foo'], dtype='m8[ns]')

        pytest.raises(Exception, f)

        # leave as object here
        td = Series([timedelta(days=i) for i in range(3)] + ['foo'])
        assert td.dtype == 'object'

        # these will correctly infer a timedelta
        s = Series([None, pd.NaT, '1 Day'])
        assert s.dtype == 'timedelta64[ns]'
        s = Series([np.nan, pd.NaT, '1 Day'])
        assert s.dtype == 'timedelta64[ns]'
        s = Series([pd.NaT, None, '1 Day'])
        assert s.dtype == 'timedelta64[ns]'
        s = Series([pd.NaT, np.nan, '1 Day'])
        assert s.dtype == 'timedelta64[ns]'

    # GH 16406
    def test_constructor_mixed_tz(self):
        s = Series([Timestamp('20130101'),
                    Timestamp('20130101', tz='US/Eastern')])
        expected = Series([Timestamp('20130101'),
                           Timestamp('20130101', tz='US/Eastern')],
                          dtype='object')
        assert_series_equal(s, expected)

    def test_NaT_scalar(self):
        series = Series([0, 1000, 2000, iNaT], dtype='M8[ns]')

        val = series[3]
        assert isna(val)

        series[2] = val
        assert isna(series[2])

    def test_NaT_cast(self):
        # GH10747
        result = Series([np.nan]).astype('M8[ns]')
        expected = Series([NaT])
        assert_series_equal(result, expected)

    def test_constructor_name_hashable(self):
        for n in [777, 777., 'name', datetime(2001, 11, 11), (1, ), u"\u05D0"]:
            for data in [[1, 2, 3], np.ones(3), {'a': 0, 'b': 1}]:
                s = Series(data, name=n)
                assert s.name == n

    def test_constructor_name_unhashable(self):
        for n in [['name_list'], np.ones(2), {1: 2}]:
            for data in [['name_list'], np.ones(2), {1: 2}]:
                pytest.raises(TypeError, Series, data, name=n)

    def test_auto_conversion(self):
        series = Series(list(date_range('1/1/2000', periods=10)))
        assert series.dtype == 'M8[ns]'

    def test_convert_non_ns(self):
        # convert from a numpy array of non-ns timedelta64
        arr = np.array([1, 2, 3], dtype='timedelta64[s]')
        s = Series(arr)
        expected = Series(pd.timedelta_range('00:00:01', periods=3, freq='s'))
        assert_series_equal(s, expected)

        # convert from a numpy array of non-ns datetime64
        # note that creating a numpy datetime64 is in LOCAL time!!!!
        # seems to work for M8[D], but not for M8[s]

        s = Series(np.array(['2013-01-01', '2013-01-02',
                             '2013-01-03'], dtype='datetime64[D]'))
        assert_series_equal(s, Series(date_range('20130101', periods=3,
                                                 freq='D')))

        # s = Series(np.array(['2013-01-01 00:00:01','2013-01-01
        # 00:00:02','2013-01-01 00:00:03'],dtype='datetime64[s]'))

        # assert_series_equal(s,date_range('20130101
        # 00:00:01',period=3,freq='s'))

    @pytest.mark.parametrize(
        "index",
        [
            date_range('1/1/2000', periods=10),
            timedelta_range('1 day', periods=10),
            period_range('2000-Q1', periods=10, freq='Q')],
        ids=lambda x: type(x).__name__)
    def test_constructor_cant_cast_datetimelike(self, index):

        # floats are not ok
        msg = "Cannot cast {}.*? to ".format(
            # strip Index to convert PeriodIndex -> Period
            # We don't care whether the error message says
            # PeriodIndex or PeriodArray
            type(index).__name__.rstrip("Index")
        )
        with pytest.raises(TypeError, match=msg):
            Series(index, dtype=float)

        # ints are ok
        # we test with np.int64 to get similar results on
        # windows / 32-bit platforms
        result = Series(index, dtype=np.int64)
        expected = Series(index.astype(np.int64))
        tm.assert_series_equal(result, expected)

    @pytest.mark.parametrize(
        "index",
        [
            date_range('1/1/2000', periods=10),
            timedelta_range('1 day', periods=10),
            period_range('2000-Q1', periods=10, freq='Q')],
        ids=lambda x: type(x).__name__)
    def test_constructor_cast_object(self, index):
        s = Series(index, dtype=object)
        exp = Series(index).astype(object)
        tm.assert_series_equal(s, exp)

        s = Series(pd.Index(index, dtype=object), dtype=object)
        exp = Series(index).astype(object)
        tm.assert_series_equal(s, exp)

        s = Series(index.astype(object), dtype=object)
        exp = Series(index).astype(object)
        tm.assert_series_equal(s, exp)

    @pytest.mark.parametrize("dtype", [
        np.datetime64,
        np.timedelta64,
    ])
    def test_constructor_generic_timestamp_no_frequency(self, dtype):
        # see gh-15524, gh-15987
        msg = "dtype has no unit. Please pass in"

        with pytest.raises(ValueError, match=msg):
            Series([], dtype=dtype)

    @pytest.mark.parametrize("dtype,msg", [
        ("m8[ps]", "cannot convert timedeltalike"),
        ("M8[ps]", "cannot convert datetimelike"),
    ])
    def test_constructor_generic_timestamp_bad_frequency(self, dtype, msg):
        # see gh-15524, gh-15987

        with pytest.raises(TypeError, match=msg):
            Series([], dtype=dtype)

    @pytest.mark.parametrize('dtype', [None, 'uint8', 'category'])
    def test_constructor_range_dtype(self, dtype):
        # GH 16804
        expected = Series([0, 1, 2, 3, 4], dtype=dtype or 'int64')
        result = Series(range(5), dtype=dtype)
        tm.assert_series_equal(result, expected)

    def test_constructor_tz_mixed_data(self):
        # GH 13051
        dt_list = [Timestamp('2016-05-01 02:03:37'),
                   Timestamp('2016-04-30 19:03:37-0700', tz='US/Pacific')]
        result = Series(dt_list)
        expected = Series(dt_list, dtype=object)
        tm.assert_series_equal(result, expected)
Beispiel #29
0
class TestCategoricalOps:
    def test_compare_frame(self):
        # GH#24282 check that Categorical.__cmp__(DataFrame) defers to frame
        data = ["a", "b", 2, "a"]
        cat = Categorical(data)

        df = DataFrame(cat)

        result = cat == df.T
        expected = DataFrame([[True, True, True, True]])
        tm.assert_frame_equal(result, expected)

        result = cat[::-1] != df.T
        expected = DataFrame([[False, True, True, False]])
        tm.assert_frame_equal(result, expected)

    def test_compare_frame_raises(self, all_compare_operators):
        # alignment raises unless we transpose
        op = getattr(operator, all_compare_operators)
        cat = Categorical(["a", "b", 2, "a"])
        df = DataFrame(cat)
        msg = "Unable to coerce to Series, length must be 1: given 4"
        with pytest.raises(ValueError, match=msg):
            op(cat, df)

    def test_datetime_categorical_comparison(self):
        dt_cat = Categorical(date_range("2014-01-01", periods=3), ordered=True)
        tm.assert_numpy_array_equal(dt_cat > dt_cat[0],
                                    np.array([False, True, True]))
        tm.assert_numpy_array_equal(dt_cat[0] < dt_cat,
                                    np.array([False, True, True]))

    def test_reflected_comparison_with_scalars(self):
        # GH8658
        cat = Categorical([1, 2, 3], ordered=True)
        tm.assert_numpy_array_equal(cat > cat[0], np.array([False, True,
                                                            True]))
        tm.assert_numpy_array_equal(cat[0] < cat, np.array([False, True,
                                                            True]))

    def test_comparison_with_unknown_scalars(self):
        # https://github.com/pandas-dev/pandas/issues/9836#issuecomment-92123057
        # and following comparisons with scalars not in categories should raise
        # for unequal comps, but not for equal/not equal
        cat = Categorical([1, 2, 3], ordered=True)

        msg = "Invalid comparison between dtype=category and int"
        with pytest.raises(TypeError, match=msg):
            cat < 4
        with pytest.raises(TypeError, match=msg):
            cat > 4
        with pytest.raises(TypeError, match=msg):
            4 < cat
        with pytest.raises(TypeError, match=msg):
            4 > cat

        tm.assert_numpy_array_equal(cat == 4, np.array([False, False, False]))
        tm.assert_numpy_array_equal(cat != 4, np.array([True, True, True]))

    def test_comparison_of_ordered_categorical_with_nan_to_scalar(
            self, compare_operators_no_eq_ne):
        # https://github.com/pandas-dev/pandas/issues/26504
        # BUG: fix ordered categorical comparison with missing values (#26504 )
        # and following comparisons with scalars in categories with missing
        # values should be evaluated as False

        cat = Categorical([1, 2, 3, None], categories=[1, 2, 3], ordered=True)
        scalar = 2
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", RuntimeWarning)
            expected = getattr(np.array(cat),
                               compare_operators_no_eq_ne)(scalar)
        actual = getattr(cat, compare_operators_no_eq_ne)(scalar)
        tm.assert_numpy_array_equal(actual, expected)

    def test_comparison_of_ordered_categorical_with_nan_to_listlike(
            self, compare_operators_no_eq_ne):
        # https://github.com/pandas-dev/pandas/issues/26504
        # and following comparisons of missing values in ordered Categorical
        # with listlike should be evaluated as False

        cat = Categorical([1, 2, 3, None], categories=[1, 2, 3], ordered=True)
        other = Categorical([2, 2, 2, 2], categories=[1, 2, 3], ordered=True)
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", RuntimeWarning)
            expected = getattr(np.array(cat), compare_operators_no_eq_ne)(2)
        actual = getattr(cat, compare_operators_no_eq_ne)(other)
        tm.assert_numpy_array_equal(actual, expected)

    @pytest.mark.parametrize(
        "data,reverse,base",
        [(list("abc"), list("cba"), list("bbb")),
         ([1, 2, 3], [3, 2, 1], [2, 2, 2])],
    )
    def test_comparisons(self, data, reverse, base):
        cat_rev = Series(Categorical(data, categories=reverse, ordered=True))
        cat_rev_base = Series(
            Categorical(base, categories=reverse, ordered=True))
        cat = Series(Categorical(data, ordered=True))
        cat_base = Series(
            Categorical(base, categories=cat.cat.categories, ordered=True))
        s = Series(base)
        a = np.array(base)

        # comparisons need to take categories ordering into account
        res_rev = cat_rev > cat_rev_base
        exp_rev = Series([True, False, False])
        tm.assert_series_equal(res_rev, exp_rev)

        res_rev = cat_rev < cat_rev_base
        exp_rev = Series([False, False, True])
        tm.assert_series_equal(res_rev, exp_rev)

        res = cat > cat_base
        exp = Series([False, False, True])
        tm.assert_series_equal(res, exp)

        scalar = base[1]
        res = cat > scalar
        exp = Series([False, False, True])
        exp2 = cat.values > scalar
        tm.assert_series_equal(res, exp)
        tm.assert_numpy_array_equal(res.values, exp2)
        res_rev = cat_rev > scalar
        exp_rev = Series([True, False, False])
        exp_rev2 = cat_rev.values > scalar
        tm.assert_series_equal(res_rev, exp_rev)
        tm.assert_numpy_array_equal(res_rev.values, exp_rev2)

        # Only categories with same categories can be compared
        msg = "Categoricals can only be compared if 'categories' are the same"
        with pytest.raises(TypeError, match=msg):
            cat > cat_rev

        # categorical cannot be compared to Series or numpy array, and also
        # not the other way around
        msg = ("Cannot compare a Categorical for op __gt__ with type "
               r"<class 'numpy\.ndarray'>")
        with pytest.raises(TypeError, match=msg):
            cat > s
        with pytest.raises(TypeError, match=msg):
            cat_rev > s
        with pytest.raises(TypeError, match=msg):
            cat > a
        with pytest.raises(TypeError, match=msg):
            cat_rev > a

        with pytest.raises(TypeError, match=msg):
            s < cat
        with pytest.raises(TypeError, match=msg):
            s < cat_rev

        with pytest.raises(TypeError, match=msg):
            a < cat
        with pytest.raises(TypeError, match=msg):
            a < cat_rev

    @pytest.mark.parametrize(
        "ctor",
        [
            lambda *args, **kwargs: Categorical(*args, **kwargs),
            lambda *args, **kwargs: Series(Categorical(*args, **kwargs)),
        ],
    )
    def test_unordered_different_order_equal(self, ctor):
        # https://github.com/pandas-dev/pandas/issues/16014
        c1 = ctor(["a", "b"], categories=["a", "b"], ordered=False)
        c2 = ctor(["a", "b"], categories=["b", "a"], ordered=False)
        assert (c1 == c2).all()

        c1 = ctor(["a", "b"], categories=["a", "b"], ordered=False)
        c2 = ctor(["b", "a"], categories=["b", "a"], ordered=False)
        assert (c1 != c2).all()

        c1 = ctor(["a", "a"], categories=["a", "b"], ordered=False)
        c2 = ctor(["b", "b"], categories=["b", "a"], ordered=False)
        assert (c1 != c2).all()

        c1 = ctor(["a", "a"], categories=["a", "b"], ordered=False)
        c2 = ctor(["a", "b"], categories=["b", "a"], ordered=False)
        result = c1 == c2
        tm.assert_numpy_array_equal(np.array(result), np.array([True, False]))

    def test_unordered_different_categories_raises(self):
        c1 = Categorical(["a", "b"], categories=["a", "b"], ordered=False)
        c2 = Categorical(["a", "c"], categories=["c", "a"], ordered=False)

        with pytest.raises(TypeError,
                           match=("Categoricals can only be compared")):
            c1 == c2

    def test_compare_different_lengths(self):
        c1 = Categorical([], categories=["a", "b"])
        c2 = Categorical([], categories=["a"])

        msg = "Categoricals can only be compared if 'categories' are the same."
        with pytest.raises(TypeError, match=msg):
            c1 == c2

    def test_compare_unordered_different_order(self):
        # https://github.com/pandas-dev/pandas/issues/16603#issuecomment-
        # 349290078
        a = pd.Categorical(["a"], categories=["a", "b"])
        b = pd.Categorical(["b"], categories=["b", "a"])
        assert not a.equals(b)

    def test_numeric_like_ops(self):

        df = DataFrame({"value": np.random.randint(0, 10000, 100)})
        labels = [f"{i} - {i + 499}" for i in range(0, 10000, 500)]
        cat_labels = Categorical(labels, labels)

        df = df.sort_values(by=["value"], ascending=True)
        df["value_group"] = pd.cut(df.value,
                                   range(0, 10500, 500),
                                   right=False,
                                   labels=cat_labels)

        # numeric ops should not succeed
        for op, str_rep in [
            ("__add__", r"\+"),
            ("__sub__", "-"),
            ("__mul__", r"\*"),
            ("__truediv__", "/"),
        ]:
            msg = f"Series cannot perform the operation {str_rep}|unsupported operand"
            with pytest.raises(TypeError, match=msg):
                getattr(df, op)(df)

        # reduction ops should not succeed (unless specifically defined, e.g.
        # min/max)
        s = df["value_group"]
        for op in ["kurt", "skew", "var", "std", "mean", "sum", "median"]:
            msg = f"Categorical cannot perform the operation {op}"
            with pytest.raises(TypeError, match=msg):
                getattr(s, op)(numeric_only=False)

        # mad technically works because it takes always the numeric data

        # numpy ops
        s = Series(Categorical([1, 2, 3, 4]))
        with pytest.raises(
                TypeError,
                match="Categorical cannot perform the operation sum"):
            np.sum(s)

        # numeric ops on a Series
        for op, str_rep in [
            ("__add__", r"\+"),
            ("__sub__", "-"),
            ("__mul__", r"\*"),
            ("__truediv__", "/"),
        ]:
            msg = f"Series cannot perform the operation {str_rep}|unsupported operand"
            with pytest.raises(TypeError, match=msg):
                getattr(s, op)(2)

        # invalid ufunc
        msg = "Object with dtype category cannot perform the numpy op log"
        with pytest.raises(TypeError, match=msg):
            np.log(s)

    def test_contains(self):
        # GH21508
        c = pd.Categorical(list("aabbca"), categories=list("cab"))

        assert "b" in c
        assert "z" not in c
        assert np.nan not in c
        with pytest.raises(TypeError, match="unhashable type: 'list'"):
            assert [1] in c

        # assert codes NOT in index
        assert 0 not in c
        assert 1 not in c

        c = pd.Categorical(list("aabbca") + [np.nan], categories=list("cab"))
        assert np.nan in c

    @pytest.mark.parametrize(
        "item, expected",
        [
            (pd.Interval(0, 1), True),
            (1.5, True),
            (pd.Interval(0.5, 1.5), False),
            ("a", False),
            (pd.Timestamp(1), False),
            (pd.Timedelta(1), False),
        ],
        ids=str,
    )
    def test_contains_interval(self, item, expected):
        # GH 23705
        cat = Categorical(pd.IntervalIndex.from_breaks(range(3)))
        result = item in cat
        assert result is expected

    def test_contains_list(self):
        # GH#21729
        cat = Categorical([1, 2, 3])

        assert "a" not in cat

        with pytest.raises(TypeError, match="unhashable type"):
            ["a"] in cat

        with pytest.raises(TypeError, match="unhashable type"):
            ["a", "b"] in cat
Beispiel #30
0
    def transform(self, df):
        if self.args.user_type == 'adult' or self.args.user_type == 'bank':
            if self.args.user_type == 'adult':
                df.replace(
                    {'income': {
                        ' <=50K.': ' <=50K',
                        ' >50K.': ' >50K'
                    }},
                    inplace=True)

            df.replace({'income': {' <=50K': '0', ' >50K': '1'}}, inplace=True)

        self.drop_column = list(
            set(self.columns).difference(set(
                self.column_to_perturb)).difference(set([self.column_to_agg])))
        df.drop(self.drop_column, axis=1, inplace=True)
        self.columns = [x for x in df.columns if x not in self.drop_column]

        # print stats
        # for col in self.columns:
        #     print(col, end=': ')
        #     if df[col].dtypes == 'object':
        #         print(df[col].unique())
        #     else:
        #         print(df[col].min(), df[col].max(), df[col].unique())

        for col in self.columns:
            bins = np.array([])
            if col not in self.ranges:
                df[col] = df[col].apply(str)
                df[col] = df[col].str.strip()
                df[col] = pd.Categorical(df[col])
            else:
                bins = np.round(
                    np.arange(self.ranges[col]['min'], self.ranges[col]['max'],
                              self.ranges[col]['gran']), 2)
                # too slow when there are many bins
                # df[col] = pd.cut(df[col], bins, right=False)
                start = bins[0]
                step = bins[1] - bins[0]
                if self.args.perturb_type in ['ord2cat2', 'ord2cat2q11', 'ord2cat2q20', 'ord2cat2q22', 'ord2cat2q10',
                                              'ord2cat2q12', 'ord4cat4q11', 'ord4cat4q12', 'ord4cat4q10',
                                              'ord4cat4q02', 'ord4cat4q01', 'ord4cat4q20', 'ord4cat4q21'] \
                        and col in ['INCTOT', 'FTOTINC'] and self.args.user_type[0:5] == 'ipums':
                    df[col] = np.random.randint(0, len(bins), df.shape[0])
                else:
                    df[col] = ((df[col].values - start) / step).astype(
                        np.int16)

            logging.info('finish %s' % col)

            if bins.any():
                bins = [
                    pd.Interval(x,
                                np.round(x + self.ranges[col]['gran'], 2),
                                closed='left') for x in bins
                ]
                self.kv_map[col] = dict(enumerate(bins))
                self.vk_map[col] = {v: k for k, v in self.kv_map[col].items()}
            else:
                self.kv_map[col] = dict(enumerate(df[col].cat.categories))
                self.vk_map[col] = {v: k for k, v in self.kv_map[col].items()}
                df.replace({col: self.vk_map[col]}, inplace=True)

        return df.values.astype(np.int)