Ejemplo n.º 1
0
    def test_join_non_int_index(self):
        other = Index([3, 6, 7, 8, 10], dtype=object)

        outer = self.index.join(other, how='outer')
        outer2 = other.join(self.index, how='outer')
        expected = Index([0, 2, 3, 4, 6, 7, 8, 10, 12, 14, 16, 18])
        tm.assert_index_equal(outer, outer2)
        tm.assert_index_equal(outer, expected)

        inner = self.index.join(other, how='inner')
        inner2 = other.join(self.index, how='inner')
        expected = Index([6, 8, 10])
        tm.assert_index_equal(inner, inner2)
        tm.assert_index_equal(inner, expected)

        left = self.index.join(other, how='left')
        tm.assert_index_equal(left, self.index.astype(object))

        left2 = other.join(self.index, how='left')
        tm.assert_index_equal(left2, other)

        right = self.index.join(other, how='right')
        tm.assert_index_equal(right, other)

        right2 = other.join(self.index, how='right')
        tm.assert_index_equal(right2, self.index.astype(object))
Ejemplo n.º 2
0
    def test_join_left(self):
        other = Int64Index([7, 12, 25, 1, 2, 5])
        other_mono = Int64Index([1, 2, 5, 7, 12, 25])

        # not monotonic
        res, lidx, ridx = self.index.join(other, how="left", return_indexers=True)
        eres = self.index
        eridx = np.array([-1, 4, -1, -1, -1, -1, 1, -1, -1, -1], dtype=np.intp)

        tm.assertIsInstance(res, Int64Index)
        self.assert_index_equal(res, eres)
        self.assertIsNone(lidx)
        tm.assert_numpy_array_equal(ridx, eridx)

        # monotonic
        res, lidx, ridx = self.index.join(other_mono, how="left", return_indexers=True)
        eridx = np.array([-1, 1, -1, -1, -1, -1, 4, -1, -1, -1], dtype=np.intp)
        tm.assertIsInstance(res, Int64Index)
        self.assert_index_equal(res, eres)
        self.assertIsNone(lidx)
        tm.assert_numpy_array_equal(ridx, eridx)

        # non-unique
        idx = Index([1, 1, 2, 5])
        idx2 = Index([1, 2, 5, 7, 9])
        res, lidx, ridx = idx2.join(idx, how="left", return_indexers=True)
        eres = Index([1, 1, 2, 5, 7, 9])  # 1 is in idx2, so it should be x2
        eridx = np.array([0, 1, 2, 3, -1, -1], dtype=np.intp)
        elidx = np.array([0, 0, 1, 2, 3, 4], dtype=np.intp)
        self.assert_index_equal(res, eres)
        tm.assert_numpy_array_equal(lidx, elidx)
        tm.assert_numpy_array_equal(ridx, eridx)
Ejemplo n.º 3
0
    def test_join_non_int_index(self):
        other = Index([3, 6, 7, 8, 10], dtype=object)

        outer = self.index.join(other, how='outer')
        outer2 = other.join(self.index, how='outer')
        expected = Index([0, 2, 3, 4, 6, 7, 8, 10, 12, 14,
                          16, 18], dtype=object)
        self.assertTrue(outer.equals(outer2))
        self.assertTrue(outer.equals(expected))

        inner = self.index.join(other, how='inner')
        inner2 = other.join(self.index, how='inner')
        expected = Index([6, 8, 10], dtype=object)
        self.assertTrue(inner.equals(inner2))
        self.assertTrue(inner.equals(expected))

        left = self.index.join(other, how='left')
        self.assertTrue(left.equals(self.index))

        left2 = other.join(self.index, how='left')
        self.assertTrue(left2.equals(other))

        right = self.index.join(other, how='right')
        self.assertTrue(right.equals(other))

        right2 = other.join(self.index, how='right')
        self.assertTrue(right2.equals(self.index))
Ejemplo n.º 4
0
    def test_join_non_int_index(self):
        other = Index(2**63 + np.array(
            [1, 5, 7, 10, 20], dtype='uint64'), dtype=object)

        outer = self.index.join(other, how='outer')
        outer2 = other.join(self.index, how='outer')
        expected = Index(2**63 + np.array(
            [0, 1, 5, 7, 10, 15, 20, 25], dtype='uint64'))
        tm.assert_index_equal(outer, outer2)
        tm.assert_index_equal(outer, expected)

        inner = self.index.join(other, how='inner')
        inner2 = other.join(self.index, how='inner')
        expected = Index(2**63 + np.array([10, 20], dtype='uint64'))
        tm.assert_index_equal(inner, inner2)
        tm.assert_index_equal(inner, expected)

        left = self.index.join(other, how='left')
        tm.assert_index_equal(left, self.index.astype(object))

        left2 = other.join(self.index, how='left')
        tm.assert_index_equal(left2, other)

        right = self.index.join(other, how='right')
        tm.assert_index_equal(right, other)

        right2 = other.join(self.index, how='right')
        tm.assert_index_equal(right2, self.index.astype(object))
Ejemplo n.º 5
0
def process_pandas(data, endog_idx=0, exog_idx=None, index_idx=None):
    names = data.columns

    if isinstance(endog_idx, (int, long)):
        endog_name = names[endog_idx]
        endog = data[endog_name].copy()
        if exog_idx is None:
            exog = data.drop([endog_name], axis=1)
        else:
            exog = data[names[exog_idx]].copy()
    else:
        endog = data.loc[:, endog_idx].copy()
        endog_name = list(endog.columns)
        if exog_idx is None:
            exog = data.drop(endog_name, axis=1)
        elif isinstance(exog_idx, (int, long)):
            exog = data[names[exog_idx]].copy()
        else:
            exog = data[names[exog_idx]].copy()

    if index_idx is not None:  # NOTE: will have to be improved for dates
        index = Index(data.iloc[:, index_idx])
        endog.index = index
        exog.index = index.copy()
        data = data.set_index(names[index_idx])

    exog_name = list(exog.columns)
    dataset = Dataset(data=data, names=list(names), endog=endog,
                      exog=exog, endog_name=endog_name, exog_name=exog_name)
    return dataset
Ejemplo n.º 6
0
    def test_datetime_name_accessors(self, time_locale):
        # Test Monday -> Sunday and January -> December, in that sequence
        if time_locale is None:
            # If the time_locale is None, day-name and month_name should
            # return the english attributes
            expected_days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday',
                             'Friday', 'Saturday', 'Sunday']
            expected_months = ['January', 'February', 'March', 'April', 'May',
                               'June', 'July', 'August', 'September',
                               'October', 'November', 'December']
        else:
            with tm.set_locale(time_locale, locale.LC_TIME):
                expected_days = calendar.day_name[:]
                expected_months = calendar.month_name[1:]

        # GH#11128
        dti = pd.date_range(freq='D', start=datetime(1998, 1, 1),
                            periods=365)
        english_days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday',
                        'Friday', 'Saturday', 'Sunday']
        for day, name, eng_name in zip(range(4, 11),
                                       expected_days,
                                       english_days):
            name = name.capitalize()
            assert dti.weekday_name[day] == eng_name
            assert dti.day_name(locale=time_locale)[day] == name
            ts = Timestamp(datetime(2016, 4, day))
            with tm.assert_produces_warning(FutureWarning,
                                            check_stacklevel=False):
                assert ts.weekday_name == eng_name
            assert ts.day_name(locale=time_locale) == name
        dti = dti.append(DatetimeIndex([pd.NaT]))
        assert np.isnan(dti.day_name(locale=time_locale)[-1])
        ts = Timestamp(pd.NaT)
        assert np.isnan(ts.day_name(locale=time_locale))

        # GH#12805
        dti = pd.date_range(freq='M', start='2012', end='2013')
        result = dti.month_name(locale=time_locale)
        expected = Index([month.capitalize() for month in expected_months])

        # work around different normalization schemes
        # https://github.com/pandas-dev/pandas/issues/22342
        if not compat.PY2:
            result = result.str.normalize("NFD")
            expected = expected.str.normalize("NFD")

        tm.assert_index_equal(result, expected)

        for date, expected in zip(dti, expected_months):
            result = date.month_name(locale=time_locale)
            expected = expected.capitalize()

            if not compat.PY2:
                result = unicodedata.normalize("NFD", result)
                expected = unicodedata.normalize("NFD", result)

            assert result == expected
        dti = dti.append(DatetimeIndex([pd.NaT]))
        assert np.isnan(dti.month_name(locale=time_locale)[-1])
Ejemplo n.º 7
0
    def test_intersection(self):
        # intersect with Int64Index
        other = Index(np.arange(1, 6))
        result = self.index.intersection(other)
        expected = Index(np.sort(np.intersect1d(self.index.values,
                                                other.values)))
        self.assert_index_equal(result, expected)

        result = other.intersection(self.index)
        expected = Index(np.sort(np.asarray(np.intersect1d(self.index.values,
                                                           other.values))))
        self.assert_index_equal(result, expected)

        # intersect with increasing RangeIndex
        other = RangeIndex(1, 6)
        result = self.index.intersection(other)
        expected = Index(np.sort(np.intersect1d(self.index.values,
                                                other.values)))
        self.assert_index_equal(result, expected)

        # intersect with decreasing RangeIndex
        other = RangeIndex(5, 0, -1)
        result = self.index.intersection(other)
        expected = Index(np.sort(np.intersect1d(self.index.values,
                                                other.values)))
        self.assert_index_equal(result, expected)
Ejemplo n.º 8
0
    def test_join_right(self):
        other = Int64Index([7, 12, 25, 1, 2, 5])
        other_mono = Int64Index([1, 2, 5, 7, 12, 25])

        # not monotonic
        res, lidx, ridx = self.index.join(other, how='right',
                                          return_indexers=True)
        eres = other
        elidx = np.array([-1, 6, -1, -1, 1, -1], dtype=np.intp)

        assert isinstance(other, Int64Index)
        tm.assert_index_equal(res, eres)
        tm.assert_numpy_array_equal(lidx, elidx)
        assert ridx is None

        # monotonic
        res, lidx, ridx = self.index.join(other_mono, how='right',
                                          return_indexers=True)
        eres = other_mono
        elidx = np.array([-1, 1, -1, -1, 6, -1], dtype=np.intp)
        assert isinstance(other, Int64Index)
        tm.assert_index_equal(res, eres)
        tm.assert_numpy_array_equal(lidx, elidx)
        assert ridx is None

        # non-unique
        idx = Index([1, 1, 2, 5])
        idx2 = Index([1, 2, 5, 7, 9])
        res, lidx, ridx = idx.join(idx2, how='right', return_indexers=True)
        eres = Index([1, 1, 2, 5, 7, 9])  # 1 is in idx2, so it should be x2
        elidx = np.array([0, 1, 2, 3, -1, -1], dtype=np.intp)
        eridx = np.array([0, 0, 1, 2, 3, 4], dtype=np.intp)
        tm.assert_index_equal(res, eres)
        tm.assert_numpy_array_equal(lidx, elidx)
        tm.assert_numpy_array_equal(ridx, eridx)
Ejemplo n.º 9
0
def _maybe_cache(arg, format, cache, convert_listlike):
    """
    Create a cache of unique dates from an array of dates

    Parameters
    ----------
    arg : integer, float, string, datetime, list, tuple, 1-d array, Series
    format : string
        Strftime format to parse time
    cache : boolean
        True attempts to create a cache of converted values
    convert_listlike : function
        Conversion function to apply on dates

    Returns
    -------
    cache_array : Series
        Cache of converted, unique dates. Can be empty
    """
    from pandas import Series
    cache_array = Series()
    if cache:
        # Perform a quicker unique check
        from pandas import Index
        unique_dates = Index(arg).unique()
        if len(unique_dates) < len(arg):
            cache_dates = convert_listlike(unique_dates.to_numpy(),
                                           True, format)
            cache_array = Series(cache_dates, index=unique_dates)
    return cache_array
Ejemplo n.º 10
0
 def test_to_tuples(self, tuples):
     # GH 18756
     idx = IntervalIndex.from_tuples(tuples)
     result = idx.to_tuples()
     expected = Index(com.asarray_tuplesafe(tuples))
     tm.assert_index_equal(result, expected)
Ejemplo n.º 11
0
    def test_get_loc(self):
        # GH 12531
        cidx1 = CategoricalIndex(list("abcde"), categories=list("edabc"))
        idx1 = Index(list("abcde"))
        assert cidx1.get_loc("a") == idx1.get_loc("a")
        assert cidx1.get_loc("e") == idx1.get_loc("e")

        for i in [cidx1, idx1]:
            with pytest.raises(KeyError, match="'NOT-EXIST'"):
                i.get_loc("NOT-EXIST")

        # non-unique
        cidx2 = CategoricalIndex(list("aacded"), categories=list("edabc"))
        idx2 = Index(list("aacded"))

        # results in bool array
        res = cidx2.get_loc("d")
        tm.assert_numpy_array_equal(res, idx2.get_loc("d"))
        tm.assert_numpy_array_equal(
            res, np.array([False, False, False, True, False, True]))
        # unique element results in scalar
        res = cidx2.get_loc("e")
        assert res == idx2.get_loc("e")
        assert res == 4

        for i in [cidx2, idx2]:
            with pytest.raises(KeyError, match="'NOT-EXIST'"):
                i.get_loc("NOT-EXIST")

        # non-unique, sliceable
        cidx3 = CategoricalIndex(list("aabbb"), categories=list("abc"))
        idx3 = Index(list("aabbb"))

        # results in slice
        res = cidx3.get_loc("a")
        assert res == idx3.get_loc("a")
        assert res == slice(0, 2, None)

        res = cidx3.get_loc("b")
        assert res == idx3.get_loc("b")
        assert res == slice(2, 5, None)

        for i in [cidx3, idx3]:
            with pytest.raises(KeyError, match="'c'"):
                i.get_loc("c")
Ejemplo n.º 12
0
    def test_datetime_name_accessors(self, time_locale):
        # Test Monday -> Sunday and January -> December, in that sequence
        if time_locale is None:
            # If the time_locale is None, day-name and month_name should
            # return the english attributes
            expected_days = [
                "Monday",
                "Tuesday",
                "Wednesday",
                "Thursday",
                "Friday",
                "Saturday",
                "Sunday",
            ]
            expected_months = [
                "January",
                "February",
                "March",
                "April",
                "May",
                "June",
                "July",
                "August",
                "September",
                "October",
                "November",
                "December",
            ]
        else:
            with tm.set_locale(time_locale, locale.LC_TIME):
                expected_days = calendar.day_name[:]
                expected_months = calendar.month_name[1:]

        # GH#11128
        dti = pd.date_range(freq="D", start=datetime(1998, 1, 1), periods=365)
        english_days = [
            "Monday",
            "Tuesday",
            "Wednesday",
            "Thursday",
            "Friday",
            "Saturday",
            "Sunday",
        ]
        for day, name, eng_name in zip(range(4, 11), expected_days,
                                       english_days):
            name = name.capitalize()
            assert dti.day_name(locale=time_locale)[day] == name
            ts = Timestamp(datetime(2016, 4, day))
            assert ts.day_name(locale=time_locale) == name
        dti = dti.append(DatetimeIndex([pd.NaT]))
        assert np.isnan(dti.day_name(locale=time_locale)[-1])
        ts = Timestamp(pd.NaT)
        assert np.isnan(ts.day_name(locale=time_locale))

        # GH#12805
        dti = pd.date_range(freq="M", start="2012", end="2013")
        result = dti.month_name(locale=time_locale)
        expected = Index([month.capitalize() for month in expected_months])

        # work around different normalization schemes
        # https://github.com/pandas-dev/pandas/issues/22342
        result = result.str.normalize("NFD")
        expected = expected.str.normalize("NFD")

        tm.assert_index_equal(result, expected)

        for date, expected in zip(dti, expected_months):
            result = date.month_name(locale=time_locale)
            expected = expected.capitalize()

            result = unicodedata.normalize("NFD", result)
            expected = unicodedata.normalize("NFD", result)

            assert result == expected
        dti = dti.append(DatetimeIndex([pd.NaT]))
        assert np.isnan(dti.month_name(locale=time_locale)[-1])
Ejemplo n.º 13
0
    def test_argminmax(self):
        obj = Index(np.arange(5, dtype="int64"))
        assert obj.argmin() == 0
        assert obj.argmax() == 4

        obj = Index([np.nan, 1, np.nan, 2])
        assert obj.argmin() == 1
        assert obj.argmax() == 3
        assert obj.argmin(skipna=False) == -1
        assert obj.argmax(skipna=False) == -1

        obj = Index([np.nan])
        assert obj.argmin() == -1
        assert obj.argmax() == -1
        assert obj.argmin(skipna=False) == -1
        assert obj.argmax(skipna=False) == -1

        obj = Index([pd.NaT, datetime(2011, 11, 1), datetime(2011, 11, 2), pd.NaT])
        assert obj.argmin() == 1
        assert obj.argmax() == 2
        assert obj.argmin(skipna=False) == -1
        assert obj.argmax(skipna=False) == -1

        obj = Index([pd.NaT])
        assert obj.argmin() == -1
        assert obj.argmax() == -1
        assert obj.argmin(skipna=False) == -1
        assert obj.argmax(skipna=False) == -1
Ejemplo n.º 14
0
    def test_slicing_and_getting_ops(self):

        # systematically test the slicing operations:
        #  for all slicing ops:
        #   - returning a dataframe
        #   - returning a column
        #   - returning a row
        #   - returning a single value

        cats = Categorical(["a", "c", "b", "c", "c", "c", "c"],
                           categories=["a", "b", "c"])
        idx = Index(["h", "i", "j", "k", "l", "m", "n"])
        values = [1, 2, 3, 4, 5, 6, 7]
        df = DataFrame({"cats": cats, "values": values}, index=idx)

        # the expected values
        cats2 = Categorical(["b", "c"], categories=["a", "b", "c"])
        idx2 = Index(["j", "k"])
        values2 = [3, 4]

        # 2:4,: | "j":"k",:
        exp_df = DataFrame({"cats": cats2, "values": values2}, index=idx2)

        # :,"cats" | :,0
        exp_col = Series(cats, index=idx, name='cats')

        # "j",: | 2,:
        exp_row = Series(["b", 3],
                         index=["cats", "values"],
                         dtype="object",
                         name="j")

        # "j","cats | 2,0
        exp_val = "b"

        # iloc
        # frame
        res_df = df.iloc[2:4, :]
        tm.assert_frame_equal(res_df, exp_df)
        assert is_categorical_dtype(res_df["cats"])

        # row
        res_row = df.iloc[2, :]
        tm.assert_series_equal(res_row, exp_row)
        assert isinstance(res_row["cats"], str)

        # col
        res_col = df.iloc[:, 0]
        tm.assert_series_equal(res_col, exp_col)
        assert is_categorical_dtype(res_col)

        # single value
        res_val = df.iloc[2, 0]
        assert res_val == exp_val

        # loc
        # frame
        res_df = df.loc["j":"k", :]
        tm.assert_frame_equal(res_df, exp_df)
        assert is_categorical_dtype(res_df["cats"])

        # row
        res_row = df.loc["j", :]
        tm.assert_series_equal(res_row, exp_row)
        assert isinstance(res_row["cats"], str)

        # col
        res_col = df.loc[:, "cats"]
        tm.assert_series_equal(res_col, exp_col)
        assert is_categorical_dtype(res_col)

        # single value
        res_val = df.loc["j", "cats"]
        assert res_val == exp_val

        # ix
        # frame
        # res_df = df.loc["j":"k",[0,1]] # doesn't work?
        res_df = df.loc["j":"k", :]
        tm.assert_frame_equal(res_df, exp_df)
        assert is_categorical_dtype(res_df["cats"])

        # row
        res_row = df.loc["j", :]
        tm.assert_series_equal(res_row, exp_row)
        assert isinstance(res_row["cats"], str)

        # col
        res_col = df.loc[:, "cats"]
        tm.assert_series_equal(res_col, exp_col)
        assert is_categorical_dtype(res_col)

        # single value
        res_val = df.loc["j", df.columns[0]]
        assert res_val == exp_val

        # iat
        res_val = df.iat[2, 0]
        assert res_val == exp_val

        # at
        res_val = df.at["j", "cats"]
        assert res_val == exp_val

        # fancy indexing
        exp_fancy = df.iloc[[2]]

        res_fancy = df[df["cats"] == "b"]
        tm.assert_frame_equal(res_fancy, exp_fancy)
        res_fancy = df[df["values"] == 3]
        tm.assert_frame_equal(res_fancy, exp_fancy)

        # get_value
        res_val = df.at["j", "cats"]
        assert res_val == exp_val

        # i : int, slice, or sequence of integers
        res_row = df.iloc[2]
        tm.assert_series_equal(res_row, exp_row)
        assert isinstance(res_row["cats"], str)

        res_df = df.iloc[slice(2, 4)]
        tm.assert_frame_equal(res_df, exp_df)
        assert is_categorical_dtype(res_df["cats"])

        res_df = df.iloc[[2, 3]]
        tm.assert_frame_equal(res_df, exp_df)
        assert is_categorical_dtype(res_df["cats"])

        res_col = df.iloc[:, 0]
        tm.assert_series_equal(res_col, exp_col)
        assert is_categorical_dtype(res_col)

        res_df = df.iloc[:, slice(0, 2)]
        tm.assert_frame_equal(res_df, df)
        assert is_categorical_dtype(res_df["cats"])

        res_df = df.iloc[:, [0, 1]]
        tm.assert_frame_equal(res_df, df)
        assert is_categorical_dtype(res_df["cats"])
Ejemplo n.º 15
0
def assert_series_equal(
    left,
    right,
    check_dtype=True,
    check_index_type="equiv",
    check_series_type=True,
    check_less_precise=no_default,
    check_names=True,
    check_exact=False,
    check_datetimelike_compat=False,
    check_categorical=True,
    check_category_order=True,
    check_freq=True,
    check_flags=True,
    rtol=1.0e-5,
    atol=1.0e-8,
    obj="Series",
    *,
    check_index=True,
):
    """
    Check that left and right Series are equal.

    Parameters
    ----------
    left : Series
    right : Series
    check_dtype : bool, default True
        Whether to check the Series dtype is identical.
    check_index_type : bool or {'equiv'}, default 'equiv'
        Whether to check the Index class, dtype and inferred_type
        are identical.
    check_series_type : bool, default True
         Whether to check the Series class is identical.
    check_less_precise : bool or int, default False
        Specify comparison precision. Only used when check_exact is False.
        5 digits (False) or 3 digits (True) after decimal points are compared.
        If int, then specify the digits to compare.

        When comparing two numbers, if the first number has magnitude less
        than 1e-5, we compare the two numbers directly and check whether
        they are equivalent within the specified precision. Otherwise, we
        compare the **ratio** of the second number to the first number and
        check whether it is equivalent to 1 within the specified precision.

        .. deprecated:: 1.1.0
           Use `rtol` and `atol` instead to define relative/absolute
           tolerance, respectively. Similar to :func:`math.isclose`.
    check_names : bool, default True
        Whether to check the Series and Index names attribute.
    check_exact : bool, default False
        Whether to compare number exactly.
    check_datetimelike_compat : bool, default False
        Compare datetime-like which is comparable ignoring dtype.
    check_categorical : bool, default True
        Whether to compare internal Categorical exactly.
    check_category_order : bool, default True
        Whether to compare category order of internal Categoricals.

        .. versionadded:: 1.0.2
    check_freq : bool, default True
        Whether to check the `freq` attribute on a DatetimeIndex or TimedeltaIndex.

        .. versionadded:: 1.1.0
    check_flags : bool, default True
        Whether to check the `flags` attribute.

        .. versionadded:: 1.2.0

    rtol : float, default 1e-5
        Relative tolerance. Only used when check_exact is False.

        .. versionadded:: 1.1.0
    atol : float, default 1e-8
        Absolute tolerance. Only used when check_exact is False.

        .. versionadded:: 1.1.0
    obj : str, default 'Series'
        Specify object name being compared, internally used to show appropriate
        assertion message.
    check_index : bool, default True
        Whether to check index equivalence. If False, then compare only values.

        .. versionadded:: 1.3.0

    Examples
    --------
    >>> from pandas import testing as tm
    >>> a = pd.Series([1, 2, 3, 4])
    >>> b = pd.Series([1, 2, 3, 4])
    >>> tm.assert_series_equal(a, b)
    """
    __tracebackhide__ = True

    if check_less_precise is not no_default:
        warnings.warn(
            "The 'check_less_precise' keyword in testing.assert_*_equal "
            "is deprecated and will be removed in a future version. "
            "You can stop passing 'check_less_precise' to silence this warning.",
            FutureWarning,
            stacklevel=2,
        )
        rtol = atol = _get_tol_from_less_precise(check_less_precise)

    # instance validation
    _check_isinstance(left, right, Series)

    if check_series_type:
        assert_class_equal(left, right, obj=obj)

    # length comparison
    if len(left) != len(right):
        msg1 = f"{len(left)}, {left.index}"
        msg2 = f"{len(right)}, {right.index}"
        raise_assert_detail(obj, "Series length are different", msg1, msg2)

    if check_flags:
        assert left.flags == right.flags, f"{repr(left.flags)} != {repr(right.flags)}"

    if check_index:
        # GH #38183
        assert_index_equal(
            left.index,
            right.index,
            exact=check_index_type,
            check_names=check_names,
            check_exact=check_exact,
            check_categorical=check_categorical,
            rtol=rtol,
            atol=atol,
            obj=f"{obj}.index",
        )

    if check_freq and isinstance(left.index, (DatetimeIndex, TimedeltaIndex)):
        lidx = left.index
        ridx = right.index
        assert lidx.freq == ridx.freq, (lidx.freq, ridx.freq)

    if check_dtype:
        # We want to skip exact dtype checking when `check_categorical`
        # is False. We'll still raise if only one is a `Categorical`,
        # regardless of `check_categorical`
        if (is_categorical_dtype(left.dtype)
                and is_categorical_dtype(right.dtype)
                and not check_categorical):
            pass
        else:
            assert_attr_equal("dtype", left, right, obj=f"Attributes of {obj}")

    if check_exact and is_numeric_dtype(left.dtype) and is_numeric_dtype(
            right.dtype):
        left_values = left._values
        right_values = right._values
        # Only check exact if dtype is numeric
        if isinstance(left_values, ExtensionArray) and isinstance(
                right_values, ExtensionArray):
            assert_extension_array_equal(
                left_values,
                right_values,
                check_dtype=check_dtype,
                index_values=np.asarray(left.index),
            )
        else:
            assert_numpy_array_equal(
                left_values,
                right_values,
                check_dtype=check_dtype,
                obj=str(obj),
                index_values=np.asarray(left.index),
            )
    elif check_datetimelike_compat and (needs_i8_conversion(left.dtype)
                                        or needs_i8_conversion(right.dtype)):
        # we want to check only if we have compat dtypes
        # e.g. integer and M|m are NOT compat, but we can simply check
        # the values in that case

        # datetimelike may have different objects (e.g. datetime.datetime
        # vs Timestamp) but will compare equal
        if not Index(left._values).equals(Index(right._values)):
            msg = (f"[datetimelike_compat=True] {left._values} "
                   f"is not equal to {right._values}.")
            raise AssertionError(msg)
    elif is_interval_dtype(left.dtype) and is_interval_dtype(right.dtype):
        assert_interval_array_equal(left.array, right.array)
    elif is_categorical_dtype(left.dtype) or is_categorical_dtype(right.dtype):
        _testing.assert_almost_equal(
            left._values,
            right._values,
            rtol=rtol,
            atol=atol,
            check_dtype=check_dtype,
            obj=str(obj),
            index_values=np.asarray(left.index),
        )
    elif is_extension_array_dtype(left.dtype) and is_extension_array_dtype(
            right.dtype):
        assert_extension_array_equal(
            left._values,
            right._values,
            check_dtype=check_dtype,
            index_values=np.asarray(left.index),
        )
    elif is_extension_array_dtype_and_needs_i8_conversion(
            left.dtype,
            right.dtype) or is_extension_array_dtype_and_needs_i8_conversion(
                right.dtype, left.dtype):
        assert_extension_array_equal(
            left._values,
            right._values,
            check_dtype=check_dtype,
            index_values=np.asarray(left.index),
        )
    elif needs_i8_conversion(left.dtype) and needs_i8_conversion(right.dtype):
        # DatetimeArray or TimedeltaArray
        assert_extension_array_equal(
            left._values,
            right._values,
            check_dtype=check_dtype,
            index_values=np.asarray(left.index),
        )
    else:
        _testing.assert_almost_equal(
            left._values,
            right._values,
            rtol=rtol,
            atol=atol,
            check_dtype=check_dtype,
            obj=str(obj),
            index_values=np.asarray(left.index),
        )

    # metadata comparison
    if check_names:
        assert_attr_equal("name", left, right, obj=obj)

    if check_categorical:
        if is_categorical_dtype(left.dtype) or is_categorical_dtype(
                right.dtype):
            assert_categorical_equal(
                left._values,
                right._values,
                obj=f"{obj} category",
                check_category_order=check_category_order,
            )
Ejemplo n.º 16
0
    assert Index(mi.values).equals(Index(recons.values))


@pytest.mark.parametrize(
    "obj",
    [
        Series([1, 2, 3]),
        Series([1.0, 1.5, 3.2]),
        Series([1.0, 1.5, np.nan]),
        Series([1.0, 1.5, 3.2], index=[1.5, 1.1, 3.3]),
        Series(["a", "b", "c"]),
        Series(["a", np.nan, "c"]),
        Series(["a", None, "c"]),
        Series([True, False, True]),
        Series(dtype=object),
        Index([1, 2, 3]),
        Index([True, False, True]),
        DataFrame({"x": ["a", "b", "c"], "y": [1, 2, 3]}),
        DataFrame(),
        tm.makeMissingDataframe(),
        tm.makeMixedDataFrame(),
        tm.makeTimeDataFrame(),
        tm.makeTimeSeries(),
        tm.makeTimedeltaIndex(),
        tm.makePeriodIndex(),
        Series(tm.makePeriodIndex()),
        Series(pd.date_range("20130101", periods=3, tz="US/Eastern")),
        MultiIndex.from_product(
            [range(5), ["foo", "bar", "baz"], pd.date_range("20130101", periods=2)]
        ),
        MultiIndex.from_product([pd.CategoricalIndex(list("aabc")), range(3)]),
Ejemplo n.º 17
0
    def test_excel_old_index_format(self, read_ext):
        # see gh-4679
        filename = "test_index_name_pre17" + read_ext

        # We detect headers to determine if index names exist, so
        # that "index" name in the "names" version of the data will
        # now be interpreted as rows that include null data.
        data = np.array(
            [
                [None, None, None, None, None],
                ["R0C0", "R0C1", "R0C2", "R0C3", "R0C4"],
                ["R1C0", "R1C1", "R1C2", "R1C3", "R1C4"],
                ["R2C0", "R2C1", "R2C2", "R2C3", "R2C4"],
                ["R3C0", "R3C1", "R3C2", "R3C3", "R3C4"],
                ["R4C0", "R4C1", "R4C2", "R4C3", "R4C4"],
            ]
        )
        columns = ["C_l0_g0", "C_l0_g1", "C_l0_g2", "C_l0_g3", "C_l0_g4"]
        mi = MultiIndex(
            levels=[
                ["R0", "R_l0_g0", "R_l0_g1", "R_l0_g2", "R_l0_g3", "R_l0_g4"],
                ["R1", "R_l1_g0", "R_l1_g1", "R_l1_g2", "R_l1_g3", "R_l1_g4"],
            ],
            codes=[[0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5]],
            names=[None, None],
        )
        si = Index(
            ["R0", "R_l0_g0", "R_l0_g1", "R_l0_g2", "R_l0_g3", "R_l0_g4"], name=None
        )

        expected = pd.DataFrame(data, index=si, columns=columns)

        actual = pd.read_excel(filename, "single_names", index_col=0)
        tm.assert_frame_equal(actual, expected)

        expected.index = mi

        actual = pd.read_excel(filename, "multi_names", index_col=[0, 1])
        tm.assert_frame_equal(actual, expected)

        # The analogous versions of the "names" version data
        # where there are explicitly no names for the indices.
        data = np.array(
            [
                ["R0C0", "R0C1", "R0C2", "R0C3", "R0C4"],
                ["R1C0", "R1C1", "R1C2", "R1C3", "R1C4"],
                ["R2C0", "R2C1", "R2C2", "R2C3", "R2C4"],
                ["R3C0", "R3C1", "R3C2", "R3C3", "R3C4"],
                ["R4C0", "R4C1", "R4C2", "R4C3", "R4C4"],
            ]
        )
        columns = ["C_l0_g0", "C_l0_g1", "C_l0_g2", "C_l0_g3", "C_l0_g4"]
        mi = MultiIndex(
            levels=[
                ["R_l0_g0", "R_l0_g1", "R_l0_g2", "R_l0_g3", "R_l0_g4"],
                ["R_l1_g0", "R_l1_g1", "R_l1_g2", "R_l1_g3", "R_l1_g4"],
            ],
            codes=[[0, 1, 2, 3, 4], [0, 1, 2, 3, 4]],
            names=[None, None],
        )
        si = Index(["R_l0_g0", "R_l0_g1", "R_l0_g2", "R_l0_g3", "R_l0_g4"], name=None)

        expected = pd.DataFrame(data, index=si, columns=columns)

        actual = pd.read_excel(filename, "single_no_names", index_col=0)
        tm.assert_frame_equal(actual, expected)

        expected.index = mi

        actual = pd.read_excel(filename, "multi_no_names", index_col=[0, 1])
        tm.assert_frame_equal(actual, expected, check_names=False)
Ejemplo n.º 18
0
    def test_nanops(self):
        # GH#7261
        for opname in ['max', 'min']:
            for klass in [Index, Series]:
                arg_op = 'arg' + opname if klass is Index else 'idx' + opname

                obj = klass([np.nan, 2.0])
                assert getattr(obj, opname)() == 2.0

                obj = klass([np.nan])
                assert pd.isna(getattr(obj, opname)())
                assert pd.isna(getattr(obj, opname)(skipna=False))

                obj = klass([])
                assert pd.isna(getattr(obj, opname)())
                assert pd.isna(getattr(obj, opname)(skipna=False))

                obj = klass([pd.NaT, datetime(2011, 11, 1)])
                # check DatetimeIndex monotonic path
                assert getattr(obj, opname)() == datetime(2011, 11, 1)
                assert getattr(obj, opname)(skipna=False) is pd.NaT

                assert getattr(obj, arg_op)() == 1
                result = getattr(obj, arg_op)(skipna=False)
                if klass is Series:
                    assert np.isnan(result)
                else:
                    assert result == -1

                obj = klass([pd.NaT, datetime(2011, 11, 1), pd.NaT])
                # check DatetimeIndex non-monotonic path
                assert getattr(obj, opname)(), datetime(2011, 11, 1)
                assert getattr(obj, opname)(skipna=False) is pd.NaT

                assert getattr(obj, arg_op)() == 1
                result = getattr(obj, arg_op)(skipna=False)
                if klass is Series:
                    assert np.isnan(result)
                else:
                    assert result == -1

                for dtype in ["M8[ns]", "datetime64[ns, UTC]"]:
                    # cases with empty Series/DatetimeIndex
                    obj = klass([], dtype=dtype)

                    assert getattr(obj, opname)() is pd.NaT
                    assert getattr(obj, opname)(skipna=False) is pd.NaT

                    with pytest.raises(ValueError, match="empty sequence"):
                        getattr(obj, arg_op)()
                    with pytest.raises(ValueError, match="empty sequence"):
                        getattr(obj, arg_op)(skipna=False)

        # argmin/max
        obj = Index(np.arange(5, dtype='int64'))
        assert obj.argmin() == 0
        assert obj.argmax() == 4

        obj = Index([np.nan, 1, np.nan, 2])
        assert obj.argmin() == 1
        assert obj.argmax() == 3
        assert obj.argmin(skipna=False) == -1
        assert obj.argmax(skipna=False) == -1

        obj = Index([np.nan])
        assert obj.argmin() == -1
        assert obj.argmax() == -1
        assert obj.argmin(skipna=False) == -1
        assert obj.argmax(skipna=False) == -1

        obj = Index([pd.NaT, datetime(2011, 11, 1), datetime(2011, 11, 2),
                     pd.NaT])
        assert obj.argmin() == 1
        assert obj.argmax() == 2
        assert obj.argmin(skipna=False) == -1
        assert obj.argmax(skipna=False) == -1

        obj = Index([pd.NaT])
        assert obj.argmin() == -1
        assert obj.argmax() == -1
        assert obj.argmin(skipna=False) == -1
        assert obj.argmax(skipna=False) == -1
Ejemplo n.º 19
0
def _have_same_elements(idx1: pd.Index, idx2: pd.Index) -> bool:
    return idx1.sort_values().equals(idx2.sort_values())
Ejemplo n.º 20
0
 def test_astype_object(self, index):
     result = index.astype(object)
     expected = Index(index.values, dtype="object")
     tm.assert_index_equal(result, expected)
     assert not result.equals(index)
Ejemplo n.º 21
0
 def test_set_columns(self, float_string_frame):
     cols = Index(np.arange(len(float_string_frame.columns)))
     float_string_frame.columns = cols
     with pytest.raises(ValueError, match="Length mismatch"):
         float_string_frame.columns = cols[::2]
Ejemplo n.º 22
0
    def test_assigning_ops(self):
        # systematically test the assigning operations:
        # for all slicing ops:
        #  for value in categories and value not in categories:

        #   - assign a single value -> exp_single_cats_value

        #   - assign a complete row (mixed values) -> exp_single_row

        # assign multiple rows (mixed values) (-> array) -> exp_multi_row

        # assign a part of a column with dtype == categorical ->
        # exp_parts_cats_col

        # assign a part of a column with dtype != categorical ->
        # exp_parts_cats_col

        cats = Categorical(["a", "a", "a", "a", "a", "a", "a"],
                           categories=["a", "b"])
        idx = Index(["h", "i", "j", "k", "l", "m", "n"])
        values = [1, 1, 1, 1, 1, 1, 1]
        orig = DataFrame({"cats": cats, "values": values}, index=idx)

        # the expected values
        # changed single row
        cats1 = Categorical(["a", "a", "b", "a", "a", "a", "a"],
                            categories=["a", "b"])
        idx1 = Index(["h", "i", "j", "k", "l", "m", "n"])
        values1 = [1, 1, 2, 1, 1, 1, 1]
        exp_single_row = DataFrame({
            "cats": cats1,
            "values": values1
        },
                                   index=idx1)

        # changed multiple rows
        cats2 = Categorical(["a", "a", "b", "b", "a", "a", "a"],
                            categories=["a", "b"])
        idx2 = Index(["h", "i", "j", "k", "l", "m", "n"])
        values2 = [1, 1, 2, 2, 1, 1, 1]
        exp_multi_row = DataFrame({
            "cats": cats2,
            "values": values2
        },
                                  index=idx2)

        # changed part of the cats column
        cats3 = Categorical(["a", "a", "b", "b", "a", "a", "a"],
                            categories=["a", "b"])
        idx3 = Index(["h", "i", "j", "k", "l", "m", "n"])
        values3 = [1, 1, 1, 1, 1, 1, 1]
        exp_parts_cats_col = DataFrame({
            "cats": cats3,
            "values": values3
        },
                                       index=idx3)

        # changed single value in cats col
        cats4 = Categorical(["a", "a", "b", "a", "a", "a", "a"],
                            categories=["a", "b"])
        idx4 = Index(["h", "i", "j", "k", "l", "m", "n"])
        values4 = [1, 1, 1, 1, 1, 1, 1]
        exp_single_cats_value = DataFrame({
            "cats": cats4,
            "values": values4
        },
                                          index=idx4)

        #  iloc
        # ###############
        #   - assign a single value -> exp_single_cats_value
        df = orig.copy()
        df.iloc[2, 0] = "b"
        tm.assert_frame_equal(df, exp_single_cats_value)

        df = orig.copy()
        df.iloc[df.index == "j", 0] = "b"
        tm.assert_frame_equal(df, exp_single_cats_value)

        #   - assign a single value not in the current categories set
        with pytest.raises(ValueError):
            df = orig.copy()
            df.iloc[2, 0] = "c"

        #   - assign a complete row (mixed values) -> exp_single_row
        df = orig.copy()
        df.iloc[2, :] = ["b", 2]
        tm.assert_frame_equal(df, exp_single_row)

        #   - assign a complete row (mixed values) not in categories set
        with pytest.raises(ValueError):
            df = orig.copy()
            df.iloc[2, :] = ["c", 2]

        #   - assign multiple rows (mixed values) -> exp_multi_row
        df = orig.copy()
        df.iloc[2:4, :] = [["b", 2], ["b", 2]]
        tm.assert_frame_equal(df, exp_multi_row)

        with pytest.raises(ValueError):
            df = orig.copy()
            df.iloc[2:4, :] = [["c", 2], ["c", 2]]

        # assign a part of a column with dtype == categorical ->
        # exp_parts_cats_col
        df = orig.copy()
        df.iloc[2:4, 0] = Categorical(["b", "b"], categories=["a", "b"])
        tm.assert_frame_equal(df, exp_parts_cats_col)

        with pytest.raises(ValueError):
            # different categories -> not sure if this should fail or pass
            df = orig.copy()
            df.iloc[2:4, 0] = Categorical(list("bb"), categories=list("abc"))

        with pytest.raises(ValueError):
            # different values
            df = orig.copy()
            df.iloc[2:4, 0] = Categorical(list("cc"), categories=list("abc"))

        # assign a part of a column with dtype != categorical ->
        # exp_parts_cats_col
        df = orig.copy()
        df.iloc[2:4, 0] = ["b", "b"]
        tm.assert_frame_equal(df, exp_parts_cats_col)

        with pytest.raises(ValueError):
            df.iloc[2:4, 0] = ["c", "c"]

        #  loc
        # ##############
        #   - assign a single value -> exp_single_cats_value
        df = orig.copy()
        df.loc["j", "cats"] = "b"
        tm.assert_frame_equal(df, exp_single_cats_value)

        df = orig.copy()
        df.loc[df.index == "j", "cats"] = "b"
        tm.assert_frame_equal(df, exp_single_cats_value)

        #   - assign a single value not in the current categories set
        with pytest.raises(ValueError):
            df = orig.copy()
            df.loc["j", "cats"] = "c"

        #   - assign a complete row (mixed values) -> exp_single_row
        df = orig.copy()
        df.loc["j", :] = ["b", 2]
        tm.assert_frame_equal(df, exp_single_row)

        #   - assign a complete row (mixed values) not in categories set
        with pytest.raises(ValueError):
            df = orig.copy()
            df.loc["j", :] = ["c", 2]

        #   - assign multiple rows (mixed values) -> exp_multi_row
        df = orig.copy()
        df.loc["j":"k", :] = [["b", 2], ["b", 2]]
        tm.assert_frame_equal(df, exp_multi_row)

        with pytest.raises(ValueError):
            df = orig.copy()
            df.loc["j":"k", :] = [["c", 2], ["c", 2]]

        # assign a part of a column with dtype == categorical ->
        # exp_parts_cats_col
        df = orig.copy()
        df.loc["j":"k", "cats"] = Categorical(["b", "b"],
                                              categories=["a", "b"])
        tm.assert_frame_equal(df, exp_parts_cats_col)

        with pytest.raises(ValueError):
            # different categories -> not sure if this should fail or pass
            df = orig.copy()
            df.loc["j":"k", "cats"] = Categorical(["b", "b"],
                                                  categories=["a", "b", "c"])

        with pytest.raises(ValueError):
            # different values
            df = orig.copy()
            df.loc["j":"k", "cats"] = Categorical(["c", "c"],
                                                  categories=["a", "b", "c"])

        # assign a part of a column with dtype != categorical ->
        # exp_parts_cats_col
        df = orig.copy()
        df.loc["j":"k", "cats"] = ["b", "b"]
        tm.assert_frame_equal(df, exp_parts_cats_col)

        with pytest.raises(ValueError):
            df.loc["j":"k", "cats"] = ["c", "c"]

        #  loc
        # ##############
        #   - assign a single value -> exp_single_cats_value
        df = orig.copy()
        df.loc["j", df.columns[0]] = "b"
        tm.assert_frame_equal(df, exp_single_cats_value)

        df = orig.copy()
        df.loc[df.index == "j", df.columns[0]] = "b"
        tm.assert_frame_equal(df, exp_single_cats_value)

        #   - assign a single value not in the current categories set
        with pytest.raises(ValueError):
            df = orig.copy()
            df.loc["j", df.columns[0]] = "c"

        #   - assign a complete row (mixed values) -> exp_single_row
        df = orig.copy()
        df.loc["j", :] = ["b", 2]
        tm.assert_frame_equal(df, exp_single_row)

        #   - assign a complete row (mixed values) not in categories set
        with pytest.raises(ValueError):
            df = orig.copy()
            df.loc["j", :] = ["c", 2]

        #   - assign multiple rows (mixed values) -> exp_multi_row
        df = orig.copy()
        df.loc["j":"k", :] = [["b", 2], ["b", 2]]
        tm.assert_frame_equal(df, exp_multi_row)

        with pytest.raises(ValueError):
            df = orig.copy()
            df.loc["j":"k", :] = [["c", 2], ["c", 2]]

        # assign a part of a column with dtype == categorical ->
        # exp_parts_cats_col
        df = orig.copy()
        df.loc["j":"k", df.columns[0]] = Categorical(["b", "b"],
                                                     categories=["a", "b"])
        tm.assert_frame_equal(df, exp_parts_cats_col)

        with pytest.raises(ValueError):
            # different categories -> not sure if this should fail or pass
            df = orig.copy()
            df.loc["j":"k",
                   df.columns[0]] = Categorical(["b", "b"],
                                                categories=["a", "b", "c"])

        with pytest.raises(ValueError):
            # different values
            df = orig.copy()
            df.loc["j":"k",
                   df.columns[0]] = Categorical(["c", "c"],
                                                categories=["a", "b", "c"])

        # assign a part of a column with dtype != categorical ->
        # exp_parts_cats_col
        df = orig.copy()
        df.loc["j":"k", df.columns[0]] = ["b", "b"]
        tm.assert_frame_equal(df, exp_parts_cats_col)

        with pytest.raises(ValueError):
            df.loc["j":"k", df.columns[0]] = ["c", "c"]

        # iat
        df = orig.copy()
        df.iat[2, 0] = "b"
        tm.assert_frame_equal(df, exp_single_cats_value)

        #   - assign a single value not in the current categories set
        with pytest.raises(ValueError):
            df = orig.copy()
            df.iat[2, 0] = "c"

        # at
        #   - assign a single value -> exp_single_cats_value
        df = orig.copy()
        df.at["j", "cats"] = "b"
        tm.assert_frame_equal(df, exp_single_cats_value)

        #   - assign a single value not in the current categories set
        with pytest.raises(ValueError):
            df = orig.copy()
            df.at["j", "cats"] = "c"

        # fancy indexing
        catsf = Categorical(["a", "a", "c", "c", "a", "a", "a"],
                            categories=["a", "b", "c"])
        idxf = Index(["h", "i", "j", "k", "l", "m", "n"])
        valuesf = [1, 1, 3, 3, 1, 1, 1]
        df = DataFrame({"cats": catsf, "values": valuesf}, index=idxf)

        exp_fancy = exp_multi_row.copy()
        exp_fancy["cats"].cat.set_categories(["a", "b", "c"], inplace=True)

        df[df["cats"] == "c"] = ["b", 2]
        # category c is kept in .categories
        tm.assert_frame_equal(df, exp_fancy)

        # set_value
        df = orig.copy()
        df.at["j", "cats"] = "b"
        tm.assert_frame_equal(df, exp_single_cats_value)

        with pytest.raises(ValueError):
            df = orig.copy()
            df.at["j", "cats"] = "c"

        # Assigning a Category to parts of a int/... column uses the values of
        # the Categorical
        df = DataFrame({"a": [1, 1, 1, 1, 1], "b": list("aaaaa")})
        exp = DataFrame({"a": [1, "b", "b", 1, 1], "b": list("aabba")})
        df.loc[1:2, "a"] = Categorical(["b", "b"], categories=["a", "b"])
        df.loc[2:3, "b"] = Categorical(["b", "b"], categories=["a", "b"])
        tm.assert_frame_equal(df, exp)
Ejemplo n.º 23
0
    def __init__(self,
                 objs,
                 axis=0,
                 join='outer',
                 join_axes=None,
                 keys=None,
                 levels=None,
                 names=None,
                 ignore_index=False,
                 verify_integrity=False,
                 copy=True):
        if isinstance(objs, (NDFrame, compat.string_types)):
            raise TypeError('first argument must be an iterable of pandas '
                            'objects, you passed an object of type '
                            '"{name}"'.format(name=type(objs).__name__))

        if join == 'outer':
            self.intersect = False
        elif join == 'inner':
            self.intersect = True
        else:  # pragma: no cover
            raise ValueError('Only can inner (intersect) or outer (union) '
                             'join the other axis')

        if isinstance(objs, dict):
            if keys is None:
                keys = sorted(objs)
            objs = [objs[k] for k in keys]
        else:
            objs = list(objs)

        if len(objs) == 0:
            raise ValueError('No objects to concatenate')

        if keys is None:
            objs = [obj for obj in objs if obj is not None]
        else:
            # #1649
            clean_keys = []
            clean_objs = []
            for k, v in zip(keys, objs):
                if v is None:
                    continue
                clean_keys.append(k)
                clean_objs.append(v)
            objs = clean_objs
            name = getattr(keys, 'name', None)
            keys = Index(clean_keys, name=name)

        if len(objs) == 0:
            raise ValueError('All objects passed were None')

        # consolidate data & figure out what our result ndim is going to be
        ndims = set()
        for obj in objs:
            if not isinstance(obj, NDFrame):
                raise TypeError("cannot concatenate a non-NDFrame object")

            # consolidate
            obj._consolidate(inplace=True)
            ndims.add(obj.ndim)

        # get the sample
        # want the higest ndim that we have, and must be non-empty
        # unless all objs are empty
        sample = None
        if len(ndims) > 1:
            max_ndim = max(ndims)
            for obj in objs:
                if obj.ndim == max_ndim and np.sum(obj.shape):
                    sample = obj
                    break

        else:
            # filter out the empties if we have not multi-index possibilities
            # note to keep empty Series as it affect to result columns / name
            non_empties = [
                obj for obj in objs
                if sum(obj.shape) > 0 or isinstance(obj, Series)
            ]

            if (len(non_empties)
                    and (keys is None and names is None and levels is None
                         and join_axes is None and not self.intersect)):
                objs = non_empties
                sample = objs[0]

        if sample is None:
            sample = objs[0]
        self.objs = objs

        # Standardize axis parameter to int
        if isinstance(sample, Series):
            axis = DataFrame()._get_axis_number(axis)
        else:
            axis = sample._get_axis_number(axis)

        # Need to flip BlockManager axis in the DataFrame special case
        self._is_frame = isinstance(sample, DataFrame)
        if self._is_frame:
            axis = 1 if axis == 0 else 0

        self._is_series = isinstance(sample, Series)
        if not 0 <= axis <= sample.ndim:
            raise AssertionError("axis must be between 0 and {ndim}, input was"
                                 " {axis}".format(ndim=sample.ndim, axis=axis))

        # if we have mixed ndims, then convert to highest ndim
        # creating column numbers as needed
        if len(ndims) > 1:
            current_column = 0
            max_ndim = sample.ndim
            self.objs, objs = [], self.objs
            for obj in objs:

                ndim = obj.ndim
                if ndim == max_ndim:
                    pass

                elif ndim != max_ndim - 1:
                    raise ValueError("cannot concatenate unaligned mixed "
                                     "dimensional NDFrame objects")

                else:
                    name = getattr(obj, 'name', None)
                    if ignore_index or name is None:
                        name = current_column
                        current_column += 1

                    # doing a row-wise concatenation so need everything
                    # to line up
                    if self._is_frame and axis == 1:
                        name = 0
                    obj = sample._constructor({name: obj})

                self.objs.append(obj)

        # note: this is the BlockManager axis (since DataFrame is transposed)
        self.axis = axis
        self.join_axes = join_axes
        self.keys = keys
        self.names = names or getattr(keys, 'names', None)
        self.levels = levels

        self.ignore_index = ignore_index
        self.verify_integrity = verify_integrity
        self.copy = copy

        self.new_axes = self._get_new_axes()
Ejemplo n.º 24
0
 def test_map_str(self, simple_index):
     # GH 31202
     idx = simple_index
     result = idx.map(str)
     expected = Index([str(x) for x in idx], dtype=object)
     tm.assert_index_equal(result, expected)
    def test_construction_index_with_mixed_timezones_with_NaT(self):
        # GH 11488
        result = Index([pd.NaT, Timestamp('2011-01-01'),
                        pd.NaT, Timestamp('2011-01-02')], name='idx')
        exp = DatetimeIndex([pd.NaT, Timestamp('2011-01-01'),
                             pd.NaT, Timestamp('2011-01-02')], name='idx')
        self.assert_index_equal(result, exp, exact=True)
        self.assertTrue(isinstance(result, DatetimeIndex))
        self.assertIsNone(result.tz)

        # same tz results in DatetimeIndex
        result = Index([pd.NaT, Timestamp('2011-01-01 10:00', tz='Asia/Tokyo'),
                        pd.NaT, Timestamp('2011-01-02 10:00',
                                          tz='Asia/Tokyo')],
                       name='idx')
        exp = DatetimeIndex([pd.NaT, Timestamp('2011-01-01 10:00'),
                             pd.NaT, Timestamp('2011-01-02 10:00')],
                            tz='Asia/Tokyo', name='idx')
        self.assert_index_equal(result, exp, exact=True)
        self.assertTrue(isinstance(result, DatetimeIndex))
        self.assertIsNotNone(result.tz)
        self.assertEqual(result.tz, exp.tz)

        # same tz results in DatetimeIndex (DST)
        result = Index([Timestamp('2011-01-01 10:00', tz='US/Eastern'),
                        pd.NaT,
                        Timestamp('2011-08-01 10:00', tz='US/Eastern')],
                       name='idx')
        exp = DatetimeIndex([Timestamp('2011-01-01 10:00'), pd.NaT,
                             Timestamp('2011-08-01 10:00')],
                            tz='US/Eastern', name='idx')
        self.assert_index_equal(result, exp, exact=True)
        self.assertTrue(isinstance(result, DatetimeIndex))
        self.assertIsNotNone(result.tz)
        self.assertEqual(result.tz, exp.tz)

        # different tz results in Index(dtype=object)
        result = Index([pd.NaT, Timestamp('2011-01-01 10:00'),
                        pd.NaT, Timestamp('2011-01-02 10:00',
                                          tz='US/Eastern')],
                       name='idx')
        exp = Index([pd.NaT, Timestamp('2011-01-01 10:00'),
                     pd.NaT, Timestamp('2011-01-02 10:00', tz='US/Eastern')],
                    dtype='object', name='idx')
        self.assert_index_equal(result, exp, exact=True)
        self.assertFalse(isinstance(result, DatetimeIndex))

        result = Index([pd.NaT, Timestamp('2011-01-01 10:00', tz='Asia/Tokyo'),
                        pd.NaT, Timestamp('2011-01-02 10:00',
                                          tz='US/Eastern')], name='idx')
        exp = Index([pd.NaT, Timestamp('2011-01-01 10:00', tz='Asia/Tokyo'),
                     pd.NaT, Timestamp('2011-01-02 10:00', tz='US/Eastern')],
                    dtype='object', name='idx')
        self.assert_index_equal(result, exp, exact=True)
        self.assertFalse(isinstance(result, DatetimeIndex))

        # passing tz results in DatetimeIndex
        result = Index([pd.NaT, Timestamp('2011-01-01 10:00'),
                        pd.NaT, Timestamp('2011-01-02 10:00',
                                          tz='US/Eastern')],
                       tz='Asia/Tokyo', name='idx')
        exp = DatetimeIndex([pd.NaT, Timestamp('2011-01-01 19:00'),
                             pd.NaT, Timestamp('2011-01-03 00:00')],
                            tz='Asia/Tokyo', name='idx')
        self.assert_index_equal(result, exp, exact=True)
        self.assertTrue(isinstance(result, DatetimeIndex))

        # all NaT
        result = Index([pd.NaT, pd.NaT], name='idx')
        exp = DatetimeIndex([pd.NaT, pd.NaT], name='idx')
        self.assert_index_equal(result, exp, exact=True)
        self.assertTrue(isinstance(result, DatetimeIndex))
        self.assertIsNone(result.tz)

        # all NaT with tz
        result = Index([pd.NaT, pd.NaT], tz='Asia/Tokyo', name='idx')
        exp = DatetimeIndex([pd.NaT, pd.NaT], tz='Asia/Tokyo', name='idx')
        self.assert_index_equal(result, exp, exact=True)
        self.assertTrue(isinstance(result, DatetimeIndex))
        self.assertIsNotNone(result.tz)
        self.assertEqual(result.tz, exp.tz)
Ejemplo n.º 26
0
def assert_index_equal(
    left: Index,
    right: Index,
    exact: bool | str = "equiv",
    check_names: bool = True,
    check_less_precise: bool | int | NoDefault = no_default,
    check_exact: bool = True,
    check_categorical: bool = True,
    check_order: bool = True,
    rtol: float = 1.0e-5,
    atol: float = 1.0e-8,
    obj: str = "Index",
) -> None:
    """
    Check that left and right Index are equal.

    Parameters
    ----------
    left : Index
    right : Index
    exact : bool or {'equiv'}, default 'equiv'
        Whether to check the Index class, dtype and inferred_type
        are identical. If 'equiv', then RangeIndex can be substituted for
        Int64Index as well.
    check_names : bool, default True
        Whether to check the names attribute.
    check_less_precise : bool or int, default False
        Specify comparison precision. Only used when check_exact is False.
        5 digits (False) or 3 digits (True) after decimal points are compared.
        If int, then specify the digits to compare.

        .. deprecated:: 1.1.0
           Use `rtol` and `atol` instead to define relative/absolute
           tolerance, respectively. Similar to :func:`math.isclose`.
    check_exact : bool, default True
        Whether to compare number exactly.
    check_categorical : bool, default True
        Whether to compare internal Categorical exactly.
    check_order : bool, default True
        Whether to compare the order of index entries as well as their values.
        If True, both indexes must contain the same elements, in the same order.
        If False, both indexes must contain the same elements, but in any order.

        .. versionadded:: 1.2.0
    rtol : float, default 1e-5
        Relative tolerance. Only used when check_exact is False.

        .. versionadded:: 1.1.0
    atol : float, default 1e-8
        Absolute tolerance. Only used when check_exact is False.

        .. versionadded:: 1.1.0
    obj : str, default 'Index'
        Specify object name being compared, internally used to show appropriate
        assertion message.

    Examples
    --------
    >>> from pandas import testing as tm
    >>> a = pd.Index([1, 2, 3])
    >>> b = pd.Index([1, 2, 3])
    >>> tm.assert_index_equal(a, b)
    """
    __tracebackhide__ = True

    def _check_types(left, right, obj="Index") -> None:
        if not exact:
            return

        assert_class_equal(left, right, exact=exact, obj=obj)
        assert_attr_equal("inferred_type", left, right, obj=obj)

        # Skip exact dtype checking when `check_categorical` is False
        if is_categorical_dtype(left.dtype) and is_categorical_dtype(
                right.dtype):
            if check_categorical:
                assert_attr_equal("dtype", left, right, obj=obj)
                assert_index_equal(left.categories,
                                   right.categories,
                                   exact=exact)
            return

        assert_attr_equal("dtype", left, right, obj=obj)

    def _get_ilevel_values(index, level):
        # accept level number only
        unique = index.levels[level]
        level_codes = index.codes[level]
        filled = take_nd(unique._values,
                         level_codes,
                         fill_value=unique._na_value)
        return unique._shallow_copy(filled, name=index.names[level])

    if check_less_precise is not no_default:
        warnings.warn(
            "The 'check_less_precise' keyword in testing.assert_*_equal "
            "is deprecated and will be removed in a future version. "
            "You can stop passing 'check_less_precise' to silence this warning.",
            FutureWarning,
            stacklevel=2,
        )
        # https://github.com/python/mypy/issues/7642
        # error: Argument 1 to "_get_tol_from_less_precise" has incompatible
        # type "Union[bool, int, NoDefault]"; expected "Union[bool, int]"
        rtol = atol = _get_tol_from_less_precise(
            check_less_precise  # type: ignore[arg-type]
        )

    # instance validation
    _check_isinstance(left, right, Index)

    # class / dtype comparison
    _check_types(left, right, obj=obj)

    # level comparison
    if left.nlevels != right.nlevels:
        msg1 = f"{obj} levels are different"
        msg2 = f"{left.nlevels}, {left}"
        msg3 = f"{right.nlevels}, {right}"
        raise_assert_detail(obj, msg1, msg2, msg3)

    # length comparison
    if len(left) != len(right):
        msg1 = f"{obj} length are different"
        msg2 = f"{len(left)}, {left}"
        msg3 = f"{len(right)}, {right}"
        raise_assert_detail(obj, msg1, msg2, msg3)

    # If order doesn't matter then sort the index entries
    if not check_order:
        left = Index(safe_sort(left))
        right = Index(safe_sort(right))

    # MultiIndex special comparison for little-friendly error messages
    if left.nlevels > 1:
        left = cast(MultiIndex, left)
        right = cast(MultiIndex, right)

        for level in range(left.nlevels):
            # cannot use get_level_values here because it can change dtype
            llevel = _get_ilevel_values(left, level)
            rlevel = _get_ilevel_values(right, level)

            lobj = f"MultiIndex level [{level}]"
            assert_index_equal(
                llevel,
                rlevel,
                exact=exact,
                check_names=check_names,
                check_exact=check_exact,
                rtol=rtol,
                atol=atol,
                obj=lobj,
            )
            # get_level_values may change dtype
            _check_types(left.levels[level], right.levels[level], obj=obj)

    # skip exact index checking when `check_categorical` is False
    if check_exact and check_categorical:
        if not left.equals(right):
            diff = (np.sum((left._values != right._values).astype(int)) *
                    100.0 / len(left))
            msg = f"{obj} values are different ({np.round(diff, 5)} %)"
            raise_assert_detail(obj, msg, left, right)
    else:

        # if we have "equiv", this becomes True
        exact_bool = bool(exact)
        _testing.assert_almost_equal(
            left.values,
            right.values,
            rtol=rtol,
            atol=atol,
            check_dtype=exact_bool,
            obj=obj,
            lobj=left,
            robj=right,
        )

    # metadata comparison
    if check_names:
        assert_attr_equal("names", left, right, obj=obj)
    if isinstance(left, PeriodIndex) or isinstance(right, PeriodIndex):
        assert_attr_equal("freq", left, right, obj=obj)
    if isinstance(left, IntervalIndex) or isinstance(right, IntervalIndex):
        assert_interval_array_equal(left._values, right._values)

    if check_categorical:
        if is_categorical_dtype(left.dtype) or is_categorical_dtype(
                right.dtype):
            assert_categorical_equal(left._values,
                                     right._values,
                                     obj=f"{obj} category")
Ejemplo n.º 27
0
    def test_order_compat(self):
        def _check_freq(index, expected_index):
            if isinstance(index, PeriodIndex):
                assert index.freq == expected_index.freq

        pidx = PeriodIndex(["2011", "2012", "2013"], name="pidx", freq="A")
        # for compatibility check
        iidx = Index([2011, 2012, 2013], name="idx")
        for idx in [pidx, iidx]:
            ordered = idx.sort_values()
            tm.assert_index_equal(ordered, idx)
            _check_freq(ordered, idx)

            ordered = idx.sort_values(ascending=False)
            tm.assert_index_equal(ordered, idx[::-1])
            _check_freq(ordered, idx[::-1])

            ordered, indexer = idx.sort_values(return_indexer=True)
            tm.assert_index_equal(ordered, idx)
            tm.assert_numpy_array_equal(indexer, np.array([0, 1, 2]), check_dtype=False)
            _check_freq(ordered, idx)

            ordered, indexer = idx.sort_values(return_indexer=True, ascending=False)
            tm.assert_index_equal(ordered, idx[::-1])
            tm.assert_numpy_array_equal(indexer, np.array([2, 1, 0]), check_dtype=False)
            _check_freq(ordered, idx[::-1])

        pidx = PeriodIndex(
            ["2011", "2013", "2015", "2012", "2011"], name="pidx", freq="A"
        )
        pexpected = PeriodIndex(
            ["2011", "2011", "2012", "2013", "2015"], name="pidx", freq="A"
        )
        # for compatibility check
        iidx = Index([2011, 2013, 2015, 2012, 2011], name="idx")
        iexpected = Index([2011, 2011, 2012, 2013, 2015], name="idx")
        for idx, expected in [(pidx, pexpected), (iidx, iexpected)]:
            ordered = idx.sort_values()
            tm.assert_index_equal(ordered, expected)
            _check_freq(ordered, idx)

            ordered = idx.sort_values(ascending=False)
            tm.assert_index_equal(ordered, expected[::-1])
            _check_freq(ordered, idx)

            ordered, indexer = idx.sort_values(return_indexer=True)
            tm.assert_index_equal(ordered, expected)

            exp = np.array([0, 4, 3, 1, 2])
            tm.assert_numpy_array_equal(indexer, exp, check_dtype=False)
            _check_freq(ordered, idx)

            ordered, indexer = idx.sort_values(return_indexer=True, ascending=False)
            tm.assert_index_equal(ordered, expected[::-1])
            _check_freq(ordered, idx)

        pidx = PeriodIndex(["2011", "2013", "NaT", "2011"], name="pidx", freq="D")

        result = pidx.sort_values()
        expected = PeriodIndex(["NaT", "2011", "2011", "2013"], name="pidx", freq="D")
        tm.assert_index_equal(result, expected)
        assert result.freq == "D"

        result = pidx.sort_values(ascending=False)
        expected = PeriodIndex(["2013", "2011", "2011", "NaT"], name="pidx", freq="D")
        tm.assert_index_equal(result, expected)
        assert result.freq == "D"
Ejemplo n.º 28
0
class TestSeriesMisc:
    def test_tab_completion(self):
        # GH 9910
        s = Series(list("abcd"))
        # Series of str values should have .str but not .dt/.cat in __dir__
        assert "str" in dir(s)
        assert "dt" not in dir(s)
        assert "cat" not in dir(s)

    def test_tab_completion_dt(self):
        # similarly for .dt
        s = Series(date_range("1/1/2015", periods=5))
        assert "dt" in dir(s)
        assert "str" not in dir(s)
        assert "cat" not in dir(s)

    def test_tab_completion_cat(self):
        # Similarly for .cat, but with the twist that str and dt should be
        # there if the categories are of that type first cat and str.
        s = Series(list("abbcd"), dtype="category")
        assert "cat" in dir(s)
        assert "str" in dir(s)  # as it is a string categorical
        assert "dt" not in dir(s)

    def test_tab_completion_cat_str(self):
        # similar to cat and str
        s = Series(date_range("1/1/2015", periods=5)).astype("category")
        assert "cat" in dir(s)
        assert "str" not in dir(s)
        assert "dt" in dir(s)  # as it is a datetime categorical

    def test_tab_completion_with_categorical(self):
        # test the tab completion display
        ok_for_cat = [
            "categories",
            "codes",
            "ordered",
            "set_categories",
            "add_categories",
            "remove_categories",
            "rename_categories",
            "reorder_categories",
            "remove_unused_categories",
            "as_ordered",
            "as_unordered",
        ]

        s = Series(list("aabbcde")).astype("category")
        results = sorted({r for r in s.cat.__dir__() if not r.startswith("_")})
        tm.assert_almost_equal(results, sorted(set(ok_for_cat)))

    @pytest.mark.parametrize(
        "index",
        [
            tm.makeStringIndex(10),
            tm.makeCategoricalIndex(10),
            Index(["foo", "bar", "baz"] * 2),
            tm.makeDateIndex(10),
            tm.makePeriodIndex(10),
            tm.makeTimedeltaIndex(10),
            tm.makeIntIndex(10),
            tm.makeUIntIndex(10),
            tm.makeIntIndex(10),
            tm.makeFloatIndex(10),
            Index([True, False]),
            Index([f"a{i}" for i in range(101)]),
            pd.MultiIndex.from_tuples(zip("ABCD", "EFGH")),
            pd.MultiIndex.from_tuples(zip([0, 1, 2, 3], "EFGH")),
        ],
    )
    def test_index_tab_completion(self, index):
        # dir contains string-like values of the Index.
        s = Series(index=index, dtype=object)
        dir_s = dir(s)
        for i, x in enumerate(s.index.unique(level=0)):
            if i < 100:
                assert not isinstance(x, str) or not x.isidentifier() or x in dir_s
            else:
                assert x not in dir_s

    @pytest.mark.parametrize("ser", [Series(dtype=object), Series([1])])
    def test_not_hashable(self, ser):
        msg = "unhashable type: 'Series'"
        with pytest.raises(TypeError, match=msg):
            hash(ser)

    def test_contains(self, datetime_series):
        tm.assert_contains_all(datetime_series.index, datetime_series)

    def test_axis_alias(self):
        s = Series([1, 2, np.nan])
        tm.assert_series_equal(s.dropna(axis="rows"), s.dropna(axis="index"))
        assert s.dropna().sum("rows") == 3
        assert s._get_axis_number("rows") == 0
        assert s._get_axis_name("rows") == "index"

    def test_class_axis(self):
        # https://github.com/pandas-dev/pandas/issues/18147
        # no exception and no empty docstring
        assert pydoc.getdoc(Series.index)

    def test_ndarray_compat(self):

        # test numpy compat with Series as sub-class of NDFrame
        tsdf = DataFrame(
            np.random.randn(1000, 3),
            columns=["A", "B", "C"],
            index=date_range("1/1/2000", periods=1000),
        )

        def f(x):
            return x[x.idxmax()]

        result = tsdf.apply(f)
        expected = tsdf.max()
        tm.assert_series_equal(result, expected)

    def test_ndarray_compat_like_func(self):
        # using an ndarray like function
        s = Series(np.random.randn(10))
        result = Series(np.ones_like(s))
        expected = Series(1, index=range(10), dtype="float64")
        tm.assert_series_equal(result, expected)

    def test_ndarray_compat_ravel(self):
        # ravel
        s = Series(np.random.randn(10))
        tm.assert_almost_equal(s.ravel(order="F"), s.values.ravel(order="F"))

    def test_empty_method(self):
        s_empty = Series(dtype=object)
        assert s_empty.empty

    @pytest.mark.parametrize("dtype", ["int64", object])
    def test_empty_method_full_series(self, dtype):
        full_series = Series(index=[1], dtype=dtype)
        assert not full_series.empty

    @pytest.mark.parametrize("dtype", [None, "Int64"])
    def test_integer_series_size(self, dtype):
        # GH 25580
        s = Series(range(9), dtype=dtype)
        assert s.size == 9

    def test_attrs(self):
        s = Series([0, 1], name="abc")
        assert s.attrs == {}
        s.attrs["version"] = 1
        result = s + 1
        assert result.attrs == {"version": 1}

    @skip_if_no("jinja2")
    def test_inspect_getmembers(self):
        # GH38782
        ser = Series(dtype=object)
        # TODO(2.0): Change to None once is_monotonic deprecation
        # is enforced
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            inspect.getmembers(ser)

    def test_unknown_attribute(self):
        # GH#9680
        tdi = pd.timedelta_range(start=0, periods=10, freq="1s")
        ser = Series(np.random.normal(size=10), index=tdi)
        assert "foo" not in ser.__dict__.keys()
        msg = "'Series' object has no attribute 'foo'"
        with pytest.raises(AttributeError, match=msg):
            ser.foo

    @pytest.mark.parametrize("op", ["year", "day", "second", "weekday"])
    def test_datetime_series_no_datelike_attrs(self, op, datetime_series):
        # GH#7206
        msg = f"'Series' object has no attribute '{op}'"
        with pytest.raises(AttributeError, match=msg):
            getattr(datetime_series, op)

    def test_series_datetimelike_attribute_access(self):
        # attribute access should still work!
        ser = Series({"year": 2000, "month": 1, "day": 10})
        assert ser.year == 2000
        assert ser.month == 1
        assert ser.day == 10

    def test_series_datetimelike_attribute_access_invalid(self):
        ser = Series({"year": 2000, "month": 1, "day": 10})
        msg = "'Series' object has no attribute 'weekday'"
        with pytest.raises(AttributeError, match=msg):
            ser.weekday

    def test_series_iteritems_deprecated(self):
        ser = Series([1])
        with tm.assert_produces_warning(FutureWarning):
            next(ser.iteritems())
class TestFloat64Index(Numeric):
    _holder = Float64Index

    @pytest.fixture(
        params=[
            [1.5, 2, 3, 4, 5],
            [0.0, 2.5, 5.0, 7.5, 10.0],
            [5, 4, 3, 2, 1.5],
            [10.0, 7.5, 5.0, 2.5, 0.0],
        ],
        ids=["mixed", "float", "mixed_dec", "float_dec"],
    )
    def index(self, request):
        return Float64Index(request.param)

    @pytest.fixture
    def mixed_index(self):
        return Float64Index([1.5, 2, 3, 4, 5])

    @pytest.fixture
    def float_index(self):
        return Float64Index([0.0, 2.5, 5.0, 7.5, 10.0])

    def create_index(self) -> Float64Index:
        return Float64Index(np.arange(5, dtype="float64"))

    def test_repr_roundtrip(self, index):
        tm.assert_index_equal(eval(repr(index)), index)

    def check_is_index(self, i):
        assert isinstance(i, Index)
        assert not isinstance(i, Float64Index)

    def check_coerce(self, a, b, is_float_index=True):
        assert a.equals(b)
        tm.assert_index_equal(a, b, exact=False)
        if is_float_index:
            assert isinstance(b, Float64Index)
        else:
            self.check_is_index(b)

    def test_constructor(self):

        # explicit construction
        index = Float64Index([1, 2, 3, 4, 5])
        assert isinstance(index, Float64Index)
        expected = np.array([1, 2, 3, 4, 5], dtype="float64")
        tm.assert_numpy_array_equal(index.values, expected)
        index = Float64Index(np.array([1, 2, 3, 4, 5]))
        assert isinstance(index, Float64Index)
        index = Float64Index([1.0, 2, 3, 4, 5])
        assert isinstance(index, Float64Index)
        index = Float64Index(np.array([1.0, 2, 3, 4, 5]))
        assert isinstance(index, Float64Index)
        assert index.dtype == float

        index = Float64Index(np.array([1.0, 2, 3, 4, 5]), dtype=np.float32)
        assert isinstance(index, Float64Index)
        assert index.dtype == np.float64

        index = Float64Index(np.array([1, 2, 3, 4, 5]), dtype=np.float32)
        assert isinstance(index, Float64Index)
        assert index.dtype == np.float64

        # nan handling
        result = Float64Index([np.nan, np.nan])
        assert pd.isna(result.values).all()
        result = Float64Index(np.array([np.nan]))
        assert pd.isna(result.values).all()
        result = Index(np.array([np.nan]))
        assert pd.isna(result.values).all()

    @pytest.mark.parametrize(
        "index, dtype",
        [
            (pd.Int64Index, "float64"),
            (pd.UInt64Index, "categorical"),
            (pd.Float64Index, "datetime64"),
            (pd.RangeIndex, "float64"),
        ],
    )
    def test_invalid_dtype(self, index, dtype):
        # GH 29539
        with pytest.raises(
            ValueError,
            match=rf"Incorrect `dtype` passed: expected \w+(?: \w+)?, received {dtype}",
        ):
            index([1, 2, 3], dtype=dtype)

    def test_constructor_invalid(self):

        # invalid
        msg = (
            r"Float64Index\(\.\.\.\) must be called with a collection of "
            r"some kind, 0\.0 was passed"
        )
        with pytest.raises(TypeError, match=msg):
            Float64Index(0.0)
        msg = (
            "String dtype not supported, "
            "you may need to explicitly cast to a numeric type"
        )
        with pytest.raises(TypeError, match=msg):
            Float64Index(["a", "b", 0.0])
        msg = r"float\(\) argument must be a string or a number, not 'Timestamp'"
        with pytest.raises(TypeError, match=msg):
            Float64Index([Timestamp("20130101")])

    def test_constructor_coerce(self, mixed_index, float_index):

        self.check_coerce(mixed_index, Index([1.5, 2, 3, 4, 5]))
        self.check_coerce(float_index, Index(np.arange(5) * 2.5))
        self.check_coerce(
            float_index, Index(np.array(np.arange(5) * 2.5, dtype=object))
        )

    def test_constructor_explicit(self, mixed_index, float_index):

        # these don't auto convert
        self.check_coerce(
            float_index, Index((np.arange(5) * 2.5), dtype=object), is_float_index=False
        )
        self.check_coerce(
            mixed_index, Index([1.5, 2, 3, 4, 5], dtype=object), is_float_index=False
        )

    def test_type_coercion_fail(self, any_int_dtype):
        # see gh-15832
        msg = "Trying to coerce float values to integers"
        with pytest.raises(ValueError, match=msg):
            Index([1, 2, 3.5], dtype=any_int_dtype)

    def test_type_coercion_valid(self, float_dtype):
        # There is no Float32Index, so we always
        # generate Float64Index.
        i = Index([1, 2, 3.5], dtype=float_dtype)
        tm.assert_index_equal(i, Index([1, 2, 3.5]))

    def test_equals_numeric(self):

        i = Float64Index([1.0, 2.0])
        assert i.equals(i)
        assert i.identical(i)

        i2 = Float64Index([1.0, 2.0])
        assert i.equals(i2)

        i = Float64Index([1.0, np.nan])
        assert i.equals(i)
        assert i.identical(i)

        i2 = Float64Index([1.0, np.nan])
        assert i.equals(i2)

    @pytest.mark.parametrize(
        "other",
        (
            Int64Index([1, 2]),
            Index([1.0, 2.0], dtype=object),
            Index([1, 2], dtype=object),
        ),
    )
    def test_equals_numeric_other_index_type(self, other):
        i = Float64Index([1.0, 2.0])
        assert i.equals(other)
        assert other.equals(i)

    @pytest.mark.parametrize(
        "vals",
        [
            pd.date_range("2016-01-01", periods=3),
            pd.timedelta_range("1 Day", periods=3),
        ],
    )
    def test_lookups_datetimelike_values(self, vals):
        # If we have datetime64 or timedelta64 values, make sure they are
        #  wrappped correctly  GH#31163
        ser = pd.Series(vals, index=range(3, 6))
        ser.index = ser.index.astype("float64")

        expected = vals[1]

        with tm.assert_produces_warning(FutureWarning):
            result = ser.index.get_value(ser, 4.0)
        assert isinstance(result, type(expected)) and result == expected
        with tm.assert_produces_warning(FutureWarning):
            result = ser.index.get_value(ser, 4)
        assert isinstance(result, type(expected)) and result == expected

        result = ser[4.0]
        assert isinstance(result, type(expected)) and result == expected
        result = ser[4]
        assert isinstance(result, type(expected)) and result == expected

        result = ser.loc[4.0]
        assert isinstance(result, type(expected)) and result == expected
        result = ser.loc[4]
        assert isinstance(result, type(expected)) and result == expected

        result = ser.at[4.0]
        assert isinstance(result, type(expected)) and result == expected
        # GH#31329 .at[4] should cast to 4.0, matching .loc behavior
        result = ser.at[4]
        assert isinstance(result, type(expected)) and result == expected

        result = ser.iloc[1]
        assert isinstance(result, type(expected)) and result == expected

        result = ser.iat[1]
        assert isinstance(result, type(expected)) and result == expected

    def test_doesnt_contain_all_the_things(self):
        i = Float64Index([np.nan])
        assert not i.isin([0]).item()
        assert not i.isin([1]).item()
        assert i.isin([np.nan]).item()

    def test_nan_multiple_containment(self):
        i = Float64Index([1.0, np.nan])
        tm.assert_numpy_array_equal(i.isin([1.0]), np.array([True, False]))
        tm.assert_numpy_array_equal(i.isin([2.0, np.pi]), np.array([False, False]))
        tm.assert_numpy_array_equal(i.isin([np.nan]), np.array([False, True]))
        tm.assert_numpy_array_equal(i.isin([1.0, np.nan]), np.array([True, True]))
        i = Float64Index([1.0, 2.0])
        tm.assert_numpy_array_equal(i.isin([np.nan]), np.array([False, False]))

    def test_fillna_float64(self):
        # GH 11343
        idx = Index([1.0, np.nan, 3.0], dtype=float, name="x")
        # can't downcast
        exp = Index([1.0, 0.1, 3.0], name="x")
        tm.assert_index_equal(idx.fillna(0.1), exp)

        # downcast
        exp = Float64Index([1.0, 2.0, 3.0], name="x")
        tm.assert_index_equal(idx.fillna(2), exp)

        # object
        exp = Index([1.0, "obj", 3.0], name="x")
        tm.assert_index_equal(idx.fillna("obj"), exp)
Ejemplo n.º 30
0
def union_categoricals(to_union, sort_categories=False, ignore_order=False):
    """
    Combine list-like of Categorical-like, unioning categories. All
    categories must have the same dtype.

    .. versionadded:: 0.19.0

    Parameters
    ----------
    to_union : list-like of Categorical, CategoricalIndex,
               or Series with dtype='category'
    sort_categories : boolean, default False
        If true, resulting categories will be lexsorted, otherwise
        they will be ordered as they appear in the data.
    ignore_order: boolean, default False
        If true, the ordered attribute of the Categoricals will be ignored.
        Results in an unordered categorical.

        .. versionadded:: 0.20.0

    Returns
    -------
    result : Categorical

    Raises
    ------
    TypeError
        - all inputs do not have the same dtype
        - all inputs do not have the same ordered property
        - all inputs are ordered and their categories are not identical
        - sort_categories=True and Categoricals are ordered
    ValueError
        Empty list of categoricals passed

    Notes
    -----

    To learn more about categories, see `link
    <http://pandas.pydata.org/pandas-docs/stable/categorical.html#unioning>`__

    Examples
    --------

    >>> from pandas.api.types import union_categoricals

    If you want to combine categoricals that do not necessarily have
    the same categories, `union_categoricals` will combine a list-like
    of categoricals. The new categories will be the union of the
    categories being combined.

    >>> a = pd.Categorical(["b", "c"])
    >>> b = pd.Categorical(["a", "b"])
    >>> union_categoricals([a, b])
    [b, c, a, b]
    Categories (3, object): [b, c, a]

    By default, the resulting categories will be ordered as they appear
    in the `categories` of the data. If you want the categories to be
    lexsorted, use `sort_categories=True` argument.

    >>> union_categoricals([a, b], sort_categories=True)
    [b, c, a, b]
    Categories (3, object): [a, b, c]

    `union_categoricals` also works with the case of combining two
    categoricals of the same categories and order information (e.g. what
    you could also `append` for).

    >>> a = pd.Categorical(["a", "b"], ordered=True)
    >>> b = pd.Categorical(["a", "b", "a"], ordered=True)
    >>> union_categoricals([a, b])
    [a, b, a, b, a]
    Categories (2, object): [a < b]

    Raises `TypeError` because the categories are ordered and not identical.

    >>> a = pd.Categorical(["a", "b"], ordered=True)
    >>> b = pd.Categorical(["a", "b", "c"], ordered=True)
    >>> union_categoricals([a, b])
    TypeError: to union ordered Categoricals, all categories must be the same

    New in version 0.20.0

    Ordered categoricals with different categories or orderings can be
    combined by using the `ignore_ordered=True` argument.

    >>> a = pd.Categorical(["a", "b", "c"], ordered=True)
    >>> b = pd.Categorical(["c", "b", "a"], ordered=True)
    >>> union_categoricals([a, b], ignore_order=True)
    [a, b, c, c, b, a]
    Categories (3, object): [a, b, c]

    `union_categoricals` also works with a `CategoricalIndex`, or `Series`
    containing categorical data, but note that the resulting array will
    always be a plain `Categorical`

    >>> a = pd.Series(["b", "c"], dtype='category')
    >>> b = pd.Series(["a", "b"], dtype='category')
    >>> union_categoricals([a, b])
    [b, c, a, b]
    Categories (3, object): [b, c, a]
    """
    from pandas import Index, Categorical, CategoricalIndex, Series
    from pandas.core.arrays.categorical import _recode_for_categories

    if len(to_union) == 0:
        raise ValueError('No Categoricals to union')

    def _maybe_unwrap(x):
        if isinstance(x, (CategoricalIndex, Series)):
            return x.values
        elif isinstance(x, Categorical):
            return x
        else:
            raise TypeError("all components to combine must be Categorical")

    to_union = [_maybe_unwrap(x) for x in to_union]
    first = to_union[0]

    if not all(is_dtype_equal(other.categories.dtype, first.categories.dtype)
               for other in to_union[1:]):
        raise TypeError("dtype of categories must be the same")

    ordered = False
    if all(first.is_dtype_equal(other) for other in to_union[1:]):
        # identical categories - fastpath
        categories = first.categories
        ordered = first.ordered

        if all(first.categories.equals(other.categories)
               for other in to_union[1:]):
            new_codes = np.concatenate([c.codes for c in to_union])
        else:
            codes = [first.codes] + [_recode_for_categories(other.codes,
                                                            other.categories,
                                                            first.categories)
                                     for other in to_union[1:]]
            new_codes = np.concatenate(codes)

        if sort_categories and not ignore_order and ordered:
            raise TypeError("Cannot use sort_categories=True with "
                            "ordered Categoricals")

        if sort_categories and not categories.is_monotonic_increasing:
            categories = categories.sort_values()
            indexer = categories.get_indexer(first.categories)

            from pandas.core.algorithms import take_1d
            new_codes = take_1d(indexer, new_codes, fill_value=-1)
    elif ignore_order or all(not c.ordered for c in to_union):
        # different categories - union and recode
        cats = first.categories.append([c.categories for c in to_union[1:]])
        categories = Index(cats.unique())
        if sort_categories:
            categories = categories.sort_values()

        new_codes = []
        for c in to_union:
            new_codes.append(_recode_for_categories(c.codes, c.categories,
                                                    categories))
        new_codes = np.concatenate(new_codes)
    else:
        # ordered - to show a proper error message
        if all(c.ordered for c in to_union):
            msg = ("to union ordered Categoricals, "
                   "all categories must be the same")
            raise TypeError(msg)
        else:
            raise TypeError('Categorical.ordered must be the same')

    if ignore_order:
        ordered = False

    return Categorical(new_codes, categories=categories, ordered=ordered,
                       fastpath=True)
Ejemplo n.º 31
0
    def test_floating_misc(self):

        # related 236
        # scalar/slicing of a float index
        s = Series(np.arange(5), index=np.arange(5) * 2.5, dtype=np.int64)

        # label based slicing
        result1 = s[1.0:3.0]
        result2 = s.loc[1.0:3.0]
        result3 = s.loc[1.0:3.0]
        tm.assert_series_equal(result1, result2)
        tm.assert_series_equal(result1, result3)

        # exact indexing when found
        result1 = s[5.0]
        result2 = s.loc[5.0]
        result3 = s.loc[5.0]
        assert result1 == result2
        assert result1 == result3

        result1 = s[5]
        result2 = s.loc[5]
        result3 = s.loc[5]
        assert result1 == result2
        assert result1 == result3

        assert s[5.0] == s[5]

        # value not found (and no fallbacking at all)

        # scalar integers
        with pytest.raises(KeyError, match=r"^4\.0$"):
            s.loc[4]
        with pytest.raises(KeyError, match=r"^4\.0$"):
            s.loc[4]
        with pytest.raises(KeyError, match=r"^4\.0$"):
            s[4]

        # fancy floats/integers create the correct entry (as nan)
        # fancy tests
        expected = Series([2, 0], index=Float64Index([5.0, 0.0]))
        for fancy_idx in [[5.0, 0.0], np.array([5.0, 0.0])]:  # float
            tm.assert_series_equal(s[fancy_idx], expected)
            tm.assert_series_equal(s.loc[fancy_idx], expected)
            tm.assert_series_equal(s.loc[fancy_idx], expected)

        expected = Series([2, 0], index=Index([5, 0], dtype="int64"))
        for fancy_idx in [[5, 0], np.array([5, 0])]:  # int
            tm.assert_series_equal(s[fancy_idx], expected)
            tm.assert_series_equal(s.loc[fancy_idx], expected)
            tm.assert_series_equal(s.loc[fancy_idx], expected)

        # all should return the same as we are slicing 'the same'
        result1 = s.loc[2:5]
        result2 = s.loc[2.0:5.0]
        result3 = s.loc[2.0:5]
        result4 = s.loc[2.1:5]
        tm.assert_series_equal(result1, result2)
        tm.assert_series_equal(result1, result3)
        tm.assert_series_equal(result1, result4)

        # previously this did fallback indexing
        result1 = s[2:5]
        result2 = s[2.0:5.0]
        result3 = s[2.0:5]
        result4 = s[2.1:5]
        tm.assert_series_equal(result1, result2)
        tm.assert_series_equal(result1, result3)
        tm.assert_series_equal(result1, result4)

        result1 = s.loc[2:5]
        result2 = s.loc[2.0:5.0]
        result3 = s.loc[2.0:5]
        result4 = s.loc[2.1:5]
        tm.assert_series_equal(result1, result2)
        tm.assert_series_equal(result1, result3)
        tm.assert_series_equal(result1, result4)

        # combined test
        result1 = s.loc[2:5]
        result2 = s.loc[2:5]
        result3 = s[2:5]

        tm.assert_series_equal(result1, result2)
        tm.assert_series_equal(result1, result3)

        # list selection
        result1 = s[[0.0, 5, 10]]
        result2 = s.loc[[0.0, 5, 10]]
        result3 = s.loc[[0.0, 5, 10]]
        result4 = s.iloc[[0, 2, 4]]
        tm.assert_series_equal(result1, result2)
        tm.assert_series_equal(result1, result3)
        tm.assert_series_equal(result1, result4)

        with pytest.raises(KeyError, match="with any missing labels"):
            s[[1.6, 5, 10]]
        with pytest.raises(KeyError, match="with any missing labels"):
            s.loc[[1.6, 5, 10]]

        with pytest.raises(KeyError, match="with any missing labels"):
            s[[0, 1, 2]]
        with pytest.raises(KeyError, match="with any missing labels"):
            s.loc[[0, 1, 2]]

        result1 = s.loc[[2.5, 5]]
        result2 = s.loc[[2.5, 5]]
        tm.assert_series_equal(result1, result2)
        tm.assert_series_equal(result1, Series([1, 2], index=[2.5, 5.0]))

        result1 = s[[2.5]]
        result2 = s.loc[[2.5]]
        result3 = s.loc[[2.5]]
        tm.assert_series_equal(result1, result2)
        tm.assert_series_equal(result1, result3)
        tm.assert_series_equal(result1, Series([1], index=[2.5]))
 def test_type_coercion_valid(self, float_dtype):
     # There is no Float32Index, so we always
     # generate Float64Index.
     i = Index([1, 2, 3.5], dtype=float_dtype)
     tm.assert_index_equal(i, Index([1, 2, 3.5]))
    def test_identical(self):
        index = self.create_index()
        i = Index(index.copy())
        assert i.identical(index)

        same_values_different_type = Index(i, dtype=object)
        assert not i.identical(same_values_different_type)

        i = index.copy(dtype=object)
        i = i.rename("foo")
        same_values = Index(i, dtype=object)
        assert same_values.identical(i)

        assert not i.identical(index)
        assert Index(same_values, name="foo", dtype=object).identical(i)

        assert not index.copy(dtype=object).identical(index.copy(dtype=self._dtype))
Ejemplo n.º 34
0
def test_arg_passthru():
    # make sure that we are passing thru kwargs
    # to our agg functions

    # GH3668
    # GH5724
    df = pd.DataFrame(
        {
            "group": [1, 1, 2],
            "int": [1, 2, 3],
            "float": [4.0, 5.0, 6.0],
            "string": list("abc"),
            "category_string": pd.Series(list("abc")).astype("category"),
            "category_int": [7, 8, 9],
            "datetime": pd.date_range("20130101", periods=3),
            "datetimetz": pd.date_range("20130101", periods=3,
                                        tz="US/Eastern"),
            "timedelta": pd.timedelta_range("1 s", periods=3, freq="s"),
        },
        columns=[
            "group",
            "int",
            "float",
            "string",
            "category_string",
            "category_int",
            "datetime",
            "datetimetz",
            "timedelta",
        ],
    )

    expected_columns_numeric = Index(["int", "float", "category_int"])

    # mean / median
    expected = pd.DataFrame(
        {
            "category_int": [7.5, 9],
            "float": [4.5, 6.0],
            "timedelta": [pd.Timedelta("1.5s"),
                          pd.Timedelta("3s")],
            "int": [1.5, 3],
            "datetime": [
                pd.Timestamp("2013-01-01 12:00:00"),
                pd.Timestamp("2013-01-03 00:00:00"),
            ],
            "datetimetz": [
                pd.Timestamp("2013-01-01 12:00:00", tz="US/Eastern"),
                pd.Timestamp("2013-01-03 00:00:00", tz="US/Eastern"),
            ],
        },
        index=Index([1, 2], name="group"),
        columns=[
            "int", "float", "category_int", "datetime", "datetimetz",
            "timedelta"
        ],
    )

    for attr in ["mean", "median"]:
        f = getattr(df.groupby("group"), attr)
        result = f()
        tm.assert_index_equal(result.columns, expected_columns_numeric)

        result = f(numeric_only=False)
        tm.assert_frame_equal(result.reindex_like(expected), expected)

    # TODO: min, max *should* handle
    # categorical (ordered) dtype
    expected_columns = Index([
        "int",
        "float",
        "string",
        "category_int",
        "datetime",
        "datetimetz",
        "timedelta",
    ])
    for attr in ["min", "max"]:
        f = getattr(df.groupby("group"), attr)
        result = f()
        tm.assert_index_equal(result.columns, expected_columns)

        result = f(numeric_only=False)
        tm.assert_index_equal(result.columns, expected_columns)

    expected_columns = Index([
        "int",
        "float",
        "string",
        "category_string",
        "category_int",
        "datetime",
        "datetimetz",
        "timedelta",
    ])
    for attr in ["first", "last"]:
        f = getattr(df.groupby("group"), attr)
        result = f()
        tm.assert_index_equal(result.columns, expected_columns)

        result = f(numeric_only=False)
        tm.assert_index_equal(result.columns, expected_columns)

    expected_columns = Index(
        ["int", "float", "string", "category_int", "timedelta"])
    for attr in ["sum"]:
        f = getattr(df.groupby("group"), attr)
        result = f()
        tm.assert_index_equal(result.columns, expected_columns_numeric)

        result = f(numeric_only=False)
        tm.assert_index_equal(result.columns, expected_columns)

    expected_columns = Index(["int", "float", "category_int"])
    for attr in ["prod", "cumprod"]:
        f = getattr(df.groupby("group"), attr)
        result = f()
        tm.assert_index_equal(result.columns, expected_columns_numeric)

        result = f(numeric_only=False)
        tm.assert_index_equal(result.columns, expected_columns)

    # like min, max, but don't include strings
    expected_columns = Index([
        "int", "float", "category_int", "datetime", "datetimetz", "timedelta"
    ])
    for attr in ["cummin", "cummax"]:
        f = getattr(df.groupby("group"), attr)
        result = f()
        # GH 15561: numeric_only=False set by default like min/max
        tm.assert_index_equal(result.columns, expected_columns)

        result = f(numeric_only=False)
        tm.assert_index_equal(result.columns, expected_columns)

    expected_columns = Index(["int", "float", "category_int", "timedelta"])
    for attr in ["cumsum"]:
        f = getattr(df.groupby("group"), attr)
        result = f()
        tm.assert_index_equal(result.columns, expected_columns_numeric)

        result = f(numeric_only=False)
        tm.assert_index_equal(result.columns, expected_columns)
 def test_type_coercion_fail(self, any_int_dtype):
     # see gh-15832
     msg = "Trying to coerce float values to integers"
     with pytest.raises(ValueError, match=msg):
         Index([1, 2, 3.5], dtype=any_int_dtype)
Ejemplo n.º 36
0
    def test_categorical_delegations(self):

        # invalid accessor
        pytest.raises(AttributeError, lambda: Series([1, 2, 3]).cat)
        tm.assert_raises_regex(
            AttributeError,
            r"Can only use .cat accessor with a 'category' dtype",
            lambda: Series([1, 2, 3]).cat)
        pytest.raises(AttributeError, lambda: Series(['a', 'b', 'c']).cat)
        pytest.raises(AttributeError, lambda: Series(np.arange(5.)).cat)
        pytest.raises(AttributeError,
                      lambda: Series([Timestamp('20130101')]).cat)

        # Series should delegate calls to '.categories', '.codes', '.ordered'
        # and the methods '.set_categories()' 'drop_unused_categories()' to the
        # categorical# -*- coding: utf-8 -*-
        s = Series(Categorical(["a", "b", "c", "a"], ordered=True))
        exp_categories = Index(["a", "b", "c"])
        tm.assert_index_equal(s.cat.categories, exp_categories)
        s.cat.categories = [1, 2, 3]
        exp_categories = Index([1, 2, 3])
        tm.assert_index_equal(s.cat.categories, exp_categories)

        exp_codes = Series([0, 1, 2, 0], dtype='int8')
        tm.assert_series_equal(s.cat.codes, exp_codes)

        assert s.cat.ordered
        s = s.cat.as_unordered()
        assert not s.cat.ordered
        s.cat.as_ordered(inplace=True)
        assert s.cat.ordered

        # reorder
        s = Series(Categorical(["a", "b", "c", "a"], ordered=True))
        exp_categories = Index(["c", "b", "a"])
        exp_values = np.array(["a", "b", "c", "a"], dtype=np.object_)
        s = s.cat.set_categories(["c", "b", "a"])
        tm.assert_index_equal(s.cat.categories, exp_categories)
        tm.assert_numpy_array_equal(s.values.__array__(), exp_values)
        tm.assert_numpy_array_equal(s.__array__(), exp_values)

        # remove unused categories
        s = Series(
            Categorical(["a", "b", "b", "a"], categories=["a", "b", "c"]))
        exp_categories = Index(["a", "b"])
        exp_values = np.array(["a", "b", "b", "a"], dtype=np.object_)
        s = s.cat.remove_unused_categories()
        tm.assert_index_equal(s.cat.categories, exp_categories)
        tm.assert_numpy_array_equal(s.values.__array__(), exp_values)
        tm.assert_numpy_array_equal(s.__array__(), exp_values)

        # This method is likely to be confused, so test that it raises an error
        # on wrong inputs:
        def f():
            s.set_categories([4, 3, 2, 1])

        pytest.raises(Exception, f)
        # right: s.cat.set_categories([4,3,2,1])

        # GH18862 (let Series.cat.rename_categories take callables)
        s = Series(Categorical(["a", "b", "c", "a"], ordered=True))
        result = s.cat.rename_categories(lambda x: x.upper())
        expected = Series(
            Categorical(["A", "B", "C", "A"],
                        categories=["A", "B", "C"],
                        ordered=True))
        tm.assert_series_equal(result, expected)
Ejemplo n.º 37
0
    def test_intersection(self, sort):
        # intersect with Int64Index
        index = RangeIndex(start=0, stop=20, step=2)
        other = Index(np.arange(1, 6))
        result = index.intersection(other, sort=sort)
        expected = Index(np.sort(np.intersect1d(index.values, other.values)))
        tm.assert_index_equal(result, expected)

        result = other.intersection(index, sort=sort)
        expected = Index(
            np.sort(np.asarray(np.intersect1d(index.values, other.values)))
        )
        tm.assert_index_equal(result, expected)

        # intersect with increasing RangeIndex
        other = RangeIndex(1, 6)
        result = index.intersection(other, sort=sort)
        expected = Index(np.sort(np.intersect1d(index.values, other.values)))
        tm.assert_index_equal(result, expected)

        # intersect with decreasing RangeIndex
        other = RangeIndex(5, 0, -1)
        result = index.intersection(other, sort=sort)
        expected = Index(np.sort(np.intersect1d(index.values, other.values)))
        tm.assert_index_equal(result, expected)

        # reversed (GH 17296)
        result = other.intersection(index, sort=sort)
        tm.assert_index_equal(result, expected)

        # GH 17296: intersect two decreasing RangeIndexes
        first = RangeIndex(10, -2, -2)
        other = RangeIndex(5, -4, -1)
        expected = first.astype(int).intersection(other.astype(int), sort=sort)
        result = first.intersection(other, sort=sort).astype(int)
        tm.assert_index_equal(result, expected)

        # reversed
        result = other.intersection(first, sort=sort).astype(int)
        tm.assert_index_equal(result, expected)

        index = RangeIndex(5, name="foo")

        # intersect of non-overlapping indices
        other = RangeIndex(5, 10, 1, name="foo")
        result = index.intersection(other, sort=sort)
        expected = RangeIndex(0, 0, 1, name="foo")
        tm.assert_index_equal(result, expected)

        other = RangeIndex(-1, -5, -1)
        result = index.intersection(other, sort=sort)
        expected = RangeIndex(0, 0, 1)
        tm.assert_index_equal(result, expected)

        # intersection of empty indices
        other = RangeIndex(0, 0, 1)
        result = index.intersection(other, sort=sort)
        expected = RangeIndex(0, 0, 1)
        tm.assert_index_equal(result, expected)

        result = other.intersection(index, sort=sort)
        tm.assert_index_equal(result, expected)
Ejemplo n.º 38
0
    def test_strftime(self):
        # GH 10086
        s = Series(date_range("20130101", periods=5))
        result = s.dt.strftime("%Y/%m/%d")
        expected = Series(
            ["2013/01/01", "2013/01/02", "2013/01/03", "2013/01/04", "2013/01/05"]
        )
        tm.assert_series_equal(result, expected)

        s = Series(date_range("2015-02-03 11:22:33.4567", periods=5))
        result = s.dt.strftime("%Y/%m/%d %H-%M-%S")
        expected = Series(
            [
                "2015/02/03 11-22-33",
                "2015/02/04 11-22-33",
                "2015/02/05 11-22-33",
                "2015/02/06 11-22-33",
                "2015/02/07 11-22-33",
            ]
        )
        tm.assert_series_equal(result, expected)

        s = Series(period_range("20130101", periods=5))
        result = s.dt.strftime("%Y/%m/%d")
        expected = Series(
            ["2013/01/01", "2013/01/02", "2013/01/03", "2013/01/04", "2013/01/05"]
        )
        tm.assert_series_equal(result, expected)

        s = Series(period_range("2015-02-03 11:22:33.4567", periods=5, freq="s"))
        result = s.dt.strftime("%Y/%m/%d %H-%M-%S")
        expected = Series(
            [
                "2015/02/03 11-22-33",
                "2015/02/03 11-22-34",
                "2015/02/03 11-22-35",
                "2015/02/03 11-22-36",
                "2015/02/03 11-22-37",
            ]
        )
        tm.assert_series_equal(result, expected)

        s = Series(date_range("20130101", periods=5))
        s.iloc[0] = pd.NaT
        result = s.dt.strftime("%Y/%m/%d")
        expected = Series(
            [np.nan, "2013/01/02", "2013/01/03", "2013/01/04", "2013/01/05"]
        )
        tm.assert_series_equal(result, expected)

        datetime_index = date_range("20150301", periods=5)
        result = datetime_index.strftime("%Y/%m/%d")

        expected = Index(
            ["2015/03/01", "2015/03/02", "2015/03/03", "2015/03/04", "2015/03/05"],
            dtype=np.object_,
        )
        # dtype may be S10 or U10 depending on python version
        tm.assert_index_equal(result, expected)

        period_index = period_range("20150301", periods=5)
        result = period_index.strftime("%Y/%m/%d")
        expected = Index(
            ["2015/03/01", "2015/03/02", "2015/03/03", "2015/03/04", "2015/03/05"],
            dtype="=U10",
        )
        tm.assert_index_equal(result, expected)

        s = Series([datetime(2013, 1, 1, 2, 32, 59), datetime(2013, 1, 2, 14, 32, 1)])
        result = s.dt.strftime("%Y-%m-%d %H:%M:%S")
        expected = Series(["2013-01-01 02:32:59", "2013-01-02 14:32:01"])
        tm.assert_series_equal(result, expected)

        s = Series(period_range("20130101", periods=4, freq="H"))
        result = s.dt.strftime("%Y/%m/%d %H:%M:%S")
        expected = Series(
            [
                "2013/01/01 00:00:00",
                "2013/01/01 01:00:00",
                "2013/01/01 02:00:00",
                "2013/01/01 03:00:00",
            ]
        )

        s = Series(period_range("20130101", periods=4, freq="L"))
        result = s.dt.strftime("%Y/%m/%d %H:%M:%S.%l")
        expected = Series(
            [
                "2013/01/01 00:00:00.000",
                "2013/01/01 00:00:00.001",
                "2013/01/01 00:00:00.002",
                "2013/01/01 00:00:00.003",
            ]
        )
        tm.assert_series_equal(result, expected)
Ejemplo n.º 39
0
class Grouping(object):
    def __init__(self, index, names=None):
        """
        index : index-like
            Can be pandas MultiIndex or Index or array-like. If array-like
            and is a MultipleIndex (more than one grouping variable),
            groups are expected to be in each row. E.g., [('red', 1),
            ('red', 2), ('green', 1), ('green', 2)]
        names : list or str, optional
            The names to use for the groups. Should be a str if only
            one grouping variable is used.

        Notes
        -----
        If index is already a pandas Index then there is no copy.
        """
        if isinstance(index, (Index, MultiIndex)):
            if names is not None:
                if hasattr(index, 'set_names'):  # newer pandas
                    index.set_names(names, inplace=True)
                else:
                    index.names = names
            self.index = index
        else:  # array_like
            if _is_hierarchical(index):
                self.index = _make_hierarchical_index(index, names)
            else:
                self.index = Index(index, name=names)
            if names is None:
                names = _make_generic_names(self.index)
                if hasattr(self.index, 'set_names'):
                    self.index.set_names(names, inplace=True)
                else:
                    self.index.names = names

        self.nobs = len(self.index)
        self.nlevels = len(self.index.names)
        self.slices = None

    @property
    def index_shape(self):
        if hasattr(self.index, 'levshape'):
            return self.index.levshape
        else:
            return self.index.shape

    @property
    def levels(self):
        if hasattr(self.index, 'levels'):
            return self.index.levels
        else:
            return pd.Categorical(self.index).levels

    @property
    def labels(self):
        # this was index_int, but that's not a very good name...
        codes = getattr(self.index, 'codes', None)
        if codes is None:
            if hasattr(self.index, 'labels'):
                codes = self.index.labels
            else:
                codes = pd.Categorical(self.index).codes[None]
        return codes

    @property
    def group_names(self):
        return self.index.names

    def reindex(self, index=None, names=None):
        """
        Resets the index in-place.
        """
        # NOTE: this isn't of much use if the rest of the data doesn't change
        # This needs to reset cache
        if names is None:
            names = self.group_names
        self = Grouping(index, names)

    def get_slices(self, level=0):
        """
        Sets the slices attribute to be a list of indices of the sorted
        groups for the first index level. I.e., self.slices[0] is the
        index where each observation is in the first (sorted) group.
        """
        # TODO: refactor this
        groups = self.index.get_level_values(level).unique()
        groups = np.array(groups)
        groups.sort()
        if isinstance(self.index, MultiIndex):
            self.slices = [
                self.index.get_loc_level(x, level=level)[0] for x in groups
            ]
        else:
            self.slices = [self.index.get_loc(x) for x in groups]

    def count_categories(self, level=0):
        """
        Sets the attribute counts to equal the bincount of the (integer-valued)
        labels.
        """
        # TODO: refactor this not to set an attribute. Why would we do this?
        self.counts = np.bincount(self.labels[level])

    def check_index(self, is_sorted=True, unique=True, index=None):
        """Sanity checks"""
        if not index:
            index = self.index
        if is_sorted:
            test = pd.DataFrame(lrange(len(index)), index=index)
            test_sorted = test.sort()
            if not test.index.equals(test_sorted.index):
                raise Exception('Data is not be sorted')
        if unique:
            if len(index) != len(index.unique()):
                raise Exception('Duplicate index entries')

    def sort(self, data, index=None):
        """Applies a (potentially hierarchical) sort operation on a numpy array
        or pandas series/dataframe based on the grouping index or a
        user-supplied index.  Returns an object of the same type as the
        original data as well as the matching (sorted) Pandas index.
        """

        if index is None:
            index = self.index
        if data_util._is_using_ndarray_type(data, None):
            if data.ndim == 1:
                out = pd.Series(data, index=index, copy=True)
                out = out.sort_index()
            else:
                out = pd.DataFrame(data, index=index)
                out = out.sort_index(inplace=False)  # copies
            return np.array(out), out.index
        elif data_util._is_using_pandas(data, None):
            out = data
            out = out.reindex(index)  # copies?
            out = out.sort_index()
            return out, out.index
        else:
            msg = 'data must be a Numpy array or a Pandas Series/DataFrame'
            raise ValueError(msg)

    def transform_dataframe(self, dataframe, function, level=0, **kwargs):
        """Apply function to each column, by group
        Assumes that the dataframe already has a proper index"""
        if dataframe.shape[0] != self.nobs:
            raise Exception('dataframe does not have the same shape as index')
        out = dataframe.groupby(level=level).apply(function, **kwargs)
        if 1 in out.shape:
            return np.ravel(out)
        else:
            return np.array(out)

    def transform_array(self, array, function, level=0, **kwargs):
        """Apply function to each column, by group
        """
        if array.shape[0] != self.nobs:
            raise Exception('array does not have the same shape as index')
        dataframe = pd.DataFrame(array, index=self.index)
        return self.transform_dataframe(dataframe,
                                        function,
                                        level=level,
                                        **kwargs)

    def transform_slices(self, array, function, level=0, **kwargs):
        """Apply function to each group. Similar to transform_array but does
        not coerce array to a DataFrame and back and only works on a 1D or 2D
        numpy array. function is called function(group, group_idx, **kwargs).
        """
        array = np.asarray(array)
        if array.shape[0] != self.nobs:
            raise Exception('array does not have the same shape as index')
        # always reset because level is given. need to refactor this.
        self.get_slices(level=level)
        processed = []
        for s in self.slices:
            if array.ndim == 2:
                subset = array[s, :]
            elif array.ndim == 1:
                subset = array[s]
            processed.append(function(subset, s, **kwargs))
        processed = np.array(processed)
        return processed.reshape(-1, processed.shape[-1])

    # TODO: this isn't general needs to be a PanelGrouping object
    def dummies_time(self):
        self.dummy_sparse(level=1)
        return self._dummies

    def dummies_groups(self, level=0):
        self.dummy_sparse(level=level)
        return self._dummies

    def dummy_sparse(self, level=0):
        """create a sparse indicator from a group array with integer labels

        Parameters
        ----------
        groups: ndarray, int, 1d (nobs,) an array of group indicators for each
            observation. Group levels are assumed to be defined as consecutive
            integers, i.e. range(n_groups) where n_groups is the number of
            group levels. A group level with no observations for it will still
            produce a column of zeros.

        Returns
        -------
        indi : ndarray, int8, 2d (nobs, n_groups)
            an indicator array with one row per observation, that has 1 in the
            column of the group level for that observation

        Examples
        --------

        >>> g = np.array([0, 0, 2, 1, 1, 2, 0])
        >>> indi = dummy_sparse(g)
        >>> indi
        <7x3 sparse matrix of type '<type 'numpy.int8'>'
            with 7 stored elements in Compressed Sparse Row format>
        >>> indi.todense()
        matrix([[1, 0, 0],
                [1, 0, 0],
                [0, 0, 1],
                [0, 1, 0],
                [0, 1, 0],
                [0, 0, 1],
                [1, 0, 0]], dtype=int8)


        current behavior with missing groups
        >>> g = np.array([0, 0, 2, 0, 2, 0])
        >>> indi = dummy_sparse(g)
        >>> indi.todense()
        matrix([[1, 0, 0],
                [1, 0, 0],
                [0, 0, 1],
                [1, 0, 0],
                [0, 0, 1],
                [1, 0, 0]], dtype=int8)
        """
        indi = dummy_sparse(self.labels[level])
        self._dummies = indi