def test_rank_tie_methods_on_infs_nans(self, method, na_option, ascending):
        dtypes = [('object', None, Infinity(), NegInfinity()),
                  ('float64', np.nan, np.inf, -np.inf)]
        chunk = 3
        disabled = {('object', 'first')}

        def _check(s, method, na_option, ascending):
            exp_ranks = {
                'average': ([2, 2, 2], [5, 5, 5], [8, 8, 8]),
                'min': ([1, 1, 1], [4, 4, 4], [7, 7, 7]),
                'max': ([3, 3, 3], [6, 6, 6], [9, 9, 9]),
                'first': ([1, 2, 3], [4, 5, 6], [7, 8, 9]),
                'dense': ([1, 1, 1], [2, 2, 2], [3, 3, 3])
            }
            ranks = exp_ranks[method]
            if na_option == 'top':
                order = [ranks[1], ranks[0], ranks[2]]
            elif na_option == 'bottom':
                order = [ranks[0], ranks[2], ranks[1]]
            else:
                order = [ranks[0], [np.nan] * chunk, ranks[1]]
            expected = order if ascending else order[::-1]
            expected = list(chain.from_iterable(expected))
            result = s.rank(method=method, na_option=na_option,
                            ascending=ascending)
            tm.assert_series_equal(result, Series(expected, dtype='float64'))

        for dtype, na_value, pos_inf, neg_inf in dtypes:
            in_arr = [neg_inf] * chunk + [na_value] * chunk + [pos_inf] * chunk
            iseries = Series(in_arr, dtype=dtype)
            if (dtype, method) in disabled:
                continue
            _check(iseries, method, na_option, ascending)
Exemple #2
0
    def test_rank_tie_methods_on_infs_nans(self):
        dtypes = [('object', None, Infinity(), NegInfinity()),
                  ('float64', np.nan, np.inf, -np.inf)]
        chunk = 3
        disabled = set([('object', 'first')])

        def _check(s, expected, method='average', na_option='keep'):
            result = s.rank(method=method, na_option=na_option)
            tm.assert_series_equal(result, Series(expected, dtype='float64'))

        exp_ranks = {
            'average': ([2, 2, 2], [5, 5, 5], [8, 8, 8]),
            'min': ([1, 1, 1], [4, 4, 4], [7, 7, 7]),
            'max': ([3, 3, 3], [6, 6, 6], [9, 9, 9]),
            'first': ([1, 2, 3], [4, 5, 6], [7, 8, 9]),
            'dense': ([1, 1, 1], [2, 2, 2], [3, 3, 3])
        }
        na_options = ('top', 'bottom', 'keep')
        for dtype, na_value, pos_inf, neg_inf in dtypes:
            in_arr = [neg_inf] * chunk + [na_value] * chunk + [pos_inf] * chunk
            iseries = Series(in_arr, dtype=dtype)
            for method, na_opt in product(exp_ranks.keys(), na_options):
                ranks = exp_ranks[method]
                if (dtype, method) in disabled:
                    continue
                if na_opt == 'top':
                    order = ranks[1] + ranks[0] + ranks[2]
                elif na_opt == 'bottom':
                    order = ranks[0] + ranks[2] + ranks[1]
                else:
                    order = ranks[0] + [np.nan] * chunk + ranks[1]
                _check(iseries, order, method, na_opt)
class TestSeriesRank(TestData):
    s = Series([1, 3, 4, 2, nan, 2, 1, 5, nan, 3])

    results = {
        'average': np.array([1.5, 5.5, 7.0, 3.5, nan,
                             3.5, 1.5, 8.0, nan, 5.5]),
        'min': np.array([1, 5, 7, 3, nan, 3, 1, 8, nan, 5]),
        'max': np.array([2, 6, 7, 4, nan, 4, 2, 8, nan, 6]),
        'first': np.array([1, 5, 7, 3, nan, 4, 2, 8, nan, 6]),
        'dense': np.array([1, 3, 4, 2, nan, 2, 1, 5, nan, 3]),
    }

    def test_rank(self):
        pytest.importorskip('scipy.stats.special')
        rankdata = pytest.importorskip('scipy.stats.rankdata')

        self.ts[::2] = np.nan
        self.ts[:10][::3] = 4.

        ranks = self.ts.rank()
        oranks = self.ts.astype('O').rank()

        assert_series_equal(ranks, oranks)

        mask = np.isnan(self.ts)
        filled = self.ts.fillna(np.inf)

        # rankdata returns a ndarray
        exp = Series(rankdata(filled), index=filled.index, name='ts')
        exp[mask] = np.nan

        tm.assert_series_equal(ranks, exp)

        iseries = Series(np.arange(5).repeat(2))

        iranks = iseries.rank()
        exp = iseries.astype(float).rank()
        assert_series_equal(iranks, exp)
        iseries = Series(np.arange(5)) + 1.0
        exp = iseries / 5.0
        iranks = iseries.rank(pct=True)

        assert_series_equal(iranks, exp)

        iseries = Series(np.repeat(1, 100))
        exp = Series(np.repeat(0.505, 100))
        iranks = iseries.rank(pct=True)
        assert_series_equal(iranks, exp)

        iseries[1] = np.nan
        exp = Series(np.repeat(50.0 / 99.0, 100))
        exp[1] = np.nan
        iranks = iseries.rank(pct=True)
        assert_series_equal(iranks, exp)

        iseries = Series(np.arange(5)) + 1.0
        iseries[4] = np.nan
        exp = iseries / 4.0
        iranks = iseries.rank(pct=True)
        assert_series_equal(iranks, exp)

        iseries = Series(np.repeat(np.nan, 100))
        exp = iseries.copy()
        iranks = iseries.rank(pct=True)
        assert_series_equal(iranks, exp)

        iseries = Series(np.arange(5)) + 1
        iseries[4] = np.nan
        exp = iseries / 4.0
        iranks = iseries.rank(pct=True)
        assert_series_equal(iranks, exp)

        rng = date_range('1/1/1990', periods=5)
        iseries = Series(np.arange(5), rng) + 1
        iseries.iloc[4] = np.nan
        exp = iseries / 4.0
        iranks = iseries.rank(pct=True)
        assert_series_equal(iranks, exp)

        iseries = Series([1e-50, 1e-100, 1e-20, 1e-2, 1e-20 + 1e-30, 1e-1])
        exp = Series([2, 1, 3, 5, 4, 6.0])
        iranks = iseries.rank()
        assert_series_equal(iranks, exp)

        # GH 5968
        iseries = Series(['3 day', '1 day 10m', '-2 day', NaT],
                         dtype='m8[ns]')
        exp = Series([3, 2, 1, np.nan])
        iranks = iseries.rank()
        assert_series_equal(iranks, exp)

        values = np.array(
            [-50, -1, -1e-20, -1e-25, -1e-50, 0, 1e-40, 1e-20, 1e-10, 2, 40
             ], dtype='float64')
        random_order = np.random.permutation(len(values))
        iseries = Series(values[random_order])
        exp = Series(random_order + 1.0, dtype='float64')
        iranks = iseries.rank()
        assert_series_equal(iranks, exp)

    def test_rank_categorical(self):
        # GH issue #15420 rank incorrectly orders ordered categories

        # Test ascending/descending ranking for ordered categoricals
        exp = Series([1., 2., 3., 4., 5., 6.])
        exp_desc = Series([6., 5., 4., 3., 2., 1.])
        ordered = Series(
            ['first', 'second', 'third', 'fourth', 'fifth', 'sixth']
        ).astype(CategoricalDtype(categories=['first', 'second', 'third',
                                              'fourth', 'fifth', 'sixth'],
                                  ordered=True))
        assert_series_equal(ordered.rank(), exp)
        assert_series_equal(ordered.rank(ascending=False), exp_desc)

        # Unordered categoricals should be ranked as objects
        unordered = Series(['first', 'second', 'third', 'fourth',
                            'fifth', 'sixth']).astype(
            CategoricalDtype(categories=['first', 'second', 'third',
                                         'fourth', 'fifth', 'sixth'],
                             ordered=False))
        exp_unordered = Series([2., 4., 6., 3., 1., 5.])
        res = unordered.rank()
        assert_series_equal(res, exp_unordered)

        unordered1 = Series(
            [1, 2, 3, 4, 5, 6],
        ).astype(CategoricalDtype([1, 2, 3, 4, 5, 6], False))
        exp_unordered1 = Series([1., 2., 3., 4., 5., 6.])
        res1 = unordered1.rank()
        assert_series_equal(res1, exp_unordered1)

        # Test na_option for rank data
        na_ser = Series(
            ['first', 'second', 'third', 'fourth', 'fifth', 'sixth', np.NaN]
        ).astype(CategoricalDtype(['first', 'second', 'third', 'fourth',
                                   'fifth', 'sixth', 'seventh'], True))

        exp_top = Series([2., 3., 4., 5., 6., 7., 1.])
        exp_bot = Series([1., 2., 3., 4., 5., 6., 7.])
        exp_keep = Series([1., 2., 3., 4., 5., 6., np.NaN])

        assert_series_equal(na_ser.rank(na_option='top'), exp_top)
        assert_series_equal(na_ser.rank(na_option='bottom'), exp_bot)
        assert_series_equal(na_ser.rank(na_option='keep'), exp_keep)

        # Test na_option for rank data with ascending False
        exp_top = Series([7., 6., 5., 4., 3., 2., 1.])
        exp_bot = Series([6., 5., 4., 3., 2., 1., 7.])
        exp_keep = Series([6., 5., 4., 3., 2., 1., np.NaN])

        assert_series_equal(
            na_ser.rank(na_option='top', ascending=False),
            exp_top
        )
        assert_series_equal(
            na_ser.rank(na_option='bottom', ascending=False),
            exp_bot
        )
        assert_series_equal(
            na_ser.rank(na_option='keep', ascending=False),
            exp_keep
        )

        # Test invalid values for na_option
        msg = "na_option must be one of 'keep', 'top', or 'bottom'"

        with tm.assert_raises_regex(ValueError, msg):
            na_ser.rank(na_option='bad', ascending=False)

        # invalid type
        with tm.assert_raises_regex(ValueError, msg):
            na_ser.rank(na_option=True, ascending=False)

        # Test with pct=True
        na_ser = Series(['first', 'second', 'third', 'fourth', np.NaN]).astype(
            CategoricalDtype(['first', 'second', 'third', 'fourth'], True))
        exp_top = Series([0.4, 0.6, 0.8, 1., 0.2])
        exp_bot = Series([0.2, 0.4, 0.6, 0.8, 1.])
        exp_keep = Series([0.25, 0.5, 0.75, 1., np.NaN])

        assert_series_equal(na_ser.rank(na_option='top', pct=True), exp_top)
        assert_series_equal(na_ser.rank(na_option='bottom', pct=True), exp_bot)
        assert_series_equal(na_ser.rank(na_option='keep', pct=True), exp_keep)

    def test_rank_signature(self):
        s = Series([0, 1])
        s.rank(method='average')
        pytest.raises(ValueError, s.rank, 'average')

    @pytest.mark.parametrize('contents,dtype', [
        ([-np.inf, -50, -1, -1e-20, -1e-25, -1e-50, 0, 1e-40, 1e-20, 1e-10,
          2, 40, np.inf],
         'float64'),
        ([-np.inf, -50, -1, -1e-20, -1e-25, -1e-45, 0, 1e-40, 1e-20, 1e-10,
          2, 40, np.inf],
         'float32'),
        ([np.iinfo(np.uint8).min, 1, 2, 100, np.iinfo(np.uint8).max],
         'uint8'),
        pytest.param([np.iinfo(np.int64).min, -100, 0, 1, 9999, 100000,
                      1e10, np.iinfo(np.int64).max],
                     'int64',
                     marks=pytest.mark.xfail(
                         reason="iNaT is equivalent to minimum value of dtype"
                                "int64 pending issue GH#16674",
                         strict=True)),
        ([NegInfinity(), '1', 'A', 'BA', 'Ba', 'C', Infinity()],
         'object')
    ])
    def test_rank_inf(self, contents, dtype):
        dtype_na_map = {
            'float64': np.nan,
            'float32': np.nan,
            'int64': iNaT,
            'object': None
        }
        # Insert nans at random positions if underlying dtype has missing
        # value. Then adjust the expected order by adding nans accordingly
        # This is for testing whether rank calculation is affected
        # when values are interwined with nan values.
        values = np.array(contents, dtype=dtype)
        exp_order = np.array(range(len(values)), dtype='float64') + 1.0
        if dtype in dtype_na_map:
            na_value = dtype_na_map[dtype]
            nan_indices = np.random.choice(range(len(values)), 5)
            values = np.insert(values, nan_indices, na_value)
            exp_order = np.insert(exp_order, nan_indices, np.nan)
        # shuffle the testing array and expected results in the same way
        random_order = np.random.permutation(len(values))
        iseries = Series(values[random_order])
        exp = Series(exp_order[random_order], dtype='float64')
        iranks = iseries.rank()
        assert_series_equal(iranks, exp)

    def test_rank_tie_methods(self):
        s = self.s

        def _check(s, expected, method='average'):
            result = s.rank(method=method)
            tm.assert_series_equal(result, Series(expected))

        dtypes = [None, object]
        disabled = {(object, 'first')}
        results = self.results

        for method, dtype in product(results, dtypes):
            if (dtype, method) in disabled:
                continue
            series = s if dtype is None else s.astype(dtype)
            _check(series, results[method], method=method)

    @td.skip_if_no_scipy
    @pytest.mark.parametrize('ascending', [True, False])
    @pytest.mark.parametrize('method', ['average', 'min', 'max', 'first',
                                        'dense'])
    @pytest.mark.parametrize('na_option', ['top', 'bottom', 'keep'])
    def test_rank_tie_methods_on_infs_nans(self, method, na_option, ascending):
        dtypes = [('object', None, Infinity(), NegInfinity()),
                  ('float64', np.nan, np.inf, -np.inf)]
        chunk = 3
        disabled = {('object', 'first')}

        def _check(s, method, na_option, ascending):
            exp_ranks = {
                'average': ([2, 2, 2], [5, 5, 5], [8, 8, 8]),
                'min': ([1, 1, 1], [4, 4, 4], [7, 7, 7]),
                'max': ([3, 3, 3], [6, 6, 6], [9, 9, 9]),
                'first': ([1, 2, 3], [4, 5, 6], [7, 8, 9]),
                'dense': ([1, 1, 1], [2, 2, 2], [3, 3, 3])
            }
            ranks = exp_ranks[method]
            if na_option == 'top':
                order = [ranks[1], ranks[0], ranks[2]]
            elif na_option == 'bottom':
                order = [ranks[0], ranks[2], ranks[1]]
            else:
                order = [ranks[0], [np.nan] * chunk, ranks[1]]
            expected = order if ascending else order[::-1]
            expected = list(chain.from_iterable(expected))
            result = s.rank(method=method, na_option=na_option,
                            ascending=ascending)
            tm.assert_series_equal(result, Series(expected, dtype='float64'))

        for dtype, na_value, pos_inf, neg_inf in dtypes:
            in_arr = [neg_inf] * chunk + [na_value] * chunk + [pos_inf] * chunk
            iseries = Series(in_arr, dtype=dtype)
            if (dtype, method) in disabled:
                continue
            _check(iseries, method, na_option, ascending)

    def test_rank_desc_mix_nans_infs(self):
        # GH 19538
        # check descending ranking when mix nans and infs
        iseries = Series([1, np.nan, np.inf, -np.inf, 25])
        result = iseries.rank(ascending=False)
        exp = Series([3, np.nan, 1, 4, 2], dtype='float64')
        tm.assert_series_equal(result, exp)

    def test_rank_methods_series(self):
        pytest.importorskip('scipy.stats.special')
        rankdata = pytest.importorskip('scipy.stats.rankdata')
        import scipy

        xs = np.random.randn(9)
        xs = np.concatenate([xs[i:] for i in range(0, 9, 2)])  # add duplicates
        np.random.shuffle(xs)

        index = [chr(ord('a') + i) for i in range(len(xs))]

        for vals in [xs, xs + 1e6, xs * 1e-6]:
            ts = Series(vals, index=index)

            for m in ['average', 'min', 'max', 'first', 'dense']:
                result = ts.rank(method=m)
                sprank = rankdata(vals, m if m != 'first' else 'ordinal')
                expected = Series(sprank, index=index)

                if LooseVersion(scipy.__version__) >= LooseVersion('0.17.0'):
                    expected = expected.astype('float64')
                tm.assert_series_equal(result, expected)

    def test_rank_dense_method(self):
        dtypes = ['O', 'f8', 'i8']
        in_out = [([1], [1]),
                  ([2], [1]),
                  ([0], [1]),
                  ([2, 2], [1, 1]),
                  ([1, 2, 3], [1, 2, 3]),
                  ([4, 2, 1], [3, 2, 1],),
                  ([1, 1, 5, 5, 3], [1, 1, 3, 3, 2]),
                  ([-5, -4, -3, -2, -1], [1, 2, 3, 4, 5])]

        for ser, exp in in_out:
            for dtype in dtypes:
                s = Series(ser).astype(dtype)
                result = s.rank(method='dense')
                expected = Series(exp).astype(result.dtype)
                assert_series_equal(result, expected)

    def test_rank_descending(self):
        dtypes = ['O', 'f8', 'i8']

        for dtype, method in product(dtypes, self.results):
            if 'i' in dtype:
                s = self.s.dropna()
            else:
                s = self.s.astype(dtype)

            res = s.rank(ascending=False)
            expected = (s.max() - s).rank()
            assert_series_equal(res, expected)

            if method == 'first' and dtype == 'O':
                continue

            expected = (s.max() - s).rank(method=method)
            res2 = s.rank(method=method, ascending=False)
            assert_series_equal(res2, expected)

    def test_rank_int(self):
        s = self.s.dropna().astype('i8')

        for method, res in compat.iteritems(self.results):
            result = s.rank(method=method)
            expected = Series(res).dropna()
            expected.index = result.index
            assert_series_equal(result, expected)

    def test_rank_object_bug(self):
        # GH 13445

        # smoke tests
        Series([np.nan] * 32).astype(object).rank(ascending=True)
        Series([np.nan] * 32).astype(object).rank(ascending=False)

    def test_rank_modify_inplace(self):
        # GH 18521
        # Check rank does not mutate series
        s = Series([Timestamp('2017-01-05 10:20:27.569000'), NaT])
        expected = s.copy()

        s.rank()
        result = s
        assert_series_equal(result, expected)
Exemple #4
0
class TestRank:
    s = Series([1, 3, 4, 2, np.nan, 2, 1, 5, np.nan, 3])
    df = DataFrame({"A": s, "B": s})

    results = {
        "average":
        np.array([1.5, 5.5, 7.0, 3.5, np.nan, 3.5, 1.5, 8.0, np.nan, 5.5]),
        "min":
        np.array([1, 5, 7, 3, np.nan, 3, 1, 8, np.nan, 5]),
        "max":
        np.array([2, 6, 7, 4, np.nan, 4, 2, 8, np.nan, 6]),
        "first":
        np.array([1, 5, 7, 3, np.nan, 4, 2, 8, np.nan, 6]),
        "dense":
        np.array([1, 3, 4, 2, np.nan, 2, 1, 5, np.nan, 3]),
    }

    @pytest.fixture(params=["average", "min", "max", "first", "dense"])
    def method(self, request):
        """
        Fixture for trying all rank methods
        """
        return request.param

    @td.skip_if_no_scipy
    def test_rank(self, float_frame):
        import scipy.stats  # noqa:F401
        from scipy.stats import rankdata

        float_frame["A"][::2] = np.nan
        float_frame["B"][::3] = np.nan
        float_frame["C"][::4] = np.nan
        float_frame["D"][::5] = np.nan

        ranks0 = float_frame.rank()
        ranks1 = float_frame.rank(1)
        mask = np.isnan(float_frame.values)

        fvals = float_frame.fillna(np.inf).values

        exp0 = np.apply_along_axis(rankdata, 0, fvals)
        exp0[mask] = np.nan

        exp1 = np.apply_along_axis(rankdata, 1, fvals)
        exp1[mask] = np.nan

        tm.assert_almost_equal(ranks0.values, exp0)
        tm.assert_almost_equal(ranks1.values, exp1)

        # integers
        df = DataFrame(np.random.randint(0, 5, size=40).reshape((10, 4)))

        result = df.rank()
        exp = df.astype(float).rank()
        tm.assert_frame_equal(result, exp)

        result = df.rank(1)
        exp = df.astype(float).rank(1)
        tm.assert_frame_equal(result, exp)

    def test_rank2(self):
        df = DataFrame([[1, 3, 2], [1, 2, 3]])
        expected = DataFrame([[1.0, 3.0, 2.0], [1, 2, 3]]) / 3.0
        result = df.rank(1, pct=True)
        tm.assert_frame_equal(result, expected)

        df = DataFrame([[1, 3, 2], [1, 2, 3]])
        expected = df.rank(0) / 2.0
        result = df.rank(0, pct=True)
        tm.assert_frame_equal(result, expected)

        df = DataFrame([["b", "c", "a"], ["a", "c", "b"]])
        expected = DataFrame([[2.0, 3.0, 1.0], [1, 3, 2]])
        result = df.rank(1, numeric_only=False)
        tm.assert_frame_equal(result, expected)

        expected = DataFrame([[2.0, 1.5, 1.0], [1, 1.5, 2]])
        result = df.rank(0, numeric_only=False)
        tm.assert_frame_equal(result, expected)

        df = DataFrame([["b", np.nan, "a"], ["a", "c", "b"]])
        expected = DataFrame([[2.0, np.nan, 1.0], [1.0, 3.0, 2.0]])
        result = df.rank(1, numeric_only=False)
        tm.assert_frame_equal(result, expected)

        expected = DataFrame([[2.0, np.nan, 1.0], [1.0, 1.0, 2.0]])
        result = df.rank(0, numeric_only=False)
        tm.assert_frame_equal(result, expected)

        # f7u12, this does not work without extensive workaround
        data = [
            [datetime(2001, 1, 5), np.nan,
             datetime(2001, 1, 2)],
            [datetime(2000, 1, 2),
             datetime(2000, 1, 3),
             datetime(2000, 1, 1)],
        ]
        df = DataFrame(data)

        # check the rank
        expected = DataFrame([[2.0, np.nan, 1.0], [2.0, 3.0, 1.0]])
        result = df.rank(1, numeric_only=False, ascending=True)
        tm.assert_frame_equal(result, expected)

        expected = DataFrame([[1.0, np.nan, 2.0], [2.0, 1.0, 3.0]])
        result = df.rank(1, numeric_only=False, ascending=False)
        tm.assert_frame_equal(result, expected)

        df = DataFrame(
            {"a": [1e-20, -5, 1e-20 + 1e-40, 10, 1e60, 1e80, 1e-30]})
        exp = DataFrame({"a": [3.5, 1.0, 3.5, 5.0, 6.0, 7.0, 2.0]})
        tm.assert_frame_equal(df.rank(), exp)

    def test_rank_does_not_mutate(self):
        # GH#18521
        # Check rank does not mutate DataFrame
        df = DataFrame(np.random.randn(10, 3), dtype="float64")
        expected = df.copy()
        df.rank()
        result = df
        tm.assert_frame_equal(result, expected)

    def test_rank_mixed_frame(self, float_string_frame):
        float_string_frame["datetime"] = datetime.now()
        float_string_frame["timedelta"] = timedelta(days=1, seconds=1)

        with tm.assert_produces_warning(FutureWarning,
                                        match="numeric_only=None"):
            float_string_frame.rank(numeric_only=None)
        with tm.assert_produces_warning(FutureWarning,
                                        match="Dropping of nuisance"):
            result = float_string_frame.rank(1)
        expected = float_string_frame.rank(1, numeric_only=True)
        tm.assert_frame_equal(result, expected)

    @td.skip_if_no_scipy
    def test_rank_na_option(self, float_frame):
        import scipy.stats  # noqa:F401
        from scipy.stats import rankdata

        float_frame["A"][::2] = np.nan
        float_frame["B"][::3] = np.nan
        float_frame["C"][::4] = np.nan
        float_frame["D"][::5] = np.nan

        # bottom
        ranks0 = float_frame.rank(na_option="bottom")
        ranks1 = float_frame.rank(1, na_option="bottom")

        fvals = float_frame.fillna(np.inf).values

        exp0 = np.apply_along_axis(rankdata, 0, fvals)
        exp1 = np.apply_along_axis(rankdata, 1, fvals)

        tm.assert_almost_equal(ranks0.values, exp0)
        tm.assert_almost_equal(ranks1.values, exp1)

        # top
        ranks0 = float_frame.rank(na_option="top")
        ranks1 = float_frame.rank(1, na_option="top")

        fval0 = float_frame.fillna((float_frame.min() - 1).to_dict()).values
        fval1 = float_frame.T
        fval1 = fval1.fillna((fval1.min() - 1).to_dict()).T
        fval1 = fval1.fillna(np.inf).values

        exp0 = np.apply_along_axis(rankdata, 0, fval0)
        exp1 = np.apply_along_axis(rankdata, 1, fval1)

        tm.assert_almost_equal(ranks0.values, exp0)
        tm.assert_almost_equal(ranks1.values, exp1)

        # descending

        # bottom
        ranks0 = float_frame.rank(na_option="top", ascending=False)
        ranks1 = float_frame.rank(1, na_option="top", ascending=False)

        fvals = float_frame.fillna(np.inf).values

        exp0 = np.apply_along_axis(rankdata, 0, -fvals)
        exp1 = np.apply_along_axis(rankdata, 1, -fvals)

        tm.assert_almost_equal(ranks0.values, exp0)
        tm.assert_almost_equal(ranks1.values, exp1)

        # descending

        # top
        ranks0 = float_frame.rank(na_option="bottom", ascending=False)
        ranks1 = float_frame.rank(1, na_option="bottom", ascending=False)

        fval0 = float_frame.fillna((float_frame.min() - 1).to_dict()).values
        fval1 = float_frame.T
        fval1 = fval1.fillna((fval1.min() - 1).to_dict()).T
        fval1 = fval1.fillna(np.inf).values

        exp0 = np.apply_along_axis(rankdata, 0, -fval0)
        exp1 = np.apply_along_axis(rankdata, 1, -fval1)

        tm.assert_numpy_array_equal(ranks0.values, exp0)
        tm.assert_numpy_array_equal(ranks1.values, exp1)

        # bad values throw error
        msg = "na_option must be one of 'keep', 'top', or 'bottom'"

        with pytest.raises(ValueError, match=msg):
            float_frame.rank(na_option="bad", ascending=False)

        # invalid type
        with pytest.raises(ValueError, match=msg):
            float_frame.rank(na_option=True, ascending=False)

    def test_rank_axis(self):
        # check if using axes' names gives the same result
        df = DataFrame([[2, 1], [4, 3]])
        tm.assert_frame_equal(df.rank(axis=0), df.rank(axis="index"))
        tm.assert_frame_equal(df.rank(axis=1), df.rank(axis="columns"))

    @td.skip_if_no_scipy
    def test_rank_methods_frame(self):
        import scipy.stats  # noqa:F401
        from scipy.stats import rankdata

        xs = np.random.randint(0, 21, (100, 26))
        xs = (xs - 10.0) / 10.0
        cols = [chr(ord("z") - i) for i in range(xs.shape[1])]

        for vals in [xs, xs + 1e6, xs * 1e-6]:
            df = DataFrame(vals, columns=cols)

            for ax in [0, 1]:
                for m in ["average", "min", "max", "first", "dense"]:
                    result = df.rank(axis=ax, method=m)
                    sprank = np.apply_along_axis(
                        rankdata, ax, vals, m if m != "first" else "ordinal")
                    sprank = sprank.astype(np.float64)
                    expected = DataFrame(sprank,
                                         columns=cols).astype("float64")
                    tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize("dtype", ["O", "f8", "i8"])
    @pytest.mark.filterwarnings("ignore:.*Select only valid:FutureWarning")
    def test_rank_descending(self, method, dtype):
        if "i" in dtype:
            df = self.df.dropna().astype(dtype)
        else:
            df = self.df.astype(dtype)

        res = df.rank(ascending=False)
        expected = (df.max() - df).rank()
        tm.assert_frame_equal(res, expected)

        expected = (df.max() - df).rank(method=method)

        if dtype != "O":
            res2 = df.rank(method=method, ascending=False, numeric_only=True)
            tm.assert_frame_equal(res2, expected)

        res3 = df.rank(method=method, ascending=False, numeric_only=False)
        tm.assert_frame_equal(res3, expected)

    @pytest.mark.parametrize("axis", [0, 1])
    @pytest.mark.parametrize("dtype", [None, object])
    def test_rank_2d_tie_methods(self, method, axis, dtype):
        df = self.df

        def _check2d(df, expected, method="average", axis=0):
            exp_df = DataFrame({"A": expected, "B": expected})

            if axis == 1:
                df = df.T
                exp_df = exp_df.T

            result = df.rank(method=method, axis=axis)
            tm.assert_frame_equal(result, exp_df)

        frame = df if dtype is None else df.astype(dtype)
        _check2d(frame, self.results[method], method=method, axis=axis)

    @pytest.mark.parametrize(
        "method,exp",
        [
            ("dense", [[1.0, 1.0, 1.0], [1.0, 0.5, 2.0 / 3],
                       [1.0, 0.5, 1.0 / 3]]),
            (
                "min",
                [
                    [1.0 / 3, 1.0, 1.0],
                    [1.0 / 3, 1.0 / 3, 2.0 / 3],
                    [1.0 / 3, 1.0 / 3, 1.0 / 3],
                ],
            ),
            (
                "max",
                [[1.0, 1.0, 1.0], [1.0, 2.0 / 3, 2.0 / 3],
                 [1.0, 2.0 / 3, 1.0 / 3]],
            ),
            (
                "average",
                [[2.0 / 3, 1.0, 1.0], [2.0 / 3, 0.5, 2.0 / 3],
                 [2.0 / 3, 0.5, 1.0 / 3]],
            ),
            (
                "first",
                [
                    [1.0 / 3, 1.0, 1.0],
                    [2.0 / 3, 1.0 / 3, 2.0 / 3],
                    [3.0 / 3, 2.0 / 3, 1.0 / 3],
                ],
            ),
        ],
    )
    def test_rank_pct_true(self, method, exp):
        # see gh-15630.

        df = DataFrame([[2012, 66, 3], [2012, 65, 2], [2012, 65, 1]])
        result = df.rank(method=method, pct=True)

        expected = DataFrame(exp)
        tm.assert_frame_equal(result, expected)

    @pytest.mark.single_cpu
    @pytest.mark.high_memory
    def test_pct_max_many_rows(self):
        # GH 18271
        df = DataFrame({
            "A": np.arange(2**24 + 1),
            "B": np.arange(2**24 + 1, 0, -1)
        })
        result = df.rank(pct=True).max()
        assert (result == 1).all()

    @pytest.mark.parametrize(
        "contents,dtype",
        [
            (
                [
                    -np.inf,
                    -50,
                    -1,
                    -1e-20,
                    -1e-25,
                    -1e-50,
                    0,
                    1e-40,
                    1e-20,
                    1e-10,
                    2,
                    40,
                    np.inf,
                ],
                "float64",
            ),
            (
                [
                    -np.inf,
                    -50,
                    -1,
                    -1e-20,
                    -1e-25,
                    -1e-45,
                    0,
                    1e-40,
                    1e-20,
                    1e-10,
                    2,
                    40,
                    np.inf,
                ],
                "float32",
            ),
            ([np.iinfo(np.uint8).min, 1, 2, 100,
              np.iinfo(np.uint8).max], "uint8"),
            (
                [
                    np.iinfo(np.int64).min,
                    -100,
                    0,
                    1,
                    9999,
                    100000,
                    1e10,
                    np.iinfo(np.int64).max,
                ],
                "int64",
            ),
            ([NegInfinity(), "1", "A", "BA", "Ba", "C",
              Infinity()], "object"),
            (
                [
                    datetime(2001, 1, 1),
                    datetime(2001, 1, 2),
                    datetime(2001, 1, 5)
                ],
                "datetime64",
            ),
        ],
    )
    def test_rank_inf_and_nan(self, contents, dtype, frame_or_series):
        dtype_na_map = {
            "float64": np.nan,
            "float32": np.nan,
            "object": None,
            "datetime64": np.datetime64("nat"),
        }
        # Insert nans at random positions if underlying dtype has missing
        # value. Then adjust the expected order by adding nans accordingly
        # This is for testing whether rank calculation is affected
        # when values are interwined with nan values.
        values = np.array(contents, dtype=dtype)
        exp_order = np.array(range(len(values)), dtype="float64") + 1.0
        if dtype in dtype_na_map:
            na_value = dtype_na_map[dtype]
            nan_indices = np.random.choice(range(len(values)), 5)
            values = np.insert(values, nan_indices, na_value)
            exp_order = np.insert(exp_order, nan_indices, np.nan)

        # Shuffle the testing array and expected results in the same way
        random_order = np.random.permutation(len(values))
        obj = frame_or_series(values[random_order])
        expected = frame_or_series(exp_order[random_order], dtype="float64")
        result = obj.rank()
        tm.assert_equal(result, expected)

    def test_df_series_inf_nan_consistency(self):
        # GH#32593
        index = [5, 4, 3, 2, 1, 6, 7, 8, 9, 10]
        col1 = [5, 4, 3, 5, 8, 5, 2, 1, 6, 6]
        col2 = [5, 4, np.nan, 5, 8, 5, np.inf, np.nan, 6, -np.inf]
        df = DataFrame(
            data={
                "col1": col1,
                "col2": col2,
            },
            index=index,
            dtype="f8",
        )
        df_result = df.rank()

        series_result = df.copy()
        series_result["col1"] = df["col1"].rank()
        series_result["col2"] = df["col2"].rank()

        tm.assert_frame_equal(df_result, series_result)

    def test_rank_both_inf(self):
        # GH#32593
        df = DataFrame({"a": [-np.inf, 0, np.inf]})
        expected = DataFrame({"a": [1.0, 2.0, 3.0]})
        result = df.rank()
        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize(
        "na_option,ascending,expected",
        [
            ("top", True, [3.0, 1.0, 2.0]),
            ("top", False, [2.0, 1.0, 3.0]),
            ("bottom", True, [2.0, 3.0, 1.0]),
            ("bottom", False, [1.0, 3.0, 2.0]),
        ],
    )
    def test_rank_inf_nans_na_option(self, frame_or_series, method, na_option,
                                     ascending, expected):
        obj = frame_or_series([np.inf, np.nan, -np.inf])
        result = obj.rank(method=method,
                          na_option=na_option,
                          ascending=ascending)
        expected = frame_or_series(expected)
        tm.assert_equal(result, expected)

    @pytest.mark.parametrize(
        "na_option,ascending,expected",
        [
            ("bottom", True, [1.0, 2.0, 4.0, 3.0]),
            ("bottom", False, [1.0, 2.0, 4.0, 3.0]),
            ("top", True, [2.0, 3.0, 1.0, 4.0]),
            ("top", False, [2.0, 3.0, 1.0, 4.0]),
        ],
    )
    def test_rank_object_first(self, frame_or_series, na_option, ascending,
                               expected):
        obj = frame_or_series(["foo", "foo", None, "foo"])
        result = obj.rank(method="first",
                          na_option=na_option,
                          ascending=ascending)
        expected = frame_or_series(expected)
        tm.assert_equal(result, expected)

    @pytest.mark.parametrize(
        "data,expected",
        [
            ({
                "a": [1, 2, "a"],
                "b": [4, 5, 6]
            }, DataFrame({"b": [1.0, 2.0, 3.0]})),
            ({
                "a": [1, 2, "a"]
            }, DataFrame(index=range(3))),
        ],
    )
    def test_rank_mixed_axis_zero(self, data, expected):
        df = DataFrame(data)
        msg = "Dropping of nuisance columns"
        with tm.assert_produces_warning(FutureWarning, match=msg):
            result = df.rank()
        tm.assert_frame_equal(result, expected)
Exemple #5
0
class TestSeriesRank:
    s = Series([1, 3, 4, 2, np.nan, 2, 1, 5, np.nan, 3])

    results = {
        "average":
        np.array([1.5, 5.5, 7.0, 3.5, np.nan, 3.5, 1.5, 8.0, np.nan, 5.5]),
        "min":
        np.array([1, 5, 7, 3, np.nan, 3, 1, 8, np.nan, 5]),
        "max":
        np.array([2, 6, 7, 4, np.nan, 4, 2, 8, np.nan, 6]),
        "first":
        np.array([1, 5, 7, 3, np.nan, 4, 2, 8, np.nan, 6]),
        "dense":
        np.array([1, 3, 4, 2, np.nan, 2, 1, 5, np.nan, 3]),
    }

    def test_rank(self, datetime_series):
        pytest.importorskip("scipy.stats.special")
        rankdata = pytest.importorskip("scipy.stats.rankdata")

        datetime_series[::2] = np.nan
        datetime_series[:10][::3] = 4.0

        ranks = datetime_series.rank()
        oranks = datetime_series.astype("O").rank()

        tm.assert_series_equal(ranks, oranks)

        mask = np.isnan(datetime_series)
        filled = datetime_series.fillna(np.inf)

        # rankdata returns a ndarray
        exp = Series(rankdata(filled), index=filled.index, name="ts")
        exp[mask] = np.nan

        tm.assert_series_equal(ranks, exp)

        iseries = Series(np.arange(5).repeat(2))

        iranks = iseries.rank()
        exp = iseries.astype(float).rank()
        tm.assert_series_equal(iranks, exp)
        iseries = Series(np.arange(5)) + 1.0
        exp = iseries / 5.0
        iranks = iseries.rank(pct=True)

        tm.assert_series_equal(iranks, exp)

        iseries = Series(np.repeat(1, 100))
        exp = Series(np.repeat(0.505, 100))
        iranks = iseries.rank(pct=True)
        tm.assert_series_equal(iranks, exp)

        iseries[1] = np.nan
        exp = Series(np.repeat(50.0 / 99.0, 100))
        exp[1] = np.nan
        iranks = iseries.rank(pct=True)
        tm.assert_series_equal(iranks, exp)

        iseries = Series(np.arange(5)) + 1.0
        iseries[4] = np.nan
        exp = iseries / 4.0
        iranks = iseries.rank(pct=True)
        tm.assert_series_equal(iranks, exp)

        iseries = Series(np.repeat(np.nan, 100))
        exp = iseries.copy()
        iranks = iseries.rank(pct=True)
        tm.assert_series_equal(iranks, exp)

        iseries = Series(np.arange(5)) + 1
        iseries[4] = np.nan
        exp = iseries / 4.0
        iranks = iseries.rank(pct=True)
        tm.assert_series_equal(iranks, exp)

        rng = date_range("1/1/1990", periods=5)
        iseries = Series(np.arange(5), rng) + 1
        iseries.iloc[4] = np.nan
        exp = iseries / 4.0
        iranks = iseries.rank(pct=True)
        tm.assert_series_equal(iranks, exp)

        iseries = Series([1e-50, 1e-100, 1e-20, 1e-2, 1e-20 + 1e-30, 1e-1])
        exp = Series([2, 1, 3, 5, 4, 6.0])
        iranks = iseries.rank()
        tm.assert_series_equal(iranks, exp)

        # GH 5968
        iseries = Series(["3 day", "1 day 10m", "-2 day", NaT], dtype="m8[ns]")
        exp = Series([3, 2, 1, np.nan])
        iranks = iseries.rank()
        tm.assert_series_equal(iranks, exp)

        values = np.array(
            [-50, -1, -1e-20, -1e-25, -1e-50, 0, 1e-40, 1e-20, 1e-10, 2, 40],
            dtype="float64",
        )
        random_order = np.random.permutation(len(values))
        iseries = Series(values[random_order])
        exp = Series(random_order + 1.0, dtype="float64")
        iranks = iseries.rank()
        tm.assert_series_equal(iranks, exp)

    def test_rank_categorical(self):
        # GH issue #15420 rank incorrectly orders ordered categories

        # Test ascending/descending ranking for ordered categoricals
        exp = Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0])
        exp_desc = Series([6.0, 5.0, 4.0, 3.0, 2.0, 1.0])
        ordered = Series(
            ["first", "second", "third", "fourth", "fifth", "sixth"]).astype(
                CategoricalDtype(
                    categories=[
                        "first", "second", "third", "fourth", "fifth", "sixth"
                    ],
                    ordered=True,
                ))
        tm.assert_series_equal(ordered.rank(), exp)
        tm.assert_series_equal(ordered.rank(ascending=False), exp_desc)

        # Unordered categoricals should be ranked as objects
        unordered = Series(
            ["first", "second", "third", "fourth", "fifth", "sixth"]).astype(
                CategoricalDtype(
                    categories=[
                        "first", "second", "third", "fourth", "fifth", "sixth"
                    ],
                    ordered=False,
                ))
        exp_unordered = Series([2.0, 4.0, 6.0, 3.0, 1.0, 5.0])
        res = unordered.rank()
        tm.assert_series_equal(res, exp_unordered)

        unordered1 = Series([1, 2, 3, 4, 5, 6]).astype(
            CategoricalDtype([1, 2, 3, 4, 5, 6], False))
        exp_unordered1 = Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0])
        res1 = unordered1.rank()
        tm.assert_series_equal(res1, exp_unordered1)

        # Test na_option for rank data
        na_ser = Series(
            ["first", "second", "third", "fourth", "fifth", "sixth",
             np.NaN]).astype(
                 CategoricalDtype(
                     [
                         "first", "second", "third", "fourth", "fifth",
                         "sixth", "seventh"
                     ],
                     True,
                 ))

        exp_top = Series([2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 1.0])
        exp_bot = Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0])
        exp_keep = Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, np.NaN])

        tm.assert_series_equal(na_ser.rank(na_option="top"), exp_top)
        tm.assert_series_equal(na_ser.rank(na_option="bottom"), exp_bot)
        tm.assert_series_equal(na_ser.rank(na_option="keep"), exp_keep)

        # Test na_option for rank data with ascending False
        exp_top = Series([7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0])
        exp_bot = Series([6.0, 5.0, 4.0, 3.0, 2.0, 1.0, 7.0])
        exp_keep = Series([6.0, 5.0, 4.0, 3.0, 2.0, 1.0, np.NaN])

        tm.assert_series_equal(na_ser.rank(na_option="top", ascending=False),
                               exp_top)
        tm.assert_series_equal(
            na_ser.rank(na_option="bottom", ascending=False), exp_bot)
        tm.assert_series_equal(na_ser.rank(na_option="keep", ascending=False),
                               exp_keep)

        # Test invalid values for na_option
        msg = "na_option must be one of 'keep', 'top', or 'bottom'"

        with pytest.raises(ValueError, match=msg):
            na_ser.rank(na_option="bad", ascending=False)

        # invalid type
        with pytest.raises(ValueError, match=msg):
            na_ser.rank(na_option=True, ascending=False)

        # Test with pct=True
        na_ser = Series(["first", "second", "third", "fourth", np.NaN]).astype(
            CategoricalDtype(["first", "second", "third", "fourth"], True))
        exp_top = Series([0.4, 0.6, 0.8, 1.0, 0.2])
        exp_bot = Series([0.2, 0.4, 0.6, 0.8, 1.0])
        exp_keep = Series([0.25, 0.5, 0.75, 1.0, np.NaN])

        tm.assert_series_equal(na_ser.rank(na_option="top", pct=True), exp_top)
        tm.assert_series_equal(na_ser.rank(na_option="bottom", pct=True),
                               exp_bot)
        tm.assert_series_equal(na_ser.rank(na_option="keep", pct=True),
                               exp_keep)

    def test_rank_signature(self):
        s = Series([0, 1])
        s.rank(method="average")
        msg = "No axis named average for object type Series"
        with pytest.raises(ValueError, match=msg):
            s.rank("average")

    @pytest.mark.parametrize(
        "contents,dtype",
        [
            (
                [
                    -np.inf,
                    -50,
                    -1,
                    -1e-20,
                    -1e-25,
                    -1e-50,
                    0,
                    1e-40,
                    1e-20,
                    1e-10,
                    2,
                    40,
                    np.inf,
                ],
                "float64",
            ),
            (
                [
                    -np.inf,
                    -50,
                    -1,
                    -1e-20,
                    -1e-25,
                    -1e-45,
                    0,
                    1e-40,
                    1e-20,
                    1e-10,
                    2,
                    40,
                    np.inf,
                ],
                "float32",
            ),
            ([np.iinfo(np.uint8).min, 1, 2, 100,
              np.iinfo(np.uint8).max], "uint8"),
            pytest.param(
                [
                    np.iinfo(np.int64).min,
                    -100,
                    0,
                    1,
                    9999,
                    100000,
                    1e10,
                    np.iinfo(np.int64).max,
                ],
                "int64",
                marks=pytest.mark.xfail(
                    reason="iNaT is equivalent to minimum value of dtype"
                    "int64 pending issue GH#16674"),
            ),
            ([NegInfinity(), "1", "A", "BA", "Ba", "C",
              Infinity()], "object"),
        ],
    )
    def test_rank_inf(self, contents, dtype):
        dtype_na_map = {
            "float64": np.nan,
            "float32": np.nan,
            "int64": iNaT,
            "object": None,
        }
        # Insert nans at random positions if underlying dtype has missing
        # value. Then adjust the expected order by adding nans accordingly
        # This is for testing whether rank calculation is affected
        # when values are interwined with nan values.
        values = np.array(contents, dtype=dtype)
        exp_order = np.array(range(len(values)), dtype="float64") + 1.0
        if dtype in dtype_na_map:
            na_value = dtype_na_map[dtype]
            nan_indices = np.random.choice(range(len(values)), 5)
            values = np.insert(values, nan_indices, na_value)
            exp_order = np.insert(exp_order, nan_indices, np.nan)
        # shuffle the testing array and expected results in the same way
        random_order = np.random.permutation(len(values))
        iseries = Series(values[random_order])
        exp = Series(exp_order[random_order], dtype="float64")
        iranks = iseries.rank()
        tm.assert_series_equal(iranks, exp)

    def test_rank_tie_methods(self):
        s = self.s

        def _check(s, expected, method="average"):
            result = s.rank(method=method)
            tm.assert_series_equal(result, Series(expected))

        dtypes = [None, object]
        disabled = {(object, "first")}
        results = self.results

        for method, dtype in product(results, dtypes):
            if (dtype, method) in disabled:
                continue
            series = s if dtype is None else s.astype(dtype)
            _check(series, results[method], method=method)

    @td.skip_if_no_scipy
    @pytest.mark.parametrize("ascending", [True, False])
    @pytest.mark.parametrize("method",
                             ["average", "min", "max", "first", "dense"])
    @pytest.mark.parametrize("na_option", ["top", "bottom", "keep"])
    def test_rank_tie_methods_on_infs_nans(self, method, na_option, ascending):
        dtypes = [
            ("object", None, Infinity(), NegInfinity()),
            ("float64", np.nan, np.inf, -np.inf),
        ]
        chunk = 3
        disabled = {("object", "first")}

        def _check(s, method, na_option, ascending):
            exp_ranks = {
                "average": ([2, 2, 2], [5, 5, 5], [8, 8, 8]),
                "min": ([1, 1, 1], [4, 4, 4], [7, 7, 7]),
                "max": ([3, 3, 3], [6, 6, 6], [9, 9, 9]),
                "first": ([1, 2, 3], [4, 5, 6], [7, 8, 9]),
                "dense": ([1, 1, 1], [2, 2, 2], [3, 3, 3]),
            }
            ranks = exp_ranks[method]
            if na_option == "top":
                order = [ranks[1], ranks[0], ranks[2]]
            elif na_option == "bottom":
                order = [ranks[0], ranks[2], ranks[1]]
            else:
                order = [ranks[0], [np.nan] * chunk, ranks[1]]
            expected = order if ascending else order[::-1]
            expected = list(chain.from_iterable(expected))
            result = s.rank(method=method,
                            na_option=na_option,
                            ascending=ascending)
            tm.assert_series_equal(result, Series(expected, dtype="float64"))

        for dtype, na_value, pos_inf, neg_inf in dtypes:
            in_arr = [neg_inf] * chunk + [na_value] * chunk + [pos_inf] * chunk
            iseries = Series(in_arr, dtype=dtype)
            if (dtype, method) in disabled:
                continue
            _check(iseries, method, na_option, ascending)

    def test_rank_desc_mix_nans_infs(self):
        # GH 19538
        # check descending ranking when mix nans and infs
        iseries = Series([1, np.nan, np.inf, -np.inf, 25])
        result = iseries.rank(ascending=False)
        exp = Series([3, np.nan, 1, 4, 2], dtype="float64")
        tm.assert_series_equal(result, exp)

    def test_rank_methods_series(self):
        pytest.importorskip("scipy.stats.special")
        rankdata = pytest.importorskip("scipy.stats.rankdata")

        xs = np.random.randn(9)
        xs = np.concatenate([xs[i:] for i in range(0, 9, 2)])  # add duplicates
        np.random.shuffle(xs)

        index = [chr(ord("a") + i) for i in range(len(xs))]

        for vals in [xs, xs + 1e6, xs * 1e-6]:
            ts = Series(vals, index=index)

            for m in ["average", "min", "max", "first", "dense"]:
                result = ts.rank(method=m)
                sprank = rankdata(vals, m if m != "first" else "ordinal")
                expected = Series(sprank, index=index).astype("float64")
                tm.assert_series_equal(result, expected)

    def test_rank_dense_method(self):
        dtypes = ["O", "f8", "i8"]
        in_out = [
            ([1], [1]),
            ([2], [1]),
            ([0], [1]),
            ([2, 2], [1, 1]),
            ([1, 2, 3], [1, 2, 3]),
            ([4, 2, 1], [3, 2, 1]),
            ([1, 1, 5, 5, 3], [1, 1, 3, 3, 2]),
            ([-5, -4, -3, -2, -1], [1, 2, 3, 4, 5]),
        ]

        for ser, exp in in_out:
            for dtype in dtypes:
                s = Series(ser).astype(dtype)
                result = s.rank(method="dense")
                expected = Series(exp).astype(result.dtype)
                tm.assert_series_equal(result, expected)

    def test_rank_descending(self):
        dtypes = ["O", "f8", "i8"]

        for dtype, method in product(dtypes, self.results):
            if "i" in dtype:
                s = self.s.dropna()
            else:
                s = self.s.astype(dtype)

            res = s.rank(ascending=False)
            expected = (s.max() - s).rank()
            tm.assert_series_equal(res, expected)

            if method == "first" and dtype == "O":
                continue

            expected = (s.max() - s).rank(method=method)
            res2 = s.rank(method=method, ascending=False)
            tm.assert_series_equal(res2, expected)

    def test_rank_int(self):
        s = self.s.dropna().astype("i8")

        for method, res in self.results.items():
            result = s.rank(method=method)
            expected = Series(res).dropna()
            expected.index = result.index
            tm.assert_series_equal(result, expected)

    def test_rank_object_bug(self):
        # GH 13445

        # smoke tests
        Series([np.nan] * 32).astype(object).rank(ascending=True)
        Series([np.nan] * 32).astype(object).rank(ascending=False)

    def test_rank_modify_inplace(self):
        # GH 18521
        # Check rank does not mutate series
        s = Series([Timestamp("2017-01-05 10:20:27.569000"), NaT])
        expected = s.copy()

        s.rank()
        result = s
        tm.assert_series_equal(result, expected)
Exemple #6
0
class TestSeriesRank:
    @td.skip_if_no_scipy
    def test_rank(self, datetime_series):
        from scipy.stats import rankdata

        datetime_series[::2] = np.nan
        datetime_series[:10][::3] = 4.0

        ranks = datetime_series.rank()
        oranks = datetime_series.astype("O").rank()

        tm.assert_series_equal(ranks, oranks)

        mask = np.isnan(datetime_series)
        filled = datetime_series.fillna(np.inf)

        # rankdata returns a ndarray
        exp = Series(rankdata(filled), index=filled.index, name="ts")
        exp[mask] = np.nan

        tm.assert_series_equal(ranks, exp)

        iseries = Series(np.arange(5).repeat(2))

        iranks = iseries.rank()
        exp = iseries.astype(float).rank()
        tm.assert_series_equal(iranks, exp)
        iseries = Series(np.arange(5)) + 1.0
        exp = iseries / 5.0
        iranks = iseries.rank(pct=True)

        tm.assert_series_equal(iranks, exp)

        iseries = Series(np.repeat(1, 100))
        exp = Series(np.repeat(0.505, 100))
        iranks = iseries.rank(pct=True)
        tm.assert_series_equal(iranks, exp)

        iseries[1] = np.nan
        exp = Series(np.repeat(50.0 / 99.0, 100))
        exp[1] = np.nan
        iranks = iseries.rank(pct=True)
        tm.assert_series_equal(iranks, exp)

        iseries = Series(np.arange(5)) + 1.0
        iseries[4] = np.nan
        exp = iseries / 4.0
        iranks = iseries.rank(pct=True)
        tm.assert_series_equal(iranks, exp)

        iseries = Series(np.repeat(np.nan, 100))
        exp = iseries.copy()
        iranks = iseries.rank(pct=True)
        tm.assert_series_equal(iranks, exp)

        iseries = Series(np.arange(5)) + 1
        iseries[4] = np.nan
        exp = iseries / 4.0
        iranks = iseries.rank(pct=True)
        tm.assert_series_equal(iranks, exp)

        rng = date_range("1/1/1990", periods=5)
        iseries = Series(np.arange(5), rng) + 1
        iseries.iloc[4] = np.nan
        exp = iseries / 4.0
        iranks = iseries.rank(pct=True)
        tm.assert_series_equal(iranks, exp)

        iseries = Series([1e-50, 1e-100, 1e-20, 1e-2, 1e-20 + 1e-30, 1e-1])
        exp = Series([2, 1, 3, 5, 4, 6.0])
        iranks = iseries.rank()
        tm.assert_series_equal(iranks, exp)

        # GH 5968
        iseries = Series(["3 day", "1 day 10m", "-2 day", NaT], dtype="m8[ns]")
        exp = Series([3, 2, 1, np.nan])
        iranks = iseries.rank()
        tm.assert_series_equal(iranks, exp)

        values = np.array(
            [-50, -1, -1e-20, -1e-25, -1e-50, 0, 1e-40, 1e-20, 1e-10, 2, 40],
            dtype="float64",
        )
        random_order = np.random.permutation(len(values))
        iseries = Series(values[random_order])
        exp = Series(random_order + 1.0, dtype="float64")
        iranks = iseries.rank()
        tm.assert_series_equal(iranks, exp)

    def test_rank_categorical(self):
        # GH issue #15420 rank incorrectly orders ordered categories

        # Test ascending/descending ranking for ordered categoricals
        exp = Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0])
        exp_desc = Series([6.0, 5.0, 4.0, 3.0, 2.0, 1.0])
        ordered = Series(
            ["first", "second", "third", "fourth", "fifth", "sixth"]
        ).astype(
            CategoricalDtype(
                categories=["first", "second", "third", "fourth", "fifth", "sixth"],
                ordered=True,
            )
        )
        tm.assert_series_equal(ordered.rank(), exp)
        tm.assert_series_equal(ordered.rank(ascending=False), exp_desc)

        # Unordered categoricals should be ranked as objects
        unordered = Series(
            ["first", "second", "third", "fourth", "fifth", "sixth"]
        ).astype(
            CategoricalDtype(
                categories=["first", "second", "third", "fourth", "fifth", "sixth"],
                ordered=False,
            )
        )
        exp_unordered = Series([2.0, 4.0, 6.0, 3.0, 1.0, 5.0])
        res = unordered.rank()
        tm.assert_series_equal(res, exp_unordered)

        unordered1 = Series([1, 2, 3, 4, 5, 6]).astype(
            CategoricalDtype([1, 2, 3, 4, 5, 6], False)
        )
        exp_unordered1 = Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0])
        res1 = unordered1.rank()
        tm.assert_series_equal(res1, exp_unordered1)

        # Test na_option for rank data
        na_ser = Series(
            ["first", "second", "third", "fourth", "fifth", "sixth", np.NaN]
        ).astype(
            CategoricalDtype(
                ["first", "second", "third", "fourth", "fifth", "sixth", "seventh"],
                True,
            )
        )

        exp_top = Series([2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 1.0])
        exp_bot = Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0])
        exp_keep = Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, np.NaN])

        tm.assert_series_equal(na_ser.rank(na_option="top"), exp_top)
        tm.assert_series_equal(na_ser.rank(na_option="bottom"), exp_bot)
        tm.assert_series_equal(na_ser.rank(na_option="keep"), exp_keep)

        # Test na_option for rank data with ascending False
        exp_top = Series([7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0])
        exp_bot = Series([6.0, 5.0, 4.0, 3.0, 2.0, 1.0, 7.0])
        exp_keep = Series([6.0, 5.0, 4.0, 3.0, 2.0, 1.0, np.NaN])

        tm.assert_series_equal(na_ser.rank(na_option="top", ascending=False), exp_top)
        tm.assert_series_equal(
            na_ser.rank(na_option="bottom", ascending=False), exp_bot
        )
        tm.assert_series_equal(na_ser.rank(na_option="keep", ascending=False), exp_keep)

        # Test invalid values for na_option
        msg = "na_option must be one of 'keep', 'top', or 'bottom'"

        with pytest.raises(ValueError, match=msg):
            na_ser.rank(na_option="bad", ascending=False)

        # invalid type
        with pytest.raises(ValueError, match=msg):
            na_ser.rank(na_option=True, ascending=False)

        # Test with pct=True
        na_ser = Series(["first", "second", "third", "fourth", np.NaN]).astype(
            CategoricalDtype(["first", "second", "third", "fourth"], True)
        )
        exp_top = Series([0.4, 0.6, 0.8, 1.0, 0.2])
        exp_bot = Series([0.2, 0.4, 0.6, 0.8, 1.0])
        exp_keep = Series([0.25, 0.5, 0.75, 1.0, np.NaN])

        tm.assert_series_equal(na_ser.rank(na_option="top", pct=True), exp_top)
        tm.assert_series_equal(na_ser.rank(na_option="bottom", pct=True), exp_bot)
        tm.assert_series_equal(na_ser.rank(na_option="keep", pct=True), exp_keep)

    def test_rank_signature(self):
        s = Series([0, 1])
        s.rank(method="average")
        msg = "No axis named average for object type Series"
        with pytest.raises(ValueError, match=msg):
            s.rank("average")

    @pytest.mark.parametrize("dtype", [None, object])
    def test_rank_tie_methods(self, ser, results, dtype):
        method, exp = results
        ser = ser if dtype is None else ser.astype(dtype)
        result = ser.rank(method=method)
        tm.assert_series_equal(result, Series(exp))

    @td.skip_if_no_scipy
    @pytest.mark.parametrize("ascending", [True, False])
    @pytest.mark.parametrize("method", ["average", "min", "max", "first", "dense"])
    @pytest.mark.parametrize("na_option", ["top", "bottom", "keep"])
    @pytest.mark.parametrize(
        "dtype, na_value, pos_inf, neg_inf",
        [
            ("object", None, Infinity(), NegInfinity()),
            ("float64", np.nan, np.inf, -np.inf),
        ],
    )
    def test_rank_tie_methods_on_infs_nans(
        self, method, na_option, ascending, dtype, na_value, pos_inf, neg_inf
    ):
        chunk = 3

        in_arr = [neg_inf] * chunk + [na_value] * chunk + [pos_inf] * chunk
        iseries = Series(in_arr, dtype=dtype)
        exp_ranks = {
            "average": ([2, 2, 2], [5, 5, 5], [8, 8, 8]),
            "min": ([1, 1, 1], [4, 4, 4], [7, 7, 7]),
            "max": ([3, 3, 3], [6, 6, 6], [9, 9, 9]),
            "first": ([1, 2, 3], [4, 5, 6], [7, 8, 9]),
            "dense": ([1, 1, 1], [2, 2, 2], [3, 3, 3]),
        }
        ranks = exp_ranks[method]
        if na_option == "top":
            order = [ranks[1], ranks[0], ranks[2]]
        elif na_option == "bottom":
            order = [ranks[0], ranks[2], ranks[1]]
        else:
            order = [ranks[0], [np.nan] * chunk, ranks[1]]
        expected = order if ascending else order[::-1]
        expected = list(chain.from_iterable(expected))
        result = iseries.rank(method=method, na_option=na_option, ascending=ascending)
        tm.assert_series_equal(result, Series(expected, dtype="float64"))

    def test_rank_desc_mix_nans_infs(self):
        # GH 19538
        # check descending ranking when mix nans and infs
        iseries = Series([1, np.nan, np.inf, -np.inf, 25])
        result = iseries.rank(ascending=False)
        exp = Series([3, np.nan, 1, 4, 2], dtype="float64")
        tm.assert_series_equal(result, exp)

    @td.skip_if_no_scipy
    @pytest.mark.parametrize("method", ["average", "min", "max", "first", "dense"])
    @pytest.mark.parametrize(
        "op, value",
        [
            [operator.add, 0],
            [operator.add, 1e6],
            [operator.mul, 1e-6],
        ],
    )
    def test_rank_methods_series(self, method, op, value):
        from scipy.stats import rankdata

        xs = np.random.randn(9)
        xs = np.concatenate([xs[i:] for i in range(0, 9, 2)])  # add duplicates
        np.random.shuffle(xs)

        index = [chr(ord("a") + i) for i in range(len(xs))]
        vals = op(xs, value)
        ts = Series(vals, index=index)
        result = ts.rank(method=method)
        sprank = rankdata(vals, method if method != "first" else "ordinal")
        expected = Series(sprank, index=index).astype("float64")
        tm.assert_series_equal(result, expected)

    @pytest.mark.parametrize("dtype", ["O", "f8", "i8"])
    @pytest.mark.parametrize(
        "ser, exp",
        [
            ([1], [1]),
            ([2], [1]),
            ([0], [1]),
            ([2, 2], [1, 1]),
            ([1, 2, 3], [1, 2, 3]),
            ([4, 2, 1], [3, 2, 1]),
            ([1, 1, 5, 5, 3], [1, 1, 3, 3, 2]),
            ([-5, -4, -3, -2, -1], [1, 2, 3, 4, 5]),
        ],
    )
    def test_rank_dense_method(self, dtype, ser, exp):
        s = Series(ser).astype(dtype)
        result = s.rank(method="dense")
        expected = Series(exp).astype(result.dtype)
        tm.assert_series_equal(result, expected)

    @pytest.mark.parametrize("dtype", ["O", "f8", "i8"])
    def test_rank_descending(self, ser, results, dtype):
        method, _ = results
        if "i" in dtype:
            s = ser.dropna()
        else:
            s = ser.astype(dtype)

        res = s.rank(ascending=False)
        expected = (s.max() - s).rank()
        tm.assert_series_equal(res, expected)

        expected = (s.max() - s).rank(method=method)
        res2 = s.rank(method=method, ascending=False)
        tm.assert_series_equal(res2, expected)

    def test_rank_int(self, ser, results):
        method, exp = results
        s = ser.dropna().astype("i8")

        result = s.rank(method=method)
        expected = Series(exp).dropna()
        expected.index = result.index
        tm.assert_series_equal(result, expected)

    def test_rank_object_bug(self):
        # GH 13445

        # smoke tests
        Series([np.nan] * 32).astype(object).rank(ascending=True)
        Series([np.nan] * 32).astype(object).rank(ascending=False)

    def test_rank_modify_inplace(self):
        # GH 18521
        # Check rank does not mutate series
        s = Series([Timestamp("2017-01-05 10:20:27.569000"), NaT])
        expected = s.copy()

        s.rank()
        result = s
        tm.assert_series_equal(result, expected)