def test_rank_tie_methods_on_infs_nans(self, method, na_option, ascending): dtypes = [('object', None, Infinity(), NegInfinity()), ('float64', np.nan, np.inf, -np.inf)] chunk = 3 disabled = {('object', 'first')} def _check(s, method, na_option, ascending): exp_ranks = { 'average': ([2, 2, 2], [5, 5, 5], [8, 8, 8]), 'min': ([1, 1, 1], [4, 4, 4], [7, 7, 7]), 'max': ([3, 3, 3], [6, 6, 6], [9, 9, 9]), 'first': ([1, 2, 3], [4, 5, 6], [7, 8, 9]), 'dense': ([1, 1, 1], [2, 2, 2], [3, 3, 3]) } ranks = exp_ranks[method] if na_option == 'top': order = [ranks[1], ranks[0], ranks[2]] elif na_option == 'bottom': order = [ranks[0], ranks[2], ranks[1]] else: order = [ranks[0], [np.nan] * chunk, ranks[1]] expected = order if ascending else order[::-1] expected = list(chain.from_iterable(expected)) result = s.rank(method=method, na_option=na_option, ascending=ascending) tm.assert_series_equal(result, Series(expected, dtype='float64')) for dtype, na_value, pos_inf, neg_inf in dtypes: in_arr = [neg_inf] * chunk + [na_value] * chunk + [pos_inf] * chunk iseries = Series(in_arr, dtype=dtype) if (dtype, method) in disabled: continue _check(iseries, method, na_option, ascending)
def test_rank_tie_methods_on_infs_nans(self): dtypes = [('object', None, Infinity(), NegInfinity()), ('float64', np.nan, np.inf, -np.inf)] chunk = 3 disabled = set([('object', 'first')]) def _check(s, expected, method='average', na_option='keep'): result = s.rank(method=method, na_option=na_option) tm.assert_series_equal(result, Series(expected, dtype='float64')) exp_ranks = { 'average': ([2, 2, 2], [5, 5, 5], [8, 8, 8]), 'min': ([1, 1, 1], [4, 4, 4], [7, 7, 7]), 'max': ([3, 3, 3], [6, 6, 6], [9, 9, 9]), 'first': ([1, 2, 3], [4, 5, 6], [7, 8, 9]), 'dense': ([1, 1, 1], [2, 2, 2], [3, 3, 3]) } na_options = ('top', 'bottom', 'keep') for dtype, na_value, pos_inf, neg_inf in dtypes: in_arr = [neg_inf] * chunk + [na_value] * chunk + [pos_inf] * chunk iseries = Series(in_arr, dtype=dtype) for method, na_opt in product(exp_ranks.keys(), na_options): ranks = exp_ranks[method] if (dtype, method) in disabled: continue if na_opt == 'top': order = ranks[1] + ranks[0] + ranks[2] elif na_opt == 'bottom': order = ranks[0] + ranks[2] + ranks[1] else: order = ranks[0] + [np.nan] * chunk + ranks[1] _check(iseries, order, method, na_opt)
class TestSeriesRank(TestData): s = Series([1, 3, 4, 2, nan, 2, 1, 5, nan, 3]) results = { 'average': np.array([1.5, 5.5, 7.0, 3.5, nan, 3.5, 1.5, 8.0, nan, 5.5]), 'min': np.array([1, 5, 7, 3, nan, 3, 1, 8, nan, 5]), 'max': np.array([2, 6, 7, 4, nan, 4, 2, 8, nan, 6]), 'first': np.array([1, 5, 7, 3, nan, 4, 2, 8, nan, 6]), 'dense': np.array([1, 3, 4, 2, nan, 2, 1, 5, nan, 3]), } def test_rank(self): pytest.importorskip('scipy.stats.special') rankdata = pytest.importorskip('scipy.stats.rankdata') self.ts[::2] = np.nan self.ts[:10][::3] = 4. ranks = self.ts.rank() oranks = self.ts.astype('O').rank() assert_series_equal(ranks, oranks) mask = np.isnan(self.ts) filled = self.ts.fillna(np.inf) # rankdata returns a ndarray exp = Series(rankdata(filled), index=filled.index, name='ts') exp[mask] = np.nan tm.assert_series_equal(ranks, exp) iseries = Series(np.arange(5).repeat(2)) iranks = iseries.rank() exp = iseries.astype(float).rank() assert_series_equal(iranks, exp) iseries = Series(np.arange(5)) + 1.0 exp = iseries / 5.0 iranks = iseries.rank(pct=True) assert_series_equal(iranks, exp) iseries = Series(np.repeat(1, 100)) exp = Series(np.repeat(0.505, 100)) iranks = iseries.rank(pct=True) assert_series_equal(iranks, exp) iseries[1] = np.nan exp = Series(np.repeat(50.0 / 99.0, 100)) exp[1] = np.nan iranks = iseries.rank(pct=True) assert_series_equal(iranks, exp) iseries = Series(np.arange(5)) + 1.0 iseries[4] = np.nan exp = iseries / 4.0 iranks = iseries.rank(pct=True) assert_series_equal(iranks, exp) iseries = Series(np.repeat(np.nan, 100)) exp = iseries.copy() iranks = iseries.rank(pct=True) assert_series_equal(iranks, exp) iseries = Series(np.arange(5)) + 1 iseries[4] = np.nan exp = iseries / 4.0 iranks = iseries.rank(pct=True) assert_series_equal(iranks, exp) rng = date_range('1/1/1990', periods=5) iseries = Series(np.arange(5), rng) + 1 iseries.iloc[4] = np.nan exp = iseries / 4.0 iranks = iseries.rank(pct=True) assert_series_equal(iranks, exp) iseries = Series([1e-50, 1e-100, 1e-20, 1e-2, 1e-20 + 1e-30, 1e-1]) exp = Series([2, 1, 3, 5, 4, 6.0]) iranks = iseries.rank() assert_series_equal(iranks, exp) # GH 5968 iseries = Series(['3 day', '1 day 10m', '-2 day', NaT], dtype='m8[ns]') exp = Series([3, 2, 1, np.nan]) iranks = iseries.rank() assert_series_equal(iranks, exp) values = np.array( [-50, -1, -1e-20, -1e-25, -1e-50, 0, 1e-40, 1e-20, 1e-10, 2, 40 ], dtype='float64') random_order = np.random.permutation(len(values)) iseries = Series(values[random_order]) exp = Series(random_order + 1.0, dtype='float64') iranks = iseries.rank() assert_series_equal(iranks, exp) def test_rank_categorical(self): # GH issue #15420 rank incorrectly orders ordered categories # Test ascending/descending ranking for ordered categoricals exp = Series([1., 2., 3., 4., 5., 6.]) exp_desc = Series([6., 5., 4., 3., 2., 1.]) ordered = Series( ['first', 'second', 'third', 'fourth', 'fifth', 'sixth'] ).astype(CategoricalDtype(categories=['first', 'second', 'third', 'fourth', 'fifth', 'sixth'], ordered=True)) assert_series_equal(ordered.rank(), exp) assert_series_equal(ordered.rank(ascending=False), exp_desc) # Unordered categoricals should be ranked as objects unordered = Series(['first', 'second', 'third', 'fourth', 'fifth', 'sixth']).astype( CategoricalDtype(categories=['first', 'second', 'third', 'fourth', 'fifth', 'sixth'], ordered=False)) exp_unordered = Series([2., 4., 6., 3., 1., 5.]) res = unordered.rank() assert_series_equal(res, exp_unordered) unordered1 = Series( [1, 2, 3, 4, 5, 6], ).astype(CategoricalDtype([1, 2, 3, 4, 5, 6], False)) exp_unordered1 = Series([1., 2., 3., 4., 5., 6.]) res1 = unordered1.rank() assert_series_equal(res1, exp_unordered1) # Test na_option for rank data na_ser = Series( ['first', 'second', 'third', 'fourth', 'fifth', 'sixth', np.NaN] ).astype(CategoricalDtype(['first', 'second', 'third', 'fourth', 'fifth', 'sixth', 'seventh'], True)) exp_top = Series([2., 3., 4., 5., 6., 7., 1.]) exp_bot = Series([1., 2., 3., 4., 5., 6., 7.]) exp_keep = Series([1., 2., 3., 4., 5., 6., np.NaN]) assert_series_equal(na_ser.rank(na_option='top'), exp_top) assert_series_equal(na_ser.rank(na_option='bottom'), exp_bot) assert_series_equal(na_ser.rank(na_option='keep'), exp_keep) # Test na_option for rank data with ascending False exp_top = Series([7., 6., 5., 4., 3., 2., 1.]) exp_bot = Series([6., 5., 4., 3., 2., 1., 7.]) exp_keep = Series([6., 5., 4., 3., 2., 1., np.NaN]) assert_series_equal( na_ser.rank(na_option='top', ascending=False), exp_top ) assert_series_equal( na_ser.rank(na_option='bottom', ascending=False), exp_bot ) assert_series_equal( na_ser.rank(na_option='keep', ascending=False), exp_keep ) # Test invalid values for na_option msg = "na_option must be one of 'keep', 'top', or 'bottom'" with tm.assert_raises_regex(ValueError, msg): na_ser.rank(na_option='bad', ascending=False) # invalid type with tm.assert_raises_regex(ValueError, msg): na_ser.rank(na_option=True, ascending=False) # Test with pct=True na_ser = Series(['first', 'second', 'third', 'fourth', np.NaN]).astype( CategoricalDtype(['first', 'second', 'third', 'fourth'], True)) exp_top = Series([0.4, 0.6, 0.8, 1., 0.2]) exp_bot = Series([0.2, 0.4, 0.6, 0.8, 1.]) exp_keep = Series([0.25, 0.5, 0.75, 1., np.NaN]) assert_series_equal(na_ser.rank(na_option='top', pct=True), exp_top) assert_series_equal(na_ser.rank(na_option='bottom', pct=True), exp_bot) assert_series_equal(na_ser.rank(na_option='keep', pct=True), exp_keep) def test_rank_signature(self): s = Series([0, 1]) s.rank(method='average') pytest.raises(ValueError, s.rank, 'average') @pytest.mark.parametrize('contents,dtype', [ ([-np.inf, -50, -1, -1e-20, -1e-25, -1e-50, 0, 1e-40, 1e-20, 1e-10, 2, 40, np.inf], 'float64'), ([-np.inf, -50, -1, -1e-20, -1e-25, -1e-45, 0, 1e-40, 1e-20, 1e-10, 2, 40, np.inf], 'float32'), ([np.iinfo(np.uint8).min, 1, 2, 100, np.iinfo(np.uint8).max], 'uint8'), pytest.param([np.iinfo(np.int64).min, -100, 0, 1, 9999, 100000, 1e10, np.iinfo(np.int64).max], 'int64', marks=pytest.mark.xfail( reason="iNaT is equivalent to minimum value of dtype" "int64 pending issue GH#16674", strict=True)), ([NegInfinity(), '1', 'A', 'BA', 'Ba', 'C', Infinity()], 'object') ]) def test_rank_inf(self, contents, dtype): dtype_na_map = { 'float64': np.nan, 'float32': np.nan, 'int64': iNaT, 'object': None } # Insert nans at random positions if underlying dtype has missing # value. Then adjust the expected order by adding nans accordingly # This is for testing whether rank calculation is affected # when values are interwined with nan values. values = np.array(contents, dtype=dtype) exp_order = np.array(range(len(values)), dtype='float64') + 1.0 if dtype in dtype_na_map: na_value = dtype_na_map[dtype] nan_indices = np.random.choice(range(len(values)), 5) values = np.insert(values, nan_indices, na_value) exp_order = np.insert(exp_order, nan_indices, np.nan) # shuffle the testing array and expected results in the same way random_order = np.random.permutation(len(values)) iseries = Series(values[random_order]) exp = Series(exp_order[random_order], dtype='float64') iranks = iseries.rank() assert_series_equal(iranks, exp) def test_rank_tie_methods(self): s = self.s def _check(s, expected, method='average'): result = s.rank(method=method) tm.assert_series_equal(result, Series(expected)) dtypes = [None, object] disabled = {(object, 'first')} results = self.results for method, dtype in product(results, dtypes): if (dtype, method) in disabled: continue series = s if dtype is None else s.astype(dtype) _check(series, results[method], method=method) @td.skip_if_no_scipy @pytest.mark.parametrize('ascending', [True, False]) @pytest.mark.parametrize('method', ['average', 'min', 'max', 'first', 'dense']) @pytest.mark.parametrize('na_option', ['top', 'bottom', 'keep']) def test_rank_tie_methods_on_infs_nans(self, method, na_option, ascending): dtypes = [('object', None, Infinity(), NegInfinity()), ('float64', np.nan, np.inf, -np.inf)] chunk = 3 disabled = {('object', 'first')} def _check(s, method, na_option, ascending): exp_ranks = { 'average': ([2, 2, 2], [5, 5, 5], [8, 8, 8]), 'min': ([1, 1, 1], [4, 4, 4], [7, 7, 7]), 'max': ([3, 3, 3], [6, 6, 6], [9, 9, 9]), 'first': ([1, 2, 3], [4, 5, 6], [7, 8, 9]), 'dense': ([1, 1, 1], [2, 2, 2], [3, 3, 3]) } ranks = exp_ranks[method] if na_option == 'top': order = [ranks[1], ranks[0], ranks[2]] elif na_option == 'bottom': order = [ranks[0], ranks[2], ranks[1]] else: order = [ranks[0], [np.nan] * chunk, ranks[1]] expected = order if ascending else order[::-1] expected = list(chain.from_iterable(expected)) result = s.rank(method=method, na_option=na_option, ascending=ascending) tm.assert_series_equal(result, Series(expected, dtype='float64')) for dtype, na_value, pos_inf, neg_inf in dtypes: in_arr = [neg_inf] * chunk + [na_value] * chunk + [pos_inf] * chunk iseries = Series(in_arr, dtype=dtype) if (dtype, method) in disabled: continue _check(iseries, method, na_option, ascending) def test_rank_desc_mix_nans_infs(self): # GH 19538 # check descending ranking when mix nans and infs iseries = Series([1, np.nan, np.inf, -np.inf, 25]) result = iseries.rank(ascending=False) exp = Series([3, np.nan, 1, 4, 2], dtype='float64') tm.assert_series_equal(result, exp) def test_rank_methods_series(self): pytest.importorskip('scipy.stats.special') rankdata = pytest.importorskip('scipy.stats.rankdata') import scipy xs = np.random.randn(9) xs = np.concatenate([xs[i:] for i in range(0, 9, 2)]) # add duplicates np.random.shuffle(xs) index = [chr(ord('a') + i) for i in range(len(xs))] for vals in [xs, xs + 1e6, xs * 1e-6]: ts = Series(vals, index=index) for m in ['average', 'min', 'max', 'first', 'dense']: result = ts.rank(method=m) sprank = rankdata(vals, m if m != 'first' else 'ordinal') expected = Series(sprank, index=index) if LooseVersion(scipy.__version__) >= LooseVersion('0.17.0'): expected = expected.astype('float64') tm.assert_series_equal(result, expected) def test_rank_dense_method(self): dtypes = ['O', 'f8', 'i8'] in_out = [([1], [1]), ([2], [1]), ([0], [1]), ([2, 2], [1, 1]), ([1, 2, 3], [1, 2, 3]), ([4, 2, 1], [3, 2, 1],), ([1, 1, 5, 5, 3], [1, 1, 3, 3, 2]), ([-5, -4, -3, -2, -1], [1, 2, 3, 4, 5])] for ser, exp in in_out: for dtype in dtypes: s = Series(ser).astype(dtype) result = s.rank(method='dense') expected = Series(exp).astype(result.dtype) assert_series_equal(result, expected) def test_rank_descending(self): dtypes = ['O', 'f8', 'i8'] for dtype, method in product(dtypes, self.results): if 'i' in dtype: s = self.s.dropna() else: s = self.s.astype(dtype) res = s.rank(ascending=False) expected = (s.max() - s).rank() assert_series_equal(res, expected) if method == 'first' and dtype == 'O': continue expected = (s.max() - s).rank(method=method) res2 = s.rank(method=method, ascending=False) assert_series_equal(res2, expected) def test_rank_int(self): s = self.s.dropna().astype('i8') for method, res in compat.iteritems(self.results): result = s.rank(method=method) expected = Series(res).dropna() expected.index = result.index assert_series_equal(result, expected) def test_rank_object_bug(self): # GH 13445 # smoke tests Series([np.nan] * 32).astype(object).rank(ascending=True) Series([np.nan] * 32).astype(object).rank(ascending=False) def test_rank_modify_inplace(self): # GH 18521 # Check rank does not mutate series s = Series([Timestamp('2017-01-05 10:20:27.569000'), NaT]) expected = s.copy() s.rank() result = s assert_series_equal(result, expected)
class TestRank: s = Series([1, 3, 4, 2, np.nan, 2, 1, 5, np.nan, 3]) df = DataFrame({"A": s, "B": s}) results = { "average": np.array([1.5, 5.5, 7.0, 3.5, np.nan, 3.5, 1.5, 8.0, np.nan, 5.5]), "min": np.array([1, 5, 7, 3, np.nan, 3, 1, 8, np.nan, 5]), "max": np.array([2, 6, 7, 4, np.nan, 4, 2, 8, np.nan, 6]), "first": np.array([1, 5, 7, 3, np.nan, 4, 2, 8, np.nan, 6]), "dense": np.array([1, 3, 4, 2, np.nan, 2, 1, 5, np.nan, 3]), } @pytest.fixture(params=["average", "min", "max", "first", "dense"]) def method(self, request): """ Fixture for trying all rank methods """ return request.param @td.skip_if_no_scipy def test_rank(self, float_frame): import scipy.stats # noqa:F401 from scipy.stats import rankdata float_frame["A"][::2] = np.nan float_frame["B"][::3] = np.nan float_frame["C"][::4] = np.nan float_frame["D"][::5] = np.nan ranks0 = float_frame.rank() ranks1 = float_frame.rank(1) mask = np.isnan(float_frame.values) fvals = float_frame.fillna(np.inf).values exp0 = np.apply_along_axis(rankdata, 0, fvals) exp0[mask] = np.nan exp1 = np.apply_along_axis(rankdata, 1, fvals) exp1[mask] = np.nan tm.assert_almost_equal(ranks0.values, exp0) tm.assert_almost_equal(ranks1.values, exp1) # integers df = DataFrame(np.random.randint(0, 5, size=40).reshape((10, 4))) result = df.rank() exp = df.astype(float).rank() tm.assert_frame_equal(result, exp) result = df.rank(1) exp = df.astype(float).rank(1) tm.assert_frame_equal(result, exp) def test_rank2(self): df = DataFrame([[1, 3, 2], [1, 2, 3]]) expected = DataFrame([[1.0, 3.0, 2.0], [1, 2, 3]]) / 3.0 result = df.rank(1, pct=True) tm.assert_frame_equal(result, expected) df = DataFrame([[1, 3, 2], [1, 2, 3]]) expected = df.rank(0) / 2.0 result = df.rank(0, pct=True) tm.assert_frame_equal(result, expected) df = DataFrame([["b", "c", "a"], ["a", "c", "b"]]) expected = DataFrame([[2.0, 3.0, 1.0], [1, 3, 2]]) result = df.rank(1, numeric_only=False) tm.assert_frame_equal(result, expected) expected = DataFrame([[2.0, 1.5, 1.0], [1, 1.5, 2]]) result = df.rank(0, numeric_only=False) tm.assert_frame_equal(result, expected) df = DataFrame([["b", np.nan, "a"], ["a", "c", "b"]]) expected = DataFrame([[2.0, np.nan, 1.0], [1.0, 3.0, 2.0]]) result = df.rank(1, numeric_only=False) tm.assert_frame_equal(result, expected) expected = DataFrame([[2.0, np.nan, 1.0], [1.0, 1.0, 2.0]]) result = df.rank(0, numeric_only=False) tm.assert_frame_equal(result, expected) # f7u12, this does not work without extensive workaround data = [ [datetime(2001, 1, 5), np.nan, datetime(2001, 1, 2)], [datetime(2000, 1, 2), datetime(2000, 1, 3), datetime(2000, 1, 1)], ] df = DataFrame(data) # check the rank expected = DataFrame([[2.0, np.nan, 1.0], [2.0, 3.0, 1.0]]) result = df.rank(1, numeric_only=False, ascending=True) tm.assert_frame_equal(result, expected) expected = DataFrame([[1.0, np.nan, 2.0], [2.0, 1.0, 3.0]]) result = df.rank(1, numeric_only=False, ascending=False) tm.assert_frame_equal(result, expected) df = DataFrame( {"a": [1e-20, -5, 1e-20 + 1e-40, 10, 1e60, 1e80, 1e-30]}) exp = DataFrame({"a": [3.5, 1.0, 3.5, 5.0, 6.0, 7.0, 2.0]}) tm.assert_frame_equal(df.rank(), exp) def test_rank_does_not_mutate(self): # GH#18521 # Check rank does not mutate DataFrame df = DataFrame(np.random.randn(10, 3), dtype="float64") expected = df.copy() df.rank() result = df tm.assert_frame_equal(result, expected) def test_rank_mixed_frame(self, float_string_frame): float_string_frame["datetime"] = datetime.now() float_string_frame["timedelta"] = timedelta(days=1, seconds=1) with tm.assert_produces_warning(FutureWarning, match="numeric_only=None"): float_string_frame.rank(numeric_only=None) with tm.assert_produces_warning(FutureWarning, match="Dropping of nuisance"): result = float_string_frame.rank(1) expected = float_string_frame.rank(1, numeric_only=True) tm.assert_frame_equal(result, expected) @td.skip_if_no_scipy def test_rank_na_option(self, float_frame): import scipy.stats # noqa:F401 from scipy.stats import rankdata float_frame["A"][::2] = np.nan float_frame["B"][::3] = np.nan float_frame["C"][::4] = np.nan float_frame["D"][::5] = np.nan # bottom ranks0 = float_frame.rank(na_option="bottom") ranks1 = float_frame.rank(1, na_option="bottom") fvals = float_frame.fillna(np.inf).values exp0 = np.apply_along_axis(rankdata, 0, fvals) exp1 = np.apply_along_axis(rankdata, 1, fvals) tm.assert_almost_equal(ranks0.values, exp0) tm.assert_almost_equal(ranks1.values, exp1) # top ranks0 = float_frame.rank(na_option="top") ranks1 = float_frame.rank(1, na_option="top") fval0 = float_frame.fillna((float_frame.min() - 1).to_dict()).values fval1 = float_frame.T fval1 = fval1.fillna((fval1.min() - 1).to_dict()).T fval1 = fval1.fillna(np.inf).values exp0 = np.apply_along_axis(rankdata, 0, fval0) exp1 = np.apply_along_axis(rankdata, 1, fval1) tm.assert_almost_equal(ranks0.values, exp0) tm.assert_almost_equal(ranks1.values, exp1) # descending # bottom ranks0 = float_frame.rank(na_option="top", ascending=False) ranks1 = float_frame.rank(1, na_option="top", ascending=False) fvals = float_frame.fillna(np.inf).values exp0 = np.apply_along_axis(rankdata, 0, -fvals) exp1 = np.apply_along_axis(rankdata, 1, -fvals) tm.assert_almost_equal(ranks0.values, exp0) tm.assert_almost_equal(ranks1.values, exp1) # descending # top ranks0 = float_frame.rank(na_option="bottom", ascending=False) ranks1 = float_frame.rank(1, na_option="bottom", ascending=False) fval0 = float_frame.fillna((float_frame.min() - 1).to_dict()).values fval1 = float_frame.T fval1 = fval1.fillna((fval1.min() - 1).to_dict()).T fval1 = fval1.fillna(np.inf).values exp0 = np.apply_along_axis(rankdata, 0, -fval0) exp1 = np.apply_along_axis(rankdata, 1, -fval1) tm.assert_numpy_array_equal(ranks0.values, exp0) tm.assert_numpy_array_equal(ranks1.values, exp1) # bad values throw error msg = "na_option must be one of 'keep', 'top', or 'bottom'" with pytest.raises(ValueError, match=msg): float_frame.rank(na_option="bad", ascending=False) # invalid type with pytest.raises(ValueError, match=msg): float_frame.rank(na_option=True, ascending=False) def test_rank_axis(self): # check if using axes' names gives the same result df = DataFrame([[2, 1], [4, 3]]) tm.assert_frame_equal(df.rank(axis=0), df.rank(axis="index")) tm.assert_frame_equal(df.rank(axis=1), df.rank(axis="columns")) @td.skip_if_no_scipy def test_rank_methods_frame(self): import scipy.stats # noqa:F401 from scipy.stats import rankdata xs = np.random.randint(0, 21, (100, 26)) xs = (xs - 10.0) / 10.0 cols = [chr(ord("z") - i) for i in range(xs.shape[1])] for vals in [xs, xs + 1e6, xs * 1e-6]: df = DataFrame(vals, columns=cols) for ax in [0, 1]: for m in ["average", "min", "max", "first", "dense"]: result = df.rank(axis=ax, method=m) sprank = np.apply_along_axis( rankdata, ax, vals, m if m != "first" else "ordinal") sprank = sprank.astype(np.float64) expected = DataFrame(sprank, columns=cols).astype("float64") tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("dtype", ["O", "f8", "i8"]) @pytest.mark.filterwarnings("ignore:.*Select only valid:FutureWarning") def test_rank_descending(self, method, dtype): if "i" in dtype: df = self.df.dropna().astype(dtype) else: df = self.df.astype(dtype) res = df.rank(ascending=False) expected = (df.max() - df).rank() tm.assert_frame_equal(res, expected) expected = (df.max() - df).rank(method=method) if dtype != "O": res2 = df.rank(method=method, ascending=False, numeric_only=True) tm.assert_frame_equal(res2, expected) res3 = df.rank(method=method, ascending=False, numeric_only=False) tm.assert_frame_equal(res3, expected) @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize("dtype", [None, object]) def test_rank_2d_tie_methods(self, method, axis, dtype): df = self.df def _check2d(df, expected, method="average", axis=0): exp_df = DataFrame({"A": expected, "B": expected}) if axis == 1: df = df.T exp_df = exp_df.T result = df.rank(method=method, axis=axis) tm.assert_frame_equal(result, exp_df) frame = df if dtype is None else df.astype(dtype) _check2d(frame, self.results[method], method=method, axis=axis) @pytest.mark.parametrize( "method,exp", [ ("dense", [[1.0, 1.0, 1.0], [1.0, 0.5, 2.0 / 3], [1.0, 0.5, 1.0 / 3]]), ( "min", [ [1.0 / 3, 1.0, 1.0], [1.0 / 3, 1.0 / 3, 2.0 / 3], [1.0 / 3, 1.0 / 3, 1.0 / 3], ], ), ( "max", [[1.0, 1.0, 1.0], [1.0, 2.0 / 3, 2.0 / 3], [1.0, 2.0 / 3, 1.0 / 3]], ), ( "average", [[2.0 / 3, 1.0, 1.0], [2.0 / 3, 0.5, 2.0 / 3], [2.0 / 3, 0.5, 1.0 / 3]], ), ( "first", [ [1.0 / 3, 1.0, 1.0], [2.0 / 3, 1.0 / 3, 2.0 / 3], [3.0 / 3, 2.0 / 3, 1.0 / 3], ], ), ], ) def test_rank_pct_true(self, method, exp): # see gh-15630. df = DataFrame([[2012, 66, 3], [2012, 65, 2], [2012, 65, 1]]) result = df.rank(method=method, pct=True) expected = DataFrame(exp) tm.assert_frame_equal(result, expected) @pytest.mark.single_cpu @pytest.mark.high_memory def test_pct_max_many_rows(self): # GH 18271 df = DataFrame({ "A": np.arange(2**24 + 1), "B": np.arange(2**24 + 1, 0, -1) }) result = df.rank(pct=True).max() assert (result == 1).all() @pytest.mark.parametrize( "contents,dtype", [ ( [ -np.inf, -50, -1, -1e-20, -1e-25, -1e-50, 0, 1e-40, 1e-20, 1e-10, 2, 40, np.inf, ], "float64", ), ( [ -np.inf, -50, -1, -1e-20, -1e-25, -1e-45, 0, 1e-40, 1e-20, 1e-10, 2, 40, np.inf, ], "float32", ), ([np.iinfo(np.uint8).min, 1, 2, 100, np.iinfo(np.uint8).max], "uint8"), ( [ np.iinfo(np.int64).min, -100, 0, 1, 9999, 100000, 1e10, np.iinfo(np.int64).max, ], "int64", ), ([NegInfinity(), "1", "A", "BA", "Ba", "C", Infinity()], "object"), ( [ datetime(2001, 1, 1), datetime(2001, 1, 2), datetime(2001, 1, 5) ], "datetime64", ), ], ) def test_rank_inf_and_nan(self, contents, dtype, frame_or_series): dtype_na_map = { "float64": np.nan, "float32": np.nan, "object": None, "datetime64": np.datetime64("nat"), } # Insert nans at random positions if underlying dtype has missing # value. Then adjust the expected order by adding nans accordingly # This is for testing whether rank calculation is affected # when values are interwined with nan values. values = np.array(contents, dtype=dtype) exp_order = np.array(range(len(values)), dtype="float64") + 1.0 if dtype in dtype_na_map: na_value = dtype_na_map[dtype] nan_indices = np.random.choice(range(len(values)), 5) values = np.insert(values, nan_indices, na_value) exp_order = np.insert(exp_order, nan_indices, np.nan) # Shuffle the testing array and expected results in the same way random_order = np.random.permutation(len(values)) obj = frame_or_series(values[random_order]) expected = frame_or_series(exp_order[random_order], dtype="float64") result = obj.rank() tm.assert_equal(result, expected) def test_df_series_inf_nan_consistency(self): # GH#32593 index = [5, 4, 3, 2, 1, 6, 7, 8, 9, 10] col1 = [5, 4, 3, 5, 8, 5, 2, 1, 6, 6] col2 = [5, 4, np.nan, 5, 8, 5, np.inf, np.nan, 6, -np.inf] df = DataFrame( data={ "col1": col1, "col2": col2, }, index=index, dtype="f8", ) df_result = df.rank() series_result = df.copy() series_result["col1"] = df["col1"].rank() series_result["col2"] = df["col2"].rank() tm.assert_frame_equal(df_result, series_result) def test_rank_both_inf(self): # GH#32593 df = DataFrame({"a": [-np.inf, 0, np.inf]}) expected = DataFrame({"a": [1.0, 2.0, 3.0]}) result = df.rank() tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "na_option,ascending,expected", [ ("top", True, [3.0, 1.0, 2.0]), ("top", False, [2.0, 1.0, 3.0]), ("bottom", True, [2.0, 3.0, 1.0]), ("bottom", False, [1.0, 3.0, 2.0]), ], ) def test_rank_inf_nans_na_option(self, frame_or_series, method, na_option, ascending, expected): obj = frame_or_series([np.inf, np.nan, -np.inf]) result = obj.rank(method=method, na_option=na_option, ascending=ascending) expected = frame_or_series(expected) tm.assert_equal(result, expected) @pytest.mark.parametrize( "na_option,ascending,expected", [ ("bottom", True, [1.0, 2.0, 4.0, 3.0]), ("bottom", False, [1.0, 2.0, 4.0, 3.0]), ("top", True, [2.0, 3.0, 1.0, 4.0]), ("top", False, [2.0, 3.0, 1.0, 4.0]), ], ) def test_rank_object_first(self, frame_or_series, na_option, ascending, expected): obj = frame_or_series(["foo", "foo", None, "foo"]) result = obj.rank(method="first", na_option=na_option, ascending=ascending) expected = frame_or_series(expected) tm.assert_equal(result, expected) @pytest.mark.parametrize( "data,expected", [ ({ "a": [1, 2, "a"], "b": [4, 5, 6] }, DataFrame({"b": [1.0, 2.0, 3.0]})), ({ "a": [1, 2, "a"] }, DataFrame(index=range(3))), ], ) def test_rank_mixed_axis_zero(self, data, expected): df = DataFrame(data) msg = "Dropping of nuisance columns" with tm.assert_produces_warning(FutureWarning, match=msg): result = df.rank() tm.assert_frame_equal(result, expected)
class TestSeriesRank: s = Series([1, 3, 4, 2, np.nan, 2, 1, 5, np.nan, 3]) results = { "average": np.array([1.5, 5.5, 7.0, 3.5, np.nan, 3.5, 1.5, 8.0, np.nan, 5.5]), "min": np.array([1, 5, 7, 3, np.nan, 3, 1, 8, np.nan, 5]), "max": np.array([2, 6, 7, 4, np.nan, 4, 2, 8, np.nan, 6]), "first": np.array([1, 5, 7, 3, np.nan, 4, 2, 8, np.nan, 6]), "dense": np.array([1, 3, 4, 2, np.nan, 2, 1, 5, np.nan, 3]), } def test_rank(self, datetime_series): pytest.importorskip("scipy.stats.special") rankdata = pytest.importorskip("scipy.stats.rankdata") datetime_series[::2] = np.nan datetime_series[:10][::3] = 4.0 ranks = datetime_series.rank() oranks = datetime_series.astype("O").rank() tm.assert_series_equal(ranks, oranks) mask = np.isnan(datetime_series) filled = datetime_series.fillna(np.inf) # rankdata returns a ndarray exp = Series(rankdata(filled), index=filled.index, name="ts") exp[mask] = np.nan tm.assert_series_equal(ranks, exp) iseries = Series(np.arange(5).repeat(2)) iranks = iseries.rank() exp = iseries.astype(float).rank() tm.assert_series_equal(iranks, exp) iseries = Series(np.arange(5)) + 1.0 exp = iseries / 5.0 iranks = iseries.rank(pct=True) tm.assert_series_equal(iranks, exp) iseries = Series(np.repeat(1, 100)) exp = Series(np.repeat(0.505, 100)) iranks = iseries.rank(pct=True) tm.assert_series_equal(iranks, exp) iseries[1] = np.nan exp = Series(np.repeat(50.0 / 99.0, 100)) exp[1] = np.nan iranks = iseries.rank(pct=True) tm.assert_series_equal(iranks, exp) iseries = Series(np.arange(5)) + 1.0 iseries[4] = np.nan exp = iseries / 4.0 iranks = iseries.rank(pct=True) tm.assert_series_equal(iranks, exp) iseries = Series(np.repeat(np.nan, 100)) exp = iseries.copy() iranks = iseries.rank(pct=True) tm.assert_series_equal(iranks, exp) iseries = Series(np.arange(5)) + 1 iseries[4] = np.nan exp = iseries / 4.0 iranks = iseries.rank(pct=True) tm.assert_series_equal(iranks, exp) rng = date_range("1/1/1990", periods=5) iseries = Series(np.arange(5), rng) + 1 iseries.iloc[4] = np.nan exp = iseries / 4.0 iranks = iseries.rank(pct=True) tm.assert_series_equal(iranks, exp) iseries = Series([1e-50, 1e-100, 1e-20, 1e-2, 1e-20 + 1e-30, 1e-1]) exp = Series([2, 1, 3, 5, 4, 6.0]) iranks = iseries.rank() tm.assert_series_equal(iranks, exp) # GH 5968 iseries = Series(["3 day", "1 day 10m", "-2 day", NaT], dtype="m8[ns]") exp = Series([3, 2, 1, np.nan]) iranks = iseries.rank() tm.assert_series_equal(iranks, exp) values = np.array( [-50, -1, -1e-20, -1e-25, -1e-50, 0, 1e-40, 1e-20, 1e-10, 2, 40], dtype="float64", ) random_order = np.random.permutation(len(values)) iseries = Series(values[random_order]) exp = Series(random_order + 1.0, dtype="float64") iranks = iseries.rank() tm.assert_series_equal(iranks, exp) def test_rank_categorical(self): # GH issue #15420 rank incorrectly orders ordered categories # Test ascending/descending ranking for ordered categoricals exp = Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0]) exp_desc = Series([6.0, 5.0, 4.0, 3.0, 2.0, 1.0]) ordered = Series( ["first", "second", "third", "fourth", "fifth", "sixth"]).astype( CategoricalDtype( categories=[ "first", "second", "third", "fourth", "fifth", "sixth" ], ordered=True, )) tm.assert_series_equal(ordered.rank(), exp) tm.assert_series_equal(ordered.rank(ascending=False), exp_desc) # Unordered categoricals should be ranked as objects unordered = Series( ["first", "second", "third", "fourth", "fifth", "sixth"]).astype( CategoricalDtype( categories=[ "first", "second", "third", "fourth", "fifth", "sixth" ], ordered=False, )) exp_unordered = Series([2.0, 4.0, 6.0, 3.0, 1.0, 5.0]) res = unordered.rank() tm.assert_series_equal(res, exp_unordered) unordered1 = Series([1, 2, 3, 4, 5, 6]).astype( CategoricalDtype([1, 2, 3, 4, 5, 6], False)) exp_unordered1 = Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0]) res1 = unordered1.rank() tm.assert_series_equal(res1, exp_unordered1) # Test na_option for rank data na_ser = Series( ["first", "second", "third", "fourth", "fifth", "sixth", np.NaN]).astype( CategoricalDtype( [ "first", "second", "third", "fourth", "fifth", "sixth", "seventh" ], True, )) exp_top = Series([2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 1.0]) exp_bot = Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0]) exp_keep = Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, np.NaN]) tm.assert_series_equal(na_ser.rank(na_option="top"), exp_top) tm.assert_series_equal(na_ser.rank(na_option="bottom"), exp_bot) tm.assert_series_equal(na_ser.rank(na_option="keep"), exp_keep) # Test na_option for rank data with ascending False exp_top = Series([7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0]) exp_bot = Series([6.0, 5.0, 4.0, 3.0, 2.0, 1.0, 7.0]) exp_keep = Series([6.0, 5.0, 4.0, 3.0, 2.0, 1.0, np.NaN]) tm.assert_series_equal(na_ser.rank(na_option="top", ascending=False), exp_top) tm.assert_series_equal( na_ser.rank(na_option="bottom", ascending=False), exp_bot) tm.assert_series_equal(na_ser.rank(na_option="keep", ascending=False), exp_keep) # Test invalid values for na_option msg = "na_option must be one of 'keep', 'top', or 'bottom'" with pytest.raises(ValueError, match=msg): na_ser.rank(na_option="bad", ascending=False) # invalid type with pytest.raises(ValueError, match=msg): na_ser.rank(na_option=True, ascending=False) # Test with pct=True na_ser = Series(["first", "second", "third", "fourth", np.NaN]).astype( CategoricalDtype(["first", "second", "third", "fourth"], True)) exp_top = Series([0.4, 0.6, 0.8, 1.0, 0.2]) exp_bot = Series([0.2, 0.4, 0.6, 0.8, 1.0]) exp_keep = Series([0.25, 0.5, 0.75, 1.0, np.NaN]) tm.assert_series_equal(na_ser.rank(na_option="top", pct=True), exp_top) tm.assert_series_equal(na_ser.rank(na_option="bottom", pct=True), exp_bot) tm.assert_series_equal(na_ser.rank(na_option="keep", pct=True), exp_keep) def test_rank_signature(self): s = Series([0, 1]) s.rank(method="average") msg = "No axis named average for object type Series" with pytest.raises(ValueError, match=msg): s.rank("average") @pytest.mark.parametrize( "contents,dtype", [ ( [ -np.inf, -50, -1, -1e-20, -1e-25, -1e-50, 0, 1e-40, 1e-20, 1e-10, 2, 40, np.inf, ], "float64", ), ( [ -np.inf, -50, -1, -1e-20, -1e-25, -1e-45, 0, 1e-40, 1e-20, 1e-10, 2, 40, np.inf, ], "float32", ), ([np.iinfo(np.uint8).min, 1, 2, 100, np.iinfo(np.uint8).max], "uint8"), pytest.param( [ np.iinfo(np.int64).min, -100, 0, 1, 9999, 100000, 1e10, np.iinfo(np.int64).max, ], "int64", marks=pytest.mark.xfail( reason="iNaT is equivalent to minimum value of dtype" "int64 pending issue GH#16674"), ), ([NegInfinity(), "1", "A", "BA", "Ba", "C", Infinity()], "object"), ], ) def test_rank_inf(self, contents, dtype): dtype_na_map = { "float64": np.nan, "float32": np.nan, "int64": iNaT, "object": None, } # Insert nans at random positions if underlying dtype has missing # value. Then adjust the expected order by adding nans accordingly # This is for testing whether rank calculation is affected # when values are interwined with nan values. values = np.array(contents, dtype=dtype) exp_order = np.array(range(len(values)), dtype="float64") + 1.0 if dtype in dtype_na_map: na_value = dtype_na_map[dtype] nan_indices = np.random.choice(range(len(values)), 5) values = np.insert(values, nan_indices, na_value) exp_order = np.insert(exp_order, nan_indices, np.nan) # shuffle the testing array and expected results in the same way random_order = np.random.permutation(len(values)) iseries = Series(values[random_order]) exp = Series(exp_order[random_order], dtype="float64") iranks = iseries.rank() tm.assert_series_equal(iranks, exp) def test_rank_tie_methods(self): s = self.s def _check(s, expected, method="average"): result = s.rank(method=method) tm.assert_series_equal(result, Series(expected)) dtypes = [None, object] disabled = {(object, "first")} results = self.results for method, dtype in product(results, dtypes): if (dtype, method) in disabled: continue series = s if dtype is None else s.astype(dtype) _check(series, results[method], method=method) @td.skip_if_no_scipy @pytest.mark.parametrize("ascending", [True, False]) @pytest.mark.parametrize("method", ["average", "min", "max", "first", "dense"]) @pytest.mark.parametrize("na_option", ["top", "bottom", "keep"]) def test_rank_tie_methods_on_infs_nans(self, method, na_option, ascending): dtypes = [ ("object", None, Infinity(), NegInfinity()), ("float64", np.nan, np.inf, -np.inf), ] chunk = 3 disabled = {("object", "first")} def _check(s, method, na_option, ascending): exp_ranks = { "average": ([2, 2, 2], [5, 5, 5], [8, 8, 8]), "min": ([1, 1, 1], [4, 4, 4], [7, 7, 7]), "max": ([3, 3, 3], [6, 6, 6], [9, 9, 9]), "first": ([1, 2, 3], [4, 5, 6], [7, 8, 9]), "dense": ([1, 1, 1], [2, 2, 2], [3, 3, 3]), } ranks = exp_ranks[method] if na_option == "top": order = [ranks[1], ranks[0], ranks[2]] elif na_option == "bottom": order = [ranks[0], ranks[2], ranks[1]] else: order = [ranks[0], [np.nan] * chunk, ranks[1]] expected = order if ascending else order[::-1] expected = list(chain.from_iterable(expected)) result = s.rank(method=method, na_option=na_option, ascending=ascending) tm.assert_series_equal(result, Series(expected, dtype="float64")) for dtype, na_value, pos_inf, neg_inf in dtypes: in_arr = [neg_inf] * chunk + [na_value] * chunk + [pos_inf] * chunk iseries = Series(in_arr, dtype=dtype) if (dtype, method) in disabled: continue _check(iseries, method, na_option, ascending) def test_rank_desc_mix_nans_infs(self): # GH 19538 # check descending ranking when mix nans and infs iseries = Series([1, np.nan, np.inf, -np.inf, 25]) result = iseries.rank(ascending=False) exp = Series([3, np.nan, 1, 4, 2], dtype="float64") tm.assert_series_equal(result, exp) def test_rank_methods_series(self): pytest.importorskip("scipy.stats.special") rankdata = pytest.importorskip("scipy.stats.rankdata") xs = np.random.randn(9) xs = np.concatenate([xs[i:] for i in range(0, 9, 2)]) # add duplicates np.random.shuffle(xs) index = [chr(ord("a") + i) for i in range(len(xs))] for vals in [xs, xs + 1e6, xs * 1e-6]: ts = Series(vals, index=index) for m in ["average", "min", "max", "first", "dense"]: result = ts.rank(method=m) sprank = rankdata(vals, m if m != "first" else "ordinal") expected = Series(sprank, index=index).astype("float64") tm.assert_series_equal(result, expected) def test_rank_dense_method(self): dtypes = ["O", "f8", "i8"] in_out = [ ([1], [1]), ([2], [1]), ([0], [1]), ([2, 2], [1, 1]), ([1, 2, 3], [1, 2, 3]), ([4, 2, 1], [3, 2, 1]), ([1, 1, 5, 5, 3], [1, 1, 3, 3, 2]), ([-5, -4, -3, -2, -1], [1, 2, 3, 4, 5]), ] for ser, exp in in_out: for dtype in dtypes: s = Series(ser).astype(dtype) result = s.rank(method="dense") expected = Series(exp).astype(result.dtype) tm.assert_series_equal(result, expected) def test_rank_descending(self): dtypes = ["O", "f8", "i8"] for dtype, method in product(dtypes, self.results): if "i" in dtype: s = self.s.dropna() else: s = self.s.astype(dtype) res = s.rank(ascending=False) expected = (s.max() - s).rank() tm.assert_series_equal(res, expected) if method == "first" and dtype == "O": continue expected = (s.max() - s).rank(method=method) res2 = s.rank(method=method, ascending=False) tm.assert_series_equal(res2, expected) def test_rank_int(self): s = self.s.dropna().astype("i8") for method, res in self.results.items(): result = s.rank(method=method) expected = Series(res).dropna() expected.index = result.index tm.assert_series_equal(result, expected) def test_rank_object_bug(self): # GH 13445 # smoke tests Series([np.nan] * 32).astype(object).rank(ascending=True) Series([np.nan] * 32).astype(object).rank(ascending=False) def test_rank_modify_inplace(self): # GH 18521 # Check rank does not mutate series s = Series([Timestamp("2017-01-05 10:20:27.569000"), NaT]) expected = s.copy() s.rank() result = s tm.assert_series_equal(result, expected)
class TestSeriesRank: @td.skip_if_no_scipy def test_rank(self, datetime_series): from scipy.stats import rankdata datetime_series[::2] = np.nan datetime_series[:10][::3] = 4.0 ranks = datetime_series.rank() oranks = datetime_series.astype("O").rank() tm.assert_series_equal(ranks, oranks) mask = np.isnan(datetime_series) filled = datetime_series.fillna(np.inf) # rankdata returns a ndarray exp = Series(rankdata(filled), index=filled.index, name="ts") exp[mask] = np.nan tm.assert_series_equal(ranks, exp) iseries = Series(np.arange(5).repeat(2)) iranks = iseries.rank() exp = iseries.astype(float).rank() tm.assert_series_equal(iranks, exp) iseries = Series(np.arange(5)) + 1.0 exp = iseries / 5.0 iranks = iseries.rank(pct=True) tm.assert_series_equal(iranks, exp) iseries = Series(np.repeat(1, 100)) exp = Series(np.repeat(0.505, 100)) iranks = iseries.rank(pct=True) tm.assert_series_equal(iranks, exp) iseries[1] = np.nan exp = Series(np.repeat(50.0 / 99.0, 100)) exp[1] = np.nan iranks = iseries.rank(pct=True) tm.assert_series_equal(iranks, exp) iseries = Series(np.arange(5)) + 1.0 iseries[4] = np.nan exp = iseries / 4.0 iranks = iseries.rank(pct=True) tm.assert_series_equal(iranks, exp) iseries = Series(np.repeat(np.nan, 100)) exp = iseries.copy() iranks = iseries.rank(pct=True) tm.assert_series_equal(iranks, exp) iseries = Series(np.arange(5)) + 1 iseries[4] = np.nan exp = iseries / 4.0 iranks = iseries.rank(pct=True) tm.assert_series_equal(iranks, exp) rng = date_range("1/1/1990", periods=5) iseries = Series(np.arange(5), rng) + 1 iseries.iloc[4] = np.nan exp = iseries / 4.0 iranks = iseries.rank(pct=True) tm.assert_series_equal(iranks, exp) iseries = Series([1e-50, 1e-100, 1e-20, 1e-2, 1e-20 + 1e-30, 1e-1]) exp = Series([2, 1, 3, 5, 4, 6.0]) iranks = iseries.rank() tm.assert_series_equal(iranks, exp) # GH 5968 iseries = Series(["3 day", "1 day 10m", "-2 day", NaT], dtype="m8[ns]") exp = Series([3, 2, 1, np.nan]) iranks = iseries.rank() tm.assert_series_equal(iranks, exp) values = np.array( [-50, -1, -1e-20, -1e-25, -1e-50, 0, 1e-40, 1e-20, 1e-10, 2, 40], dtype="float64", ) random_order = np.random.permutation(len(values)) iseries = Series(values[random_order]) exp = Series(random_order + 1.0, dtype="float64") iranks = iseries.rank() tm.assert_series_equal(iranks, exp) def test_rank_categorical(self): # GH issue #15420 rank incorrectly orders ordered categories # Test ascending/descending ranking for ordered categoricals exp = Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0]) exp_desc = Series([6.0, 5.0, 4.0, 3.0, 2.0, 1.0]) ordered = Series( ["first", "second", "third", "fourth", "fifth", "sixth"] ).astype( CategoricalDtype( categories=["first", "second", "third", "fourth", "fifth", "sixth"], ordered=True, ) ) tm.assert_series_equal(ordered.rank(), exp) tm.assert_series_equal(ordered.rank(ascending=False), exp_desc) # Unordered categoricals should be ranked as objects unordered = Series( ["first", "second", "third", "fourth", "fifth", "sixth"] ).astype( CategoricalDtype( categories=["first", "second", "third", "fourth", "fifth", "sixth"], ordered=False, ) ) exp_unordered = Series([2.0, 4.0, 6.0, 3.0, 1.0, 5.0]) res = unordered.rank() tm.assert_series_equal(res, exp_unordered) unordered1 = Series([1, 2, 3, 4, 5, 6]).astype( CategoricalDtype([1, 2, 3, 4, 5, 6], False) ) exp_unordered1 = Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0]) res1 = unordered1.rank() tm.assert_series_equal(res1, exp_unordered1) # Test na_option for rank data na_ser = Series( ["first", "second", "third", "fourth", "fifth", "sixth", np.NaN] ).astype( CategoricalDtype( ["first", "second", "third", "fourth", "fifth", "sixth", "seventh"], True, ) ) exp_top = Series([2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 1.0]) exp_bot = Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0]) exp_keep = Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, np.NaN]) tm.assert_series_equal(na_ser.rank(na_option="top"), exp_top) tm.assert_series_equal(na_ser.rank(na_option="bottom"), exp_bot) tm.assert_series_equal(na_ser.rank(na_option="keep"), exp_keep) # Test na_option for rank data with ascending False exp_top = Series([7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0]) exp_bot = Series([6.0, 5.0, 4.0, 3.0, 2.0, 1.0, 7.0]) exp_keep = Series([6.0, 5.0, 4.0, 3.0, 2.0, 1.0, np.NaN]) tm.assert_series_equal(na_ser.rank(na_option="top", ascending=False), exp_top) tm.assert_series_equal( na_ser.rank(na_option="bottom", ascending=False), exp_bot ) tm.assert_series_equal(na_ser.rank(na_option="keep", ascending=False), exp_keep) # Test invalid values for na_option msg = "na_option must be one of 'keep', 'top', or 'bottom'" with pytest.raises(ValueError, match=msg): na_ser.rank(na_option="bad", ascending=False) # invalid type with pytest.raises(ValueError, match=msg): na_ser.rank(na_option=True, ascending=False) # Test with pct=True na_ser = Series(["first", "second", "third", "fourth", np.NaN]).astype( CategoricalDtype(["first", "second", "third", "fourth"], True) ) exp_top = Series([0.4, 0.6, 0.8, 1.0, 0.2]) exp_bot = Series([0.2, 0.4, 0.6, 0.8, 1.0]) exp_keep = Series([0.25, 0.5, 0.75, 1.0, np.NaN]) tm.assert_series_equal(na_ser.rank(na_option="top", pct=True), exp_top) tm.assert_series_equal(na_ser.rank(na_option="bottom", pct=True), exp_bot) tm.assert_series_equal(na_ser.rank(na_option="keep", pct=True), exp_keep) def test_rank_signature(self): s = Series([0, 1]) s.rank(method="average") msg = "No axis named average for object type Series" with pytest.raises(ValueError, match=msg): s.rank("average") @pytest.mark.parametrize("dtype", [None, object]) def test_rank_tie_methods(self, ser, results, dtype): method, exp = results ser = ser if dtype is None else ser.astype(dtype) result = ser.rank(method=method) tm.assert_series_equal(result, Series(exp)) @td.skip_if_no_scipy @pytest.mark.parametrize("ascending", [True, False]) @pytest.mark.parametrize("method", ["average", "min", "max", "first", "dense"]) @pytest.mark.parametrize("na_option", ["top", "bottom", "keep"]) @pytest.mark.parametrize( "dtype, na_value, pos_inf, neg_inf", [ ("object", None, Infinity(), NegInfinity()), ("float64", np.nan, np.inf, -np.inf), ], ) def test_rank_tie_methods_on_infs_nans( self, method, na_option, ascending, dtype, na_value, pos_inf, neg_inf ): chunk = 3 in_arr = [neg_inf] * chunk + [na_value] * chunk + [pos_inf] * chunk iseries = Series(in_arr, dtype=dtype) exp_ranks = { "average": ([2, 2, 2], [5, 5, 5], [8, 8, 8]), "min": ([1, 1, 1], [4, 4, 4], [7, 7, 7]), "max": ([3, 3, 3], [6, 6, 6], [9, 9, 9]), "first": ([1, 2, 3], [4, 5, 6], [7, 8, 9]), "dense": ([1, 1, 1], [2, 2, 2], [3, 3, 3]), } ranks = exp_ranks[method] if na_option == "top": order = [ranks[1], ranks[0], ranks[2]] elif na_option == "bottom": order = [ranks[0], ranks[2], ranks[1]] else: order = [ranks[0], [np.nan] * chunk, ranks[1]] expected = order if ascending else order[::-1] expected = list(chain.from_iterable(expected)) result = iseries.rank(method=method, na_option=na_option, ascending=ascending) tm.assert_series_equal(result, Series(expected, dtype="float64")) def test_rank_desc_mix_nans_infs(self): # GH 19538 # check descending ranking when mix nans and infs iseries = Series([1, np.nan, np.inf, -np.inf, 25]) result = iseries.rank(ascending=False) exp = Series([3, np.nan, 1, 4, 2], dtype="float64") tm.assert_series_equal(result, exp) @td.skip_if_no_scipy @pytest.mark.parametrize("method", ["average", "min", "max", "first", "dense"]) @pytest.mark.parametrize( "op, value", [ [operator.add, 0], [operator.add, 1e6], [operator.mul, 1e-6], ], ) def test_rank_methods_series(self, method, op, value): from scipy.stats import rankdata xs = np.random.randn(9) xs = np.concatenate([xs[i:] for i in range(0, 9, 2)]) # add duplicates np.random.shuffle(xs) index = [chr(ord("a") + i) for i in range(len(xs))] vals = op(xs, value) ts = Series(vals, index=index) result = ts.rank(method=method) sprank = rankdata(vals, method if method != "first" else "ordinal") expected = Series(sprank, index=index).astype("float64") tm.assert_series_equal(result, expected) @pytest.mark.parametrize("dtype", ["O", "f8", "i8"]) @pytest.mark.parametrize( "ser, exp", [ ([1], [1]), ([2], [1]), ([0], [1]), ([2, 2], [1, 1]), ([1, 2, 3], [1, 2, 3]), ([4, 2, 1], [3, 2, 1]), ([1, 1, 5, 5, 3], [1, 1, 3, 3, 2]), ([-5, -4, -3, -2, -1], [1, 2, 3, 4, 5]), ], ) def test_rank_dense_method(self, dtype, ser, exp): s = Series(ser).astype(dtype) result = s.rank(method="dense") expected = Series(exp).astype(result.dtype) tm.assert_series_equal(result, expected) @pytest.mark.parametrize("dtype", ["O", "f8", "i8"]) def test_rank_descending(self, ser, results, dtype): method, _ = results if "i" in dtype: s = ser.dropna() else: s = ser.astype(dtype) res = s.rank(ascending=False) expected = (s.max() - s).rank() tm.assert_series_equal(res, expected) expected = (s.max() - s).rank(method=method) res2 = s.rank(method=method, ascending=False) tm.assert_series_equal(res2, expected) def test_rank_int(self, ser, results): method, exp = results s = ser.dropna().astype("i8") result = s.rank(method=method) expected = Series(exp).dropna() expected.index = result.index tm.assert_series_equal(result, expected) def test_rank_object_bug(self): # GH 13445 # smoke tests Series([np.nan] * 32).astype(object).rank(ascending=True) Series([np.nan] * 32).astype(object).rank(ascending=False) def test_rank_modify_inplace(self): # GH 18521 # Check rank does not mutate series s = Series([Timestamp("2017-01-05 10:20:27.569000"), NaT]) expected = s.copy() s.rank() result = s tm.assert_series_equal(result, expected)