def test_rank_signature(self): s = Series([0, 1]) s.rank(method='average') msg = ("No axis named average for object type" " <class 'pandas.core.series.Series'>") with pytest.raises(ValueError, match=msg): s.rank('average')
def test_rank_modify_inplace(self): # GH 18521 # Check rank does not mutate series s = Series([Timestamp('2017-01-05 10:20:27.569000'), NaT]) expected = s.copy() s.rank() result = s assert_series_equal(result, expected)
def test_rank_desc_mix_nans_infs(self): # GH 19538 # check descending ranking when mix nans and infs iseries = Series([1, np.nan, np.inf, -np.inf, 25]) result = iseries.rank(ascending=False) exp = Series([3, np.nan, 1, 4, 2], dtype="float64") tm.assert_series_equal(result, exp)
def test_rank_desc_mix_nans_infs(self): # GH 19538 # check descending ranking when mix nans and infs iseries = Series([1, np.nan, np.inf, -np.inf, 25]) result = iseries.rank(ascending=False) exp = Series([3, np.nan, 1, 4, 2], dtype='float64') tm.assert_series_equal(result, exp)
def test_rank_inf(self): pytest.skip('DataFrame.rank does not currently rank ' 'np.inf and -np.inf properly') values = np.array( [-np.inf, -50, -1, -1e-20, -1e-25, -1e-50, 0, 1e-40, 1e-20, 1e-10, 2, 40, np.inf], dtype='float64') random_order = np.random.permutation(len(values)) iseries = Series(values[random_order]) exp = Series(random_order + 1.0, dtype='float64') iranks = iseries.rank() assert_series_equal(iranks, exp)
def test_rank_inf(self): pytest.skip('DataFrame.rank does not currently rank ' 'np.inf and -np.inf properly') values = np.array([ -np.inf, -50, -1, -1e-20, -1e-25, -1e-50, 0, 1e-40, 1e-20, 1e-10, 2, 40, np.inf ], dtype='float64') random_order = np.random.permutation(len(values)) iseries = Series(values[random_order]) exp = Series(random_order + 1.0, dtype='float64') iranks = iseries.rank() assert_series_equal(iranks, exp)
def test_rank_methods_series(self, method, op, value): from scipy.stats import rankdata xs = np.random.randn(9) xs = np.concatenate([xs[i:] for i in range(0, 9, 2)]) # add duplicates np.random.shuffle(xs) index = [chr(ord("a") + i) for i in range(len(xs))] vals = op(xs, value) ts = Series(vals, index=index) result = ts.rank(method=method) sprank = rankdata(vals, method if method != "first" else "ordinal") expected = Series(sprank, index=index).astype("float64") tm.assert_series_equal(result, expected)
def test_rank_dense_method(self): dtypes = ['O', 'f8', 'i8'] in_out = [([1], [1]), ([2], [1]), ([0], [1]), ([2, 2], [1, 1]), ([1, 2, 3], [1, 2, 3]), ( [4, 2, 1], [3, 2, 1], ), ([1, 1, 5, 5, 3], [1, 1, 3, 3, 2]), ([-5, -4, -3, -2, -1], [1, 2, 3, 4, 5])] for ser, exp in in_out: for dtype in dtypes: s = Series(ser).astype(dtype) result = s.rank(method='dense') expected = Series(exp).astype(result.dtype) assert_series_equal(result, expected)
def gene_ttest(edgeList, one_value, group_value, n_genes, gene2num): statistic_list = [] for i in range(n_genes): statistic = ttest_onesmaple(one_value[i], group_value[i]) if math.isnan(statistic): statistic_list.append(0.) elif math.isinf(statistic): statistic_list.append(0.) else: statistic_list.append(abs(statistic)) statistic_series = Series(statistic_list) rank = statistic_series.rank().values.astype(np.int) return rank,statistic_list
def _ms_sing(geneset: list, x: pd.Series, norm_method: str, rankup: bool) -> dict: """ bare bones version of scsing scoring. Their function (see scsingscore.py) does a ton of stuff, here's the essentials :param genest: Geneset to score against :param x: pd.Series with the gene expression of a single sample. One gene per row :param norm_method: how to normalize the scores :param rankup: direction of ranking, up: True, down: False """ sig_len_up = len(geneset) assert isinstance(x, pd.Series) up_sort = x.rank(method='min', ascending=rankup) # su = [] # for every gene in the list gene get the value at that # index/rowname (the gene) and the sample that is equal to i if True: for j in geneset: if j in up_sort.index: su.append(up_sort[j]) else: sig_len_up = sig_len_up - 1 else: # dict acces would be faster, but dict generation takes too loading # damn d = up_sort.to_dict() for g in geneset: if g in d: su.append(d[g]) else: sig_len_up = sig_len_up - 1 # normalise the score for the number of genes in the signature score_up = np.mean(su) norm_up = si.normalisation(norm_method=norm_method, library_len=len(x.index), score_list=su, score=score_up, sig_len=sig_len_up) norm_up = norm_up - 0.5 mad_up = statsmodels.robust.scale.mad(su) total_score = norm_up return dict(total_score=total_score, mad_up=mad_up)
def rank(value: Series) -> Series: """ 순위 <설명> 당일 코스피, 코스닥 구성 종목에서 해당 종목의 순위를 반환하는 함수입니다. 0과 1사이의 값을 가지며, 1에 가까울수록 순위가 높다는 의미입니다. <사용 방법> 첫 번째 인자에 순위를 구하고자 하는 값을 적으면 됩니다. 예를 들어, 20일 평균 거래대금의 순위를 구하고자 하는 경우에는 'rank(sma(tr_val, 20))' 또는 '순위(단순이동평균(거래대금, 20))'과 같이 작성하면 됩니다. :param value: (데이터) 순위를 구하고자 하는 값 :return: """ return value.rank(pct=True)
def test_rank_methods_series(self): from scipy.stats import rankdata xs = np.random.randn(9) xs = np.concatenate([xs[i:] for i in range(0, 9, 2)]) # add duplicates np.random.shuffle(xs) index = [chr(ord("a") + i) for i in range(len(xs))] for vals in [xs, xs + 1e6, xs * 1e-6]: ts = Series(vals, index=index) for m in ["average", "min", "max", "first", "dense"]: result = ts.rank(method=m) sprank = rankdata(vals, m if m != "first" else "ordinal") expected = Series(sprank, index=index).astype("float64") tm.assert_series_equal(result, expected)
def test_rank_methods_series(self): tm.skip_if_no_package('scipy', '0.13', 'scipy.stats.rankdata') from scipy.stats import rankdata xs = np.random.randn(9) xs = np.concatenate([xs[i:] for i in range(0, 9, 2)]) # add duplicates np.random.shuffle(xs) index = [chr(ord('a') + i) for i in range(len(xs))] for vals in [xs, xs + 1e6, xs * 1e-6]: ts = Series(vals, index=index) for m in ['average', 'min', 'max', 'first', 'dense']: result = ts.rank(m) sprank = rankdata(vals, m if m != 'first' else 'ordinal') tm.assert_series_equal(result, Series(sprank, index=index))
def test_rank_dense_method(self): dtypes = ['O', 'f8', 'i8'] in_out = [([1], [1]), ([2], [1]), ([0], [1]), ([2, 2], [1, 1]), ([1, 2, 3], [1, 2, 3]), ([4, 2, 1], [3, 2, 1],), ([1, 1, 5, 5, 3], [1, 1, 3, 3, 2]), ([-5, -4, -3, -2, -1], [1, 2, 3, 4, 5])] for ser, exp in in_out: for dtype in dtypes: s = Series(ser).astype(dtype) result = s.rank(method='dense') expected = Series(exp).astype(result.dtype) assert_series_equal(result, expected)
def test_rank_methods_series(self): tm.skip_if_no_package('scipy', '0.13', 'scipy.stats.rankdata') from scipy.stats import rankdata xs = np.random.randn(9) xs = np.concatenate([xs[i:] for i in range(0, 9, 2)]) # add duplicates np.random.shuffle(xs) index = [chr(ord('a') + i) for i in range(len(xs))] for vals in [xs, xs + 1e6, xs * 1e-6]: ts = Series(vals, index=index) for m in ['average', 'min', 'max', 'first', 'dense']: result = ts.rank(method=m) sprank = rankdata(vals, m if m != 'first' else 'ordinal') tm.assert_series_equal(result, Series(sprank, index=index))
def test_rank_methods_series(self): pytest.importorskip('scipy.stats.special') rankdata = pytest.importorskip('scipy.stats.rankdata') xs = np.random.randn(9) xs = np.concatenate([xs[i:] for i in range(0, 9, 2)]) # add duplicates np.random.shuffle(xs) index = [chr(ord('a') + i) for i in range(len(xs))] for vals in [xs, xs + 1e6, xs * 1e-6]: ts = Series(vals, index=index) for m in ['average', 'min', 'max', 'first', 'dense']: result = ts.rank(method=m) sprank = rankdata(vals, m if m != 'first' else 'ordinal') expected = Series(sprank, index=index).astype('float64') tm.assert_series_equal(result, expected)
def test_rank_dense_method(self): dtypes = ["O", "f8", "i8"] in_out = [ ([1], [1]), ([2], [1]), ([0], [1]), ([2, 2], [1, 1]), ([1, 2, 3], [1, 2, 3]), ([4, 2, 1], [3, 2, 1]), ([1, 1, 5, 5, 3], [1, 1, 3, 3, 2]), ([-5, -4, -3, -2, -1], [1, 2, 3, 4, 5]), ] for ser, exp in in_out: for dtype in dtypes: s = Series(ser).astype(dtype) result = s.rank(method="dense") expected = Series(exp).astype(result.dtype) tm.assert_series_equal(result, expected)
def Ranking(a, b): RankDict = {} RankInitial = [] RankInitial.append(ANC_reportRank(a, b)) c = RowToSub[a] total = 0.00 for i in check: result = check.get_group(c) k = result.shape[0] for j in range(k): p = result.values[j][0] RankDict[p] = ANC_reportRank(p, b) total += 1 del RankDict[a] RankList = RankDict.values() for x in RankList: RankInitial.append(x) RankPosition = Series(RankInitial) RankFinal = RankPosition.rank(method='min')[0] print 'Ranked:', RankFinal, 'on a scale of:', total
def Ranking(a, b, tree): RankDict = {} RankInitial = [] RankInitial.append(ANC_reportRank(a, b, tree)) c = RowToSub[a] total = 0.00 for i in check: result = check.get_group(c) k = result.shape[0] for j in range(k): p = result.values[j][0] RankDict[p] = ANC_reportRank(p, b, tree) total += 1 del RankDict[a] RankList = RankDict.values() for x in RankList: RankInitial.append(x) RankPosition = Series(RankInitial) RankFinal = RankPosition.rank(method='min')[0] # print 'Ranked:',ord(RankFinal),'on a scale of:', total return RankFinal
def test_rank_methods_series(self): tm.skip_if_no_package('scipy', min_version='0.13', app='scipy.stats.rankdata') import scipy from scipy.stats import rankdata xs = np.random.randn(9) xs = np.concatenate([xs[i:] for i in range(0, 9, 2)]) # add duplicates np.random.shuffle(xs) index = [chr(ord('a') + i) for i in range(len(xs))] for vals in [xs, xs + 1e6, xs * 1e-6]: ts = Series(vals, index=index) for m in ['average', 'min', 'max', 'first', 'dense']: result = ts.rank(method=m) sprank = rankdata(vals, m if m != 'first' else 'ordinal') expected = Series(sprank, index=index) if LooseVersion(scipy.__version__) >= '0.17.0': expected = expected.astype('float64') tm.assert_series_equal(result, expected)
def test_rank_inf(self, contents, dtype): dtype_na_map = { 'float64': np.nan, 'float32': np.nan, 'int64': iNaT, 'object': None } # Insert nans at random positions if underlying dtype has missing # value. Then adjust the expected order by adding nans accordingly # This is for testing whether rank calculation is affected # when values are interwined with nan values. values = np.array(contents, dtype=dtype) exp_order = np.array(range(len(values)), dtype='float64') + 1.0 if dtype in dtype_na_map: na_value = dtype_na_map[dtype] nan_indices = np.random.choice(range(len(values)), 5) values = np.insert(values, nan_indices, na_value) exp_order = np.insert(exp_order, nan_indices, np.nan) # shuffle the testing array and expected results in the same way random_order = np.random.permutation(len(values)) iseries = Series(values[random_order]) exp = Series(exp_order[random_order], dtype='float64') iranks = iseries.rank() assert_series_equal(iranks, exp)
# row 기준 내림차순 print(data.sort_index(ascending = False)) # column 기준 내림차순 print(data.sort_index(axis= 1,ascending = False)) # 객체 기준 정렬 오름차순 print(data.sort_values(by = 'a')) # by 컬럼명 # 복합 객체 기준 정렬 오름차순 print(data.sort_values(by = ['a', 'b'])) # by 컬럼명 # 순위 기준 정렬 - 동점자 처리 - 공동 처리 print(data.rank()) # 순위 기준 정렬 - 동점자 처리 - 데이터 순서 내림차순 상위 처리 print(data.rank(method = 'first')) # 순위 기준 정렬 - 동점자 처리 - 공동 처리 print(data.rank(ascending = False)) # 순위 기준 정렬 - 동점자 처리 - 공동 처리 print(data.rank(method = 'first',ascending = False)) # 중복 색인은 허용됨. #========= # 기술 통계 #=========
obj = Series([4, 7, -3, 2]) print(obj.order()) ''' 2 -3 3 2 0 4 1 7 dtype: int64 ''' ''' Ranking is closely related to sorting, assigning ranks from one through the number of valid data points in an array. It is similar to the indirect sort indices produced by numpy.argsort , except that ties are broken according to a rule. The rank methods for Series and DataFrame are the place to look; by default rank breaks ties by assigning each group the mean rank: ''' obj = Series([7, -5, 7, 4, 2, 0, 4]) print(obj.rank()) ''' 0 6.5 1 1.0 2 6.5 3 4.5 4 3.0 5 2.0 6 4.5 dtype: float64 '''
def test_rank_signature(self): s = Series([0, 1]) s.rank(method='average') msg = r"No axis named average for object type <(class|type) 'type'>" with pytest.raises(ValueError, match=msg): s.rank('average')
def test_rank_categorical(self): # GH issue #15420 rank incorrectly orders ordered categories # Test ascending/descending ranking for ordered categoricals exp = Series([1., 2., 3., 4., 5., 6.]) exp_desc = Series([6., 5., 4., 3., 2., 1.]) ordered = Series( ['first', 'second', 'third', 'fourth', 'fifth', 'sixth'] ).astype( 'category', categories=['first', 'second', 'third', 'fourth', 'fifth', 'sixth'], ordered=True ) assert_series_equal(ordered.rank(), exp) assert_series_equal(ordered.rank(ascending=False), exp_desc) # Unordered categoricals should be ranked as objects unordered = Series( ['first', 'second', 'third', 'fourth', 'fifth', 'sixth'], ).astype( 'category', categories=['first', 'second', 'third', 'fourth', 'fifth', 'sixth'], ordered=False ) exp_unordered = Series([2., 4., 6., 3., 1., 5.]) res = unordered.rank() assert_series_equal(res, exp_unordered) unordered1 = Series( [1, 2, 3, 4, 5, 6], ).astype( 'category', categories=[1, 2, 3, 4, 5, 6], ordered=False ) exp_unordered1 = Series([1., 2., 3., 4., 5., 6.]) res1 = unordered1.rank() assert_series_equal(res1, exp_unordered1) # Test na_option for rank data na_ser = Series( ['first', 'second', 'third', 'fourth', 'fifth', 'sixth', np.NaN] ).astype( 'category', categories=[ 'first', 'second', 'third', 'fourth', 'fifth', 'sixth', 'seventh' ], ordered=True ) exp_top = Series([2., 3., 4., 5., 6., 7., 1.]) exp_bot = Series([1., 2., 3., 4., 5., 6., 7.]) exp_keep = Series([1., 2., 3., 4., 5., 6., np.NaN]) assert_series_equal(na_ser.rank(na_option='top'), exp_top) assert_series_equal(na_ser.rank(na_option='bottom'), exp_bot) assert_series_equal(na_ser.rank(na_option='keep'), exp_keep) # Test na_option for rank data with ascending False exp_top = Series([7., 6., 5., 4., 3., 2., 1.]) exp_bot = Series([6., 5., 4., 3., 2., 1., 7.]) exp_keep = Series([6., 5., 4., 3., 2., 1., np.NaN]) assert_series_equal( na_ser.rank(na_option='top', ascending=False), exp_top ) assert_series_equal( na_ser.rank(na_option='bottom', ascending=False), exp_bot ) assert_series_equal( na_ser.rank(na_option='keep', ascending=False), exp_keep ) # Test with pct=True na_ser = Series( ['first', 'second', 'third', 'fourth', np.NaN], ).astype( 'category', categories=['first', 'second', 'third', 'fourth'], ordered=True ) exp_top = Series([0.4, 0.6, 0.8, 1., 0.2]) exp_bot = Series([0.2, 0.4, 0.6, 0.8, 1.]) exp_keep = Series([0.25, 0.5, 0.75, 1., np.NaN]) assert_series_equal(na_ser.rank(na_option='top', pct=True), exp_top) assert_series_equal(na_ser.rank(na_option='bottom', pct=True), exp_bot) assert_series_equal(na_ser.rank(na_option='keep', pct=True), exp_keep)
import pandas as pd from pandas import Series, DataFrame ser1 = Series(range(3), index = ['C','A','B']) ser1 # use sort index to sort by index ser1.sort_index() # use order to sort by values ser1.order() from numpy.random import randn ser2 = Series(randn(10)) ser2 # ranking ser2.sort_values() ser2.rank() ser2.sort_values(ascending = False) ser3 = Series(randn(10)) ser3 ser3.rank() ser3 = ser3.sort_values() ser3.rank()
def test_rank_average_pct(dtype, ser, exp): s = Series(ser).astype(dtype) result = s.rank(method="average", pct=True) expected = Series(exp).astype(result.dtype) tm.assert_series_equal(result, expected)
def test_rank(self, datetime_series): pytest.importorskip("scipy.stats.special") rankdata = pytest.importorskip("scipy.stats.rankdata") datetime_series[::2] = np.nan datetime_series[:10][::3] = 4.0 ranks = datetime_series.rank() oranks = datetime_series.astype("O").rank() tm.assert_series_equal(ranks, oranks) mask = np.isnan(datetime_series) filled = datetime_series.fillna(np.inf) # rankdata returns a ndarray exp = Series(rankdata(filled), index=filled.index, name="ts") exp[mask] = np.nan tm.assert_series_equal(ranks, exp) iseries = Series(np.arange(5).repeat(2)) iranks = iseries.rank() exp = iseries.astype(float).rank() tm.assert_series_equal(iranks, exp) iseries = Series(np.arange(5)) + 1.0 exp = iseries / 5.0 iranks = iseries.rank(pct=True) tm.assert_series_equal(iranks, exp) iseries = Series(np.repeat(1, 100)) exp = Series(np.repeat(0.505, 100)) iranks = iseries.rank(pct=True) tm.assert_series_equal(iranks, exp) iseries[1] = np.nan exp = Series(np.repeat(50.0 / 99.0, 100)) exp[1] = np.nan iranks = iseries.rank(pct=True) tm.assert_series_equal(iranks, exp) iseries = Series(np.arange(5)) + 1.0 iseries[4] = np.nan exp = iseries / 4.0 iranks = iseries.rank(pct=True) tm.assert_series_equal(iranks, exp) iseries = Series(np.repeat(np.nan, 100)) exp = iseries.copy() iranks = iseries.rank(pct=True) tm.assert_series_equal(iranks, exp) iseries = Series(np.arange(5)) + 1 iseries[4] = np.nan exp = iseries / 4.0 iranks = iseries.rank(pct=True) tm.assert_series_equal(iranks, exp) rng = date_range("1/1/1990", periods=5) iseries = Series(np.arange(5), rng) + 1 iseries.iloc[4] = np.nan exp = iseries / 4.0 iranks = iseries.rank(pct=True) tm.assert_series_equal(iranks, exp) iseries = Series([1e-50, 1e-100, 1e-20, 1e-2, 1e-20 + 1e-30, 1e-1]) exp = Series([2, 1, 3, 5, 4, 6.0]) iranks = iseries.rank() tm.assert_series_equal(iranks, exp) # GH 5968 iseries = Series(["3 day", "1 day 10m", "-2 day", NaT], dtype="m8[ns]") exp = Series([3, 2, 1, np.nan]) iranks = iseries.rank() tm.assert_series_equal(iranks, exp) values = np.array( [-50, -1, -1e-20, -1e-25, -1e-50, 0, 1e-40, 1e-20, 1e-10, 2, 40], dtype="float64", ) random_order = np.random.permutation(len(values)) iseries = Series(values[random_order]) exp = Series(random_order + 1.0, dtype="float64") iranks = iseries.rank() tm.assert_series_equal(iranks, exp)
print(frame.sort_index(axis=1)) print(frame.sort_index(axis=1, ascending=False)) obj = Series([4, 7, -3, -2]) print(obj.sort_values()) obj = Series([4, np.nan, 7, np.nan, -3, -2]) print(obj.sort_values()) frame = DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]}) print(frame) print(frame.sort_values(by='b')) print(frame.sort_values(by=['a', 'b'])) # rank obj = Series([7, -5, 7, 4, 2, 0, 4]) print(obj.rank()) print(obj.rank(method='first')) print(obj.rank(method='max', ascending=False)) frame = DataFrame({'b': [4.3, 7, -3, 2], 'a': [0, 1, 0, 1], 'c': [-2, 5, 8, -2.5]}) print(frame) print(frame.rank(axis=1)) ''' duplicate index ''' obj = Series(range(5), index=['a', 'a', 'b', 'b', 'c']) print(obj) print(obj.index.is_unique) print(obj['a']) print(obj['c'])
print "根据索引排序,对于DataFrame可以指定轴。" obj = Series(range(4), index=["d", "a", "b", "c"]) print obj.sort_index() frame = DataFrame(np.arange(8).reshape((2, 4)), index=["three", "one"], columns=list("dabc")) print frame.sort_index() print frame.sort_index(axis=1) # axis=1 表示对列进行操作 print frame.sort_index(axis=1, ascending=False) # 降序 print print "根据值排序" obj = Series([4, 7, -3, 2]) print obj.sort_values() # order已淘汰 print print "DataFrame指定列排序" frame = DataFrame({"b": [4, 7, -3, 2], "a": [0, 1, 0, 1]}) print frame print frame.sort_values(by="b") # sort_index(by = ...)已淘汰 print frame.sort_values(by=["a", "b"]) print print "rank,求排名的平均位置(从1开始)" obj = Series([7, -5, 7, 4, 2, 0, 4]) # 对应排名:-5(1), 0(2), 2(3), 4(4), 4(5), 7(6), 7(7) print obj.rank() print obj.rank(method="first") # 去第一次出现,不求平均值。 print obj.rank(ascending=False, method="max") # 逆序,并取最大值。所以-5的rank是7. frame = DataFrame({"b": [4.3, 7, -3, 2], "a": [0, 1, 0, 1], "c": [-2, 5, 8, -2.5]}) print frame print frame.rank(axis=1)
def test_rank_first_pct(dtype, ser, exp): s = Series(ser).astype(dtype) result = s.rank(method='first', pct=True) expected = Series(exp).astype(result.dtype) assert_series_equal(result, expected)
obj.sort_index() frame=DataFrame(np.arange(8).reshape((2,4)),index=['three','one'],columns=['d','a','b','c']) frame.sort_index(axis=1,ascending=False) frame.sort_index() frame.sort_index(axis=1) # 排序 obj=Series([7,-5,7,4,2,0,4]) obj.rank() #重新 按 升序 从一排序 rank() 意思 是rank(method=‘average’) 是 又max 和min 两个排名 取均值 得到的。 #另外还有几种排序 平级 处理方式 参见 P140 #****************************************** #索引有可能不是唯一的 带有重复的唯一 可以由以下 来判断 obj.index.is_unique #*********************************************** df=DataFrame([[1.4,np.nan],[7.1,-4.5],[np.nan,np.nan],[0.75,-1.3]],index=['a','b','c','d'],columns=['one','two']) df.sum()
#每次随机散步中,最早离原点距离30时是在第几次漫步. print np.argmax(walkcum[index],axis=1) print np.mean(np.argmax(walkcum[index],axis=1)) pd.Index obj = Series([1,2,3]) obj.reindex() data = DataFrame([[1,2,3],[4,5,6]]) data.drop() np.argsort() obj.rank() obj.sort_values() data.tail() data.cov() data.cov() data.corr() data.dropna() data.loc
frame.sort_values(by=['a', 'b']) # 先a后b进行列的值排序 ''' a b 2 0 -3 0 0 4 3 1 2 1 1 7 ''' print print 'rank:默认升序,排名值从1开始' obj = Series([4, 2, 0, 4], index=['a', 'b', 'c', 'd']) # 以值从小到大来赋排名值:c:0(1) b:2(2) a:4(3) d:4(4) print obj.rank() ''' a 3.5 求平均值(4+3)/2 b 2.0 c 1.0 d 3.5 ''' print obj.rank(method='first') # 按出现顺序排名,不求平均值。 ''' a 3.0 b 2.0 c 1.0 d 4.0 ''' print
def test_rank_categorical(self): # GH issue #15420 rank incorrectly orders ordered categories # Test ascending/descending ranking for ordered categoricals exp = Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0]) exp_desc = Series([6.0, 5.0, 4.0, 3.0, 2.0, 1.0]) ordered = Series( ["first", "second", "third", "fourth", "fifth", "sixth"]).astype( CategoricalDtype( categories=[ "first", "second", "third", "fourth", "fifth", "sixth" ], ordered=True, )) tm.assert_series_equal(ordered.rank(), exp) tm.assert_series_equal(ordered.rank(ascending=False), exp_desc) # Unordered categoricals should be ranked as objects unordered = Series( ["first", "second", "third", "fourth", "fifth", "sixth"]).astype( CategoricalDtype( categories=[ "first", "second", "third", "fourth", "fifth", "sixth" ], ordered=False, )) exp_unordered = Series([2.0, 4.0, 6.0, 3.0, 1.0, 5.0]) res = unordered.rank() tm.assert_series_equal(res, exp_unordered) unordered1 = Series([1, 2, 3, 4, 5, 6]).astype( CategoricalDtype([1, 2, 3, 4, 5, 6], False)) exp_unordered1 = Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0]) res1 = unordered1.rank() tm.assert_series_equal(res1, exp_unordered1) # Test na_option for rank data na_ser = Series( ["first", "second", "third", "fourth", "fifth", "sixth", np.NaN]).astype( CategoricalDtype( [ "first", "second", "third", "fourth", "fifth", "sixth", "seventh" ], True, )) exp_top = Series([2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 1.0]) exp_bot = Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0]) exp_keep = Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, np.NaN]) tm.assert_series_equal(na_ser.rank(na_option="top"), exp_top) tm.assert_series_equal(na_ser.rank(na_option="bottom"), exp_bot) tm.assert_series_equal(na_ser.rank(na_option="keep"), exp_keep) # Test na_option for rank data with ascending False exp_top = Series([7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0]) exp_bot = Series([6.0, 5.0, 4.0, 3.0, 2.0, 1.0, 7.0]) exp_keep = Series([6.0, 5.0, 4.0, 3.0, 2.0, 1.0, np.NaN]) tm.assert_series_equal(na_ser.rank(na_option="top", ascending=False), exp_top) tm.assert_series_equal( na_ser.rank(na_option="bottom", ascending=False), exp_bot) tm.assert_series_equal(na_ser.rank(na_option="keep", ascending=False), exp_keep) # Test invalid values for na_option msg = "na_option must be one of 'keep', 'top', or 'bottom'" with pytest.raises(ValueError, match=msg): na_ser.rank(na_option="bad", ascending=False) # invalid type with pytest.raises(ValueError, match=msg): na_ser.rank(na_option=True, ascending=False) # Test with pct=True na_ser = Series(["first", "second", "third", "fourth", np.NaN]).astype( CategoricalDtype(["first", "second", "third", "fourth"], True)) exp_top = Series([0.4, 0.6, 0.8, 1.0, 0.2]) exp_bot = Series([0.2, 0.4, 0.6, 0.8, 1.0]) exp_keep = Series([0.25, 0.5, 0.75, 1.0, np.NaN]) tm.assert_series_equal(na_ser.rank(na_option="top", pct=True), exp_top) tm.assert_series_equal(na_ser.rank(na_option="bottom", pct=True), exp_bot) tm.assert_series_equal(na_ser.rank(na_option="keep", pct=True), exp_keep)
def test_rank_signature(self): s = Series([0, 1]) s.rank(method="average") msg = "No axis named average for object type Series" with pytest.raises(ValueError, match=msg): s.rank("average")
def test_rank_categorical(self): # GH issue #15420 rank incorrectly orders ordered categories # Test ascending/descending ranking for ordered categoricals exp = Series([1., 2., 3., 4., 5., 6.]) exp_desc = Series([6., 5., 4., 3., 2., 1.]) ordered = Series( ['first', 'second', 'third', 'fourth', 'fifth', 'sixth'] ).astype(CategoricalDtype(categories=['first', 'second', 'third', 'fourth', 'fifth', 'sixth'], ordered=True)) assert_series_equal(ordered.rank(), exp) assert_series_equal(ordered.rank(ascending=False), exp_desc) # Unordered categoricals should be ranked as objects unordered = Series(['first', 'second', 'third', 'fourth', 'fifth', 'sixth']).astype( CategoricalDtype(categories=['first', 'second', 'third', 'fourth', 'fifth', 'sixth'], ordered=False)) exp_unordered = Series([2., 4., 6., 3., 1., 5.]) res = unordered.rank() assert_series_equal(res, exp_unordered) unordered1 = Series( [1, 2, 3, 4, 5, 6], ).astype(CategoricalDtype([1, 2, 3, 4, 5, 6], False)) exp_unordered1 = Series([1., 2., 3., 4., 5., 6.]) res1 = unordered1.rank() assert_series_equal(res1, exp_unordered1) # Test na_option for rank data na_ser = Series( ['first', 'second', 'third', 'fourth', 'fifth', 'sixth', np.NaN] ).astype(CategoricalDtype(['first', 'second', 'third', 'fourth', 'fifth', 'sixth', 'seventh'], True)) exp_top = Series([2., 3., 4., 5., 6., 7., 1.]) exp_bot = Series([1., 2., 3., 4., 5., 6., 7.]) exp_keep = Series([1., 2., 3., 4., 5., 6., np.NaN]) assert_series_equal(na_ser.rank(na_option='top'), exp_top) assert_series_equal(na_ser.rank(na_option='bottom'), exp_bot) assert_series_equal(na_ser.rank(na_option='keep'), exp_keep) # Test na_option for rank data with ascending False exp_top = Series([7., 6., 5., 4., 3., 2., 1.]) exp_bot = Series([6., 5., 4., 3., 2., 1., 7.]) exp_keep = Series([6., 5., 4., 3., 2., 1., np.NaN]) assert_series_equal( na_ser.rank(na_option='top', ascending=False), exp_top ) assert_series_equal( na_ser.rank(na_option='bottom', ascending=False), exp_bot ) assert_series_equal( na_ser.rank(na_option='keep', ascending=False), exp_keep ) # Test invalid values for na_option msg = "na_option must be one of 'keep', 'top', or 'bottom'" with tm.assert_raises_regex(ValueError, msg): na_ser.rank(na_option='bad', ascending=False) # invalid type with tm.assert_raises_regex(ValueError, msg): na_ser.rank(na_option=True, ascending=False) # Test with pct=True na_ser = Series(['first', 'second', 'third', 'fourth', np.NaN]).astype( CategoricalDtype(['first', 'second', 'third', 'fourth'], True)) exp_top = Series([0.4, 0.6, 0.8, 1., 0.2]) exp_bot = Series([0.2, 0.4, 0.6, 0.8, 1.]) exp_keep = Series([0.25, 0.5, 0.75, 1., np.NaN]) assert_series_equal(na_ser.rank(na_option='top', pct=True), exp_top) assert_series_equal(na_ser.rank(na_option='bottom', pct=True), exp_bot) assert_series_equal(na_ser.rank(na_option='keep', pct=True), exp_keep)
def test_pct_max_many_rows(): # GH 18271 s = Series(np.arange(2**24 + 1)) result = s.rank(pct=True).max() assert result == 1
print(ser2.sort_values()) ''' 5 -2.682719 8 -1.789567 2 -0.991176 9 -0.547529 3 0.144296 4 0.481288 6 0.593906 7 0.815368 0 1.290655 1 1.383484 dtype: float64 ''' print(ser2.rank()) ''' 0 9.0 1 10.0 2 3.0 3 5.0 4 6.0 5 1.0 6 7.0 7 8.0 8 2.0 9 4.0 dtype: float64 '''
s.unique() # count(*) group by non-NaN value, get a Series s.value_counts() # aggregation and statistic s.max() s.mean() s.var() # location of the max element s.idxmax() # rank s = Series([4, 1, 2, 5]) s.rank() # return [3,1,2,4] # plot s.plot() plt.show() # translate ################################################## # sort new_s1 = s.sort_index() # sort by index new_s2 = s.sort_values() # sort by values # reindex includes the following steps: # 1. Reordering existing data to match a set of labels. # 2. Inserting NaN markers where no data exists for a label. # 3. Possibly, filling missing data for a label using some type # of logic
class MySeries: def __init__(self, *args, **kwargs): self.x = Series(*args, **kwargs) self.values = self.x.values self.index = self.x.index def rolling_mean(self, *args, **kwargs): return MySeries(pd.rolling_mean(self.x, *args, **kwargs)) def rolling_count(self, *args, **kwargs): return MySeries(pd.rolling_count(self.x, *args, **kwargs)) def rolling_sum(self, *args, **kwargs): return MySeries(pd.rolling_sum(self.x, *args, **kwargs)) def rolling_median(self, *args, **kwargs): return MySeries(pd.rolling_median(self.x, *args, **kwargs)) def rolling_min(self, *args, **kwargs): return MySeries(pd.rolling_min(self.x, *args, **kwargs)) def rolling_max(self, *args, **kwargs): return MySeries(pd.rolling_max(self.x, *args, **kwargs)) def rolling_std(self, *args, **kwargs): return MySeries(pd.rolling_std(self.x, *args, **kwargs)) def rolling_var(self, *args, **kwargs): return MySeries(pd.rolling_var(self.x, *args, **kwargs)) def rolling_skew(self, *args, **kwargs): return MySeries(pd.rolling_skew(self.x, *args, **kwargs)) def rolling_kurtosis(self, *args, **kwargs): return MySeries(pd.rolling_kurtosis(self.x, *args, **kwargs)) def rolling_window(self, *args, **kwargs): return MySeries(pd.rolling_window(self.x, *args, **kwargs)) def cumprod(self, *args, **kwargs): return MySeries(self.x.cumprod(*args, **kwargs)) def cumsum(self, *args, **kwargs): return MySeries(self.x.cumsum(*args, **kwargs)) def diff(self, *args, **kwargs): return MySeries(self.x.diff(*args, **kwargs)) def div(self, *args, **kwargs): return MySeries(self.x.div(*args, **kwargs)) def mul(self, *args, **kwargs): return MySeries(self.x.mul(*args, **kwargs)) def add(self, *args, **kwargs): return MySeries(self.x.add(*args, **kwargs)) def dropna(self, *args, **kwargs): return MySeries(self.x.dropna(*args, **kwargs)) def fillna(self, *args, **kwargs): return MySeries(self.x.fillna(*args, **kwargs)) def floordiv(self, *args, **kwargs): return MySeries(self.x.floordiv(*args, **kwargs)) def mod(self, *args, **kwargs): return MySeries(self.x.mod(*args, **kwargs)) def nlargest(self, *args, **kwargs): return MySeries(self.x.nlargest(*args, **kwargs)) def nonzero(self, *args, **kwargs): return MySeries(self.x.nonzero(*args, **kwargs)) def nsmallest(self, *args, **kwargs): return MySeries(self.x.nsmallest(*args, **kwargs)) def pow(self, *args, **kwargs): return MySeries(self.x.pow(*args, **kwargs)) def rank(self, *args, **kwargs): return MySeries(self.x.rank(*args, **kwargs)) def round(self, *args, **kwargs): return MySeries(self.x.round(*args, **kwargs)) def shift(self, *args, **kwargs): return MySeries(self.x.shift(*args, **kwargs)) def sub(self, *args, **kwargs): return MySeries(self.x.sub(*args, **kwargs)) def abs(self, *args, **kwargs): return MySeries(self.x.abs(*args, **kwargs)) def clip(self, *args, **kwargs): return MySeries(self.x.clip(*args, **kwargs)) def clip_lower(self, *args, **kwargs): return MySeries(self.x.clip_lower(*args, **kwargs)) def clip_upper(self, *args, **kwargs): return MySeries(self.x.clip_upper(*args, **kwargs)) def interpolate(self, *args, **kwargs): return MySeries(self.x.interpolate(*args, **kwargs)) def resample(self, *args, **kwargs): return MySeries(self.x.resample(*args, **kwargs)) def replace(self, *args, **kwargs): return MySeries(self.x.replace(*args, **kwargs))
def test_rank_signature(self): s = Series([0, 1]) s.rank(method='average') self.assertRaises(ValueError, s.rank, 'average')
# now shift gears and sort by values obj = Series([4, 7, -3, 2]) obj.order() # error obj.sort_values() obj = Series([4, np.nan, 7, np.nan, -3, 2]) obj.sort_values() frame = DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]}) frame frame.sort_index(by='b') # from old version of book frame.sort_values(by='b') # you can make this a list if you like # ranking obj = Series([7, -5, 7, 4, 2, 0, 4]) obj.rank() obj.rank(method='first') obj.rank(ascending=False, method='max') frame = DataFrame({ 'b': [4.3, 7, -3, 2], 'a': [0, 1, 0, 1], 'c': [-2, 5, 8, -2.5] }) frame frame.rank(axis=1) # duplicate indecies obj = Series(range(5), index=['a', 'a', 'b', 'b', 'c'])
frame.sort_index() # 按列名排序 frame.sort_index(axis=1) # 降序排 frame.sort_index(axis=1, ascending=False) # 对值进行排序,这个只能对Series使用 obj = Series([4,7,-3,2]) obj.order() # 排序时缺失值都会被放在末尾 # 对多列进行排序 frame = DataFrame({'b':[4,7,-3,2],'a':[0,1,0,1]}) frame.sort_index(by=['a','b']) frame.order(by=['a','b']) # 排名 obj = Series([7,-5,7,4,2,0,4]) obj.rank() # 对于相同值,按照出现次序排 obj.rank(method='first') # 降序 obj.rank(ascending=False,method='max') # 对列计算排名 frame = DataFrame({'b':[4.3,7,-3,2],'a':[0,1,0,1],'c':[-2,5,8,-2.5]}) frame.rank(axis=1) ## 带有重复值的轴索引 obj = Series(range(5), index=['a','a','b','b','c']) # 检验是否唯一 obj.index.is_unique # 一个索引有多个值,那么该索引就会返回多个值。 obj['a']
def main(): # reindex obj = Series(range(4), index="a b c d".split(" ")[::-1]) print obj obj2 = obj.reindex("a b c d e".split(" ")) print obj2 # Change NaN print obj.reindex("a b c d e".split(" "), fill_value=0) colors = ["blue", "purple", "yellow"] index = [0, 2, 4] obj3 = Series(colors, index=index) print obj3.reindex(range(6)) print obj3.reindex(range(6), method="ffill") # not found forward fill print obj3.reindex(range(6), method="backfill") # bfill # DataFrame states = ["Ohio", "Texas", "California"] frame = DataFrame(np.arange(9).reshape((3, 3)), index="a b c".split(" "), columns=["Ohio", "Texas", "California"]) print frame frame2 = frame.reindex("a b c d".split(" ")) print frame2 states[0] = "Utah" states[1], states[0] = states[:2] print frame.reindex(columns=states) # fill print frame.reindex("a b c d".split(" "), method="ffill", columns=states) print frame.ix["a b c d".split(" ")] print frame.ix["a b c d".split(" "), states] # Delete column print "", "" obj = Series(range(5), index="a b c d e".split(" ")) new_obj = obj.drop("c") print new_obj print obj # Index reference print "", "" obj = Series(np.arange(4.0), index="a b c d".split(" ")) print obj["b"] print obj[1] # same print obj[2:4] print obj[["b", "a", "c"]] print obj[[1, 3]] print obj[obj < 2] # Slice with label print obj["b":"c"] # include 'c' obj["b":"c"] = 5 print obj data = DataFrame( np.arange(16).reshape((4, 4)), index=["Ohio", "Colorado", "Utah", "New York"], columns=["one", "two", "three", "four"], ) print data # column print data["two"] print data[["three", "one"]] # row print data[:2] print data[data["three"] > 5] # all values print data < 5 data[data < 5] = 0 print data # row and column print data.ix[["Colorado"], ["two", "three"]] print data.ix[["Colorado", "Utah"], [3, 0, 1]] # row print data.ix[2] # label row and column, return column print data.ix[:"Utah", "two"] # xs # row print data.xs("Utah") print data.xs("Utah", axis=0) # rows print data.xs("two", axis=1) # icol/irow i is index print data.icol(1) print data.irow(1) # Union print "", "" s1 = Series([7.3, -2.5, 3.4, 1.5], index=["a", "c", "d", "e"]) s2 = Series([-2.1, 3.6, -1.5, 4, 3.1], index=["a", "c", "e", "f", "g"]) print s1 print s2 # index is union, but d, f, g are NaN print s1 + s2 df1 = DataFrame(np.arange(9.0).reshape((3, 3)), columns=list("bcd"), index=["Ohio", "Texas", "Colorado"]) df2 = DataFrame(np.arange(12.0).reshape((4, 3)), columns=list("bde"), index=["Utah", "Ohio", "Texas", "Oregon"]) print df1 print df2 print df1 + df2 # arithmetic method print "", "" df1 = DataFrame(np.arange(12.0).reshape((3, 4)), columns=list("abcd")) df2 = DataFrame(np.arange(20.0).reshape((4, 5)), columns=list("abcde")) print df1 print df2 print df1.add(df2, fill_value=0) # reindex has fill_value argument # other arithmetic method are sub/div/mul(ti) # Calculation in a DataFrame and Series print "", "" # subtract from each row. broadcat arr = np.arange(12.0).reshape((3, 4)) print arr print arr[0] print arr - arr[0] frame = DataFrame(np.arange(12.0).reshape((4, 3)), columns=list("bde"), index=["Utah", "Ohio", "Texas", "Oregon"]) series = frame.ix[0] print frame print series print frame - series series2 = Series(range(3), index=list("bef")) print frame + series2 series3 = frame["d"] series4 = frame.ix[0] print frame print series3 print series4 print frame.sub(series3, axis=0) print frame.sub(series4, axis=1) # apply function and mapping print "", "" frame = DataFrame(np.arange(12.0).reshape((4, 3)), columns=list("bde"), index=["Utah", "Ohio", "Texas", "Oregon"]) print frame f = lambda x: x.max() - x.min() print frame.apply(f) print frame.apply(f, axis=1) f = lambda x: Series([x.min(), x.max()], index=["min", "max"]) print frame.apply(f) format = lambda x: "{0:.2f}".format(x) print frame.applymap(format) # frame print frame["e"].map(format) # series # sort and rank print "", "" obj = Series(range(4), index=list("dabc")) print obj print obj.sort_index() frame = DataFrame(np.arange(8).reshape((2, 4)), index=["three", "one"], columns=list("dabc")) print frame print frame.sort_index() print frame.sort_index(axis=1) print frame.sort_index(axis=1, ascending=False) # Sorting series print "", "" obj = Series([4, 7, -3, 2]) print obj.order() obj = Series([4, np.nan, 7, np.nan, -3, 2]) print obj.order() print obj.order(ascending=False) # order by multi columns print "", "" frame = DataFrame({"b": [4, 7, -3, 2], "a": [0, 1, 0, 1]}) print frame.sort_index(by=["a", "b"]) # rank print "", "" obj = Series([7, -5, 7, 4, 2, 0, 4]) print obj.rank() # method is average print obj.rank(method="first") # No Duplicates print obj.rank(ascending=False, method="min") print obj.rank(ascending=False, method="max") f1 = DataFrame(obj, columns=["data"]) f2 = DataFrame(obj.rank(), columns=["rank"]) # merge by each index print pd.merge(f1, f2, left_index=True, right_index=True) # Index of the axis with duplicate values print "", "" obj = Series(range(5), index=list("aaabc")) print obj print obj.index.is_unique print obj["a"] print obj["c"] df = DataFrame(np.arange(12.0).reshape((4, 3)), index=list("aabb"), columns=list("ccd")) print df print df.ix["b"] print df["c"]
def test_rank(self): tm._skip_if_no_scipy() from scipy.stats import rankdata self.ts[::2] = np.nan self.ts[:10][::3] = 4. ranks = self.ts.rank() oranks = self.ts.astype('O').rank() assert_series_equal(ranks, oranks) mask = np.isnan(self.ts) filled = self.ts.fillna(np.inf) # rankdata returns a ndarray exp = Series(rankdata(filled), index=filled.index, name='ts') exp[mask] = np.nan tm.assert_series_equal(ranks, exp) iseries = Series(np.arange(5).repeat(2)) iranks = iseries.rank() exp = iseries.astype(float).rank() assert_series_equal(iranks, exp) iseries = Series(np.arange(5)) + 1.0 exp = iseries / 5.0 iranks = iseries.rank(pct=True) assert_series_equal(iranks, exp) iseries = Series(np.repeat(1, 100)) exp = Series(np.repeat(0.505, 100)) iranks = iseries.rank(pct=True) assert_series_equal(iranks, exp) iseries[1] = np.nan exp = Series(np.repeat(50.0 / 99.0, 100)) exp[1] = np.nan iranks = iseries.rank(pct=True) assert_series_equal(iranks, exp) iseries = Series(np.arange(5)) + 1.0 iseries[4] = np.nan exp = iseries / 4.0 iranks = iseries.rank(pct=True) assert_series_equal(iranks, exp) iseries = Series(np.repeat(np.nan, 100)) exp = iseries.copy() iranks = iseries.rank(pct=True) assert_series_equal(iranks, exp) iseries = Series(np.arange(5)) + 1 iseries[4] = np.nan exp = iseries / 4.0 iranks = iseries.rank(pct=True) assert_series_equal(iranks, exp) rng = date_range('1/1/1990', periods=5) iseries = Series(np.arange(5), rng) + 1 iseries.iloc[4] = np.nan exp = iseries / 4.0 iranks = iseries.rank(pct=True) assert_series_equal(iranks, exp) iseries = Series([1e-50, 1e-100, 1e-20, 1e-2, 1e-20 + 1e-30, 1e-1]) exp = Series([2, 1, 3, 5, 4, 6.0]) iranks = iseries.rank() assert_series_equal(iranks, exp) # GH 5968 iseries = Series(['3 day', '1 day 10m', '-2 day', NaT], dtype='m8[ns]') exp = Series([3, 2, 1, np.nan]) iranks = iseries.rank() assert_series_equal(iranks, exp) values = np.array( [-50, -1, -1e-20, -1e-25, -1e-50, 0, 1e-40, 1e-20, 1e-10, 2, 40 ], dtype='float64') random_order = np.random.permutation(len(values)) iseries = Series(values[random_order]) exp = Series(random_order + 1.0, dtype='float64') iranks = iseries.rank() assert_series_equal(iranks, exp)
#!/usr/bin/env python3 from pandas import Series, DataFrame import pandas as pd import numpy as np print('-----------------Series排名排序----------------------') obj = Series(range(4), index=['a', 'c', 'b', 'd']) print(obj.sort_index()) #Series根据索引排序 print(obj.sort_values()) #Series根据值排序 obj = Series([7, -5, 7, 3, 4, 2]) print(obj.rank()) #rank:排名值 method:排名时用于破坏平级关系的选项 print(obj.rank(method='first')) print(obj.rank(method='max')) print(obj.rank(method='min')) #first按值在原始数据中出现顺序分配排名 #max使用整个分组的最大排名 #min使用整个分组的最小排名 #average 默认:在相等分组中,为各个值分配平均排名 print('-----------------Dataframe排名排序----------------------') frame = DataFrame(np.arange(8).reshape(2, 4), index=['three', 'one'], columns=['d', 'a', 'b', 'c']) print(frame) print(frame.sort_index()) #根据行索引排序 print(frame.sort_index(axis=1)) #根据列索引排序 print(frame.sort_index(axis=1, ascending=False)) #倒序 df = DataFrame({'a': [4, 7, -3, 2], 'b': [0, 1, 0, 1]}) print(df) print(df.sort_values(by=['a'], ascending=False)) #根据列值进行排序 print(df.rank(axis=1))
def test_rank_signature(self): s = Series([0, 1]) s.rank(method='average') pytest.raises(ValueError, s.rank, 'average')
def CombinedReport(id, interval, tree): """CombinedReport(id,interval)-- This takes an id and interval interms of shorthand months of the year --E.g CombinedReport('cK5zkZIUFsN','jan')""" mytype = tree['ANC'][id] mynewtype = [] x = 0 for i in range(0, len(mytype), 2): mynewtype.append(mytype[i]) remapped = np.nan_to_num(mynewtype) for i in Months[interval]: x += remapped[i] A = x / 3.00 # start of comparison report bit of the function mytypeComp = tree['ANC'][id] mynewtypeComp = [] xComp = 0 for i in range(0, len(mytypeComp), 2): mynewtypeComp.append(mytypeComp[i]) remapped = np.nan_to_num(mynewtypeComp) for i in Months[Mappings[interval]]: xComp += remapped[i] B = xComp / 3.00 # start of completeness report for ANC mytype = tree['ANC'][id] mynewtype = [] truetest = [] x = 0 for i in range(0, len(mytype), 2): mynewtype.append(mytype[i]) remapped = np.isnan(mynewtype) for i in Months[interval]: truetest.append(remapped[i]) remappednum = truetest.count(False) C = (remappednum / 3.00) * 100 mytype = tree['PVC'][id] x = 0 remapped = np.nan_to_num(mytype) for i in Months[interval]: x += remapped[i] D = x / 3.00 # start of comparison of PVC past Month's report mytype = tree['PVC'][id] x = 0 remapped = np.nan_to_num(mytype) for i in Months[Mappings[interval]]: x += remapped[i] E = x / 3.00 # start of comparison of Completeness of PVC past Month's report mytype = tree['PVC'][id] truetest = [] x = 0.0000 remapped = np.isnan(mytype) remapped = list(remapped) for i in Months[interval]: truetest.append(remapped[i]) x = truetest.count(False) F = (x / 3.00) * 100 mytype = tree['Deliv'][id] x = 0 remapped = np.nan_to_num(mytype) for i in Months[interval]: x += remapped[i] G = x / 3.00 # start of comparison for the Past Months Deliveries report mytype = tree['Deliv'][id] x = 0 remapped = np.nan_to_num(mytype) for i in Months[Mappings[interval]]: x += remapped[i] H = x / 3.00 # start of comparison of Completeness of Deliveries for Month's report mytype = tree['Deliv'][id] truetest = [] x = 0.0000 remapped = np.isnan(mytype) remapped = list(remapped) for i in Months[interval]: truetest.append(remapped[i]) x = truetest.count(False) I = (x / 3.00) * 100 RankDict = {} RankInitial = [] RankInitial.append(ANC_reportRank(id, interval, tree)) c = RowToSub[id] total = 0 for i in check: result = check.get_group(c) k = result.shape[0] for j in range(k): p = result.values[j][0] RankDict[p] = ANC_reportRank(p, interval, tree) total += 1 del RankDict[id] RankList = RankDict.values() for x in RankList: RankInitial.append(x) RankPosition = Series(RankInitial) RankFinal = RankPosition.rank(method='min')[0] J = RankFinal K = total L = ord(J), ':of', K return A, B, C, D, E, F, G, H, I, L
def test_rank(self): pytest.importorskip('scipy.stats.special') rankdata = pytest.importorskip('scipy.stats.rankdata') self.ts[::2] = np.nan self.ts[:10][::3] = 4. ranks = self.ts.rank() oranks = self.ts.astype('O').rank() assert_series_equal(ranks, oranks) mask = np.isnan(self.ts) filled = self.ts.fillna(np.inf) # rankdata returns a ndarray exp = Series(rankdata(filled), index=filled.index, name='ts') exp[mask] = np.nan tm.assert_series_equal(ranks, exp) iseries = Series(np.arange(5).repeat(2)) iranks = iseries.rank() exp = iseries.astype(float).rank() assert_series_equal(iranks, exp) iseries = Series(np.arange(5)) + 1.0 exp = iseries / 5.0 iranks = iseries.rank(pct=True) assert_series_equal(iranks, exp) iseries = Series(np.repeat(1, 100)) exp = Series(np.repeat(0.505, 100)) iranks = iseries.rank(pct=True) assert_series_equal(iranks, exp) iseries[1] = np.nan exp = Series(np.repeat(50.0 / 99.0, 100)) exp[1] = np.nan iranks = iseries.rank(pct=True) assert_series_equal(iranks, exp) iseries = Series(np.arange(5)) + 1.0 iseries[4] = np.nan exp = iseries / 4.0 iranks = iseries.rank(pct=True) assert_series_equal(iranks, exp) iseries = Series(np.repeat(np.nan, 100)) exp = iseries.copy() iranks = iseries.rank(pct=True) assert_series_equal(iranks, exp) iseries = Series(np.arange(5)) + 1 iseries[4] = np.nan exp = iseries / 4.0 iranks = iseries.rank(pct=True) assert_series_equal(iranks, exp) rng = date_range('1/1/1990', periods=5) iseries = Series(np.arange(5), rng) + 1 iseries.iloc[4] = np.nan exp = iseries / 4.0 iranks = iseries.rank(pct=True) assert_series_equal(iranks, exp) iseries = Series([1e-50, 1e-100, 1e-20, 1e-2, 1e-20 + 1e-30, 1e-1]) exp = Series([2, 1, 3, 5, 4, 6.0]) iranks = iseries.rank() assert_series_equal(iranks, exp) # GH 5968 iseries = Series(['3 day', '1 day 10m', '-2 day', NaT], dtype='m8[ns]') exp = Series([3, 2, 1, np.nan]) iranks = iseries.rank() assert_series_equal(iranks, exp) values = np.array( [-50, -1, -1e-20, -1e-25, -1e-50, 0, 1e-40, 1e-20, 1e-10, 2, 40 ], dtype='float64') random_order = np.random.permutation(len(values)) iseries = Series(values[random_order]) exp = Series(random_order + 1.0, dtype='float64') iranks = iseries.rank() assert_series_equal(iranks, exp)
import numpy as np import pandas as pd from pandas import Series, DataFrame from numpy.random import random ser1 = Series([500, 1000, 1500], index=['a', 'c', 'b']) print(ser1) #sorting by index print(ser1.sort_index()) #sort by values print(ser1.sort_values()) print(ser1.rank()) #ranking of series ser2 = Series(random(10)) print(ser2) print(ser2.rank()) ser2 = ser2.sort_values() print(ser2.rank)
def panda_basci_function(): # 基本功能 # PART1: 重新索引 reindex obj = Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c']) obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e']) # reindex会根据新索引进行重排,某个索引值不存在就引入Nan obj3 = obj.reindex(['a', 'b', 'c', 'd', 'e'], fill_value=0) # print(obj2) # print(obj3) obj3 = Series(['blue', 'purple', 'yellow'], index=[0, 2, 4]) # print(obj3) # print(obj3.reindex(range(6), method='ffill')) # print(obj3.reindex(range(6), method='bfill')) # method: ffill/pad --前向填充/搬运 bfill/backfill --后向填充/搬运 # DF,reindex可以修改行索引,列,或者两个都修改;如果仅传入一个序列,则会重新索引行 frame = DataFrame(np.arange(9).reshape((3, 3)), columns=['Ohio', 'Texas', 'California'], index=['a', 'c', 'd']) frame2 = frame.reindex(['a', 'b', 'c', 'd']) # 仅仅重新索引行 # print(frame2) states = ['Texas', 'Utah', 'California'] # print(frame.reindex(columns=states)) # 对列重新索引 # print(frame.reindex(index=['a', 'b', 'c', 'd'], columns=states)) # 同时重新索引行列 # print(frame.ix[['a', 'b', 'c', 'd'], states]) # PART2: 丢弃指定轴上的项 obj = Series(np.arange(5), index=['a', 'b', 'c', 'd', 'e']) new_obj = obj.drop('c') # 删除索引c # print(new_obj) # print(obj.drop(['c', 'd'])) data = DataFrame(np.arange(16).reshape((4, 4)), index=['Ohio', 'Colorado', 'Utah', 'New York'], columns=['one', 'two', 'three', 'four']) # print(data.drop(['Colorado', 'Ohio'])) # 删除行索引 # print(data.drop('two', axis=1)) # axis删除列上的索引 # print(data.drop(['two', 'four'], axis=1)) # PART3: 索引,选取和过滤 obj = Series(np.arange(4), index=['a', 'b', 'c', 'd']) # print(obj['b']) # 标签索引 # print(obj[1]) # 数字索引 # print(obj[2:4]) # print(obj[['b', 'c']]) # print(obj['b': 'c']) # 标签切片,末端是包含的 # print(data['two']) # 对DF索引其实就是获取一个或多个列 # print(data[['three', 'one']]) # print(data[:2]) # 切片选取行 # print(data[data['three'] > 5]) # 布尔数组选取行 # print(data.ix[:2]) # print(data.ix['Colorado']) # 行标签选择指定行 # print(data.ix['Colorado', ['two', 'three']]) # 行标签 & 列索引 # print(data.ix[:, 'two']) # two列 """ obj[val]: 选取df的一个或一组列 obj.ix[val]: 选取dc的单个行或一组行 obj.ix[:, val]: 选取单个列或列子集 obj.ix[val1, val2]: 同时选取行和列 loc:通过行标签索引数据 iloc:通过行号索引行数据 ix:通过行标签或行号索引数据(基于loc和iloc的混合) df.ix[0] -- 行号索引 dc.ix['a'] -- 行标签索引 """ # PART4: 算数运算和数据对齐 s1 = Series([1, 2, 3, 4], index=['a', 'c', 'd', 'e']) s2 = Series([5, 6, 7, 8, 9], index=['a', 'c', 'e', 'f', 'g']) # print(s1 + s2) # 相加时结果为索引对的并集,自动对齐使得在不重复的索引处引入Na # df1 = DataFrame(np.arange(9).reshape((3, 3)), # columns=list('bcd'), # index=['Ohio', 'Texas', 'Colorado']) # df2 = DataFrame(np.arange(12).reshape((4, 3)), # columns=list('bde'), # index=['Utah', 'Ohio', 'Texas', 'Oregon']) # print(df1 + df2) # df对齐会同时发生在行和列上 # 在算数方法中填充值: add(),sub(),div(),mul() df1 = DataFrame(np.arange(12).reshape((3, 4)), columns=list('abcd')) df2 = DataFrame(np.arange(20).reshape((4, 5)), columns=list('abcde')) # print(df1.add(df2, fill_value=0)) # 调用df的add() # print(df1.reindex(columns=df2.columns, fill_value=0)) # 重新索引 # DF和Series之间的运算 arr = np.arange(12).reshape((3, 4)) # print(arr - arr[0]) # 广播 frame = DataFrame(np.arange(12).reshape((4, 3)), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon']) # series = frame.ix[0] # print(frame - series) # series的索引匹配到df的列,然后沿着行一直向下广播 # 列相减,即匹配行且在列上广播 series = frame['d'] # print(frame.sub(series, axis=0)) # axis就是希望匹配的轴,行为轴0,列为轴1 # PART5: 函数应用和映射 frame = DataFrame(np.random.randn(4, 3), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon']) # print(np.abs(frame)) # numpy的元素级函数可以直接使用 f = lambda x: x.max() - x.min() # print(frame.apply(f)) # 使用apply将函数应用到各行或列上 # print(frame.apply(f, axis=1)) format = lambda x: '%.2f' % x # print(frame.applymap(format)) # 元素级的python函数通过applymap() # print(frame[0].map(format)) # PART6: 排序和排名 obj = Series(range(4), index=['d', 'a', 'b', 'c']) # print(obj.sort_index()) # 对行索引进行排序 # print(obj.sort_values()) # 对值进行排序 frame = DataFrame(np.arange(8).reshape((2, 4)), index=['three', 'one'], columns=['d', 'a', 'b', 'c']) # print(frame.sort_index()) # 对行轴进行排序,即一行中的数据是由小到大的 # print(frame.sort_index(axis=1)) # 对列轴进行排序.即一列中的数据是由小到大的 # print(frame.sort_index(by='b')) # 对某个列进行排序 # 排名ranking和排序关系密切,会增加一个排名值(从1开始,一直到数据中有效数据的数量),和numpy的argsort()类似,只不过 # ranking可以根据某种规则破坏评级关系 obj = Series([1, 2, 3, 4]) print(obj.rank()) # 现在值表示:在原来obj这个序列中,0-3这4个索引所对应的每一个值分别在序列里排名第几。 obj = Series([1, 1, 2, 2, 3, 4]) print(obj.rank()) # 索引0和索引1对应的值均为1,按照上面的说法,调用rank()方法后,他们的排名分别是第1位,和第2位,那么究竟是索引0对应的值是第1,还是索引1对应的值是第1呢? # rank函数的默认处理是当出现重复值的情况下,默认取他们排名次序值(这里的第1名、第2名)的平均值。也就是说索引0和索引1对应的值1统一排名为(1+2)/2 = 1.5。 # method 说明 # average 默认:在相等分组中,为各个值分配平均排名 # min 使用整个分组的最小排名(两人并列第 1 名,下一个人是第 3 名。 ) # max 使用整个分组的最大排名(两人并列第 2 名,下一个人是第 3 名) # first 按值在原始数据中的出现顺序分配排名 # PART7: 带重复值的轴索引 obj = Series(range(5), index=['a', 'a', 'b', 'b', 'c']) print(obj) print(obj.index.is_unique) # 索引是否唯一 print(obj['a']) # 索引a对应2个值,返回series print(obj['c']) # 索引c对应1个值,返回标量
def pd_05(): obj=Series([7,-5,7,4,2,0,4]) print obj.rank() print obj.rank(method='first') print obj.rank(ascending=False,method='first') print obj.describe()