Example #1
0
 def test_rank_signature(self):
     s = Series([0, 1])
     s.rank(method='average')
     msg = ("No axis named average for object type"
            " <class 'pandas.core.series.Series'>")
     with pytest.raises(ValueError, match=msg):
         s.rank('average')
Example #2
0
    def test_rank_modify_inplace(self):
        # GH 18521
        # Check rank does not mutate series
        s = Series([Timestamp('2017-01-05 10:20:27.569000'), NaT])
        expected = s.copy()

        s.rank()
        result = s
        assert_series_equal(result, expected)
Example #3
0
 def test_rank_desc_mix_nans_infs(self):
     # GH 19538
     # check descending ranking when mix nans and infs
     iseries = Series([1, np.nan, np.inf, -np.inf, 25])
     result = iseries.rank(ascending=False)
     exp = Series([3, np.nan, 1, 4, 2], dtype="float64")
     tm.assert_series_equal(result, exp)
Example #4
0
 def test_rank_desc_mix_nans_infs(self):
     # GH 19538
     # check descending ranking when mix nans and infs
     iseries = Series([1, np.nan, np.inf, -np.inf, 25])
     result = iseries.rank(ascending=False)
     exp = Series([3, np.nan, 1, 4, 2], dtype='float64')
     tm.assert_series_equal(result, exp)
Example #5
0
    def test_rank_inf(self):
        pytest.skip('DataFrame.rank does not currently rank '
                    'np.inf and -np.inf properly')

        values = np.array(
            [-np.inf, -50, -1, -1e-20, -1e-25, -1e-50, 0, 1e-40, 1e-20, 1e-10,
             2, 40, np.inf], dtype='float64')
        random_order = np.random.permutation(len(values))
        iseries = Series(values[random_order])
        exp = Series(random_order + 1.0, dtype='float64')
        iranks = iseries.rank()
        assert_series_equal(iranks, exp)
Example #6
0
    def test_rank_inf(self):
        pytest.skip('DataFrame.rank does not currently rank '
                    'np.inf and -np.inf properly')

        values = np.array([
            -np.inf, -50, -1, -1e-20, -1e-25, -1e-50, 0, 1e-40, 1e-20, 1e-10,
            2, 40, np.inf
        ],
                          dtype='float64')
        random_order = np.random.permutation(len(values))
        iseries = Series(values[random_order])
        exp = Series(random_order + 1.0, dtype='float64')
        iranks = iseries.rank()
        assert_series_equal(iranks, exp)
Example #7
0
    def test_rank_methods_series(self, method, op, value):
        from scipy.stats import rankdata

        xs = np.random.randn(9)
        xs = np.concatenate([xs[i:] for i in range(0, 9, 2)])  # add duplicates
        np.random.shuffle(xs)

        index = [chr(ord("a") + i) for i in range(len(xs))]
        vals = op(xs, value)
        ts = Series(vals, index=index)
        result = ts.rank(method=method)
        sprank = rankdata(vals, method if method != "first" else "ordinal")
        expected = Series(sprank, index=index).astype("float64")
        tm.assert_series_equal(result, expected)
Example #8
0
    def test_rank_dense_method(self):
        dtypes = ['O', 'f8', 'i8']
        in_out = [([1], [1]), ([2], [1]), ([0], [1]), ([2, 2], [1, 1]),
                  ([1, 2, 3], [1, 2, 3]), (
                      [4, 2, 1],
                      [3, 2, 1],
                  ), ([1, 1, 5, 5, 3], [1, 1, 3, 3, 2]),
                  ([-5, -4, -3, -2, -1], [1, 2, 3, 4, 5])]

        for ser, exp in in_out:
            for dtype in dtypes:
                s = Series(ser).astype(dtype)
                result = s.rank(method='dense')
                expected = Series(exp).astype(result.dtype)
                assert_series_equal(result, expected)
Example #9
0
def gene_ttest(edgeList, one_value, group_value, n_genes, gene2num):
    
    statistic_list = []
    for i in range(n_genes):
        statistic = ttest_onesmaple(one_value[i], group_value[i])
        if math.isnan(statistic):
            statistic_list.append(0.)
        elif math.isinf(statistic):
            statistic_list.append(0.)
        else:
            statistic_list.append(abs(statistic))
    statistic_series = Series(statistic_list)
    rank = statistic_series.rank().values.astype(np.int)    
        
    return rank,statistic_list
Example #10
0
def _ms_sing(geneset: list, x: pd.Series, norm_method: str,
             rankup: bool) -> dict:
    """
    bare bones version of scsing scoring. Their function (see scsingscore.py)
    does a ton of stuff, here's the essentials

    :param genest: Geneset to score against
    :param x: pd.Series with the gene expression of a single sample. One gene per row
    :param norm_method: how to normalize the scores
    :param rankup: direction of ranking, up: True, down: False
    """

    sig_len_up = len(geneset)
    assert isinstance(x, pd.Series)
    up_sort = x.rank(method='min', ascending=rankup)  #
    su = []

    # for every gene in the list gene get the value at that
    # index/rowname (the gene) and the sample that is equal to i
    if True:
        for j in geneset:
            if j in up_sort.index:
                su.append(up_sort[j])
            else:
                sig_len_up = sig_len_up - 1
    else:
        # dict acces would be faster, but dict generation takes too loading
        # damn
        d = up_sort.to_dict()
        for g in geneset:
            if g in d:
                su.append(d[g])
            else:
                sig_len_up = sig_len_up - 1

    # normalise the score for the number of genes in the signature
    score_up = np.mean(su)
    norm_up = si.normalisation(norm_method=norm_method,
                               library_len=len(x.index),
                               score_list=su,
                               score=score_up,
                               sig_len=sig_len_up)
    norm_up = norm_up - 0.5
    mad_up = statsmodels.robust.scale.mad(su)
    total_score = norm_up
    return dict(total_score=total_score, mad_up=mad_up)
Example #11
0
def rank(value: Series) -> Series:
    """
    순위

    <설명>
    당일 코스피, 코스닥 구성 종목에서 해당 종목의 순위를 반환하는 함수입니다.
    0과 1사이의 값을 가지며, 1에 가까울수록 순위가 높다는 의미입니다.

    <사용 방법>
    첫 번째 인자에 순위를 구하고자 하는 값을 적으면 됩니다.
    예를 들어, 20일 평균 거래대금의 순위를 구하고자 하는 경우에는
    'rank(sma(tr_val, 20))' 또는 '순위(단순이동평균(거래대금, 20))'과 같이 작성하면 됩니다.

    :param value: (데이터) 순위를 구하고자 하는 값
    :return:
    """
    return value.rank(pct=True)
Example #12
0
    def test_rank_methods_series(self):
        from scipy.stats import rankdata

        xs = np.random.randn(9)
        xs = np.concatenate([xs[i:] for i in range(0, 9, 2)])  # add duplicates
        np.random.shuffle(xs)

        index = [chr(ord("a") + i) for i in range(len(xs))]

        for vals in [xs, xs + 1e6, xs * 1e-6]:
            ts = Series(vals, index=index)

            for m in ["average", "min", "max", "first", "dense"]:
                result = ts.rank(method=m)
                sprank = rankdata(vals, m if m != "first" else "ordinal")
                expected = Series(sprank, index=index).astype("float64")
                tm.assert_series_equal(result, expected)
Example #13
0
    def test_rank_methods_series(self):
        tm.skip_if_no_package('scipy', '0.13', 'scipy.stats.rankdata')
        from scipy.stats import rankdata

        xs = np.random.randn(9)
        xs = np.concatenate([xs[i:] for i in range(0, 9, 2)])  # add duplicates
        np.random.shuffle(xs)

        index = [chr(ord('a') + i) for i in range(len(xs))]

        for vals in [xs, xs + 1e6, xs * 1e-6]:
            ts = Series(vals, index=index)

            for m in ['average', 'min', 'max', 'first', 'dense']:
                result = ts.rank(m)
                sprank = rankdata(vals, m if m != 'first' else 'ordinal')
                tm.assert_series_equal(result, Series(sprank, index=index))
Example #14
0
    def test_rank_dense_method(self):
        dtypes = ['O', 'f8', 'i8']
        in_out = [([1], [1]),
                  ([2], [1]),
                  ([0], [1]),
                  ([2, 2], [1, 1]),
                  ([1, 2, 3], [1, 2, 3]),
                  ([4, 2, 1], [3, 2, 1],),
                  ([1, 1, 5, 5, 3], [1, 1, 3, 3, 2]),
                  ([-5, -4, -3, -2, -1], [1, 2, 3, 4, 5])]

        for ser, exp in in_out:
            for dtype in dtypes:
                s = Series(ser).astype(dtype)
                result = s.rank(method='dense')
                expected = Series(exp).astype(result.dtype)
                assert_series_equal(result, expected)
Example #15
0
    def test_rank_methods_series(self):
        tm.skip_if_no_package('scipy', '0.13', 'scipy.stats.rankdata')
        from scipy.stats import rankdata

        xs = np.random.randn(9)
        xs = np.concatenate([xs[i:] for i in range(0, 9, 2)])  # add duplicates
        np.random.shuffle(xs)

        index = [chr(ord('a') + i) for i in range(len(xs))]

        for vals in [xs, xs + 1e6, xs * 1e-6]:
            ts = Series(vals, index=index)

            for m in ['average', 'min', 'max', 'first', 'dense']:
                result = ts.rank(method=m)
                sprank = rankdata(vals, m if m != 'first' else 'ordinal')
                tm.assert_series_equal(result, Series(sprank, index=index))
Example #16
0
    def test_rank_methods_series(self):
        pytest.importorskip('scipy.stats.special')
        rankdata = pytest.importorskip('scipy.stats.rankdata')

        xs = np.random.randn(9)
        xs = np.concatenate([xs[i:] for i in range(0, 9, 2)])  # add duplicates
        np.random.shuffle(xs)

        index = [chr(ord('a') + i) for i in range(len(xs))]

        for vals in [xs, xs + 1e6, xs * 1e-6]:
            ts = Series(vals, index=index)

            for m in ['average', 'min', 'max', 'first', 'dense']:
                result = ts.rank(method=m)
                sprank = rankdata(vals, m if m != 'first' else 'ordinal')
                expected = Series(sprank, index=index).astype('float64')
                tm.assert_series_equal(result, expected)
Example #17
0
    def test_rank_methods_series(self):
        pytest.importorskip('scipy.stats.special')
        rankdata = pytest.importorskip('scipy.stats.rankdata')

        xs = np.random.randn(9)
        xs = np.concatenate([xs[i:] for i in range(0, 9, 2)])  # add duplicates
        np.random.shuffle(xs)

        index = [chr(ord('a') + i) for i in range(len(xs))]

        for vals in [xs, xs + 1e6, xs * 1e-6]:
            ts = Series(vals, index=index)

            for m in ['average', 'min', 'max', 'first', 'dense']:
                result = ts.rank(method=m)
                sprank = rankdata(vals, m if m != 'first' else 'ordinal')
                expected = Series(sprank, index=index).astype('float64')
                tm.assert_series_equal(result, expected)
Example #18
0
    def test_rank_dense_method(self):
        dtypes = ["O", "f8", "i8"]
        in_out = [
            ([1], [1]),
            ([2], [1]),
            ([0], [1]),
            ([2, 2], [1, 1]),
            ([1, 2, 3], [1, 2, 3]),
            ([4, 2, 1], [3, 2, 1]),
            ([1, 1, 5, 5, 3], [1, 1, 3, 3, 2]),
            ([-5, -4, -3, -2, -1], [1, 2, 3, 4, 5]),
        ]

        for ser, exp in in_out:
            for dtype in dtypes:
                s = Series(ser).astype(dtype)
                result = s.rank(method="dense")
                expected = Series(exp).astype(result.dtype)
                tm.assert_series_equal(result, expected)
Example #19
0
def Ranking(a, b):
    RankDict = {}
    RankInitial = []
    RankInitial.append(ANC_reportRank(a, b))
    c = RowToSub[a]
    total = 0.00
    for i in check:
        result = check.get_group(c)
    k = result.shape[0]
    for j in range(k):
        p = result.values[j][0]
        RankDict[p] = ANC_reportRank(p, b)
        total += 1
    del RankDict[a]
    RankList = RankDict.values()
    for x in RankList:
        RankInitial.append(x)
    RankPosition = Series(RankInitial)
    RankFinal = RankPosition.rank(method='min')[0]
    print 'Ranked:', RankFinal, 'on a scale of:', total
Example #20
0
def Ranking(a, b, tree):
    RankDict = {}
    RankInitial = []
    RankInitial.append(ANC_reportRank(a, b, tree))
    c = RowToSub[a]
    total = 0.00
    for i in check:
        result = check.get_group(c)
    k = result.shape[0]
    for j in range(k):
        p = result.values[j][0]
        RankDict[p] = ANC_reportRank(p, b, tree)
        total += 1
    del RankDict[a]
    RankList = RankDict.values()
    for x in RankList:
        RankInitial.append(x)
    RankPosition = Series(RankInitial)
    RankFinal = RankPosition.rank(method='min')[0]
    # print 'Ranked:',ord(RankFinal),'on a scale of:', total
    return RankFinal
Example #21
0
    def test_rank_methods_series(self):
        tm.skip_if_no_package('scipy', min_version='0.13',
                              app='scipy.stats.rankdata')
        import scipy
        from scipy.stats import rankdata

        xs = np.random.randn(9)
        xs = np.concatenate([xs[i:] for i in range(0, 9, 2)])  # add duplicates
        np.random.shuffle(xs)

        index = [chr(ord('a') + i) for i in range(len(xs))]

        for vals in [xs, xs + 1e6, xs * 1e-6]:
            ts = Series(vals, index=index)

            for m in ['average', 'min', 'max', 'first', 'dense']:
                result = ts.rank(method=m)
                sprank = rankdata(vals, m if m != 'first' else 'ordinal')
                expected = Series(sprank, index=index)

                if LooseVersion(scipy.__version__) >= '0.17.0':
                    expected = expected.astype('float64')
                tm.assert_series_equal(result, expected)
Example #22
0
 def test_rank_inf(self, contents, dtype):
     dtype_na_map = {
         'float64': np.nan,
         'float32': np.nan,
         'int64': iNaT,
         'object': None
     }
     # Insert nans at random positions if underlying dtype has missing
     # value. Then adjust the expected order by adding nans accordingly
     # This is for testing whether rank calculation is affected
     # when values are interwined with nan values.
     values = np.array(contents, dtype=dtype)
     exp_order = np.array(range(len(values)), dtype='float64') + 1.0
     if dtype in dtype_na_map:
         na_value = dtype_na_map[dtype]
         nan_indices = np.random.choice(range(len(values)), 5)
         values = np.insert(values, nan_indices, na_value)
         exp_order = np.insert(exp_order, nan_indices, np.nan)
     # shuffle the testing array and expected results in the same way
     random_order = np.random.permutation(len(values))
     iseries = Series(values[random_order])
     exp = Series(exp_order[random_order], dtype='float64')
     iranks = iseries.rank()
     assert_series_equal(iranks, exp)
Example #23
0
    def test_rank_methods_series(self):
        tm.skip_if_no_package('scipy',
                              min_version='0.13',
                              app='scipy.stats.rankdata')
        import scipy
        from scipy.stats import rankdata

        xs = np.random.randn(9)
        xs = np.concatenate([xs[i:] for i in range(0, 9, 2)])  # add duplicates
        np.random.shuffle(xs)

        index = [chr(ord('a') + i) for i in range(len(xs))]

        for vals in [xs, xs + 1e6, xs * 1e-6]:
            ts = Series(vals, index=index)

            for m in ['average', 'min', 'max', 'first', 'dense']:
                result = ts.rank(method=m)
                sprank = rankdata(vals, m if m != 'first' else 'ordinal')
                expected = Series(sprank, index=index)

                if LooseVersion(scipy.__version__) >= '0.17.0':
                    expected = expected.astype('float64')
                tm.assert_series_equal(result, expected)
Example #24
0
 def test_rank_inf(self, contents, dtype):
     dtype_na_map = {
         'float64': np.nan,
         'float32': np.nan,
         'int64': iNaT,
         'object': None
     }
     # Insert nans at random positions if underlying dtype has missing
     # value. Then adjust the expected order by adding nans accordingly
     # This is for testing whether rank calculation is affected
     # when values are interwined with nan values.
     values = np.array(contents, dtype=dtype)
     exp_order = np.array(range(len(values)), dtype='float64') + 1.0
     if dtype in dtype_na_map:
         na_value = dtype_na_map[dtype]
         nan_indices = np.random.choice(range(len(values)), 5)
         values = np.insert(values, nan_indices, na_value)
         exp_order = np.insert(exp_order, nan_indices, np.nan)
     # shuffle the testing array and expected results in the same way
     random_order = np.random.permutation(len(values))
     iseries = Series(values[random_order])
     exp = Series(exp_order[random_order], dtype='float64')
     iranks = iseries.rank()
     assert_series_equal(iranks, exp)
Example #25
0

# row 기준 내림차순 
print(data.sort_index(ascending = False))

# column 기준 내림차순
print(data.sort_index(axis= 1,ascending = False))

# 객체 기준 정렬 오름차순
print(data.sort_values(by = 'a')) # by 컬럼명

# 복합 객체 기준 정렬 오름차순
print(data.sort_values(by = ['a', 'b'])) # by 컬럼명

# 순위 기준 정렬 - 동점자 처리 - 공동 처리
print(data.rank())

# 순위 기준 정렬 - 동점자 처리 - 데이터 순서 내림차순 상위 처리
print(data.rank(method = 'first'))

# 순위 기준 정렬 - 동점자 처리 - 공동 처리 
print(data.rank(ascending = False))

# 순위 기준 정렬 - 동점자 처리 - 공동 처리 
print(data.rank(method = 'first',ascending = False))

# 중복 색인은 허용됨.

#=========
# 기술 통계
#=========
obj = Series([4, 7, -3, 2])
print(obj.order())
'''
2   -3
3    2
0    4
1    7
dtype: int64
'''


'''
Ranking is closely related to sorting, assigning ranks from one through the number of
valid data points in an array. It is similar to the indirect sort indices produced by
numpy.argsort , except that ties are broken according to a rule. The rank methods for
Series and DataFrame are the place to look; by default rank breaks ties by assigning
each group the mean rank:
'''
obj = Series([7, -5, 7, 4, 2, 0, 4])
print(obj.rank())
'''
0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64
'''
Example #27
0
 def test_rank_signature(self):
     s = Series([0, 1])
     s.rank(method='average')
     msg = r"No axis named average for object type <(class|type) 'type'>"
     with pytest.raises(ValueError, match=msg):
         s.rank('average')
Example #28
0
    def test_rank_categorical(self):
        # GH issue #15420 rank incorrectly orders ordered categories

        # Test ascending/descending ranking for ordered categoricals
        exp = Series([1., 2., 3., 4., 5., 6.])
        exp_desc = Series([6., 5., 4., 3., 2., 1.])
        ordered = Series(
            ['first', 'second', 'third', 'fourth', 'fifth', 'sixth']
        ).astype(
            'category',
            categories=['first', 'second', 'third',
                        'fourth', 'fifth', 'sixth'],
            ordered=True
        )
        assert_series_equal(ordered.rank(), exp)
        assert_series_equal(ordered.rank(ascending=False), exp_desc)

        # Unordered categoricals should be ranked as objects
        unordered = Series(
            ['first', 'second', 'third', 'fourth', 'fifth', 'sixth'],
        ).astype(
            'category',
            categories=['first', 'second', 'third',
                        'fourth', 'fifth', 'sixth'],
            ordered=False
        )
        exp_unordered = Series([2., 4., 6., 3., 1., 5.])
        res = unordered.rank()
        assert_series_equal(res, exp_unordered)

        unordered1 = Series(
            [1, 2, 3, 4, 5, 6],
        ).astype(
            'category',
            categories=[1, 2, 3, 4, 5, 6],
            ordered=False
        )
        exp_unordered1 = Series([1., 2., 3., 4., 5., 6.])
        res1 = unordered1.rank()
        assert_series_equal(res1, exp_unordered1)

        # Test na_option for rank data
        na_ser = Series(
            ['first', 'second', 'third', 'fourth', 'fifth', 'sixth', np.NaN]
        ).astype(
            'category',
            categories=[
                'first', 'second', 'third', 'fourth',
                'fifth', 'sixth', 'seventh'
            ],
            ordered=True
        )

        exp_top = Series([2., 3., 4., 5., 6., 7., 1.])
        exp_bot = Series([1., 2., 3., 4., 5., 6., 7.])
        exp_keep = Series([1., 2., 3., 4., 5., 6., np.NaN])

        assert_series_equal(na_ser.rank(na_option='top'), exp_top)
        assert_series_equal(na_ser.rank(na_option='bottom'), exp_bot)
        assert_series_equal(na_ser.rank(na_option='keep'), exp_keep)

        # Test na_option for rank data with ascending False
        exp_top = Series([7., 6., 5., 4., 3., 2., 1.])
        exp_bot = Series([6., 5., 4., 3., 2., 1., 7.])
        exp_keep = Series([6., 5., 4., 3., 2., 1., np.NaN])

        assert_series_equal(
            na_ser.rank(na_option='top', ascending=False),
            exp_top
        )
        assert_series_equal(
            na_ser.rank(na_option='bottom', ascending=False),
            exp_bot
        )
        assert_series_equal(
            na_ser.rank(na_option='keep', ascending=False),
            exp_keep
        )

        # Test with pct=True
        na_ser = Series(
            ['first', 'second', 'third', 'fourth', np.NaN],
        ).astype(
            'category',
            categories=['first', 'second', 'third', 'fourth'],
            ordered=True
        )
        exp_top = Series([0.4, 0.6, 0.8, 1., 0.2])
        exp_bot = Series([0.2, 0.4, 0.6, 0.8, 1.])
        exp_keep = Series([0.25, 0.5, 0.75, 1., np.NaN])

        assert_series_equal(na_ser.rank(na_option='top', pct=True), exp_top)
        assert_series_equal(na_ser.rank(na_option='bottom', pct=True), exp_bot)
        assert_series_equal(na_ser.rank(na_option='keep', pct=True), exp_keep)
Example #29
0
import pandas as pd
from pandas import Series, DataFrame

ser1 = Series(range(3), index = ['C','A','B'])
ser1

# use sort index to sort by index
ser1.sort_index()

# use order to sort by values
ser1.order()

from numpy.random import randn
ser2 = Series(randn(10))
ser2

# ranking
ser2.sort_values()

ser2.rank()

ser2.sort_values(ascending = False)

ser3 = Series(randn(10))
ser3

ser3.rank()
ser3 = ser3.sort_values()
ser3.rank()

Example #30
0
def test_rank_average_pct(dtype, ser, exp):
    s = Series(ser).astype(dtype)
    result = s.rank(method="average", pct=True)
    expected = Series(exp).astype(result.dtype)
    tm.assert_series_equal(result, expected)
Example #31
0
    def test_rank(self, datetime_series):
        pytest.importorskip("scipy.stats.special")
        rankdata = pytest.importorskip("scipy.stats.rankdata")

        datetime_series[::2] = np.nan
        datetime_series[:10][::3] = 4.0

        ranks = datetime_series.rank()
        oranks = datetime_series.astype("O").rank()

        tm.assert_series_equal(ranks, oranks)

        mask = np.isnan(datetime_series)
        filled = datetime_series.fillna(np.inf)

        # rankdata returns a ndarray
        exp = Series(rankdata(filled), index=filled.index, name="ts")
        exp[mask] = np.nan

        tm.assert_series_equal(ranks, exp)

        iseries = Series(np.arange(5).repeat(2))

        iranks = iseries.rank()
        exp = iseries.astype(float).rank()
        tm.assert_series_equal(iranks, exp)
        iseries = Series(np.arange(5)) + 1.0
        exp = iseries / 5.0
        iranks = iseries.rank(pct=True)

        tm.assert_series_equal(iranks, exp)

        iseries = Series(np.repeat(1, 100))
        exp = Series(np.repeat(0.505, 100))
        iranks = iseries.rank(pct=True)
        tm.assert_series_equal(iranks, exp)

        iseries[1] = np.nan
        exp = Series(np.repeat(50.0 / 99.0, 100))
        exp[1] = np.nan
        iranks = iseries.rank(pct=True)
        tm.assert_series_equal(iranks, exp)

        iseries = Series(np.arange(5)) + 1.0
        iseries[4] = np.nan
        exp = iseries / 4.0
        iranks = iseries.rank(pct=True)
        tm.assert_series_equal(iranks, exp)

        iseries = Series(np.repeat(np.nan, 100))
        exp = iseries.copy()
        iranks = iseries.rank(pct=True)
        tm.assert_series_equal(iranks, exp)

        iseries = Series(np.arange(5)) + 1
        iseries[4] = np.nan
        exp = iseries / 4.0
        iranks = iseries.rank(pct=True)
        tm.assert_series_equal(iranks, exp)

        rng = date_range("1/1/1990", periods=5)
        iseries = Series(np.arange(5), rng) + 1
        iseries.iloc[4] = np.nan
        exp = iseries / 4.0
        iranks = iseries.rank(pct=True)
        tm.assert_series_equal(iranks, exp)

        iseries = Series([1e-50, 1e-100, 1e-20, 1e-2, 1e-20 + 1e-30, 1e-1])
        exp = Series([2, 1, 3, 5, 4, 6.0])
        iranks = iseries.rank()
        tm.assert_series_equal(iranks, exp)

        # GH 5968
        iseries = Series(["3 day", "1 day 10m", "-2 day", NaT], dtype="m8[ns]")
        exp = Series([3, 2, 1, np.nan])
        iranks = iseries.rank()
        tm.assert_series_equal(iranks, exp)

        values = np.array(
            [-50, -1, -1e-20, -1e-25, -1e-50, 0, 1e-40, 1e-20, 1e-10, 2, 40],
            dtype="float64",
        )
        random_order = np.random.permutation(len(values))
        iseries = Series(values[random_order])
        exp = Series(random_order + 1.0, dtype="float64")
        iranks = iseries.rank()
        tm.assert_series_equal(iranks, exp)
Example #32
0
print(frame.sort_index(axis=1))
print(frame.sort_index(axis=1, ascending=False))

obj = Series([4, 7, -3, -2])
print(obj.sort_values())
obj = Series([4, np.nan, 7, np.nan, -3, -2])
print(obj.sort_values())

frame = DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
print(frame)
print(frame.sort_values(by='b'))
print(frame.sort_values(by=['a', 'b']))

# rank
obj = Series([7, -5, 7, 4, 2, 0, 4])
print(obj.rank())
print(obj.rank(method='first'))
print(obj.rank(method='max', ascending=False))

frame = DataFrame({'b': [4.3, 7, -3, 2], 'a': [0, 1, 0, 1], 'c': [-2, 5, 8, -2.5]})
print(frame)
print(frame.rank(axis=1))

'''
duplicate index
'''
obj = Series(range(5), index=['a', 'a', 'b', 'b', 'c'])
print(obj)
print(obj.index.is_unique)
print(obj['a'])
print(obj['c'])
print "根据索引排序,对于DataFrame可以指定轴。"
obj = Series(range(4), index=["d", "a", "b", "c"])
print obj.sort_index()
frame = DataFrame(np.arange(8).reshape((2, 4)), index=["three", "one"], columns=list("dabc"))
print frame.sort_index()
print frame.sort_index(axis=1)  # axis=1 表示对列进行操作
print frame.sort_index(axis=1, ascending=False)  # 降序
print

print "根据值排序"
obj = Series([4, 7, -3, 2])
print obj.sort_values()  # order已淘汰
print

print "DataFrame指定列排序"
frame = DataFrame({"b": [4, 7, -3, 2], "a": [0, 1, 0, 1]})
print frame
print frame.sort_values(by="b")  # sort_index(by = ...)已淘汰
print frame.sort_values(by=["a", "b"])
print

print "rank,求排名的平均位置(从1开始)"
obj = Series([7, -5, 7, 4, 2, 0, 4])
# 对应排名:-5(1), 0(2), 2(3), 4(4), 4(5), 7(6), 7(7)
print obj.rank()
print obj.rank(method="first")  # 去第一次出现,不求平均值。
print obj.rank(ascending=False, method="max")  # 逆序,并取最大值。所以-5的rank是7.
frame = DataFrame({"b": [4.3, 7, -3, 2], "a": [0, 1, 0, 1], "c": [-2, 5, 8, -2.5]})
print frame
print frame.rank(axis=1)
Example #34
0
def test_rank_first_pct(dtype, ser, exp):
        s = Series(ser).astype(dtype)
        result = s.rank(method='first', pct=True)
        expected = Series(exp).astype(result.dtype)
        assert_series_equal(result, expected)
obj.sort_index()

frame=DataFrame(np.arange(8).reshape((2,4)),index=['three','one'],columns=['d','a','b','c'])

frame.sort_index(axis=1,ascending=False)

frame.sort_index()

frame.sort_index(axis=1)

# 排序

obj=Series([7,-5,7,4,2,0,4])

obj.rank()  #重新 按 升序 从一排序 rank() 意思 是rank(method=‘average’) 是 又max 和min 两个排名 取均值 得到的。

#另外还有几种排序 平级  处理方式 参见 P140

#******************************************
#索引有可能不是唯一的  带有重复的唯一 可以由以下 来判断

obj.index.is_unique


#***********************************************

df=DataFrame([[1.4,np.nan],[7.1,-4.5],[np.nan,np.nan],[0.75,-1.3]],index=['a','b','c','d'],columns=['one','two'])

df.sum()
#每次随机散步中,最早离原点距离30时是在第几次漫步.
print np.argmax(walkcum[index],axis=1)
print np.mean(np.argmax(walkcum[index],axis=1))
pd.Index

obj = Series([1,2,3])

obj.reindex()

data = DataFrame([[1,2,3],[4,5,6]])
data.drop()

np.argsort()

obj.rank()

obj.sort_values()


data.tail()

data.cov()

data.cov()

data.corr()

data.dropna()

data.loc
frame.sort_values(by=['a', 'b'])  # 先a后b进行列的值排序
'''
   a  b
2  0 -3
0  0  4
3  1  2
1  1  7
'''
print

print
'rank:默认升序,排名值从1开始'
obj = Series([4, 2, 0, 4], index=['a', 'b', 'c', 'd'])
# 以值从小到大来赋排名值:c:0(1) b:2(2) a:4(3) d:4(4)
print
obj.rank()
'''
a    3.5  求平均值(4+3)/2
b    2.0
c    1.0
d    3.5
'''
print
obj.rank(method='first')  # 按出现顺序排名,不求平均值。
'''
a    3.0
b    2.0
c    1.0
d    4.0
'''
print
Example #38
0
    def test_rank_categorical(self):
        # GH issue #15420 rank incorrectly orders ordered categories

        # Test ascending/descending ranking for ordered categoricals
        exp = Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0])
        exp_desc = Series([6.0, 5.0, 4.0, 3.0, 2.0, 1.0])
        ordered = Series(
            ["first", "second", "third", "fourth", "fifth", "sixth"]).astype(
                CategoricalDtype(
                    categories=[
                        "first", "second", "third", "fourth", "fifth", "sixth"
                    ],
                    ordered=True,
                ))
        tm.assert_series_equal(ordered.rank(), exp)
        tm.assert_series_equal(ordered.rank(ascending=False), exp_desc)

        # Unordered categoricals should be ranked as objects
        unordered = Series(
            ["first", "second", "third", "fourth", "fifth", "sixth"]).astype(
                CategoricalDtype(
                    categories=[
                        "first", "second", "third", "fourth", "fifth", "sixth"
                    ],
                    ordered=False,
                ))
        exp_unordered = Series([2.0, 4.0, 6.0, 3.0, 1.0, 5.0])
        res = unordered.rank()
        tm.assert_series_equal(res, exp_unordered)

        unordered1 = Series([1, 2, 3, 4, 5, 6]).astype(
            CategoricalDtype([1, 2, 3, 4, 5, 6], False))
        exp_unordered1 = Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0])
        res1 = unordered1.rank()
        tm.assert_series_equal(res1, exp_unordered1)

        # Test na_option for rank data
        na_ser = Series(
            ["first", "second", "third", "fourth", "fifth", "sixth",
             np.NaN]).astype(
                 CategoricalDtype(
                     [
                         "first", "second", "third", "fourth", "fifth",
                         "sixth", "seventh"
                     ],
                     True,
                 ))

        exp_top = Series([2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 1.0])
        exp_bot = Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0])
        exp_keep = Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, np.NaN])

        tm.assert_series_equal(na_ser.rank(na_option="top"), exp_top)
        tm.assert_series_equal(na_ser.rank(na_option="bottom"), exp_bot)
        tm.assert_series_equal(na_ser.rank(na_option="keep"), exp_keep)

        # Test na_option for rank data with ascending False
        exp_top = Series([7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0])
        exp_bot = Series([6.0, 5.0, 4.0, 3.0, 2.0, 1.0, 7.0])
        exp_keep = Series([6.0, 5.0, 4.0, 3.0, 2.0, 1.0, np.NaN])

        tm.assert_series_equal(na_ser.rank(na_option="top", ascending=False),
                               exp_top)
        tm.assert_series_equal(
            na_ser.rank(na_option="bottom", ascending=False), exp_bot)
        tm.assert_series_equal(na_ser.rank(na_option="keep", ascending=False),
                               exp_keep)

        # Test invalid values for na_option
        msg = "na_option must be one of 'keep', 'top', or 'bottom'"

        with pytest.raises(ValueError, match=msg):
            na_ser.rank(na_option="bad", ascending=False)

        # invalid type
        with pytest.raises(ValueError, match=msg):
            na_ser.rank(na_option=True, ascending=False)

        # Test with pct=True
        na_ser = Series(["first", "second", "third", "fourth", np.NaN]).astype(
            CategoricalDtype(["first", "second", "third", "fourth"], True))
        exp_top = Series([0.4, 0.6, 0.8, 1.0, 0.2])
        exp_bot = Series([0.2, 0.4, 0.6, 0.8, 1.0])
        exp_keep = Series([0.25, 0.5, 0.75, 1.0, np.NaN])

        tm.assert_series_equal(na_ser.rank(na_option="top", pct=True), exp_top)
        tm.assert_series_equal(na_ser.rank(na_option="bottom", pct=True),
                               exp_bot)
        tm.assert_series_equal(na_ser.rank(na_option="keep", pct=True),
                               exp_keep)
Example #39
0
 def test_rank_signature(self):
     s = Series([0, 1])
     s.rank(method="average")
     msg = "No axis named average for object type Series"
     with pytest.raises(ValueError, match=msg):
         s.rank("average")
Example #40
0
    def test_rank_categorical(self):
        # GH issue #15420 rank incorrectly orders ordered categories

        # Test ascending/descending ranking for ordered categoricals
        exp = Series([1., 2., 3., 4., 5., 6.])
        exp_desc = Series([6., 5., 4., 3., 2., 1.])
        ordered = Series(
            ['first', 'second', 'third', 'fourth', 'fifth', 'sixth']
        ).astype(CategoricalDtype(categories=['first', 'second', 'third',
                                              'fourth', 'fifth', 'sixth'],
                                  ordered=True))
        assert_series_equal(ordered.rank(), exp)
        assert_series_equal(ordered.rank(ascending=False), exp_desc)

        # Unordered categoricals should be ranked as objects
        unordered = Series(['first', 'second', 'third', 'fourth',
                            'fifth', 'sixth']).astype(
            CategoricalDtype(categories=['first', 'second', 'third',
                                         'fourth', 'fifth', 'sixth'],
                             ordered=False))
        exp_unordered = Series([2., 4., 6., 3., 1., 5.])
        res = unordered.rank()
        assert_series_equal(res, exp_unordered)

        unordered1 = Series(
            [1, 2, 3, 4, 5, 6],
        ).astype(CategoricalDtype([1, 2, 3, 4, 5, 6], False))
        exp_unordered1 = Series([1., 2., 3., 4., 5., 6.])
        res1 = unordered1.rank()
        assert_series_equal(res1, exp_unordered1)

        # Test na_option for rank data
        na_ser = Series(
            ['first', 'second', 'third', 'fourth', 'fifth', 'sixth', np.NaN]
        ).astype(CategoricalDtype(['first', 'second', 'third', 'fourth',
                                   'fifth', 'sixth', 'seventh'], True))

        exp_top = Series([2., 3., 4., 5., 6., 7., 1.])
        exp_bot = Series([1., 2., 3., 4., 5., 6., 7.])
        exp_keep = Series([1., 2., 3., 4., 5., 6., np.NaN])

        assert_series_equal(na_ser.rank(na_option='top'), exp_top)
        assert_series_equal(na_ser.rank(na_option='bottom'), exp_bot)
        assert_series_equal(na_ser.rank(na_option='keep'), exp_keep)

        # Test na_option for rank data with ascending False
        exp_top = Series([7., 6., 5., 4., 3., 2., 1.])
        exp_bot = Series([6., 5., 4., 3., 2., 1., 7.])
        exp_keep = Series([6., 5., 4., 3., 2., 1., np.NaN])

        assert_series_equal(
            na_ser.rank(na_option='top', ascending=False),
            exp_top
        )
        assert_series_equal(
            na_ser.rank(na_option='bottom', ascending=False),
            exp_bot
        )
        assert_series_equal(
            na_ser.rank(na_option='keep', ascending=False),
            exp_keep
        )

        # Test invalid values for na_option
        msg = "na_option must be one of 'keep', 'top', or 'bottom'"

        with tm.assert_raises_regex(ValueError, msg):
            na_ser.rank(na_option='bad', ascending=False)

        # invalid type
        with tm.assert_raises_regex(ValueError, msg):
            na_ser.rank(na_option=True, ascending=False)

        # Test with pct=True
        na_ser = Series(['first', 'second', 'third', 'fourth', np.NaN]).astype(
            CategoricalDtype(['first', 'second', 'third', 'fourth'], True))
        exp_top = Series([0.4, 0.6, 0.8, 1., 0.2])
        exp_bot = Series([0.2, 0.4, 0.6, 0.8, 1.])
        exp_keep = Series([0.25, 0.5, 0.75, 1., np.NaN])

        assert_series_equal(na_ser.rank(na_option='top', pct=True), exp_top)
        assert_series_equal(na_ser.rank(na_option='bottom', pct=True), exp_bot)
        assert_series_equal(na_ser.rank(na_option='keep', pct=True), exp_keep)
Example #41
0
def test_pct_max_many_rows():
    # GH 18271
    s = Series(np.arange(2**24 + 1))
    result = s.rank(pct=True).max()
    assert result == 1
print(ser2.sort_values())
'''
5   -2.682719
8   -1.789567
2   -0.991176
9   -0.547529
3    0.144296
4    0.481288
6    0.593906
7    0.815368
0    1.290655
1    1.383484
dtype: float64
'''

print(ser2.rank())
'''
0     9.0
1    10.0
2     3.0
3     5.0
4     6.0
5     1.0
6     7.0
7     8.0
8     2.0
9     4.0
dtype: float64
'''
Example #43
0
def test_pct_max_many_rows():
    # GH 18271
    s = Series(np.arange(2**24 + 1))
    result = s.rank(pct=True).max()
    assert result == 1
Example #44
0
s.unique()

# count(*) group by non-NaN value, get a Series
s.value_counts()

# aggregation and statistic
s.max()
s.mean()
s.var()

# location of the max element
s.idxmax()

# rank
s = Series([4, 1, 2, 5])
s.rank()                     # return [3,1,2,4]

# plot
s.plot()
plt.show()

# translate ##################################################
# sort
new_s1 = s.sort_index()       # sort by index
new_s2 = s.sort_values()      # sort by values

# reindex includes the following steps:
# 1. Reordering existing data to match a set of labels.
# 2. Inserting NaN markers where no data exists for a label.
# 3. Possibly, filling missing data for a label using some type
#    of logic
Example #45
0
def test_rank_first_pct(dtype, ser, exp):
    s = Series(ser).astype(dtype)
    result = s.rank(method='first', pct=True)
    expected = Series(exp).astype(result.dtype)
    assert_series_equal(result, expected)
class MySeries:
    def __init__(self, *args, **kwargs):
        self.x = Series(*args, **kwargs)
        self.values = self.x.values
        self.index = self.x.index
    
    def rolling_mean(self, *args, **kwargs):
        return MySeries(pd.rolling_mean(self.x, *args, **kwargs))

    def rolling_count(self, *args, **kwargs):
        return MySeries(pd.rolling_count(self.x, *args, **kwargs))

    def rolling_sum(self, *args, **kwargs):
        return MySeries(pd.rolling_sum(self.x, *args, **kwargs))

    def rolling_median(self, *args, **kwargs):
        return MySeries(pd.rolling_median(self.x, *args, **kwargs))
        
    def rolling_min(self, *args, **kwargs):
        return MySeries(pd.rolling_min(self.x, *args, **kwargs))

    def rolling_max(self, *args, **kwargs):
        return MySeries(pd.rolling_max(self.x, *args, **kwargs))

    def rolling_std(self, *args, **kwargs):
        return MySeries(pd.rolling_std(self.x, *args, **kwargs))

    def rolling_var(self, *args, **kwargs):
        return MySeries(pd.rolling_var(self.x, *args, **kwargs))

    def rolling_skew(self, *args, **kwargs):
        return MySeries(pd.rolling_skew(self.x, *args, **kwargs))

    def rolling_kurtosis(self, *args, **kwargs):
        return MySeries(pd.rolling_kurtosis(self.x, *args, **kwargs))

    def rolling_window(self, *args, **kwargs):
        return MySeries(pd.rolling_window(self.x, *args, **kwargs))

    def cumprod(self, *args, **kwargs):
        return MySeries(self.x.cumprod(*args, **kwargs))

    def cumsum(self, *args, **kwargs):
        return MySeries(self.x.cumsum(*args, **kwargs))

    def diff(self, *args, **kwargs):
        return MySeries(self.x.diff(*args, **kwargs))

    def div(self, *args, **kwargs):
        return MySeries(self.x.div(*args, **kwargs))

    def mul(self, *args, **kwargs):
        return MySeries(self.x.mul(*args, **kwargs))

    def add(self, *args, **kwargs):
        return MySeries(self.x.add(*args, **kwargs))

    def dropna(self, *args, **kwargs):
        return MySeries(self.x.dropna(*args, **kwargs))
    
    def fillna(self, *args, **kwargs):
        return MySeries(self.x.fillna(*args, **kwargs))

    def floordiv(self, *args, **kwargs):
        return MySeries(self.x.floordiv(*args, **kwargs))

    def mod(self, *args, **kwargs):
        return MySeries(self.x.mod(*args, **kwargs))

    def nlargest(self, *args, **kwargs):
        return MySeries(self.x.nlargest(*args, **kwargs))

    def nonzero(self, *args, **kwargs):
        return MySeries(self.x.nonzero(*args, **kwargs))

    def nsmallest(self, *args, **kwargs):
        return MySeries(self.x.nsmallest(*args, **kwargs))

    def pow(self, *args, **kwargs):
        return MySeries(self.x.pow(*args, **kwargs))

    def rank(self, *args, **kwargs):
        return MySeries(self.x.rank(*args, **kwargs))

    def round(self, *args, **kwargs):
        return MySeries(self.x.round(*args, **kwargs))

    def shift(self, *args, **kwargs):
        return MySeries(self.x.shift(*args, **kwargs))

    def sub(self, *args, **kwargs):
        return MySeries(self.x.sub(*args, **kwargs))

    def abs(self, *args, **kwargs):
        return MySeries(self.x.abs(*args, **kwargs))

    def clip(self, *args, **kwargs):
        return MySeries(self.x.clip(*args, **kwargs))

    def clip_lower(self, *args, **kwargs):
        return MySeries(self.x.clip_lower(*args, **kwargs))

    def clip_upper(self, *args, **kwargs):
        return MySeries(self.x.clip_upper(*args, **kwargs))
    
    def interpolate(self, *args, **kwargs):
        return MySeries(self.x.interpolate(*args, **kwargs))

    def resample(self, *args, **kwargs):
        return MySeries(self.x.resample(*args, **kwargs))
        
    def replace(self, *args, **kwargs):
        return MySeries(self.x.replace(*args, **kwargs))
Example #47
0
 def test_rank_signature(self):
     s = Series([0, 1])
     s.rank(method='average')
     self.assertRaises(ValueError, s.rank, 'average')
Example #48
0
# now shift gears and sort by values
obj = Series([4, 7, -3, 2])
obj.order()  # error
obj.sort_values()

obj = Series([4, np.nan, 7, np.nan, -3, 2])
obj.sort_values()

frame = DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
frame
frame.sort_index(by='b')  # from old version of book
frame.sort_values(by='b')  # you can make this a list if you like

# ranking
obj = Series([7, -5, 7, 4, 2, 0, 4])
obj.rank()

obj.rank(method='first')

obj.rank(ascending=False, method='max')

frame = DataFrame({
    'b': [4.3, 7, -3, 2],
    'a': [0, 1, 0, 1],
    'c': [-2, 5, 8, -2.5]
})
frame
frame.rank(axis=1)

# duplicate indecies
obj = Series(range(5), index=['a', 'a', 'b', 'b', 'c'])
Example #49
0
frame.sort_index()
# 按列名排序
frame.sort_index(axis=1)
# 降序排
frame.sort_index(axis=1, ascending=False)
# 对值进行排序,这个只能对Series使用
obj = Series([4,7,-3,2])
obj.order()
# 排序时缺失值都会被放在末尾
# 对多列进行排序
frame = DataFrame({'b':[4,7,-3,2],'a':[0,1,0,1]})
frame.sort_index(by=['a','b'])
frame.order(by=['a','b'])
# 排名
obj = Series([7,-5,7,4,2,0,4])
obj.rank()
# 对于相同值,按照出现次序排
obj.rank(method='first')
# 降序
obj.rank(ascending=False,method='max')
# 对列计算排名
frame = DataFrame({'b':[4.3,7,-3,2],'a':[0,1,0,1],'c':[-2,5,8,-2.5]})
frame.rank(axis=1)

## 带有重复值的轴索引
obj = Series(range(5), index=['a','a','b','b','c'])
# 检验是否唯一
obj.index.is_unique
# 一个索引有多个值,那么该索引就会返回多个值。
obj['a']
def main():
    # reindex
    obj = Series(range(4), index="a b c d".split(" ")[::-1])
    print obj

    obj2 = obj.reindex("a b c d e".split(" "))
    print obj2

    # Change NaN
    print obj.reindex("a b c d e".split(" "), fill_value=0)
    colors = ["blue", "purple", "yellow"]
    index = [0, 2, 4]
    obj3 = Series(colors, index=index)
    print obj3.reindex(range(6))
    print obj3.reindex(range(6), method="ffill")  # not found forward fill
    print obj3.reindex(range(6), method="backfill")  # bfill

    # DataFrame
    states = ["Ohio", "Texas", "California"]
    frame = DataFrame(np.arange(9).reshape((3, 3)), index="a b c".split(" "), columns=["Ohio", "Texas", "California"])
    print frame
    frame2 = frame.reindex("a b c d".split(" "))
    print frame2
    states[0] = "Utah"
    states[1], states[0] = states[:2]
    print frame.reindex(columns=states)
    # fill
    print frame.reindex("a b c d".split(" "), method="ffill", columns=states)
    print frame.ix["a b c d".split(" ")]
    print frame.ix["a b c d".split(" "), states]

    # Delete column
    print "", ""
    obj = Series(range(5), index="a b c d e".split(" "))
    new_obj = obj.drop("c")
    print new_obj
    print obj

    # Index reference
    print "", ""
    obj = Series(np.arange(4.0), index="a b c d".split(" "))
    print obj["b"]
    print obj[1]  # same
    print obj[2:4]
    print obj[["b", "a", "c"]]
    print obj[[1, 3]]
    print obj[obj < 2]
    # Slice with label
    print obj["b":"c"]  # include 'c'
    obj["b":"c"] = 5
    print obj

    data = DataFrame(
        np.arange(16).reshape((4, 4)),
        index=["Ohio", "Colorado", "Utah", "New York"],
        columns=["one", "two", "three", "four"],
    )
    print data
    # column
    print data["two"]
    print data[["three", "one"]]
    # row
    print data[:2]
    print data[data["three"] > 5]
    # all values
    print data < 5
    data[data < 5] = 0
    print data
    # row and column
    print data.ix[["Colorado"], ["two", "three"]]
    print data.ix[["Colorado", "Utah"], [3, 0, 1]]
    # row
    print data.ix[2]
    # label row and column, return column
    print data.ix[:"Utah", "two"]
    # xs
    # row
    print data.xs("Utah")
    print data.xs("Utah", axis=0)
    # rows
    print data.xs("two", axis=1)
    # icol/irow i is index
    print data.icol(1)
    print data.irow(1)

    # Union
    print "", ""
    s1 = Series([7.3, -2.5, 3.4, 1.5], index=["a", "c", "d", "e"])
    s2 = Series([-2.1, 3.6, -1.5, 4, 3.1], index=["a", "c", "e", "f", "g"])
    print s1
    print s2
    # index is union, but d, f, g are NaN
    print s1 + s2
    df1 = DataFrame(np.arange(9.0).reshape((3, 3)), columns=list("bcd"), index=["Ohio", "Texas", "Colorado"])
    df2 = DataFrame(np.arange(12.0).reshape((4, 3)), columns=list("bde"), index=["Utah", "Ohio", "Texas", "Oregon"])
    print df1
    print df2
    print df1 + df2

    # arithmetic method
    print "", ""
    df1 = DataFrame(np.arange(12.0).reshape((3, 4)), columns=list("abcd"))
    df2 = DataFrame(np.arange(20.0).reshape((4, 5)), columns=list("abcde"))
    print df1
    print df2
    print df1.add(df2, fill_value=0)
    # reindex has fill_value argument
    # other arithmetic method are sub/div/mul(ti)

    # Calculation in a DataFrame and Series
    print "", ""
    # subtract from each row. broadcat
    arr = np.arange(12.0).reshape((3, 4))
    print arr
    print arr[0]
    print arr - arr[0]
    frame = DataFrame(np.arange(12.0).reshape((4, 3)), columns=list("bde"), index=["Utah", "Ohio", "Texas", "Oregon"])
    series = frame.ix[0]
    print frame
    print series
    print frame - series

    series2 = Series(range(3), index=list("bef"))
    print frame + series2

    series3 = frame["d"]
    series4 = frame.ix[0]
    print frame
    print series3
    print series4
    print frame.sub(series3, axis=0)
    print frame.sub(series4, axis=1)

    # apply function and mapping
    print "", ""
    frame = DataFrame(np.arange(12.0).reshape((4, 3)), columns=list("bde"), index=["Utah", "Ohio", "Texas", "Oregon"])
    print frame
    f = lambda x: x.max() - x.min()
    print frame.apply(f)
    print frame.apply(f, axis=1)

    f = lambda x: Series([x.min(), x.max()], index=["min", "max"])
    print frame.apply(f)

    format = lambda x: "{0:.2f}".format(x)
    print frame.applymap(format)  # frame
    print frame["e"].map(format)  # series

    # sort and rank
    print "", ""
    obj = Series(range(4), index=list("dabc"))
    print obj
    print obj.sort_index()

    frame = DataFrame(np.arange(8).reshape((2, 4)), index=["three", "one"], columns=list("dabc"))
    print frame
    print frame.sort_index()
    print frame.sort_index(axis=1)
    print frame.sort_index(axis=1, ascending=False)

    # Sorting series
    print "", ""
    obj = Series([4, 7, -3, 2])
    print obj.order()
    obj = Series([4, np.nan, 7, np.nan, -3, 2])
    print obj.order()
    print obj.order(ascending=False)

    # order by multi columns
    print "", ""
    frame = DataFrame({"b": [4, 7, -3, 2], "a": [0, 1, 0, 1]})
    print frame.sort_index(by=["a", "b"])

    # rank
    print "", ""
    obj = Series([7, -5, 7, 4, 2, 0, 4])
    print obj.rank()  # method is average
    print obj.rank(method="first")  # No Duplicates
    print obj.rank(ascending=False, method="min")
    print obj.rank(ascending=False, method="max")
    f1 = DataFrame(obj, columns=["data"])
    f2 = DataFrame(obj.rank(), columns=["rank"])
    # merge by each index
    print pd.merge(f1, f2, left_index=True, right_index=True)

    # Index of the axis with duplicate values
    print "", ""
    obj = Series(range(5), index=list("aaabc"))
    print obj
    print obj.index.is_unique
    print obj["a"]
    print obj["c"]

    df = DataFrame(np.arange(12.0).reshape((4, 3)), index=list("aabb"), columns=list("ccd"))
    print df
    print df.ix["b"]
    print df["c"]
Example #51
0
    def test_rank(self):
        tm._skip_if_no_scipy()
        from scipy.stats import rankdata

        self.ts[::2] = np.nan
        self.ts[:10][::3] = 4.

        ranks = self.ts.rank()
        oranks = self.ts.astype('O').rank()

        assert_series_equal(ranks, oranks)

        mask = np.isnan(self.ts)
        filled = self.ts.fillna(np.inf)

        # rankdata returns a ndarray
        exp = Series(rankdata(filled), index=filled.index, name='ts')
        exp[mask] = np.nan

        tm.assert_series_equal(ranks, exp)

        iseries = Series(np.arange(5).repeat(2))

        iranks = iseries.rank()
        exp = iseries.astype(float).rank()
        assert_series_equal(iranks, exp)
        iseries = Series(np.arange(5)) + 1.0
        exp = iseries / 5.0
        iranks = iseries.rank(pct=True)

        assert_series_equal(iranks, exp)

        iseries = Series(np.repeat(1, 100))
        exp = Series(np.repeat(0.505, 100))
        iranks = iseries.rank(pct=True)
        assert_series_equal(iranks, exp)

        iseries[1] = np.nan
        exp = Series(np.repeat(50.0 / 99.0, 100))
        exp[1] = np.nan
        iranks = iseries.rank(pct=True)
        assert_series_equal(iranks, exp)

        iseries = Series(np.arange(5)) + 1.0
        iseries[4] = np.nan
        exp = iseries / 4.0
        iranks = iseries.rank(pct=True)
        assert_series_equal(iranks, exp)

        iseries = Series(np.repeat(np.nan, 100))
        exp = iseries.copy()
        iranks = iseries.rank(pct=True)
        assert_series_equal(iranks, exp)

        iseries = Series(np.arange(5)) + 1
        iseries[4] = np.nan
        exp = iseries / 4.0
        iranks = iseries.rank(pct=True)
        assert_series_equal(iranks, exp)

        rng = date_range('1/1/1990', periods=5)
        iseries = Series(np.arange(5), rng) + 1
        iseries.iloc[4] = np.nan
        exp = iseries / 4.0
        iranks = iseries.rank(pct=True)
        assert_series_equal(iranks, exp)

        iseries = Series([1e-50, 1e-100, 1e-20, 1e-2, 1e-20 + 1e-30, 1e-1])
        exp = Series([2, 1, 3, 5, 4, 6.0])
        iranks = iseries.rank()
        assert_series_equal(iranks, exp)

        # GH 5968
        iseries = Series(['3 day', '1 day 10m', '-2 day', NaT],
                         dtype='m8[ns]')
        exp = Series([3, 2, 1, np.nan])
        iranks = iseries.rank()
        assert_series_equal(iranks, exp)

        values = np.array(
            [-50, -1, -1e-20, -1e-25, -1e-50, 0, 1e-40, 1e-20, 1e-10, 2, 40
             ], dtype='float64')
        random_order = np.random.permutation(len(values))
        iseries = Series(values[random_order])
        exp = Series(random_order + 1.0, dtype='float64')
        iranks = iseries.rank()
        assert_series_equal(iranks, exp)
Example #52
0
#!/usr/bin/env python3

from pandas import Series, DataFrame
import pandas as pd
import numpy as np

print('-----------------Series排名排序----------------------')
obj = Series(range(4), index=['a', 'c', 'b', 'd'])
print(obj.sort_index())  #Series根据索引排序
print(obj.sort_values())  #Series根据值排序
obj = Series([7, -5, 7, 3, 4, 2])
print(obj.rank())  #rank:排名值 method:排名时用于破坏平级关系的选项
print(obj.rank(method='first'))
print(obj.rank(method='max'))
print(obj.rank(method='min'))
#first按值在原始数据中出现顺序分配排名
#max使用整个分组的最大排名
#min使用整个分组的最小排名
#average 默认:在相等分组中,为各个值分配平均排名
print('-----------------Dataframe排名排序----------------------')
frame = DataFrame(np.arange(8).reshape(2, 4),
                  index=['three', 'one'],
                  columns=['d', 'a', 'b', 'c'])
print(frame)
print(frame.sort_index())  #根据行索引排序
print(frame.sort_index(axis=1))  #根据列索引排序
print(frame.sort_index(axis=1, ascending=False))  #倒序
df = DataFrame({'a': [4, 7, -3, 2], 'b': [0, 1, 0, 1]})
print(df)
print(df.sort_values(by=['a'], ascending=False))  #根据列值进行排序
print(df.rank(axis=1))
Example #53
0
 def test_rank_signature(self):
     s = Series([0, 1])
     s.rank(method='average')
     pytest.raises(ValueError, s.rank, 'average')
Example #54
0
def CombinedReport(id, interval, tree):
    """CombinedReport(id,interval)-- This takes an id and interval interms of shorthand
    months of the year --E.g CombinedReport('cK5zkZIUFsN','jan')"""
    mytype = tree['ANC'][id]
    mynewtype = []
    x = 0
    for i in range(0, len(mytype), 2):
        mynewtype.append(mytype[i])
    remapped = np.nan_to_num(mynewtype)
    for i in Months[interval]:
        x += remapped[i]
    A = x / 3.00
# start of comparison report bit of the function
    mytypeComp = tree['ANC'][id]
    mynewtypeComp = []
    xComp = 0
    for i in range(0, len(mytypeComp), 2):
        mynewtypeComp.append(mytypeComp[i])
    remapped = np.nan_to_num(mynewtypeComp)
    for i in Months[Mappings[interval]]:
        xComp += remapped[i]
    B = xComp / 3.00
# start of completeness report for ANC
    mytype = tree['ANC'][id]
    mynewtype = []
    truetest = []
    x = 0
    for i in range(0, len(mytype), 2):
        mynewtype.append(mytype[i])
    remapped = np.isnan(mynewtype)
    for i in Months[interval]:
        truetest.append(remapped[i])
    remappednum = truetest.count(False)
    C = (remappednum / 3.00) * 100
    mytype = tree['PVC'][id]
    x = 0
    remapped = np.nan_to_num(mytype)
    for i in Months[interval]:
        x += remapped[i]
    D = x / 3.00
# start of comparison of PVC past Month's report
    mytype = tree['PVC'][id]
    x = 0
    remapped = np.nan_to_num(mytype)
    for i in Months[Mappings[interval]]:
        x += remapped[i]
    E = x / 3.00
# start of comparison of Completeness of PVC past Month's report
    mytype = tree['PVC'][id]
    truetest = []
    x = 0.0000
    remapped = np.isnan(mytype)
    remapped = list(remapped)
    for i in Months[interval]:
        truetest.append(remapped[i])
    x = truetest.count(False)
    F = (x / 3.00) * 100
    mytype = tree['Deliv'][id]
    x = 0
    remapped = np.nan_to_num(mytype)
    for i in Months[interval]:
        x += remapped[i]
    G = x / 3.00
# start of comparison for the Past Months Deliveries report
    mytype = tree['Deliv'][id]
    x = 0
    remapped = np.nan_to_num(mytype)
    for i in Months[Mappings[interval]]:
        x += remapped[i]
    H = x / 3.00
# start of comparison of Completeness of Deliveries for Month's report
    mytype = tree['Deliv'][id]
    truetest = []
    x = 0.0000
    remapped = np.isnan(mytype)
    remapped = list(remapped)
    for i in Months[interval]:
        truetest.append(remapped[i])
    x = truetest.count(False)
    I = (x / 3.00) * 100
    RankDict = {}
    RankInitial = []
    RankInitial.append(ANC_reportRank(id, interval, tree))
    c = RowToSub[id]
    total = 0
    for i in check:
        result = check.get_group(c)
    k = result.shape[0]
    for j in range(k):
        p = result.values[j][0]
        RankDict[p] = ANC_reportRank(p, interval, tree)
        total += 1
    del RankDict[id]
    RankList = RankDict.values()
    for x in RankList:
        RankInitial.append(x)
    RankPosition = Series(RankInitial)
    RankFinal = RankPosition.rank(method='min')[0]
    J = RankFinal
    K = total
    L = ord(J), ':of', K
    return A, B, C, D, E, F, G, H, I, L
Example #55
0
    def test_rank(self):
        pytest.importorskip('scipy.stats.special')
        rankdata = pytest.importorskip('scipy.stats.rankdata')

        self.ts[::2] = np.nan
        self.ts[:10][::3] = 4.

        ranks = self.ts.rank()
        oranks = self.ts.astype('O').rank()

        assert_series_equal(ranks, oranks)

        mask = np.isnan(self.ts)
        filled = self.ts.fillna(np.inf)

        # rankdata returns a ndarray
        exp = Series(rankdata(filled), index=filled.index, name='ts')
        exp[mask] = np.nan

        tm.assert_series_equal(ranks, exp)

        iseries = Series(np.arange(5).repeat(2))

        iranks = iseries.rank()
        exp = iseries.astype(float).rank()
        assert_series_equal(iranks, exp)
        iseries = Series(np.arange(5)) + 1.0
        exp = iseries / 5.0
        iranks = iseries.rank(pct=True)

        assert_series_equal(iranks, exp)

        iseries = Series(np.repeat(1, 100))
        exp = Series(np.repeat(0.505, 100))
        iranks = iseries.rank(pct=True)
        assert_series_equal(iranks, exp)

        iseries[1] = np.nan
        exp = Series(np.repeat(50.0 / 99.0, 100))
        exp[1] = np.nan
        iranks = iseries.rank(pct=True)
        assert_series_equal(iranks, exp)

        iseries = Series(np.arange(5)) + 1.0
        iseries[4] = np.nan
        exp = iseries / 4.0
        iranks = iseries.rank(pct=True)
        assert_series_equal(iranks, exp)

        iseries = Series(np.repeat(np.nan, 100))
        exp = iseries.copy()
        iranks = iseries.rank(pct=True)
        assert_series_equal(iranks, exp)

        iseries = Series(np.arange(5)) + 1
        iseries[4] = np.nan
        exp = iseries / 4.0
        iranks = iseries.rank(pct=True)
        assert_series_equal(iranks, exp)

        rng = date_range('1/1/1990', periods=5)
        iseries = Series(np.arange(5), rng) + 1
        iseries.iloc[4] = np.nan
        exp = iseries / 4.0
        iranks = iseries.rank(pct=True)
        assert_series_equal(iranks, exp)

        iseries = Series([1e-50, 1e-100, 1e-20, 1e-2, 1e-20 + 1e-30, 1e-1])
        exp = Series([2, 1, 3, 5, 4, 6.0])
        iranks = iseries.rank()
        assert_series_equal(iranks, exp)

        # GH 5968
        iseries = Series(['3 day', '1 day 10m', '-2 day', NaT],
                         dtype='m8[ns]')
        exp = Series([3, 2, 1, np.nan])
        iranks = iseries.rank()
        assert_series_equal(iranks, exp)

        values = np.array(
            [-50, -1, -1e-20, -1e-25, -1e-50, 0, 1e-40, 1e-20, 1e-10, 2, 40
             ], dtype='float64')
        random_order = np.random.permutation(len(values))
        iseries = Series(values[random_order])
        exp = Series(random_order + 1.0, dtype='float64')
        iranks = iseries.rank()
        assert_series_equal(iranks, exp)
Example #56
0
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
from numpy.random import random
ser1 = Series([500, 1000, 1500], index=['a', 'c', 'b'])
print(ser1)
#sorting by index
print(ser1.sort_index())

#sort by values
print(ser1.sort_values())
print(ser1.rank())

#ranking of series
ser2 = Series(random(10))
print(ser2)

print(ser2.rank())

ser2 = ser2.sort_values()
print(ser2.rank)
def panda_basci_function():
    # 基本功能
    # PART1: 重新索引 reindex
    obj = Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
    obj2 = obj.reindex(['a', 'b', 'c', 'd',
                        'e'])  # reindex会根据新索引进行重排,某个索引值不存在就引入Nan
    obj3 = obj.reindex(['a', 'b', 'c', 'd', 'e'], fill_value=0)
    # print(obj2)
    # print(obj3)

    obj3 = Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])
    # print(obj3)
    # print(obj3.reindex(range(6), method='ffill'))
    # print(obj3.reindex(range(6), method='bfill'))
    # method: ffill/pad --前向填充/搬运  bfill/backfill --后向填充/搬运

    # DF,reindex可以修改行索引,列,或者两个都修改;如果仅传入一个序列,则会重新索引行
    frame = DataFrame(np.arange(9).reshape((3, 3)),
                      columns=['Ohio', 'Texas', 'California'],
                      index=['a', 'c', 'd'])
    frame2 = frame.reindex(['a', 'b', 'c', 'd'])  # 仅仅重新索引行
    # print(frame2)

    states = ['Texas', 'Utah', 'California']
    # print(frame.reindex(columns=states))  # 对列重新索引

    # print(frame.reindex(index=['a', 'b', 'c', 'd'], columns=states))  # 同时重新索引行列
    # print(frame.ix[['a', 'b', 'c', 'd'], states])

    # PART2: 丢弃指定轴上的项
    obj = Series(np.arange(5), index=['a', 'b', 'c', 'd', 'e'])
    new_obj = obj.drop('c')  # 删除索引c
    # print(new_obj)
    # print(obj.drop(['c', 'd']))

    data = DataFrame(np.arange(16).reshape((4, 4)),
                     index=['Ohio', 'Colorado', 'Utah', 'New York'],
                     columns=['one', 'two', 'three', 'four'])
    # print(data.drop(['Colorado', 'Ohio']))  # 删除行索引
    # print(data.drop('two', axis=1))  # axis删除列上的索引
    # print(data.drop(['two', 'four'], axis=1))

    # PART3: 索引,选取和过滤
    obj = Series(np.arange(4), index=['a', 'b', 'c', 'd'])
    # print(obj['b'])  # 标签索引
    # print(obj[1])    # 数字索引
    # print(obj[2:4])
    # print(obj[['b', 'c']])

    # print(obj['b': 'c'])  # 标签切片,末端是包含的

    # print(data['two'])  # 对DF索引其实就是获取一个或多个列
    # print(data[['three', 'one']])
    # print(data[:2])     # 切片选取行
    # print(data[data['three'] > 5])  # 布尔数组选取行
    # print(data.ix[:2])
    # print(data.ix['Colorado'])  # 行标签选择指定行
    # print(data.ix['Colorado', ['two', 'three']])  # 行标签 & 列索引
    # print(data.ix[:, 'two'])  # two列
    """
    obj[val]: 选取df的一个或一组列
    obj.ix[val]: 选取dc的单个行或一组行
    obj.ix[:, val]: 选取单个列或列子集
    obj.ix[val1, val2]: 同时选取行和列
    loc:通过行标签索引数据
    iloc:通过行号索引行数据
    ix:通过行标签或行号索引数据(基于loc和iloc的混合) 
        df.ix[0] -- 行号索引
        dc.ix['a'] -- 行标签索引
    """

    # PART4: 算数运算和数据对齐
    s1 = Series([1, 2, 3, 4], index=['a', 'c', 'd', 'e'])
    s2 = Series([5, 6, 7, 8, 9], index=['a', 'c', 'e', 'f', 'g'])
    # print(s1 + s2)  # 相加时结果为索引对的并集,自动对齐使得在不重复的索引处引入Na

    # df1 = DataFrame(np.arange(9).reshape((3, 3)),
    #                 columns=list('bcd'),
    #                 index=['Ohio', 'Texas', 'Colorado'])
    # df2 = DataFrame(np.arange(12).reshape((4, 3)),
    #                 columns=list('bde'),
    #                 index=['Utah', 'Ohio', 'Texas', 'Oregon'])
    # print(df1 + df2)  # df对齐会同时发生在行和列上

    # 在算数方法中填充值: add(),sub(),div(),mul()
    df1 = DataFrame(np.arange(12).reshape((3, 4)), columns=list('abcd'))
    df2 = DataFrame(np.arange(20).reshape((4, 5)), columns=list('abcde'))
    # print(df1.add(df2, fill_value=0))  # 调用df的add()
    # print(df1.reindex(columns=df2.columns, fill_value=0))  # 重新索引

    # DF和Series之间的运算
    arr = np.arange(12).reshape((3, 4))
    # print(arr - arr[0])  # 广播

    frame = DataFrame(np.arange(12).reshape((4, 3)),
                      columns=list('bde'),
                      index=['Utah', 'Ohio', 'Texas', 'Oregon'])
    # series = frame.ix[0]
    # print(frame - series)  # series的索引匹配到df的列,然后沿着行一直向下广播

    # 列相减,即匹配行且在列上广播
    series = frame['d']
    # print(frame.sub(series, axis=0))  # axis就是希望匹配的轴,行为轴0,列为轴1

    # PART5: 函数应用和映射
    frame = DataFrame(np.random.randn(4, 3),
                      columns=list('bde'),
                      index=['Utah', 'Ohio', 'Texas', 'Oregon'])
    # print(np.abs(frame))  # numpy的元素级函数可以直接使用
    f = lambda x: x.max() - x.min()
    # print(frame.apply(f))  # 使用apply将函数应用到各行或列上
    # print(frame.apply(f, axis=1))
    format = lambda x: '%.2f' % x
    # print(frame.applymap(format))  # 元素级的python函数通过applymap()
    # print(frame[0].map(format))

    # PART6: 排序和排名
    obj = Series(range(4), index=['d', 'a', 'b', 'c'])
    # print(obj.sort_index())   # 对行索引进行排序
    # print(obj.sort_values())  # 对值进行排序

    frame = DataFrame(np.arange(8).reshape((2, 4)),
                      index=['three', 'one'],
                      columns=['d', 'a', 'b', 'c'])
    # print(frame.sort_index())  # 对行轴进行排序,即一行中的数据是由小到大的
    # print(frame.sort_index(axis=1))  # 对列轴进行排序.即一列中的数据是由小到大的
    # print(frame.sort_index(by='b'))  # 对某个列进行排序

    # 排名ranking和排序关系密切,会增加一个排名值(从1开始,一直到数据中有效数据的数量),和numpy的argsort()类似,只不过
    # ranking可以根据某种规则破坏评级关系
    obj = Series([1, 2, 3, 4])
    print(obj.rank())  # 现在值表示:在原来obj这个序列中,0-3这4个索引所对应的每一个值分别在序列里排名第几。

    obj = Series([1, 1, 2, 2, 3, 4])
    print(obj.rank())
    # 索引0和索引1对应的值均为1,按照上面的说法,调用rank()方法后,他们的排名分别是第1位,和第2位,那么究竟是索引0对应的值是第1,还是索引1对应的值是第1呢?
    # rank函数的默认处理是当出现重复值的情况下,默认取他们排名次序值(这里的第1名、第2名)的平均值。也就是说索引0和索引1对应的值1统一排名为(1+2)/2 = 1.5。
    # method      说明
    # average     默认:在相等分组中,为各个值分配平均排名
    # min         使用整个分组的最小排名(两人并列第 1 名,下一个人是第 3 名。 )
    # max         使用整个分组的最大排名(两人并列第 2 名,下一个人是第 3 名)
    # first       按值在原始数据中的出现顺序分配排名

    # PART7: 带重复值的轴索引
    obj = Series(range(5), index=['a', 'a', 'b', 'b', 'c'])
    print(obj)
    print(obj.index.is_unique)  # 索引是否唯一
    print(obj['a'])  # 索引a对应2个值,返回series
    print(obj['c'])  # 索引c对应1个值,返回标量
Example #58
0
def pd_05():
    obj=Series([7,-5,7,4,2,0,4])
    print obj.rank()
    print obj.rank(method='first')
    print obj.rank(ascending=False,method='first')
    print obj.describe()