Example #1
0
    def test_rank(self, float_frame):
        rankdata = pytest.importorskip('scipy.stats.rankdata')

        float_frame['A'][::2] = np.nan
        float_frame['B'][::3] = np.nan
        float_frame['C'][::4] = np.nan
        float_frame['D'][::5] = np.nan

        ranks0 = float_frame.rank()
        ranks1 = float_frame.rank(1)
        mask = np.isnan(float_frame.values)

        fvals = float_frame.fillna(np.inf).values

        exp0 = np.apply_along_axis(rankdata, 0, fvals)
        exp0[mask] = np.nan

        exp1 = np.apply_along_axis(rankdata, 1, fvals)
        exp1[mask] = np.nan

        tm.assert_almost_equal(ranks0.values, exp0)
        tm.assert_almost_equal(ranks1.values, exp1)

        # integers
        df = DataFrame(np.random.randint(0, 5, size=40).reshape((10, 4)))

        result = df.rank()
        exp = df.astype(float).rank()
        tm.assert_frame_equal(result, exp)

        result = df.rank(1)
        exp = df.astype(float).rank(1)
        tm.assert_frame_equal(result, exp)
Example #2
0
    def test_rank(self):
        tm._skip_if_no_scipy()
        from scipy.stats import rankdata

        self.frame['A'][::2] = np.nan
        self.frame['B'][::3] = np.nan
        self.frame['C'][::4] = np.nan
        self.frame['D'][::5] = np.nan

        ranks0 = self.frame.rank()
        ranks1 = self.frame.rank(1)
        mask = np.isnan(self.frame.values)

        fvals = self.frame.fillna(np.inf).values

        exp0 = np.apply_along_axis(rankdata, 0, fvals)
        exp0[mask] = np.nan

        exp1 = np.apply_along_axis(rankdata, 1, fvals)
        exp1[mask] = np.nan

        tm.assert_almost_equal(ranks0.values, exp0)
        tm.assert_almost_equal(ranks1.values, exp1)

        # integers
        df = DataFrame(np.random.randint(0, 5, size=40).reshape((10, 4)))

        result = df.rank()
        exp = df.astype(float).rank()
        tm.assert_frame_equal(result, exp)

        result = df.rank(1)
        exp = df.astype(float).rank(1)
        tm.assert_frame_equal(result, exp)
Example #3
0
    def test_rank2(self):
        df = DataFrame([[1, 3, 2], [1, 2, 3]])
        expected = DataFrame([[1.0, 3.0, 2.0], [1, 2, 3]]) / 3.0
        result = df.rank(1, pct=True)
        tm.assert_frame_equal(result, expected)

        df = DataFrame([[1, 3, 2], [1, 2, 3]])
        expected = df.rank(0) / 2.0
        result = df.rank(0, pct=True)
        tm.assert_frame_equal(result, expected)

        df = DataFrame([['b', 'c', 'a'], ['a', 'c', 'b']])
        expected = DataFrame([[2.0, 3.0, 1.0], [1, 3, 2]])
        result = df.rank(1, numeric_only=False)
        tm.assert_frame_equal(result, expected)

        expected = DataFrame([[2.0, 1.5, 1.0], [1, 1.5, 2]])
        result = df.rank(0, numeric_only=False)
        tm.assert_frame_equal(result, expected)

        df = DataFrame([['b', np.nan, 'a'], ['a', 'c', 'b']])
        expected = DataFrame([[2.0, nan, 1.0], [1.0, 3.0, 2.0]])
        result = df.rank(1, numeric_only=False)
        tm.assert_frame_equal(result, expected)

        expected = DataFrame([[2.0, nan, 1.0], [1.0, 1.0, 2.0]])
        result = df.rank(0, numeric_only=False)
        tm.assert_frame_equal(result, expected)

        # f7u12, this does not work without extensive workaround
        data = [[datetime(2001, 1, 5), nan, datetime(2001, 1, 2)],
                [datetime(2000, 1, 2), datetime(2000, 1, 3),
                 datetime(2000, 1, 1)]]
        df = DataFrame(data)

        # check the rank
        expected = DataFrame([[2., nan, 1.],
                              [2., 3., 1.]])
        result = df.rank(1, numeric_only=False, ascending=True)
        tm.assert_frame_equal(result, expected)

        expected = DataFrame([[1., nan, 2.],
                              [2., 1., 3.]])
        result = df.rank(1, numeric_only=False, ascending=False)
        tm.assert_frame_equal(result, expected)

        # mixed-type frames
        self.mixed_frame['datetime'] = datetime.now()
        self.mixed_frame['timedelta'] = timedelta(days=1, seconds=1)

        result = self.mixed_frame.rank(1)
        expected = self.mixed_frame.rank(1, numeric_only=True)
        tm.assert_frame_equal(result, expected)

        df = DataFrame({"a": [1e-20, -5, 1e-20 + 1e-40, 10,
                              1e60, 1e80, 1e-30]})
        exp = DataFrame({"a": [3.5, 1., 3.5, 5., 6., 7., 2.]})
        tm.assert_frame_equal(df.rank(), exp)
Example #4
0
    def test_rank_methods_frame(self):
        tm.skip_if_no_package('scipy', min_version='0.13',
                              app='scipy.stats.rankdata')
        import scipy
        from scipy.stats import rankdata

        xs = np.random.randint(0, 21, (100, 26))
        xs = (xs - 10.0) / 10.0
        cols = [chr(ord('z') - i) for i in range(xs.shape[1])]

        for vals in [xs, xs + 1e6, xs * 1e-6]:
            df = DataFrame(vals, columns=cols)

            for ax in [0, 1]:
                for m in ['average', 'min', 'max', 'first', 'dense']:
                    result = df.rank(axis=ax, method=m)
                    sprank = np.apply_along_axis(
                        rankdata, ax, vals,
                        m if m != 'first' else 'ordinal')
                    sprank = sprank.astype(np.float64)
                    expected = DataFrame(sprank, columns=cols)

                    if LooseVersion(scipy.__version__) >= '0.17.0':
                        expected = expected.astype('float64')
                    tm.assert_frame_equal(result, expected)
Example #5
0
    def test_rank_pct_true(self, method, exp):
        # see gh-15630.

        df = DataFrame([[2012, 66, 3], [2012, 65, 2], [2012, 65, 1]])
        result = df.rank(method=method, pct=True)

        expected = DataFrame(exp)
        tm.assert_frame_equal(result, expected)
Example #6
0
def rank(prefix, target, links, vectors, theta, k=2):
    res = DataFrame(columns=["p"], index=vectors.index)
    for i in range(vectors.shape[0]):       # vectors.shape[0]
        t = vectors.index[i]
        flag = False
        for j in range(k):
            if t == prefix[j]:
                flag = True
        if flag == False:
            res.loc[t] = probability2(prefix, t, links, vectors, theta, k)
    print res
    r = res.rank(ascending=False)['p'][target]
    return r
Example #7
0
    def test_rank_methods_frame(self):
        pytest.importorskip('scipy.stats.special')
        rankdata = pytest.importorskip('scipy.stats.rankdata')

        xs = np.random.randint(0, 21, (100, 26))
        xs = (xs - 10.0) / 10.0
        cols = [chr(ord('z') - i) for i in range(xs.shape[1])]

        for vals in [xs, xs + 1e6, xs * 1e-6]:
            df = DataFrame(vals, columns=cols)

            for ax in [0, 1]:
                for m in ['average', 'min', 'max', 'first', 'dense']:
                    result = df.rank(axis=ax, method=m)
                    sprank = np.apply_along_axis(
                        rankdata, ax, vals,
                        m if m != 'first' else 'ordinal')
                    sprank = sprank.astype(np.float64)
                    expected = DataFrame(sprank,
                                         columns=cols).astype('float64')
                    tm.assert_frame_equal(result, expected)
Example #8
0
class Scores(object):
    """

    Parameters
    ----------
    uri : str, optional

    modality : str, optional

    Returns
    -------
    scores : `Scores`

    Examples
    --------

        >>> s = Scores(uri='video', modality='speaker')
        >>> s[Segment(0,1), 's1', 'A'] = 0.1
        >>> s[Segment(0,1), 's1', 'B'] = 0.2
        >>> s[Segment(0,1), 's1', 'C'] = 0.3
        >>> s[Segment(0,1), 's2', 'A'] = 0.4
        >>> s[Segment(0,1), 's2', 'B'] = 0.3
        >>> s[Segment(0,1), 's2', 'C'] = 0.2
        >>> s[Segment(2,3), 's1', 'A'] = 0.2
        >>> s[Segment(2,3), 's1', 'B'] = 0.1
        >>> s[Segment(2,3), 's1', 'C'] = 0.3

    """
    @classmethod
    def from_df(
        cls, df,
        uri=None, modality=None, aggfunc=np.mean
    ):
        """

        Parameters
        ----------
        df : DataFrame
            Must contain the following columns:
            'segment', 'track', 'label' and 'value'
        uri : str, optional
            Resource identifier
        modality : str, optional
            Modality
        aggfunc : func
            Value aggregation function in case of duplicate (segment, track,
            label) tuples

        Returns
        -------

        """
        dataframe = pivot_table(
            df, values=PYANNOTE_SCORE,
            index=[PYANNOTE_SEGMENT, PYANNOTE_TRACK], columns=PYANNOTE_LABEL,
            aggfunc=aggfunc
        )

        annotation = Annotation(uri=uri, modality=modality)
        for index, _ in dataframe.iterrows():
            segment = Segment(*index[0])
            track = index[1]
            annotation[segment, track] = ''

        labels = dataframe.columns

        return cls(uri=uri, modality=modality,
                   annotation=annotation, labels=labels,
                   values=dataframe.values)

    def __init__(self, uri=None, modality=None,
                 annotation=None, labels=None,
                 values=None, dtype=None):

        super(Scores, self).__init__()

        names = [PYANNOTE_SEGMENT + '_' + field
                 for field in Segment._fields] + [PYANNOTE_TRACK]

        if annotation:
            annotation = annotation.copy()
            index = Index(
                [s + (t, ) for s, t in annotation.itertracks()],
                name=names)

        else:
            annotation = Annotation(uri=uri, modality=modality)
            index = MultiIndex(levels=[list() for name in names],
                               labels=[list() for name in names],
                               names=names)

        self.annotation_ = annotation
        columns = None if labels is None else list(labels)
        data = None if values is None else np.array(values)
        dtype = np.float if values is None else values.dtype

        self.dataframe_ = DataFrame(data=data, dtype=dtype,
                                    index=index, columns=columns)

        self.hasChanged_ = True

        self.modality = modality
        self.uri = uri

    def copy(self):
        self._reindexIfNeeded()
        copied = self.__class__(uri=self.uri, modality=self.modality)
        copied.dataframe_ = self.dataframe_.copy()
        copied.annotation_ = self.annotation_.copy()
        copied.hasChanged_ = self.hasChanged_
        return copied

    # del scores[segment]
    # del scores[segment, :]
    # del scores[segment, track]
    def __delitem__(self, key):

        if isinstance(key, Segment):
            segment = key
            self.dataframe_.drop(tuple(segment), axis=0, inplace=True)
            del self.annotation_[segment]
            self.hasChanged_ = True

        elif isinstance(key, tuple) and len(key) == 2:
            segment, track = key
            self.dataframe_.drop(tuple(segment) + (track, ),
                                 axis=0, inplace=True)
            del self.annotation_[segment, track]
            self.hasChanged_ = True

        else:
            raise KeyError('')

    # value = scores[segment, track, label]
    def __getitem__(self, key):

        if len(key) == 2:
            key = (key[0], '_', key[1])

        segment, track, label = key
        return self.dataframe_.at[tuple(segment) + (track, ), label]

    # scores[segment, track, label] = value
    # scores[segment, label] ==== scores[segment, '_', label]
    def __setitem__(self, key, value):

        if len(key) == 2:
            key = (key[0], '_', key[1])

        segment, track, label = key

        # do not add empty track
        if not segment:
            return

        self.dataframe_.at[tuple(segment) + (track,), label] = value
        self.annotation_[segment, track] = label
        self.hasChanged_ = True

    def __len__(self):
        """Number of annotated segments"""
        return len(self.annotation_)

    def __nonzero__(self):
        return self.__bool__()

    def __bool__(self):
        """False if annotation is empty"""
        return True if self.annotation_ else False

    def __contains__(self, included):
        """Check if segments are annotated

        Parameters
        ----------
        included : `Segment` or `Timeline`

        Returns
        -------
        contains : bool
            True if every segment in `included` is annotated, False otherwise.
        """
        return included in self.annotation_

    def __iter__(self):
        """Iterate over sorted segments"""
        return iter(self.annotation_.get_timeline())

    def __reversed__(self):
        """Reverse iterate over sorted segments"""
        return reversed(self.annotation_.get_timeline())

    def itersegments(self):
        return iter(self)

    def tracks(self, segment):
        """Set of tracks for query segment

        Parameters
        ----------
        segment : `Segment`
            Query segment

        Returns
        -------
        tracks : set
            Set of tracks for query segment
        """
        return self.annotation_.get_tracks(segment)

    def has_track(self, segment, track):
        """Check whether a given track exists

        Parameters
        ----------
        segment : `Segment`
            Query segment
        track :
            Query track

        Returns
        -------
        exists : bool
            True if track exists for segment
        """
        return self.annotation_.has_track(segment, track)

    def get_track_by_name(self, track):
        """Get all tracks with given name

        Parameters
        ----------
        track : any valid track name
            Requested name track

        Returns
        -------
        tracks : list
            List of (segment, track) tuples
        """
        return self.annotation_.get_track_by_name(track)

    def new_track(self, segment, candidate=None, prefix=None):
        """Track name generator

        Parameters
        ----------
        segment : Segment
        prefix : str, optional
        candidate : any valid track name


        Returns
        -------
        track : str
            New track name
        """

        return self.annotation_.new_track(segment, candidate=None, prefix=None)

    def itertracks(self):
        """Iterate over annotation as (segment, track) tuple"""
        return self.annotation_.itertracks()

    def itervalues(self):
        """Iterate over scores as (segment, track, label, value) tuple"""

        # make sure segment/track pairs are sorted
        self._reindexIfNeeded()

        labels = self.labels()

        # yield one (segment, track, label) tuple per loop
        for index, columns in self.dataframe_.iterrows():
            segment = Segment(*index[:-1])
            track = index[-1]
            for label in labels:
                value = columns[label]
                if not np.isnan(value):
                    yield segment, track, label, value

    def get_track_scores(self, segment, track):
        """Get all scores for a given track.

        Parameters
        ----------
        segment : Segment
        track : hashable
            segment, track must be a valid track

        Returns
        -------
        scores : dict
            {label: score} dictionary
        """
        return dict(self.dataframe_.xs(tuple(segment) + (track, )))

    def labels(self, unknown=True):
        """List of labels

        Parameters
        ----------
        unknown : bool, optional
            When False, do not return Unknown instances
            When True, return any label (even Unknown instances)

        Returns
        -------
        labels : list
            Sorted list of existing labels

        Remarks
        -------
            Labels are sorted based on their string representation.
        """
        labels = sorted(self.dataframe_.columns, key=str)
        if unknown:
            return labels
        else:
            return [l for l in labels if not isinstance(l, Unknown)]

    def _reindexIfNeeded(self):

        if not self.hasChanged_:
            return

        names = [PYANNOTE_SEGMENT + '_' + field
                 for field in Segment._fields] + [PYANNOTE_TRACK]

        new_index = Index(
            [s + (t, ) for s, t in self.annotation_.itertracks()],
            name=names)

        self.dataframe_ = self.dataframe_.reindex(new_index)

        self.hasChanged_ = False

        return

    def retrack(self):
        """
        """

        self._reindexIfNeeded()
        retracked = self.copy()

        annotation = self.annotation_.retrack()
        retracked.annotation_ = annotation

        names = [PYANNOTE_SEGMENT + '_' + field
                 for field in Segment._fields] + [PYANNOTE_TRACK]
        new_index = Index(
            [s + (t, ) for s, t in annotation.itertracks()],
            name=names)
        retracked.dataframe_.index = new_index

        return retracked

    def apply(self, func, axis=0):

        applied = self.copy()
        applied.dataframe_ = self.dataframe_.apply(func, axis=axis)
        applied.hasChanged_ = True

        return applied

    def rank(self, ascending=False):
        """

        Parameters
        ----------
        ascending : boolean, default False
            False for ranks by high (0) to low (N-1)

        Returns
        -------
        rank : `Scores`

        """

        ranked = self.copy()
        ranked.dataframe_ = -1 + self.dataframe_.rank(axis=1,
                                                      ascending=ascending)
        ranked.hasChanged_ = True
        return ranked

    def nbest(self, n, ascending=False):
        """

        Parameters
        ----------
        n : int
            Size of n-best list
        ascending : boolean, default False
            False for ranks by high (0) to low (N-1)

        Returns
        -------
        nbest : `Scores`
            New scores where only n-best are kept.

        """

        filtered = self.copy()
        ranked_ = -1 + self.dataframe_.rank(axis=1, ascending=ascending)
        filtered.dataframe_ = filtered.dataframe_.where(ranked_ < n,
                                                        other=np.NaN)
        filtered.hasChanged_ = True
        return filtered

    def subset(self, labels, invert=False):
        """Scores subset

        Extract scores subset based on labels

        Parameters
        ----------
        labels : set
            Set of labels
        invert : bool, optional
            If invert is True, extract all but requested `labels`

        Returns
        -------
        subset : `Scores`
            Scores subset.
        """

        self._reindexIfNeeded()

        if not isinstance(labels, set):
            raise TypeError('labels must be provided as a set of labels.')

        if invert:
            labels = set(self.labels()) - labels
        else:
            labels = labels & set(self.labels())

        subset = Scores(uri=self.uri, modality=self.modality)
        subset.annotation_ = self.annotation_
        subset.dataframe_ = self.dataframe_[list(labels)]

        return subset

    def to_annotation(self, threshold=-np.inf, posterior=False):
        """

        Parameters
        ----------
        threshold : float, optional
            Each track is annotated with the label with the highest score.
            Yet, if the latter is smaller than `threshold`, label is replaced
            with an `Unknown` instance.
        posterior : bool, optional
            If True, scores are posterior probabilities in open-set
            identification. If top model posterior is higher than unknown
            posterior, it is selected. Otherwise, label is replaced with an
            `Unknown` instance.
        """

        if not self:
            return Annotation(uri=self.uri, modality=self.modality)

        best = self.nbest(1, ascending=False)
        large_enough = best.copy()

        if posterior:
            unknown_posterior = 1. - self.dataframe_.sum(axis=1)

            large_enough.dataframe_ = (
                ((best.dataframe_.T > unknown_posterior) &
                 (best.dataframe_.T > threshold)).T
            )

        else:

            large_enough.dataframe_ = (
                (best.dataframe_.T > threshold).T
            )

        large_enough.dataframe_.where(best.dataframe_.notnull(),
                                      inplace=True, other=np.NaN)

        annotation = Annotation(uri=self.uri, modality=self.modality)
        for segment, track, label, value in large_enough.itervalues():
            label = label if value else Unknown()
            annotation[segment, track] = label

        return annotation

    def map(self, func):
        """Apply function to all values"""

        mapped = self.copy()
        mapped.dataframe_ = self.dataframe_.applymap(func)
        mapped.hasChanged_ = True
        return mapped

    def crop(self, focus, mode='strict'):
        """Crop on focus

        Parameters
        ----------
        focus : `Segment` or `Timeline`

        mode : {'strict', 'loose', 'intersection'}
            In 'strict' mode, only segments fully included in focus coverage
            are kept. In 'loose' mode, any intersecting segment is kept
            unchanged. In 'intersection' mode, only intersecting segments are
            kept and replaced by their actual intersection with the focus.

        Returns
        -------
        cropped : same type as caller
            Cropped version of the caller containing only tracks matching
            the provided focus and mode.

        Remarks
        -------
        In 'intersection' mode, the best is done to keep the track names
        unchanged. However, in some cases where two original segments are
        cropped into the same resulting segments, conflicting track names are
        modified to make sure no track is lost.

        """

        if isinstance(focus, Segment):
            return self.crop(Timeline([focus], uri=self.uri), mode=mode)

        self._reindexIfNeeded()
        cropped = self.copy()

        if mode in ['strict', 'loose']:

            new_annotation = self.annotation_.crop(focus, mode=mode)
            keep = [new_annotation.has_track(segment, track)
                    for segment, track in self.itertracks()]
            cropped.dataframe_ = self.dataframe_[keep]
            cropped.annotation_ = new_annotation
            cropped.hasChanged_ = True

            return cropped

        elif mode in ['intersection']:

            raise NotImplementedError('')

            # # two original segments might be cropped into the same resulting
            # # segment -- therefore, we keep track of the mapping
            # intersection, mapping = timeline.crop(coverage,
            #                                       mode=mode, mapping=True)
            #
            # # create new empty annotation
            # A = self.__class__(uri=self.uri, modality=self.modality)
            #
            # for cropped in intersection:
            #     for original in mapping[cropped]:
            #         for track in self.tracks(original):
            #             # try to use original track name (candidate)
            #             # if it already exists, create a brand new one
            #             new_track = A.new_track(cropped, candidate=track)
            #             # copy each value, column by column
            #             for label in self.dataframe_.columns:
            #                 value = self.dataframe_.get_value((original, track),
            #                                            label)
            #                 A.dataframe_ = A.dataframe_.set_value((cropped, new_track),
            #                                         label, value)
            #
            # return A

    def __str__(self):
        """Human-friendly representation"""
        if self:
            self._reindexIfNeeded()
            return str(self.dataframe_)
        else:
            return ""

    def _repr_png_(self):
        from .notebook import repr_scores
        return repr_scores(self)
Example #9
0
import numpy as np
randn = np.random.randn
import pandas as pd
from pandas import Series, DataFrame

frame = DataFrame(np.random.randn(4, 3), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon'])
print frame

f = lambda x: x.max() - x.min()
print frame.apply(f)
print frame.apply(f, axis=1)

print 'fancy f(x)---------'

def f(x):
    return Series([x.min(), x.max()], index=['min', 'max'])

print frame.apply(f)
format = lambda x: '%.2f' % x
print frame.applymap(format)

print 'sorting-------------'
print frame.sort_index(by='b')
print frame.rank(method='max', axis=1)
Example #10
0
 def test_rank_axis(self):
     # check if using axes' names gives the same result
     df = DataFrame([[2, 1], [4, 3]])
     tm.assert_frame_equal(df.rank(axis=0), df.rank(axis='index'))
     tm.assert_frame_equal(df.rank(axis=1), df.rank(axis='columns'))
Example #11
0
obj.order()
# 排序时缺失值都会被放在末尾
# 对多列进行排序
frame = DataFrame({'b':[4,7,-3,2],'a':[0,1,0,1]})
frame.sort_index(by=['a','b'])
frame.order(by=['a','b'])
# 排名
obj = Series([7,-5,7,4,2,0,4])
obj.rank()
# 对于相同值,按照出现次序排
obj.rank(method='first')
# 降序
obj.rank(ascending=False,method='max')
# 对列计算排名
frame = DataFrame({'b':[4.3,7,-3,2],'a':[0,1,0,1],'c':[-2,5,8,-2.5]})
frame.rank(axis=1)

## 带有重复值的轴索引
obj = Series(range(5), index=['a','a','b','b','c'])
# 检验是否唯一
obj.index.is_unique
# 一个索引有多个值,那么该索引就会返回多个值。
obj['a']

## 汇总和计算描述统计
df = DataFrame([[1.4,np.nan],[7.1,-4.5],[np.nan,np.nan],[0.75,-1.3]],
	index=['a','b','c','d'], columns=['one','two'])
# 对列
df.sum()
# 对行
df.sum(axis=1)
d1.sort_index()
#%% 按字段排序
d1.sort(['b', 'c'], ascending=[1, 0])
#%%
d1.sort(['c', 'b'], ascending=[1, 0])
#%% sort_index功能可以覆盖sort
d1.sort_index(by='c',ascending=False)
#%%
d1.sort_index(by=['b','c'],ascending=[0,1])

#%% 对列名排序
d1.sort_index(axis=1,ascending=False)
#%% 对列指定出现的顺序
d1[['c','b','a']]

#%% 如果想根据某一行数据对列进行排序 
d1.reindex(columns=d1.ix['j'].order().index)

#%% 指定行顺序的排序
d1.ix[['b','c','a']]

#%% 根据多级索引进行排序
d2.sortlevel(0,ascending=False)
#%%
d2.sortlevel(0,ascending=False).sortlevel(1)

#%% 对元素进行排名
d1.rank()
#%% 指定是用相同值最小的排名、倒叙
d1.rank(method='min',ascending=False)
Example #13
0
 def test_pct_max_many_rows(self):
     # GH 18271
     df = DataFrame({'A': np.arange(2**24 + 1),
                     'B': np.arange(2**24 + 1, 0, -1)})
     result = df.rank(pct=True).max()
     assert (result == 1).all()
d    2.0
'''
frame = DataFrame({'b': [4.3, 7, -3, 2],
                   'a': [0, 1, 0, 1],
                   'c': [-2, 5, 8, -2.5]})
print
frame
'''
   a    b    c
0  0  4.3 -2.0
1  1  7.0  5.0
2  0 -3.0  8.0
3  1  2.0 -2.5
'''
print
frame.rank(axis=1)  # 按行进行排名,默认升序
'''
     a    b    c
0  2.0  3.0  1.0
1  1.0  3.0  2.0
2  2.0  1.0  3.0
3  2.0  3.0  1.0
'''

print
'重复索引:进行两层索引'
obj = Series([0, 1, 2, 3, 4], index=['a', 'a', 'b', 'b', 'c'])
print
obj.index.is_unique  # 判断是非有重复索引
# False
print
Example #15
0
print(obj.sort_values())

frame = DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
print(frame)
print(frame.sort_values(by='b'))
print(frame.sort_values(by=['a', 'b']))

# rank
obj = Series([7, -5, 7, 4, 2, 0, 4])
print(obj.rank())
print(obj.rank(method='first'))
print(obj.rank(method='max', ascending=False))

frame = DataFrame({'b': [4.3, 7, -3, 2], 'a': [0, 1, 0, 1], 'c': [-2, 5, 8, -2.5]})
print(frame)
print(frame.rank(axis=1))

'''
duplicate index
'''
obj = Series(range(5), index=['a', 'a', 'b', 'b', 'c'])
print(obj)
print(obj.index.is_unique)
print(obj['a'])
print(obj['c'])

df = DataFrame(np.random.randn(4, 3),
               index=['a', 'a', 'b', 'b'])
print(df)
print(df.ix['b'])
print "根据索引排序,对于DataFrame可以指定轴。"
obj = Series(range(4), index=["d", "a", "b", "c"])
print obj.sort_index()
frame = DataFrame(np.arange(8).reshape((2, 4)), index=["three", "one"], columns=list("dabc"))
print frame.sort_index()
print frame.sort_index(axis=1)  # axis=1 表示对列进行操作
print frame.sort_index(axis=1, ascending=False)  # 降序
print

print "根据值排序"
obj = Series([4, 7, -3, 2])
print obj.sort_values()  # order已淘汰
print

print "DataFrame指定列排序"
frame = DataFrame({"b": [4, 7, -3, 2], "a": [0, 1, 0, 1]})
print frame
print frame.sort_values(by="b")  # sort_index(by = ...)已淘汰
print frame.sort_values(by=["a", "b"])
print

print "rank,求排名的平均位置(从1开始)"
obj = Series([7, -5, 7, 4, 2, 0, 4])
# 对应排名:-5(1), 0(2), 2(3), 4(4), 4(5), 7(6), 7(7)
print obj.rank()
print obj.rank(method="first")  # 去第一次出现,不求平均值。
print obj.rank(ascending=False, method="max")  # 逆序,并取最大值。所以-5的rank是7.
frame = DataFrame({"b": [4.3, 7, -3, 2], "a": [0, 1, 0, 1], "c": [-2, 5, 8, -2.5]})
print frame
print frame.rank(axis=1)