Ejemplo n.º 1
0
    def test_rank(self):
        tm._skip_if_no_scipy()
        from scipy.stats import rankdata

        self.frame['A'][::2] = np.nan
        self.frame['B'][::3] = np.nan
        self.frame['C'][::4] = np.nan
        self.frame['D'][::5] = np.nan

        ranks0 = self.frame.rank()
        ranks1 = self.frame.rank(1)
        mask = np.isnan(self.frame.values)

        fvals = self.frame.fillna(np.inf).values

        exp0 = np.apply_along_axis(rankdata, 0, fvals)
        exp0[mask] = np.nan

        exp1 = np.apply_along_axis(rankdata, 1, fvals)
        exp1[mask] = np.nan

        tm.assert_almost_equal(ranks0.values, exp0)
        tm.assert_almost_equal(ranks1.values, exp1)

        # integers
        df = DataFrame(np.random.randint(0, 5, size=40).reshape((10, 4)))

        result = df.rank()
        exp = df.astype(float).rank()
        tm.assert_frame_equal(result, exp)

        result = df.rank(1)
        exp = df.astype(float).rank(1)
        tm.assert_frame_equal(result, exp)
Ejemplo n.º 2
0
    def test_rank2(self):
        df = DataFrame([[1, 3, 2], [1, 2, 3]])
        expected = DataFrame([[1.0, 3.0, 2.0], [1, 2, 3]]) / 3.0
        result = df.rank(1, pct=True)
        tm.assert_frame_equal(result, expected)

        df = DataFrame([[1, 3, 2], [1, 2, 3]])
        expected = df.rank(0) / 2.0
        result = df.rank(0, pct=True)
        tm.assert_frame_equal(result, expected)

        df = DataFrame([['b', 'c', 'a'], ['a', 'c', 'b']])
        expected = DataFrame([[2.0, 3.0, 1.0], [1, 3, 2]])
        result = df.rank(1, numeric_only=False)
        tm.assert_frame_equal(result, expected)

        expected = DataFrame([[2.0, 1.5, 1.0], [1, 1.5, 2]])
        result = df.rank(0, numeric_only=False)
        tm.assert_frame_equal(result, expected)

        df = DataFrame([['b', np.nan, 'a'], ['a', 'c', 'b']])
        expected = DataFrame([[2.0, nan, 1.0], [1.0, 3.0, 2.0]])
        result = df.rank(1, numeric_only=False)
        tm.assert_frame_equal(result, expected)

        expected = DataFrame([[2.0, nan, 1.0], [1.0, 1.0, 2.0]])
        result = df.rank(0, numeric_only=False)
        tm.assert_frame_equal(result, expected)

        # f7u12, this does not work without extensive workaround
        data = [[datetime(2001, 1, 5), nan,
                 datetime(2001, 1, 2)],
                [
                    datetime(2000, 1, 2),
                    datetime(2000, 1, 3),
                    datetime(2000, 1, 1)
                ]]
        df = DataFrame(data)

        # check the rank
        expected = DataFrame([[2., nan, 1.], [2., 3., 1.]])
        result = df.rank(1, numeric_only=False, ascending=True)
        tm.assert_frame_equal(result, expected)

        expected = DataFrame([[1., nan, 2.], [2., 1., 3.]])
        result = df.rank(1, numeric_only=False, ascending=False)
        tm.assert_frame_equal(result, expected)

        # mixed-type frames
        self.mixed_frame['datetime'] = datetime.now()
        self.mixed_frame['timedelta'] = timedelta(days=1, seconds=1)

        result = self.mixed_frame.rank(1)
        expected = self.mixed_frame.rank(1, numeric_only=True)
        tm.assert_frame_equal(result, expected)

        df = DataFrame(
            {"a": [1e-20, -5, 1e-20 + 1e-40, 10, 1e60, 1e80, 1e-30]})
        exp = DataFrame({"a": [3.5, 1., 3.5, 5., 6., 7., 2.]})
        tm.assert_frame_equal(df.rank(), exp)
Ejemplo n.º 3
0
    def test_rank(self):
        tm._skip_if_no_scipy()
        from scipy.stats import rankdata

        self.frame['A'][::2] = np.nan
        self.frame['B'][::3] = np.nan
        self.frame['C'][::4] = np.nan
        self.frame['D'][::5] = np.nan

        ranks0 = self.frame.rank()
        ranks1 = self.frame.rank(1)
        mask = np.isnan(self.frame.values)

        fvals = self.frame.fillna(np.inf).values

        exp0 = np.apply_along_axis(rankdata, 0, fvals)
        exp0[mask] = np.nan

        exp1 = np.apply_along_axis(rankdata, 1, fvals)
        exp1[mask] = np.nan

        tm.assert_almost_equal(ranks0.values, exp0)
        tm.assert_almost_equal(ranks1.values, exp1)

        # integers
        df = DataFrame(np.random.randint(0, 5, size=40).reshape((10, 4)))

        result = df.rank()
        exp = df.astype(float).rank()
        tm.assert_frame_equal(result, exp)

        result = df.rank(1)
        exp = df.astype(float).rank(1)
        tm.assert_frame_equal(result, exp)
Ejemplo n.º 4
0
    def test_rank(self, float_frame):
        rankdata = pytest.importorskip("scipy.stats.rankdata")

        float_frame["A"][::2] = np.nan
        float_frame["B"][::3] = np.nan
        float_frame["C"][::4] = np.nan
        float_frame["D"][::5] = np.nan

        ranks0 = float_frame.rank()
        ranks1 = float_frame.rank(1)
        mask = np.isnan(float_frame.values)

        fvals = float_frame.fillna(np.inf).values

        exp0 = np.apply_along_axis(rankdata, 0, fvals)
        exp0[mask] = np.nan

        exp1 = np.apply_along_axis(rankdata, 1, fvals)
        exp1[mask] = np.nan

        tm.assert_almost_equal(ranks0.values, exp0)
        tm.assert_almost_equal(ranks1.values, exp1)

        # integers
        df = DataFrame(np.random.randint(0, 5, size=40).reshape((10, 4)))

        result = df.rank()
        exp = df.astype(float).rank()
        tm.assert_frame_equal(result, exp)

        result = df.rank(1)
        exp = df.astype(float).rank(1)
        tm.assert_frame_equal(result, exp)
Ejemplo n.º 5
0
    def test_rank(self, float_frame):
        rankdata = pytest.importorskip('scipy.stats.rankdata')

        float_frame['A'][::2] = np.nan
        float_frame['B'][::3] = np.nan
        float_frame['C'][::4] = np.nan
        float_frame['D'][::5] = np.nan

        ranks0 = float_frame.rank()
        ranks1 = float_frame.rank(1)
        mask = np.isnan(float_frame.values)

        fvals = float_frame.fillna(np.inf).values

        exp0 = np.apply_along_axis(rankdata, 0, fvals)
        exp0[mask] = np.nan

        exp1 = np.apply_along_axis(rankdata, 1, fvals)
        exp1[mask] = np.nan

        tm.assert_almost_equal(ranks0.values, exp0)
        tm.assert_almost_equal(ranks1.values, exp1)

        # integers
        df = DataFrame(np.random.randint(0, 5, size=40).reshape((10, 4)))

        result = df.rank()
        exp = df.astype(float).rank()
        tm.assert_frame_equal(result, exp)

        result = df.rank(1)
        exp = df.astype(float).rank(1)
        tm.assert_frame_equal(result, exp)
    def calculate_prcc(self, parameter, result, plot=False):
        """
        Calculate a partial rank correlation coefficient (PRCC) value for uncertain parameters against the output.

        Partial correlation characterizes the linear relationship between an input and an output after the linear
        effects of the remaining inputs on the output are discounted. This is calculated by constructing a linear
        regression model between the input and the remaining inputs, and calculating the residuals between the input
        values and the model. Similarly, a linear regression model between the output and the remaining input is
        created and residuals are calculated. The PRCC is then the correlation coefficient between the input residuals
        and the output residuals (Note all data is rank transformed).
        :param result:
        :param parameter:
        :param plot:
        :return:
        """
        # Linear regression model
        regr = linear_model.LinearRegression()

        df = self.dataframe_aggregated()

        ranked_params = DataFrame.rank(df[self.uncertain_parameters()])
        ranked_results = DataFrame.rank(df[self._result_keys])

        # Turn data into numpy arrays

        all_params_data = numpy.asarray(ranked_params)
        result_data = numpy.asarray(ranked_results[result]).reshape((len(ranked_results),1))

        # Number of parameters
        k = all_params_data.shape[1]

        # Create truth dictionaries to split the parameter_data into the parameter of interest and all other
        # parameters
        # Dictionary indicating where parameter lies (true for param, false for all others)
        param_col_truth_table = [ranked_params.columns[j] == parameter for j in range(k)]

        # Numpy array of just the parameter
        param_data = all_params_data[:, param_col_truth_table]
        # Numpy array of all other parameters
        remaining_param_data = all_params_data[:, numpy.logical_not(param_col_truth_table)]

        # Fit a linear regression model between the remaining parameters and the parameter
        regr.fit(remaining_param_data, param_data)
        # Use to construct a line
        linreg_param = regr.predict(remaining_param_data)
        # Calculate residuals
        param_resid = param_data - linreg_param

        # Fit a linear regression model between the remaining parameters and the result
        regr.fit(remaining_param_data, result_data)
        # Use to construct a line
        linreg_result = regr.predict(remaining_param_data)
        # Calculate residuals
        result_resid = result_data - linreg_result

        # Determine correlation between residuals
        corr, p = stats.pearsonr(param_resid, result_resid)

        return (corr, p)
Ejemplo n.º 7
0
 def test_rank_does_not_mutate(self):
     # GH#18521
     # Check rank does not mutate DataFrame
     df = DataFrame(np.random.randn(10, 3), dtype="float64")
     expected = df.copy()
     df.rank()
     result = df
     tm.assert_frame_equal(result, expected)
Ejemplo n.º 8
0
    def test_rank2(self):
        df = DataFrame([[1, 3, 2], [1, 2, 3]])
        expected = DataFrame([[1.0, 3.0, 2.0], [1, 2, 3]]) / 3.0
        result = df.rank(1, pct=True)
        tm.assert_frame_equal(result, expected)

        df = DataFrame([[1, 3, 2], [1, 2, 3]])
        expected = df.rank(0) / 2.0
        result = df.rank(0, pct=True)
        tm.assert_frame_equal(result, expected)

        df = DataFrame([['b', 'c', 'a'], ['a', 'c', 'b']])
        expected = DataFrame([[2.0, 3.0, 1.0], [1, 3, 2]])
        result = df.rank(1, numeric_only=False)
        tm.assert_frame_equal(result, expected)

        expected = DataFrame([[2.0, 1.5, 1.0], [1, 1.5, 2]])
        result = df.rank(0, numeric_only=False)
        tm.assert_frame_equal(result, expected)

        df = DataFrame([['b', np.nan, 'a'], ['a', 'c', 'b']])
        expected = DataFrame([[2.0, nan, 1.0], [1.0, 3.0, 2.0]])
        result = df.rank(1, numeric_only=False)
        tm.assert_frame_equal(result, expected)

        expected = DataFrame([[2.0, nan, 1.0], [1.0, 1.0, 2.0]])
        result = df.rank(0, numeric_only=False)
        tm.assert_frame_equal(result, expected)

        # f7u12, this does not work without extensive workaround
        data = [[datetime(2001, 1, 5), nan, datetime(2001, 1, 2)],
                [datetime(2000, 1, 2), datetime(2000, 1, 3),
                 datetime(2000, 1, 1)]]
        df = DataFrame(data)

        # check the rank
        expected = DataFrame([[2., nan, 1.],
                              [2., 3., 1.]])
        result = df.rank(1, numeric_only=False, ascending=True)
        tm.assert_frame_equal(result, expected)

        expected = DataFrame([[1., nan, 2.],
                              [2., 1., 3.]])
        result = df.rank(1, numeric_only=False, ascending=False)
        tm.assert_frame_equal(result, expected)

        # mixed-type frames
        self.mixed_frame['datetime'] = datetime.now()
        self.mixed_frame['timedelta'] = timedelta(days=1, seconds=1)

        result = self.mixed_frame.rank(1)
        expected = self.mixed_frame.rank(1, numeric_only=True)
        tm.assert_frame_equal(result, expected)

        df = DataFrame({"a": [1e-20, -5, 1e-20 + 1e-40, 10,
                              1e60, 1e80, 1e-30]})
        exp = DataFrame({"a": [3.5, 1., 3.5, 5., 6., 7., 2.]})
        tm.assert_frame_equal(df.rank(), exp)
Ejemplo n.º 9
0
    def test_rank2(self):
        df = DataFrame([[1, 3, 2], [1, 2, 3]])
        expected = DataFrame([[1.0, 3.0, 2.0], [1, 2, 3]]) / 3.0
        result = df.rank(1, pct=True)
        tm.assert_frame_equal(result, expected)

        df = DataFrame([[1, 3, 2], [1, 2, 3]])
        expected = df.rank(0) / 2.0
        result = df.rank(0, pct=True)
        tm.assert_frame_equal(result, expected)

        df = DataFrame([["b", "c", "a"], ["a", "c", "b"]])
        expected = DataFrame([[2.0, 3.0, 1.0], [1, 3, 2]])
        result = df.rank(1, numeric_only=False)
        tm.assert_frame_equal(result, expected)

        expected = DataFrame([[2.0, 1.5, 1.0], [1, 1.5, 2]])
        result = df.rank(0, numeric_only=False)
        tm.assert_frame_equal(result, expected)

        df = DataFrame([["b", np.nan, "a"], ["a", "c", "b"]])
        expected = DataFrame([[2.0, np.nan, 1.0], [1.0, 3.0, 2.0]])
        result = df.rank(1, numeric_only=False)
        tm.assert_frame_equal(result, expected)

        expected = DataFrame([[2.0, np.nan, 1.0], [1.0, 1.0, 2.0]])
        result = df.rank(0, numeric_only=False)
        tm.assert_frame_equal(result, expected)

        # f7u12, this does not work without extensive workaround
        data = [
            [datetime(2001, 1, 5), np.nan,
             datetime(2001, 1, 2)],
            [datetime(2000, 1, 2),
             datetime(2000, 1, 3),
             datetime(2000, 1, 1)],
        ]
        df = DataFrame(data)

        # check the rank
        expected = DataFrame([[2.0, np.nan, 1.0], [2.0, 3.0, 1.0]])
        result = df.rank(1, numeric_only=False, ascending=True)
        tm.assert_frame_equal(result, expected)

        expected = DataFrame([[1.0, np.nan, 2.0], [2.0, 1.0, 3.0]])
        result = df.rank(1, numeric_only=False, ascending=False)
        tm.assert_frame_equal(result, expected)

        df = DataFrame(
            {"a": [1e-20, -5, 1e-20 + 1e-40, 10, 1e60, 1e80, 1e-30]})
        exp = DataFrame({"a": [3.5, 1.0, 3.5, 5.0, 6.0, 7.0, 2.0]})
        tm.assert_frame_equal(df.rank(), exp)
Ejemplo n.º 10
0
class Rank:
    param_names = ["dtype"]
    params = [
        ["int", "uint", "float", "object"],
    ]

    def setup(self, dtype):
        self.df = DataFrame(np.random.randn(10000, 10),
                            columns=range(10),
                            dtype=dtype)

    def time_rank(self, dtype):
        self.df.rank()
Ejemplo n.º 11
0
 def test_pct_max_many_rows(self):
     # GH 18271
     df = DataFrame(
         {"A": np.arange(2 ** 24 + 1), "B": np.arange(2 ** 24 + 1, 0, -1)}
     )
     result = df.rank(pct=True).max()
     assert (result == 1).all()
Ejemplo n.º 12
0
    def test_rank_methods_frame(self):
        tm.skip_if_no_package('scipy', min_version='0.13',
                              app='scipy.stats.rankdata')
        import scipy
        from scipy.stats import rankdata

        xs = np.random.randint(0, 21, (100, 26))
        xs = (xs - 10.0) / 10.0
        cols = [chr(ord('z') - i) for i in range(xs.shape[1])]

        for vals in [xs, xs + 1e6, xs * 1e-6]:
            df = DataFrame(vals, columns=cols)

            for ax in [0, 1]:
                for m in ['average', 'min', 'max', 'first', 'dense']:
                    result = df.rank(axis=ax, method=m)
                    sprank = np.apply_along_axis(
                        rankdata, ax, vals,
                        m if m != 'first' else 'ordinal')
                    sprank = sprank.astype(np.float64)
                    expected = DataFrame(sprank, columns=cols)

                    if LooseVersion(scipy.__version__) >= '0.17.0':
                        expected = expected.astype('float64')
                    tm.assert_frame_equal(result, expected)
Ejemplo n.º 13
0
def determine_rank_invariance(
        df: pd.DataFrame,
        nri_threshold: float = 0.5) -> Tuple[pd.Index, pd.Index]:
    """
    Determines rank invariance along axis 0, meaning columns should be samples, rows genes or proteins.
    The rank invariant (RI) and near rank invariant (NRI) indices can give information about how problematic a
    Quantile Normalization might be, since biological variance would be lost.

    Parameters
    ----------
    df
        input dataframe with
    nri_threshold
        the percentage threshold at which near rank invariance condition is fulfilled. Should be between 0 and 1.

    Returns
    -------

    """
    df_rank = df.rank()
    max_rank_percentages = df_rank.apply(pd.Series.value_counts,
                                         axis=1).max() / df.shape[1]
    ri = df.index[max_rank_percentages == 1]
    nri = df.index[max_rank_percentages >= nri_threshold]
    nri = nri.difference(ri)
    return ri, nri
Ejemplo n.º 14
0
    def test_rank_methods_frame(self):
        tm.skip_if_no_package('scipy', min_version='0.13',
                              app='scipy.stats.rankdata')
        import scipy
        from scipy.stats import rankdata

        xs = np.random.randint(0, 21, (100, 26))
        xs = (xs - 10.0) / 10.0
        cols = [chr(ord('z') - i) for i in range(xs.shape[1])]

        for vals in [xs, xs + 1e6, xs * 1e-6]:
            df = DataFrame(vals, columns=cols)

            for ax in [0, 1]:
                for m in ['average', 'min', 'max', 'first', 'dense']:
                    result = df.rank(axis=ax, method=m)
                    sprank = np.apply_along_axis(
                        rankdata, ax, vals,
                        m if m != 'first' else 'ordinal')
                    sprank = sprank.astype(np.float64)
                    expected = DataFrame(sprank, columns=cols)

                    if LooseVersion(scipy.__version__) >= '0.17.0':
                        expected = expected.astype('float64')
                    tm.assert_frame_equal(result, expected)
Ejemplo n.º 15
0
    def generate_defaults_discretized(self, df: pd.DataFrame, num_defaults: int,
                                      minimize: bool, aggregate: typing.Callable,
                                      config_space: ConfigSpace.ConfigurationSpace,
                                      raise_no_improvement: bool) \
            -> typing.Tuple[typing.List, typing.Dict[str, typing.Any]]:
        """
        Takes a data frame with a discretized set of defaults and returns the
        average rank. The data frame should be structured as follows: each
        column represents a task, each row represents a configuration. As such,
        each cell represents the evaluation of that configuration on that task.
        The sum of the selected configurations across tasks is to be minimized
        or maximized

        Parameters
        ----------
        df: pd.DataFrame
            The data frame as described above

        num_defaults: int
            The number of configurations to be selected

        minimize: bool
            Will minimize the objective function iff this is true.

        aggregate: callable
            function to aggregate per task results

        config_space: ConfigurationSpace
            the configuration space object corresponding to the defaults. Will
            be used casting defaults to the right data type.

        raise_no_improvement: bool
            if true, an error will be raised if no improvement is obtained
            before the correct number of default was obtained

        Returns
        -------
        selected_indices: List[int]
            List of indices, as given by the dataframe
        results_dict: Dict[str, Any]
            Additional meta-information. Containing at least the key 'run_time',
            but potentially more information
        """
        logging.info('Started %s, dimensions config frame %s' %
                     (self.name, str(df.shape)))
        if num_defaults < 1:
            raise ValueError()
        start_time = time.time()

        # note that we deliberately inverse the average ranks (to work with np.argsort) The TODO rank is now the
        # best to go with
        inv_avg_ranks = df.rank(axis=0, method='average',
                                ascending=minimize).sum(axis=1) / df.shape[1]
        selected_indices = list(np.argsort(inv_avg_ranks.values))

        results_dict = {
            'run_time': time.time() - start_time,
        }
        return selected_indices, results_dict
Ejemplo n.º 16
0
    def test_rank_pct_true(self, method, exp):
        # see gh-15630.

        df = DataFrame([[2012, 66, 3], [2012, 65, 2], [2012, 65, 1]])
        result = df.rank(method=method, pct=True)

        expected = DataFrame(exp)
        tm.assert_frame_equal(result, expected)
Ejemplo n.º 17
0
 def _discretize_based_on_quantile(self, data: pd.DataFrame) -> pd.DataFrame:
     # The higher the values, the higher the rank
     rank = data.rank(method="first", axis=1, ascending=True)
     labels = range(1, self.bins + 1)
     discretized = rank.apply(
         lambda x: pd.qcut(x, self.bins, labels), axis=1, raw=True
     )
     return discretized
Ejemplo n.º 18
0
    def test_rank_pct_true(self, method, exp):
        # see gh-15630.

        df = DataFrame([[2012, 66, 3], [2012, 65, 2], [2012, 65, 1]])
        result = df.rank(method=method, pct=True)

        expected = DataFrame(exp)
        tm.assert_frame_equal(result, expected)
Ejemplo n.º 19
0
    def execute_scoring(self, labels: DataFrame, prediction: DataFrame) -> float:
        """
        Scores the correlation as defined by the Numerai tournament rules.

        Arguments:
            labels: The real labels of the output
            prediction: The predicted labels

        Returns:
            The correlation coefficient
        """
        ranked_prediction = prediction.rank(pct=True, method="first")
        return np.corrcoef(labels, ranked_prediction, rowvar=False)[0, 1]
Ejemplo n.º 20
0
def dataFrameUtilityMethodTest():
    df = DataFrame()
    # Load a DataFrame from a CSV file    
    df = pd.read_csv('mlg.csv')
    df = df.copy() # copy a DataFrame
    df = df.rank() # rank each col (default)
    df = df.sort_values(by='maker')
    print(df)
    df = df.sort_values(by=['maker', 'modelyear'])
    print(df)
    df = df.sort_index()
    df = df.astype(int) # type conversion
    print(df)
Ejemplo n.º 21
0
def rank(prefix, target, links, vectors, theta, k=2):
    res = DataFrame(columns=["p"], index=vectors.index)
    for i in range(vectors.shape[0]):       # vectors.shape[0]
        t = vectors.index[i]
        flag = False
        for j in range(k):
            if t == prefix[j]:
                flag = True
        if flag == False:
            res.loc[t] = probability2(prefix, t, links, vectors, theta, k)
    print res
    r = res.rank(ascending=False)['p'][target]
    return r
Ejemplo n.º 22
0
def rank(prefix, target, links, vectors, theta, k=2):
    res = DataFrame(columns=["p"], index=vectors.index)
    for i in range(vectors.shape[0]):  # vectors.shape[0]
        t = vectors.index[i]
        flag = False
        for j in range(k):
            if t == prefix[j]:
                flag = True
        if flag == False:
            res.loc[t] = probability2(prefix, t, links, vectors, theta, k)
    print res
    r = res.rank(ascending=False)['p'][target]
    return r
Ejemplo n.º 23
0
    def _infer_index_from_coords(self, coords: pd.DataFrame) -> pd.DataFrame:
        """
        Discrete tile index. Using TILE_INDEX_STR as header.

        Returns:
            (pd.DataFrame): these are used as the index
        """
        logger.warning(
            "infer index by raw coordinates may lead to unwanted error")

        index = coords.rank(axis="index", method="dense", ascending=True)
        # integer 0-based index
        index = index.astype(int) - 1
        return index
Ejemplo n.º 24
0
def quantile_normalize(df: pd.DataFrame) -> pd.DataFrame:
    """
    input: dataframe with numerical columns
    output: dataframe with quantile normalized values
    """
    df_sorted = pd.DataFrame(np.sort(df.values, axis=0),
                             index=df.index,
                             columns=df.columns,
                             dtype=np.float32)
    print("peak")
    df_mean = df_sorted.mean(axis=1)
    df_sorted = 0
    df_mean.index = np.arange(1, len(df_mean) + 1)
    df_qn = df.rank(method="min").stack().astype(int).map(df_mean).unstack()
    return (df_qn)
Ejemplo n.º 25
0
 def rank(self, prefix, target, theta, k=2):
     # links = self.links
     vectors = self.vectors
     res = DataFrame(columns=["p"], index=vectors.index)
     for i in range(vectors.shape[0]):
         t = vectors.index[i]
         flag = False
         for j in range(k):
             if t == prefix[j]:
                 flag = True
         if flag == False:
             res.loc[t] = self.probability(prefix, t, theta, k)
             # print res.loc[t]
     r = res.rank(ascending=False)['p'][target]
     return r
 def _winsorize_data(self, ranked_df: pd.DataFrame):
     if self.winsorizing_fraction:
         count_non_nan_s = ranked_df.count(axis=1)
         rank_number_s = round(count_non_nan_s * self.winsorizing_fraction)
         winsorizing_array = np.where(ranked_df.le(rank_number_s, axis=0),
                                      np.nan, 1)
     elif self.winsorizing_number:
         winsorizing_array = np.where(ranked_df <= self.winsorizing_number,
                                      np.nan, 1)
     else:
         return ranked_df
     ranked_df *= winsorizing_array
     return ranked_df.rank(axis='columns',
                           method='first',
                           ascending=True,
                           numeric_only=True)
Ejemplo n.º 27
0
 def transform(self, data: pd.DataFrame):
     if not self.rank_replace:
         raise ValueError("Please call fit first or use fit_transform")
     if self.missing_value_handler is not None:
         na_mask = data.notna()
         data = self.missing_value_handler(data)
     result = data.rank()
     if self.missing_value_handler is not None:
         result = result[na_mask]
     result = result.apply(pd.Series.map, arg=self.rank_replace)
     if self.output_scale == "normal":
         result = np.exp2(result)
     if self.col_name_prefix is not None:
         result.rename(lambda x: f"{self.col_name_prefix} {x}",
                       axis=1,
                       inplace=True)
     return result
Ejemplo n.º 28
0
    def _split_long_short(self, data: pd.DataFrame) -> pd.DataFrame:
        """
        split data into long(1) and short(-1)
        for certain percentage (ls_percentage)
        of each long and short
        """
        # The higher the values, the higher the rank
        rank = data.rank(method="first", axis=1, ascending=True)

        def split_long_short(row):
            count = int(len(row.dropna()) * self.ls_percentage)
            long = row.nlargest(count).where(row.isnull(), 1)
            short = row.nsmallest(count).where(row.isnull(), -1)
            return pd.concat([long, short])

        splited = rank.apply(split_long_short, axis=1)
        return splited
Ejemplo n.º 29
0
 def grouping(data: pd.DataFrame, n):
     """
     1.假设样本量为M,将因子分成N组,前N-1组有效样本量为int(M/N),最后一组有效样本量为M-(N-1)*int(M/*N);
     2.无效样本不参与计算;
     3.相同排序定义为同一组;
     4.相同排序后下一元素连续不跳级
     5.升序排列
     :param data:
     :param n:分组个数
     :return:
     """
     rank_data = data.rank(axis=1, ascending=True, method='dense')
     effect_data = rank_data.max(axis=1)
     amount_each_group = effect_data // n
     data_group = rank_data.floordiv(amount_each_group, axis=0) + np.sign(
         rank_data.mod(amount_each_group, axis=0))
     data_group[data_group > n] = n
     return data_group
Ejemplo n.º 30
0
def rank_filter(df: pd.DataFrame, ascending: bool, inclusive: bool,
                rank_threshold: {int, float}):
    """
    Returns a DataFrame with 1 if the element ha passed the specified ranking filter, else nan. Ranking threshold can be
    in percentage terms or a number.
    :param df: DataFrame
    :param ascending: bool
    :param inclusive: bool
    :param rank_threshold: float or int
    :return: DataFrame
    """
    # check the inputs
    if rank_threshold <= 0:
        raise ValueError(
            "'rank_threshold' needs to be an int or float strictly larger than 0"
        )
    elif type(rank_threshold) == float and rank_threshold > 1:
        raise ValueError(
            "if 'rank_threshold' is a float, it needs to be less than or equal to 1"
        )

    # rank the elements in the DataFrame
    ranked_df = df.rank(axis='columns',
                        method='first',
                        ascending=ascending,
                        numeric_only=True)

    # set DataFrame to 1 if the ranking filter is passed, else nan
    if type(rank_threshold) == float:
        num_numeric_per_row = ranked_df.count(axis=1)
        rank_threshold = round(num_numeric_per_row * rank_threshold)
        _filter = np.where(ranked_df.le(rank_threshold, axis=0), inclusive,
                           not inclusive)
    else:
        _filter = np.where(ranked_df <= rank_threshold, inclusive,
                           not inclusive)
    filter_df = pd.DataFrame(index=df.index, columns=df.columns,
                             data=_filter)  # store result in a DataFrame
    filter_df *= 1  # convert True to 1 and False to 0
    filter_df *= np.where(~df.isnull(), 1, np.nan)  # nan in case data is nan
    filter_df.replace(0, np.nan, inplace=True)
    return filter_df
Ejemplo n.º 31
0
    def calculate_ranks(self, df: pd.DataFrame) -> pd.DataFrame:
        """ Calculate expanded dataframe to match length of animation

        Returns:
            typing.Tuple[pd.DataFrame,pd.DataFrame]: df_values contains interpolated values, df_rank contains interpolated rank
        """

        df_rank = df.rank(axis=1, method="first",
                          ascending=False).clip(upper=self.n_visible + 1)
        if (self.sort == "desc"
                and self.orientation == "h") or (self.sort == "asc"
                                                 and self.orientation == "v"):
            # This flips all rankings, eg if n_visible = 5 then score 1 in table becomes (6-1 = 5)
            df_rank = self.n_visible + 1 - df_rank

        df_rank = self.get_interpolated_df(df_rank, self.steps_per_period,
                                           self.interpolate_period)
        # new_index = range(df.index.max() + 1)
        # df_rank = df_rank.reindex(new_index).interpolate()
        return df_rank
Ejemplo n.º 32
0
    def test_rank_methods_frame(self):
        pytest.importorskip('scipy.stats.special')
        rankdata = pytest.importorskip('scipy.stats.rankdata')

        xs = np.random.randint(0, 21, (100, 26))
        xs = (xs - 10.0) / 10.0
        cols = [chr(ord('z') - i) for i in range(xs.shape[1])]

        for vals in [xs, xs + 1e6, xs * 1e-6]:
            df = DataFrame(vals, columns=cols)

            for ax in [0, 1]:
                for m in ['average', 'min', 'max', 'first', 'dense']:
                    result = df.rank(axis=ax, method=m)
                    sprank = np.apply_along_axis(
                        rankdata, ax, vals, m if m != 'first' else 'ordinal')
                    sprank = sprank.astype(np.float64)
                    expected = DataFrame(sprank,
                                         columns=cols).astype('float64')
                    tm.assert_frame_equal(result, expected)
Ejemplo n.º 33
0
    def test_rank_methods_frame(self):
        pytest.importorskip("scipy.stats.special")
        rankdata = pytest.importorskip("scipy.stats.rankdata")

        xs = np.random.randint(0, 21, (100, 26))
        xs = (xs - 10.0) / 10.0
        cols = [chr(ord("z") - i) for i in range(xs.shape[1])]

        for vals in [xs, xs + 1e6, xs * 1e-6]:
            df = DataFrame(vals, columns=cols)

            for ax in [0, 1]:
                for m in ["average", "min", "max", "first", "dense"]:
                    result = df.rank(axis=ax, method=m)
                    sprank = np.apply_along_axis(
                        rankdata, ax, vals, m if m != "first" else "ordinal")
                    sprank = sprank.astype(np.float64)
                    expected = DataFrame(sprank,
                                         columns=cols).astype("float64")
                    tm.assert_frame_equal(result, expected)
Ejemplo n.º 34
0
    def test_df_series_inf_nan_consistency(self):
        # GH#32593
        index = [5, 4, 3, 2, 1, 6, 7, 8, 9, 10]
        col1 = [5, 4, 3, 5, 8, 5, 2, 1, 6, 6]
        col2 = [5, 4, np.nan, 5, 8, 5, np.inf, np.nan, 6, -np.inf]
        df = DataFrame(
            data={
                "col1": col1,
                "col2": col2,
            },
            index=index,
            dtype="f8",
        )
        df_result = df.rank()

        series_result = df.copy()
        series_result["col1"] = df["col1"].rank()
        series_result["col2"] = df["col2"].rank()

        tm.assert_frame_equal(df_result, series_result)
    def __init__(self, daily_return: pd.DataFrame, num_of_partner: int,
                 num_take_account: int):
        self.daily_return = daily_return
        self.ticker_list = list(daily_return.columns)
        self.num_of_partner = num_of_partner
        self.num_take_account = num_take_account

        self.potential_partner_dict = {}
        self.potential_partner_combinations_dict = {}
        self.approach_result_dict = {}

        self.d = num_of_partner + 1  # Total stocks as a cohort
        self.h_d = (self.d + 1) / (
            2**(self.d) - self.d - 1
        )  # Scalar needed for implementation of extended_approach approach

        self.ranked_daily_return = daily_return.rank(
            axis=0, pct=True, ascending=True,
            na_option='keep')  #NaN will be keep without affecting the ranking
        self.pairwise_spearman_corr = self.ranked_daily_return.corr()
Ejemplo n.º 36
0
 def _perform_ranking_on_dataframe(
         self, data_to_be_ranked: pd.DataFrame) -> pd.DataFrame:
     """Ranks data in a DataFrame in either descending or ascending order"""
     ranked_df = data_to_be_ranked.rank(axis='columns',
                                        method='first',
                                        ascending=not self.descending,
                                        numeric_only=True)
     if self.rank_number is not None:
         signal_array = np.where(ranked_df <= self.rank_number,
                                 self.include, not self.include)
     else:
         count_non_nan_s = ranked_df.count(axis=1)
         rank_number_s = round(count_non_nan_s * self.rank_fraction)
         signal_array = np.where(
             ranked_df.le(rank_number_s, axis=0), self.include,
             not self.include)  # True if df is Less or Equal to series
     signal_df = pd.DataFrame(index=data_to_be_ranked.index,
                              columns=data_to_be_ranked.columns,
                              data=signal_array)
     signal_df *= 1  # convert True to 1 and False to 0
     return signal_df
Ejemplo n.º 37
0
    def test_rank_methods_frame(self):
        pytest.importorskip('scipy.stats.special')
        rankdata = pytest.importorskip('scipy.stats.rankdata')

        xs = np.random.randint(0, 21, (100, 26))
        xs = (xs - 10.0) / 10.0
        cols = [chr(ord('z') - i) for i in range(xs.shape[1])]

        for vals in [xs, xs + 1e6, xs * 1e-6]:
            df = DataFrame(vals, columns=cols)

            for ax in [0, 1]:
                for m in ['average', 'min', 'max', 'first', 'dense']:
                    result = df.rank(axis=ax, method=m)
                    sprank = np.apply_along_axis(
                        rankdata, ax, vals,
                        m if m != 'first' else 'ordinal')
                    sprank = sprank.astype(np.float64)
                    expected = DataFrame(sprank,
                                         columns=cols).astype('float64')
                    tm.assert_frame_equal(result, expected)
Ejemplo n.º 38
0
class Scores(object):
    """

    Parameters
    ----------
    uri : str, optional

    modality : str, optional

    Returns
    -------
    scores : `Scores`

    Examples
    --------

        >>> s = Scores(uri='video', modality='speaker')
        >>> s[Segment(0,1), 's1', 'A'] = 0.1
        >>> s[Segment(0,1), 's1', 'B'] = 0.2
        >>> s[Segment(0,1), 's1', 'C'] = 0.3
        >>> s[Segment(0,1), 's2', 'A'] = 0.4
        >>> s[Segment(0,1), 's2', 'B'] = 0.3
        >>> s[Segment(0,1), 's2', 'C'] = 0.2
        >>> s[Segment(2,3), 's1', 'A'] = 0.2
        >>> s[Segment(2,3), 's1', 'B'] = 0.1
        >>> s[Segment(2,3), 's1', 'C'] = 0.3

    """
    @classmethod
    def from_df(
        cls, df,
        uri=None, modality=None, aggfunc=np.mean
    ):
        """

        Parameters
        ----------
        df : DataFrame
            Must contain the following columns:
            'segment', 'track', 'label' and 'value'
        uri : str, optional
            Resource identifier
        modality : str, optional
            Modality
        aggfunc : func
            Value aggregation function in case of duplicate (segment, track,
            label) tuples

        Returns
        -------

        """
        dataframe = pivot_table(
            df, values=PYANNOTE_SCORE,
            index=[PYANNOTE_SEGMENT, PYANNOTE_TRACK], columns=PYANNOTE_LABEL,
            aggfunc=aggfunc
        )

        annotation = Annotation(uri=uri, modality=modality)
        for index, _ in dataframe.iterrows():
            segment = Segment(*index[0])
            track = index[1]
            annotation[segment, track] = ''

        labels = dataframe.columns

        return cls(uri=uri, modality=modality,
                   annotation=annotation, labels=labels,
                   values=dataframe.values)

    def __init__(self, uri=None, modality=None,
                 annotation=None, labels=None,
                 values=None, dtype=None):

        super(Scores, self).__init__()

        names = [PYANNOTE_SEGMENT + '_' + field
                 for field in Segment._fields] + [PYANNOTE_TRACK]

        if annotation:
            annotation = annotation.copy()
            index = Index(
                [s + (t, ) for s, t in annotation.itertracks()],
                name=names)

        else:
            annotation = Annotation(uri=uri, modality=modality)
            index = MultiIndex(levels=[list() for name in names],
                               labels=[list() for name in names],
                               names=names)

        self.annotation_ = annotation
        columns = None if labels is None else list(labels)
        data = None if values is None else np.array(values)
        dtype = np.float if values is None else values.dtype

        self.dataframe_ = DataFrame(data=data, dtype=dtype,
                                    index=index, columns=columns)

        self.hasChanged_ = True

        self.modality = modality
        self.uri = uri

    def copy(self):
        self._reindexIfNeeded()
        copied = self.__class__(uri=self.uri, modality=self.modality)
        copied.dataframe_ = self.dataframe_.copy()
        copied.annotation_ = self.annotation_.copy()
        copied.hasChanged_ = self.hasChanged_
        return copied

    # del scores[segment]
    # del scores[segment, :]
    # del scores[segment, track]
    def __delitem__(self, key):

        if isinstance(key, Segment):
            segment = key
            self.dataframe_.drop(tuple(segment), axis=0, inplace=True)
            del self.annotation_[segment]
            self.hasChanged_ = True

        elif isinstance(key, tuple) and len(key) == 2:
            segment, track = key
            self.dataframe_.drop(tuple(segment) + (track, ),
                                 axis=0, inplace=True)
            del self.annotation_[segment, track]
            self.hasChanged_ = True

        else:
            raise KeyError('')

    # value = scores[segment, track, label]
    def __getitem__(self, key):

        if len(key) == 2:
            key = (key[0], '_', key[1])

        segment, track, label = key
        return self.dataframe_.at[tuple(segment) + (track, ), label]

    # scores[segment, track, label] = value
    # scores[segment, label] ==== scores[segment, '_', label]
    def __setitem__(self, key, value):

        if len(key) == 2:
            key = (key[0], '_', key[1])

        segment, track, label = key

        # do not add empty track
        if not segment:
            return

        self.dataframe_.at[tuple(segment) + (track,), label] = value
        self.annotation_[segment, track] = label
        self.hasChanged_ = True

    def __len__(self):
        """Number of annotated segments"""
        return len(self.annotation_)

    def __nonzero__(self):
        return self.__bool__()

    def __bool__(self):
        """False if annotation is empty"""
        return True if self.annotation_ else False

    def __contains__(self, included):
        """Check if segments are annotated

        Parameters
        ----------
        included : `Segment` or `Timeline`

        Returns
        -------
        contains : bool
            True if every segment in `included` is annotated, False otherwise.
        """
        return included in self.annotation_

    def __iter__(self):
        """Iterate over sorted segments"""
        return iter(self.annotation_.get_timeline())

    def __reversed__(self):
        """Reverse iterate over sorted segments"""
        return reversed(self.annotation_.get_timeline())

    def itersegments(self):
        return iter(self)

    def tracks(self, segment):
        """Set of tracks for query segment

        Parameters
        ----------
        segment : `Segment`
            Query segment

        Returns
        -------
        tracks : set
            Set of tracks for query segment
        """
        return self.annotation_.get_tracks(segment)

    def has_track(self, segment, track):
        """Check whether a given track exists

        Parameters
        ----------
        segment : `Segment`
            Query segment
        track :
            Query track

        Returns
        -------
        exists : bool
            True if track exists for segment
        """
        return self.annotation_.has_track(segment, track)

    def get_track_by_name(self, track):
        """Get all tracks with given name

        Parameters
        ----------
        track : any valid track name
            Requested name track

        Returns
        -------
        tracks : list
            List of (segment, track) tuples
        """
        return self.annotation_.get_track_by_name(track)

    def new_track(self, segment, candidate=None, prefix=None):
        """Track name generator

        Parameters
        ----------
        segment : Segment
        prefix : str, optional
        candidate : any valid track name


        Returns
        -------
        track : str
            New track name
        """

        return self.annotation_.new_track(segment, candidate=None, prefix=None)

    def itertracks(self):
        """Iterate over annotation as (segment, track) tuple"""
        return self.annotation_.itertracks()

    def itervalues(self):
        """Iterate over scores as (segment, track, label, value) tuple"""

        # make sure segment/track pairs are sorted
        self._reindexIfNeeded()

        labels = self.labels()

        # yield one (segment, track, label) tuple per loop
        for index, columns in self.dataframe_.iterrows():
            segment = Segment(*index[:-1])
            track = index[-1]
            for label in labels:
                value = columns[label]
                if not np.isnan(value):
                    yield segment, track, label, value

    def get_track_scores(self, segment, track):
        """Get all scores for a given track.

        Parameters
        ----------
        segment : Segment
        track : hashable
            segment, track must be a valid track

        Returns
        -------
        scores : dict
            {label: score} dictionary
        """
        return dict(self.dataframe_.xs(tuple(segment) + (track, )))

    def labels(self, unknown=True):
        """List of labels

        Parameters
        ----------
        unknown : bool, optional
            When False, do not return Unknown instances
            When True, return any label (even Unknown instances)

        Returns
        -------
        labels : list
            Sorted list of existing labels

        Remarks
        -------
            Labels are sorted based on their string representation.
        """
        labels = sorted(self.dataframe_.columns, key=str)
        if unknown:
            return labels
        else:
            return [l for l in labels if not isinstance(l, Unknown)]

    def _reindexIfNeeded(self):

        if not self.hasChanged_:
            return

        names = [PYANNOTE_SEGMENT + '_' + field
                 for field in Segment._fields] + [PYANNOTE_TRACK]

        new_index = Index(
            [s + (t, ) for s, t in self.annotation_.itertracks()],
            name=names)

        self.dataframe_ = self.dataframe_.reindex(new_index)

        self.hasChanged_ = False

        return

    def retrack(self):
        """
        """

        self._reindexIfNeeded()
        retracked = self.copy()

        annotation = self.annotation_.retrack()
        retracked.annotation_ = annotation

        names = [PYANNOTE_SEGMENT + '_' + field
                 for field in Segment._fields] + [PYANNOTE_TRACK]
        new_index = Index(
            [s + (t, ) for s, t in annotation.itertracks()],
            name=names)
        retracked.dataframe_.index = new_index

        return retracked

    def apply(self, func, axis=0):

        applied = self.copy()
        applied.dataframe_ = self.dataframe_.apply(func, axis=axis)
        applied.hasChanged_ = True

        return applied

    def rank(self, ascending=False):
        """

        Parameters
        ----------
        ascending : boolean, default False
            False for ranks by high (0) to low (N-1)

        Returns
        -------
        rank : `Scores`

        """

        ranked = self.copy()
        ranked.dataframe_ = -1 + self.dataframe_.rank(axis=1,
                                                      ascending=ascending)
        ranked.hasChanged_ = True
        return ranked

    def nbest(self, n, ascending=False):
        """

        Parameters
        ----------
        n : int
            Size of n-best list
        ascending : boolean, default False
            False for ranks by high (0) to low (N-1)

        Returns
        -------
        nbest : `Scores`
            New scores where only n-best are kept.

        """

        filtered = self.copy()
        ranked_ = -1 + self.dataframe_.rank(axis=1, ascending=ascending)
        filtered.dataframe_ = filtered.dataframe_.where(ranked_ < n,
                                                        other=np.NaN)
        filtered.hasChanged_ = True
        return filtered

    def subset(self, labels, invert=False):
        """Scores subset

        Extract scores subset based on labels

        Parameters
        ----------
        labels : set
            Set of labels
        invert : bool, optional
            If invert is True, extract all but requested `labels`

        Returns
        -------
        subset : `Scores`
            Scores subset.
        """

        self._reindexIfNeeded()

        if not isinstance(labels, set):
            raise TypeError('labels must be provided as a set of labels.')

        if invert:
            labels = set(self.labels()) - labels
        else:
            labels = labels & set(self.labels())

        subset = Scores(uri=self.uri, modality=self.modality)
        subset.annotation_ = self.annotation_
        subset.dataframe_ = self.dataframe_[list(labels)]

        return subset

    def to_annotation(self, threshold=-np.inf, posterior=False):
        """

        Parameters
        ----------
        threshold : float, optional
            Each track is annotated with the label with the highest score.
            Yet, if the latter is smaller than `threshold`, label is replaced
            with an `Unknown` instance.
        posterior : bool, optional
            If True, scores are posterior probabilities in open-set
            identification. If top model posterior is higher than unknown
            posterior, it is selected. Otherwise, label is replaced with an
            `Unknown` instance.
        """

        if not self:
            return Annotation(uri=self.uri, modality=self.modality)

        best = self.nbest(1, ascending=False)
        large_enough = best.copy()

        if posterior:
            unknown_posterior = 1. - self.dataframe_.sum(axis=1)

            large_enough.dataframe_ = (
                ((best.dataframe_.T > unknown_posterior) &
                 (best.dataframe_.T > threshold)).T
            )

        else:

            large_enough.dataframe_ = (
                (best.dataframe_.T > threshold).T
            )

        large_enough.dataframe_.where(best.dataframe_.notnull(),
                                      inplace=True, other=np.NaN)

        annotation = Annotation(uri=self.uri, modality=self.modality)
        for segment, track, label, value in large_enough.itervalues():
            label = label if value else Unknown()
            annotation[segment, track] = label

        return annotation

    def map(self, func):
        """Apply function to all values"""

        mapped = self.copy()
        mapped.dataframe_ = self.dataframe_.applymap(func)
        mapped.hasChanged_ = True
        return mapped

    def crop(self, focus, mode='strict'):
        """Crop on focus

        Parameters
        ----------
        focus : `Segment` or `Timeline`

        mode : {'strict', 'loose', 'intersection'}
            In 'strict' mode, only segments fully included in focus coverage
            are kept. In 'loose' mode, any intersecting segment is kept
            unchanged. In 'intersection' mode, only intersecting segments are
            kept and replaced by their actual intersection with the focus.

        Returns
        -------
        cropped : same type as caller
            Cropped version of the caller containing only tracks matching
            the provided focus and mode.

        Remarks
        -------
        In 'intersection' mode, the best is done to keep the track names
        unchanged. However, in some cases where two original segments are
        cropped into the same resulting segments, conflicting track names are
        modified to make sure no track is lost.

        """

        if isinstance(focus, Segment):
            return self.crop(Timeline([focus], uri=self.uri), mode=mode)

        self._reindexIfNeeded()
        cropped = self.copy()

        if mode in ['strict', 'loose']:

            new_annotation = self.annotation_.crop(focus, mode=mode)
            keep = [new_annotation.has_track(segment, track)
                    for segment, track in self.itertracks()]
            cropped.dataframe_ = self.dataframe_[keep]
            cropped.annotation_ = new_annotation
            cropped.hasChanged_ = True

            return cropped

        elif mode in ['intersection']:

            raise NotImplementedError('')

            # # two original segments might be cropped into the same resulting
            # # segment -- therefore, we keep track of the mapping
            # intersection, mapping = timeline.crop(coverage,
            #                                       mode=mode, mapping=True)
            #
            # # create new empty annotation
            # A = self.__class__(uri=self.uri, modality=self.modality)
            #
            # for cropped in intersection:
            #     for original in mapping[cropped]:
            #         for track in self.tracks(original):
            #             # try to use original track name (candidate)
            #             # if it already exists, create a brand new one
            #             new_track = A.new_track(cropped, candidate=track)
            #             # copy each value, column by column
            #             for label in self.dataframe_.columns:
            #                 value = self.dataframe_.get_value((original, track),
            #                                            label)
            #                 A.dataframe_ = A.dataframe_.set_value((cropped, new_track),
            #                                         label, value)
            #
            # return A

    def __str__(self):
        """Human-friendly representation"""
        if self:
            self._reindexIfNeeded()
            return str(self.dataframe_)
        else:
            return ""

    def _repr_png_(self):
        from .notebook import repr_scores
        return repr_scores(self)
d1.sort_index()
#%% 按字段排序
d1.sort(['b', 'c'], ascending=[1, 0])
#%%
d1.sort(['c', 'b'], ascending=[1, 0])
#%% sort_index功能可以覆盖sort
d1.sort_index(by='c',ascending=False)
#%%
d1.sort_index(by=['b','c'],ascending=[0,1])

#%% 对列名排序
d1.sort_index(axis=1,ascending=False)
#%% 对列指定出现的顺序
d1[['c','b','a']]

#%% 如果想根据某一行数据对列进行排序 
d1.reindex(columns=d1.ix['j'].order().index)

#%% 指定行顺序的排序
d1.ix[['b','c','a']]

#%% 根据多级索引进行排序
d2.sortlevel(0,ascending=False)
#%%
d2.sortlevel(0,ascending=False).sortlevel(1)

#%% 对元素进行排名
d1.rank()
#%% 指定是用相同值最小的排名、倒叙
d1.rank(method='min',ascending=False)
print "根据索引排序,对于DataFrame可以指定轴。"
obj = Series(range(4), index=["d", "a", "b", "c"])
print obj.sort_index()
frame = DataFrame(np.arange(8).reshape((2, 4)), index=["three", "one"], columns=list("dabc"))
print frame.sort_index()
print frame.sort_index(axis=1)  # axis=1 表示对列进行操作
print frame.sort_index(axis=1, ascending=False)  # 降序
print

print "根据值排序"
obj = Series([4, 7, -3, 2])
print obj.sort_values()  # order已淘汰
print

print "DataFrame指定列排序"
frame = DataFrame({"b": [4, 7, -3, 2], "a": [0, 1, 0, 1]})
print frame
print frame.sort_values(by="b")  # sort_index(by = ...)已淘汰
print frame.sort_values(by=["a", "b"])
print

print "rank,求排名的平均位置(从1开始)"
obj = Series([7, -5, 7, 4, 2, 0, 4])
# 对应排名:-5(1), 0(2), 2(3), 4(4), 4(5), 7(6), 7(7)
print obj.rank()
print obj.rank(method="first")  # 去第一次出现,不求平均值。
print obj.rank(ascending=False, method="max")  # 逆序,并取最大值。所以-5的rank是7.
frame = DataFrame({"b": [4.3, 7, -3, 2], "a": [0, 1, 0, 1], "c": [-2, 5, 8, -2.5]})
print frame
print frame.rank(axis=1)
Ejemplo n.º 41
0
obj.order()
# 排序时缺失值都会被放在末尾
# 对多列进行排序
frame = DataFrame({'b':[4,7,-3,2],'a':[0,1,0,1]})
frame.sort_index(by=['a','b'])
frame.order(by=['a','b'])
# 排名
obj = Series([7,-5,7,4,2,0,4])
obj.rank()
# 对于相同值,按照出现次序排
obj.rank(method='first')
# 降序
obj.rank(ascending=False,method='max')
# 对列计算排名
frame = DataFrame({'b':[4.3,7,-3,2],'a':[0,1,0,1],'c':[-2,5,8,-2.5]})
frame.rank(axis=1)

## 带有重复值的轴索引
obj = Series(range(5), index=['a','a','b','b','c'])
# 检验是否唯一
obj.index.is_unique
# 一个索引有多个值,那么该索引就会返回多个值。
obj['a']

## 汇总和计算描述统计
df = DataFrame([[1.4,np.nan],[7.1,-4.5],[np.nan,np.nan],[0.75,-1.3]],
	index=['a','b','c','d'], columns=['one','two'])
# 对列
df.sum()
# 对行
df.sum(axis=1)
Ejemplo n.º 42
0
print(obj.sort_values())

frame = DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
print(frame)
print(frame.sort_values(by='b'))
print(frame.sort_values(by=['a', 'b']))

# rank
obj = Series([7, -5, 7, 4, 2, 0, 4])
print(obj.rank())
print(obj.rank(method='first'))
print(obj.rank(method='max', ascending=False))

frame = DataFrame({'b': [4.3, 7, -3, 2], 'a': [0, 1, 0, 1], 'c': [-2, 5, 8, -2.5]})
print(frame)
print(frame.rank(axis=1))

'''
duplicate index
'''
obj = Series(range(5), index=['a', 'a', 'b', 'b', 'c'])
print(obj)
print(obj.index.is_unique)
print(obj['a'])
print(obj['c'])

df = DataFrame(np.random.randn(4, 3),
               index=['a', 'a', 'b', 'b'])
print(df)
print(df.ix['b'])
Ejemplo n.º 43
0
import numpy as np
randn = np.random.randn
import pandas as pd
from pandas import Series, DataFrame

frame = DataFrame(np.random.randn(4, 3), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon'])
print frame

f = lambda x: x.max() - x.min()
print frame.apply(f)
print frame.apply(f, axis=1)

print 'fancy f(x)---------'

def f(x):
    return Series([x.min(), x.max()], index=['min', 'max'])

print frame.apply(f)
format = lambda x: '%.2f' % x
print frame.applymap(format)

print 'sorting-------------'
print frame.sort_index(by='b')
print frame.rank(method='max', axis=1)
Ejemplo n.º 44
0
 def test_pct_max_many_rows(self):
     # GH 18271
     df = DataFrame({'A': np.arange(2**24 + 1),
                     'B': np.arange(2**24 + 1, 0, -1)})
     result = df.rank(pct=True).max()
     assert (result == 1).all()
Ejemplo n.º 45
0
d    2.0
'''
frame = DataFrame({'b': [4.3, 7, -3, 2],
                   'a': [0, 1, 0, 1],
                   'c': [-2, 5, 8, -2.5]})
print
frame
'''
   a    b    c
0  0  4.3 -2.0
1  1  7.0  5.0
2  0 -3.0  8.0
3  1  2.0 -2.5
'''
print
frame.rank(axis=1)  # 按行进行排名,默认升序
'''
     a    b    c
0  2.0  3.0  1.0
1  1.0  3.0  2.0
2  2.0  1.0  3.0
3  2.0  3.0  1.0
'''

print
'重复索引:进行两层索引'
obj = Series([0, 1, 2, 3, 4], index=['a', 'a', 'b', 'b', 'c'])
print
obj.index.is_unique  # 判断是非有重复索引
# False
print
Ejemplo n.º 46
0
 def test_rank_axis(self):
     # check if using axes' names gives the same result
     df = DataFrame([[2, 1], [4, 3]])
     tm.assert_frame_equal(df.rank(axis=0), df.rank(axis='index'))
     tm.assert_frame_equal(df.rank(axis=1), df.rank(axis='columns'))