def test_rank(self): tm._skip_if_no_scipy() from scipy.stats import rankdata self.frame['A'][::2] = np.nan self.frame['B'][::3] = np.nan self.frame['C'][::4] = np.nan self.frame['D'][::5] = np.nan ranks0 = self.frame.rank() ranks1 = self.frame.rank(1) mask = np.isnan(self.frame.values) fvals = self.frame.fillna(np.inf).values exp0 = np.apply_along_axis(rankdata, 0, fvals) exp0[mask] = np.nan exp1 = np.apply_along_axis(rankdata, 1, fvals) exp1[mask] = np.nan tm.assert_almost_equal(ranks0.values, exp0) tm.assert_almost_equal(ranks1.values, exp1) # integers df = DataFrame(np.random.randint(0, 5, size=40).reshape((10, 4))) result = df.rank() exp = df.astype(float).rank() tm.assert_frame_equal(result, exp) result = df.rank(1) exp = df.astype(float).rank(1) tm.assert_frame_equal(result, exp)
def test_rank2(self): df = DataFrame([[1, 3, 2], [1, 2, 3]]) expected = DataFrame([[1.0, 3.0, 2.0], [1, 2, 3]]) / 3.0 result = df.rank(1, pct=True) tm.assert_frame_equal(result, expected) df = DataFrame([[1, 3, 2], [1, 2, 3]]) expected = df.rank(0) / 2.0 result = df.rank(0, pct=True) tm.assert_frame_equal(result, expected) df = DataFrame([['b', 'c', 'a'], ['a', 'c', 'b']]) expected = DataFrame([[2.0, 3.0, 1.0], [1, 3, 2]]) result = df.rank(1, numeric_only=False) tm.assert_frame_equal(result, expected) expected = DataFrame([[2.0, 1.5, 1.0], [1, 1.5, 2]]) result = df.rank(0, numeric_only=False) tm.assert_frame_equal(result, expected) df = DataFrame([['b', np.nan, 'a'], ['a', 'c', 'b']]) expected = DataFrame([[2.0, nan, 1.0], [1.0, 3.0, 2.0]]) result = df.rank(1, numeric_only=False) tm.assert_frame_equal(result, expected) expected = DataFrame([[2.0, nan, 1.0], [1.0, 1.0, 2.0]]) result = df.rank(0, numeric_only=False) tm.assert_frame_equal(result, expected) # f7u12, this does not work without extensive workaround data = [[datetime(2001, 1, 5), nan, datetime(2001, 1, 2)], [ datetime(2000, 1, 2), datetime(2000, 1, 3), datetime(2000, 1, 1) ]] df = DataFrame(data) # check the rank expected = DataFrame([[2., nan, 1.], [2., 3., 1.]]) result = df.rank(1, numeric_only=False, ascending=True) tm.assert_frame_equal(result, expected) expected = DataFrame([[1., nan, 2.], [2., 1., 3.]]) result = df.rank(1, numeric_only=False, ascending=False) tm.assert_frame_equal(result, expected) # mixed-type frames self.mixed_frame['datetime'] = datetime.now() self.mixed_frame['timedelta'] = timedelta(days=1, seconds=1) result = self.mixed_frame.rank(1) expected = self.mixed_frame.rank(1, numeric_only=True) tm.assert_frame_equal(result, expected) df = DataFrame( {"a": [1e-20, -5, 1e-20 + 1e-40, 10, 1e60, 1e80, 1e-30]}) exp = DataFrame({"a": [3.5, 1., 3.5, 5., 6., 7., 2.]}) tm.assert_frame_equal(df.rank(), exp)
def test_rank(self, float_frame): rankdata = pytest.importorskip("scipy.stats.rankdata") float_frame["A"][::2] = np.nan float_frame["B"][::3] = np.nan float_frame["C"][::4] = np.nan float_frame["D"][::5] = np.nan ranks0 = float_frame.rank() ranks1 = float_frame.rank(1) mask = np.isnan(float_frame.values) fvals = float_frame.fillna(np.inf).values exp0 = np.apply_along_axis(rankdata, 0, fvals) exp0[mask] = np.nan exp1 = np.apply_along_axis(rankdata, 1, fvals) exp1[mask] = np.nan tm.assert_almost_equal(ranks0.values, exp0) tm.assert_almost_equal(ranks1.values, exp1) # integers df = DataFrame(np.random.randint(0, 5, size=40).reshape((10, 4))) result = df.rank() exp = df.astype(float).rank() tm.assert_frame_equal(result, exp) result = df.rank(1) exp = df.astype(float).rank(1) tm.assert_frame_equal(result, exp)
def test_rank(self, float_frame): rankdata = pytest.importorskip('scipy.stats.rankdata') float_frame['A'][::2] = np.nan float_frame['B'][::3] = np.nan float_frame['C'][::4] = np.nan float_frame['D'][::5] = np.nan ranks0 = float_frame.rank() ranks1 = float_frame.rank(1) mask = np.isnan(float_frame.values) fvals = float_frame.fillna(np.inf).values exp0 = np.apply_along_axis(rankdata, 0, fvals) exp0[mask] = np.nan exp1 = np.apply_along_axis(rankdata, 1, fvals) exp1[mask] = np.nan tm.assert_almost_equal(ranks0.values, exp0) tm.assert_almost_equal(ranks1.values, exp1) # integers df = DataFrame(np.random.randint(0, 5, size=40).reshape((10, 4))) result = df.rank() exp = df.astype(float).rank() tm.assert_frame_equal(result, exp) result = df.rank(1) exp = df.astype(float).rank(1) tm.assert_frame_equal(result, exp)
def calculate_prcc(self, parameter, result, plot=False): """ Calculate a partial rank correlation coefficient (PRCC) value for uncertain parameters against the output. Partial correlation characterizes the linear relationship between an input and an output after the linear effects of the remaining inputs on the output are discounted. This is calculated by constructing a linear regression model between the input and the remaining inputs, and calculating the residuals between the input values and the model. Similarly, a linear regression model between the output and the remaining input is created and residuals are calculated. The PRCC is then the correlation coefficient between the input residuals and the output residuals (Note all data is rank transformed). :param result: :param parameter: :param plot: :return: """ # Linear regression model regr = linear_model.LinearRegression() df = self.dataframe_aggregated() ranked_params = DataFrame.rank(df[self.uncertain_parameters()]) ranked_results = DataFrame.rank(df[self._result_keys]) # Turn data into numpy arrays all_params_data = numpy.asarray(ranked_params) result_data = numpy.asarray(ranked_results[result]).reshape((len(ranked_results),1)) # Number of parameters k = all_params_data.shape[1] # Create truth dictionaries to split the parameter_data into the parameter of interest and all other # parameters # Dictionary indicating where parameter lies (true for param, false for all others) param_col_truth_table = [ranked_params.columns[j] == parameter for j in range(k)] # Numpy array of just the parameter param_data = all_params_data[:, param_col_truth_table] # Numpy array of all other parameters remaining_param_data = all_params_data[:, numpy.logical_not(param_col_truth_table)] # Fit a linear regression model between the remaining parameters and the parameter regr.fit(remaining_param_data, param_data) # Use to construct a line linreg_param = regr.predict(remaining_param_data) # Calculate residuals param_resid = param_data - linreg_param # Fit a linear regression model between the remaining parameters and the result regr.fit(remaining_param_data, result_data) # Use to construct a line linreg_result = regr.predict(remaining_param_data) # Calculate residuals result_resid = result_data - linreg_result # Determine correlation between residuals corr, p = stats.pearsonr(param_resid, result_resid) return (corr, p)
def test_rank_does_not_mutate(self): # GH#18521 # Check rank does not mutate DataFrame df = DataFrame(np.random.randn(10, 3), dtype="float64") expected = df.copy() df.rank() result = df tm.assert_frame_equal(result, expected)
def test_rank2(self): df = DataFrame([[1, 3, 2], [1, 2, 3]]) expected = DataFrame([[1.0, 3.0, 2.0], [1, 2, 3]]) / 3.0 result = df.rank(1, pct=True) tm.assert_frame_equal(result, expected) df = DataFrame([[1, 3, 2], [1, 2, 3]]) expected = df.rank(0) / 2.0 result = df.rank(0, pct=True) tm.assert_frame_equal(result, expected) df = DataFrame([['b', 'c', 'a'], ['a', 'c', 'b']]) expected = DataFrame([[2.0, 3.0, 1.0], [1, 3, 2]]) result = df.rank(1, numeric_only=False) tm.assert_frame_equal(result, expected) expected = DataFrame([[2.0, 1.5, 1.0], [1, 1.5, 2]]) result = df.rank(0, numeric_only=False) tm.assert_frame_equal(result, expected) df = DataFrame([['b', np.nan, 'a'], ['a', 'c', 'b']]) expected = DataFrame([[2.0, nan, 1.0], [1.0, 3.0, 2.0]]) result = df.rank(1, numeric_only=False) tm.assert_frame_equal(result, expected) expected = DataFrame([[2.0, nan, 1.0], [1.0, 1.0, 2.0]]) result = df.rank(0, numeric_only=False) tm.assert_frame_equal(result, expected) # f7u12, this does not work without extensive workaround data = [[datetime(2001, 1, 5), nan, datetime(2001, 1, 2)], [datetime(2000, 1, 2), datetime(2000, 1, 3), datetime(2000, 1, 1)]] df = DataFrame(data) # check the rank expected = DataFrame([[2., nan, 1.], [2., 3., 1.]]) result = df.rank(1, numeric_only=False, ascending=True) tm.assert_frame_equal(result, expected) expected = DataFrame([[1., nan, 2.], [2., 1., 3.]]) result = df.rank(1, numeric_only=False, ascending=False) tm.assert_frame_equal(result, expected) # mixed-type frames self.mixed_frame['datetime'] = datetime.now() self.mixed_frame['timedelta'] = timedelta(days=1, seconds=1) result = self.mixed_frame.rank(1) expected = self.mixed_frame.rank(1, numeric_only=True) tm.assert_frame_equal(result, expected) df = DataFrame({"a": [1e-20, -5, 1e-20 + 1e-40, 10, 1e60, 1e80, 1e-30]}) exp = DataFrame({"a": [3.5, 1., 3.5, 5., 6., 7., 2.]}) tm.assert_frame_equal(df.rank(), exp)
def test_rank2(self): df = DataFrame([[1, 3, 2], [1, 2, 3]]) expected = DataFrame([[1.0, 3.0, 2.0], [1, 2, 3]]) / 3.0 result = df.rank(1, pct=True) tm.assert_frame_equal(result, expected) df = DataFrame([[1, 3, 2], [1, 2, 3]]) expected = df.rank(0) / 2.0 result = df.rank(0, pct=True) tm.assert_frame_equal(result, expected) df = DataFrame([["b", "c", "a"], ["a", "c", "b"]]) expected = DataFrame([[2.0, 3.0, 1.0], [1, 3, 2]]) result = df.rank(1, numeric_only=False) tm.assert_frame_equal(result, expected) expected = DataFrame([[2.0, 1.5, 1.0], [1, 1.5, 2]]) result = df.rank(0, numeric_only=False) tm.assert_frame_equal(result, expected) df = DataFrame([["b", np.nan, "a"], ["a", "c", "b"]]) expected = DataFrame([[2.0, np.nan, 1.0], [1.0, 3.0, 2.0]]) result = df.rank(1, numeric_only=False) tm.assert_frame_equal(result, expected) expected = DataFrame([[2.0, np.nan, 1.0], [1.0, 1.0, 2.0]]) result = df.rank(0, numeric_only=False) tm.assert_frame_equal(result, expected) # f7u12, this does not work without extensive workaround data = [ [datetime(2001, 1, 5), np.nan, datetime(2001, 1, 2)], [datetime(2000, 1, 2), datetime(2000, 1, 3), datetime(2000, 1, 1)], ] df = DataFrame(data) # check the rank expected = DataFrame([[2.0, np.nan, 1.0], [2.0, 3.0, 1.0]]) result = df.rank(1, numeric_only=False, ascending=True) tm.assert_frame_equal(result, expected) expected = DataFrame([[1.0, np.nan, 2.0], [2.0, 1.0, 3.0]]) result = df.rank(1, numeric_only=False, ascending=False) tm.assert_frame_equal(result, expected) df = DataFrame( {"a": [1e-20, -5, 1e-20 + 1e-40, 10, 1e60, 1e80, 1e-30]}) exp = DataFrame({"a": [3.5, 1.0, 3.5, 5.0, 6.0, 7.0, 2.0]}) tm.assert_frame_equal(df.rank(), exp)
class Rank: param_names = ["dtype"] params = [ ["int", "uint", "float", "object"], ] def setup(self, dtype): self.df = DataFrame(np.random.randn(10000, 10), columns=range(10), dtype=dtype) def time_rank(self, dtype): self.df.rank()
def test_pct_max_many_rows(self): # GH 18271 df = DataFrame( {"A": np.arange(2 ** 24 + 1), "B": np.arange(2 ** 24 + 1, 0, -1)} ) result = df.rank(pct=True).max() assert (result == 1).all()
def test_rank_methods_frame(self): tm.skip_if_no_package('scipy', min_version='0.13', app='scipy.stats.rankdata') import scipy from scipy.stats import rankdata xs = np.random.randint(0, 21, (100, 26)) xs = (xs - 10.0) / 10.0 cols = [chr(ord('z') - i) for i in range(xs.shape[1])] for vals in [xs, xs + 1e6, xs * 1e-6]: df = DataFrame(vals, columns=cols) for ax in [0, 1]: for m in ['average', 'min', 'max', 'first', 'dense']: result = df.rank(axis=ax, method=m) sprank = np.apply_along_axis( rankdata, ax, vals, m if m != 'first' else 'ordinal') sprank = sprank.astype(np.float64) expected = DataFrame(sprank, columns=cols) if LooseVersion(scipy.__version__) >= '0.17.0': expected = expected.astype('float64') tm.assert_frame_equal(result, expected)
def determine_rank_invariance( df: pd.DataFrame, nri_threshold: float = 0.5) -> Tuple[pd.Index, pd.Index]: """ Determines rank invariance along axis 0, meaning columns should be samples, rows genes or proteins. The rank invariant (RI) and near rank invariant (NRI) indices can give information about how problematic a Quantile Normalization might be, since biological variance would be lost. Parameters ---------- df input dataframe with nri_threshold the percentage threshold at which near rank invariance condition is fulfilled. Should be between 0 and 1. Returns ------- """ df_rank = df.rank() max_rank_percentages = df_rank.apply(pd.Series.value_counts, axis=1).max() / df.shape[1] ri = df.index[max_rank_percentages == 1] nri = df.index[max_rank_percentages >= nri_threshold] nri = nri.difference(ri) return ri, nri
def generate_defaults_discretized(self, df: pd.DataFrame, num_defaults: int, minimize: bool, aggregate: typing.Callable, config_space: ConfigSpace.ConfigurationSpace, raise_no_improvement: bool) \ -> typing.Tuple[typing.List, typing.Dict[str, typing.Any]]: """ Takes a data frame with a discretized set of defaults and returns the average rank. The data frame should be structured as follows: each column represents a task, each row represents a configuration. As such, each cell represents the evaluation of that configuration on that task. The sum of the selected configurations across tasks is to be minimized or maximized Parameters ---------- df: pd.DataFrame The data frame as described above num_defaults: int The number of configurations to be selected minimize: bool Will minimize the objective function iff this is true. aggregate: callable function to aggregate per task results config_space: ConfigurationSpace the configuration space object corresponding to the defaults. Will be used casting defaults to the right data type. raise_no_improvement: bool if true, an error will be raised if no improvement is obtained before the correct number of default was obtained Returns ------- selected_indices: List[int] List of indices, as given by the dataframe results_dict: Dict[str, Any] Additional meta-information. Containing at least the key 'run_time', but potentially more information """ logging.info('Started %s, dimensions config frame %s' % (self.name, str(df.shape))) if num_defaults < 1: raise ValueError() start_time = time.time() # note that we deliberately inverse the average ranks (to work with np.argsort) The TODO rank is now the # best to go with inv_avg_ranks = df.rank(axis=0, method='average', ascending=minimize).sum(axis=1) / df.shape[1] selected_indices = list(np.argsort(inv_avg_ranks.values)) results_dict = { 'run_time': time.time() - start_time, } return selected_indices, results_dict
def test_rank_pct_true(self, method, exp): # see gh-15630. df = DataFrame([[2012, 66, 3], [2012, 65, 2], [2012, 65, 1]]) result = df.rank(method=method, pct=True) expected = DataFrame(exp) tm.assert_frame_equal(result, expected)
def _discretize_based_on_quantile(self, data: pd.DataFrame) -> pd.DataFrame: # The higher the values, the higher the rank rank = data.rank(method="first", axis=1, ascending=True) labels = range(1, self.bins + 1) discretized = rank.apply( lambda x: pd.qcut(x, self.bins, labels), axis=1, raw=True ) return discretized
def execute_scoring(self, labels: DataFrame, prediction: DataFrame) -> float: """ Scores the correlation as defined by the Numerai tournament rules. Arguments: labels: The real labels of the output prediction: The predicted labels Returns: The correlation coefficient """ ranked_prediction = prediction.rank(pct=True, method="first") return np.corrcoef(labels, ranked_prediction, rowvar=False)[0, 1]
def dataFrameUtilityMethodTest(): df = DataFrame() # Load a DataFrame from a CSV file df = pd.read_csv('mlg.csv') df = df.copy() # copy a DataFrame df = df.rank() # rank each col (default) df = df.sort_values(by='maker') print(df) df = df.sort_values(by=['maker', 'modelyear']) print(df) df = df.sort_index() df = df.astype(int) # type conversion print(df)
def rank(prefix, target, links, vectors, theta, k=2): res = DataFrame(columns=["p"], index=vectors.index) for i in range(vectors.shape[0]): # vectors.shape[0] t = vectors.index[i] flag = False for j in range(k): if t == prefix[j]: flag = True if flag == False: res.loc[t] = probability2(prefix, t, links, vectors, theta, k) print res r = res.rank(ascending=False)['p'][target] return r
def _infer_index_from_coords(self, coords: pd.DataFrame) -> pd.DataFrame: """ Discrete tile index. Using TILE_INDEX_STR as header. Returns: (pd.DataFrame): these are used as the index """ logger.warning( "infer index by raw coordinates may lead to unwanted error") index = coords.rank(axis="index", method="dense", ascending=True) # integer 0-based index index = index.astype(int) - 1 return index
def quantile_normalize(df: pd.DataFrame) -> pd.DataFrame: """ input: dataframe with numerical columns output: dataframe with quantile normalized values """ df_sorted = pd.DataFrame(np.sort(df.values, axis=0), index=df.index, columns=df.columns, dtype=np.float32) print("peak") df_mean = df_sorted.mean(axis=1) df_sorted = 0 df_mean.index = np.arange(1, len(df_mean) + 1) df_qn = df.rank(method="min").stack().astype(int).map(df_mean).unstack() return (df_qn)
def rank(self, prefix, target, theta, k=2): # links = self.links vectors = self.vectors res = DataFrame(columns=["p"], index=vectors.index) for i in range(vectors.shape[0]): t = vectors.index[i] flag = False for j in range(k): if t == prefix[j]: flag = True if flag == False: res.loc[t] = self.probability(prefix, t, theta, k) # print res.loc[t] r = res.rank(ascending=False)['p'][target] return r
def _winsorize_data(self, ranked_df: pd.DataFrame): if self.winsorizing_fraction: count_non_nan_s = ranked_df.count(axis=1) rank_number_s = round(count_non_nan_s * self.winsorizing_fraction) winsorizing_array = np.where(ranked_df.le(rank_number_s, axis=0), np.nan, 1) elif self.winsorizing_number: winsorizing_array = np.where(ranked_df <= self.winsorizing_number, np.nan, 1) else: return ranked_df ranked_df *= winsorizing_array return ranked_df.rank(axis='columns', method='first', ascending=True, numeric_only=True)
def transform(self, data: pd.DataFrame): if not self.rank_replace: raise ValueError("Please call fit first or use fit_transform") if self.missing_value_handler is not None: na_mask = data.notna() data = self.missing_value_handler(data) result = data.rank() if self.missing_value_handler is not None: result = result[na_mask] result = result.apply(pd.Series.map, arg=self.rank_replace) if self.output_scale == "normal": result = np.exp2(result) if self.col_name_prefix is not None: result.rename(lambda x: f"{self.col_name_prefix} {x}", axis=1, inplace=True) return result
def _split_long_short(self, data: pd.DataFrame) -> pd.DataFrame: """ split data into long(1) and short(-1) for certain percentage (ls_percentage) of each long and short """ # The higher the values, the higher the rank rank = data.rank(method="first", axis=1, ascending=True) def split_long_short(row): count = int(len(row.dropna()) * self.ls_percentage) long = row.nlargest(count).where(row.isnull(), 1) short = row.nsmallest(count).where(row.isnull(), -1) return pd.concat([long, short]) splited = rank.apply(split_long_short, axis=1) return splited
def grouping(data: pd.DataFrame, n): """ 1.假设样本量为M,将因子分成N组,前N-1组有效样本量为int(M/N),最后一组有效样本量为M-(N-1)*int(M/*N); 2.无效样本不参与计算; 3.相同排序定义为同一组; 4.相同排序后下一元素连续不跳级 5.升序排列 :param data: :param n:分组个数 :return: """ rank_data = data.rank(axis=1, ascending=True, method='dense') effect_data = rank_data.max(axis=1) amount_each_group = effect_data // n data_group = rank_data.floordiv(amount_each_group, axis=0) + np.sign( rank_data.mod(amount_each_group, axis=0)) data_group[data_group > n] = n return data_group
def rank_filter(df: pd.DataFrame, ascending: bool, inclusive: bool, rank_threshold: {int, float}): """ Returns a DataFrame with 1 if the element ha passed the specified ranking filter, else nan. Ranking threshold can be in percentage terms or a number. :param df: DataFrame :param ascending: bool :param inclusive: bool :param rank_threshold: float or int :return: DataFrame """ # check the inputs if rank_threshold <= 0: raise ValueError( "'rank_threshold' needs to be an int or float strictly larger than 0" ) elif type(rank_threshold) == float and rank_threshold > 1: raise ValueError( "if 'rank_threshold' is a float, it needs to be less than or equal to 1" ) # rank the elements in the DataFrame ranked_df = df.rank(axis='columns', method='first', ascending=ascending, numeric_only=True) # set DataFrame to 1 if the ranking filter is passed, else nan if type(rank_threshold) == float: num_numeric_per_row = ranked_df.count(axis=1) rank_threshold = round(num_numeric_per_row * rank_threshold) _filter = np.where(ranked_df.le(rank_threshold, axis=0), inclusive, not inclusive) else: _filter = np.where(ranked_df <= rank_threshold, inclusive, not inclusive) filter_df = pd.DataFrame(index=df.index, columns=df.columns, data=_filter) # store result in a DataFrame filter_df *= 1 # convert True to 1 and False to 0 filter_df *= np.where(~df.isnull(), 1, np.nan) # nan in case data is nan filter_df.replace(0, np.nan, inplace=True) return filter_df
def calculate_ranks(self, df: pd.DataFrame) -> pd.DataFrame: """ Calculate expanded dataframe to match length of animation Returns: typing.Tuple[pd.DataFrame,pd.DataFrame]: df_values contains interpolated values, df_rank contains interpolated rank """ df_rank = df.rank(axis=1, method="first", ascending=False).clip(upper=self.n_visible + 1) if (self.sort == "desc" and self.orientation == "h") or (self.sort == "asc" and self.orientation == "v"): # This flips all rankings, eg if n_visible = 5 then score 1 in table becomes (6-1 = 5) df_rank = self.n_visible + 1 - df_rank df_rank = self.get_interpolated_df(df_rank, self.steps_per_period, self.interpolate_period) # new_index = range(df.index.max() + 1) # df_rank = df_rank.reindex(new_index).interpolate() return df_rank
def test_rank_methods_frame(self): pytest.importorskip('scipy.stats.special') rankdata = pytest.importorskip('scipy.stats.rankdata') xs = np.random.randint(0, 21, (100, 26)) xs = (xs - 10.0) / 10.0 cols = [chr(ord('z') - i) for i in range(xs.shape[1])] for vals in [xs, xs + 1e6, xs * 1e-6]: df = DataFrame(vals, columns=cols) for ax in [0, 1]: for m in ['average', 'min', 'max', 'first', 'dense']: result = df.rank(axis=ax, method=m) sprank = np.apply_along_axis( rankdata, ax, vals, m if m != 'first' else 'ordinal') sprank = sprank.astype(np.float64) expected = DataFrame(sprank, columns=cols).astype('float64') tm.assert_frame_equal(result, expected)
def test_rank_methods_frame(self): pytest.importorskip("scipy.stats.special") rankdata = pytest.importorskip("scipy.stats.rankdata") xs = np.random.randint(0, 21, (100, 26)) xs = (xs - 10.0) / 10.0 cols = [chr(ord("z") - i) for i in range(xs.shape[1])] for vals in [xs, xs + 1e6, xs * 1e-6]: df = DataFrame(vals, columns=cols) for ax in [0, 1]: for m in ["average", "min", "max", "first", "dense"]: result = df.rank(axis=ax, method=m) sprank = np.apply_along_axis( rankdata, ax, vals, m if m != "first" else "ordinal") sprank = sprank.astype(np.float64) expected = DataFrame(sprank, columns=cols).astype("float64") tm.assert_frame_equal(result, expected)
def test_df_series_inf_nan_consistency(self): # GH#32593 index = [5, 4, 3, 2, 1, 6, 7, 8, 9, 10] col1 = [5, 4, 3, 5, 8, 5, 2, 1, 6, 6] col2 = [5, 4, np.nan, 5, 8, 5, np.inf, np.nan, 6, -np.inf] df = DataFrame( data={ "col1": col1, "col2": col2, }, index=index, dtype="f8", ) df_result = df.rank() series_result = df.copy() series_result["col1"] = df["col1"].rank() series_result["col2"] = df["col2"].rank() tm.assert_frame_equal(df_result, series_result)
def __init__(self, daily_return: pd.DataFrame, num_of_partner: int, num_take_account: int): self.daily_return = daily_return self.ticker_list = list(daily_return.columns) self.num_of_partner = num_of_partner self.num_take_account = num_take_account self.potential_partner_dict = {} self.potential_partner_combinations_dict = {} self.approach_result_dict = {} self.d = num_of_partner + 1 # Total stocks as a cohort self.h_d = (self.d + 1) / ( 2**(self.d) - self.d - 1 ) # Scalar needed for implementation of extended_approach approach self.ranked_daily_return = daily_return.rank( axis=0, pct=True, ascending=True, na_option='keep') #NaN will be keep without affecting the ranking self.pairwise_spearman_corr = self.ranked_daily_return.corr()
def _perform_ranking_on_dataframe( self, data_to_be_ranked: pd.DataFrame) -> pd.DataFrame: """Ranks data in a DataFrame in either descending or ascending order""" ranked_df = data_to_be_ranked.rank(axis='columns', method='first', ascending=not self.descending, numeric_only=True) if self.rank_number is not None: signal_array = np.where(ranked_df <= self.rank_number, self.include, not self.include) else: count_non_nan_s = ranked_df.count(axis=1) rank_number_s = round(count_non_nan_s * self.rank_fraction) signal_array = np.where( ranked_df.le(rank_number_s, axis=0), self.include, not self.include) # True if df is Less or Equal to series signal_df = pd.DataFrame(index=data_to_be_ranked.index, columns=data_to_be_ranked.columns, data=signal_array) signal_df *= 1 # convert True to 1 and False to 0 return signal_df
class Scores(object): """ Parameters ---------- uri : str, optional modality : str, optional Returns ------- scores : `Scores` Examples -------- >>> s = Scores(uri='video', modality='speaker') >>> s[Segment(0,1), 's1', 'A'] = 0.1 >>> s[Segment(0,1), 's1', 'B'] = 0.2 >>> s[Segment(0,1), 's1', 'C'] = 0.3 >>> s[Segment(0,1), 's2', 'A'] = 0.4 >>> s[Segment(0,1), 's2', 'B'] = 0.3 >>> s[Segment(0,1), 's2', 'C'] = 0.2 >>> s[Segment(2,3), 's1', 'A'] = 0.2 >>> s[Segment(2,3), 's1', 'B'] = 0.1 >>> s[Segment(2,3), 's1', 'C'] = 0.3 """ @classmethod def from_df( cls, df, uri=None, modality=None, aggfunc=np.mean ): """ Parameters ---------- df : DataFrame Must contain the following columns: 'segment', 'track', 'label' and 'value' uri : str, optional Resource identifier modality : str, optional Modality aggfunc : func Value aggregation function in case of duplicate (segment, track, label) tuples Returns ------- """ dataframe = pivot_table( df, values=PYANNOTE_SCORE, index=[PYANNOTE_SEGMENT, PYANNOTE_TRACK], columns=PYANNOTE_LABEL, aggfunc=aggfunc ) annotation = Annotation(uri=uri, modality=modality) for index, _ in dataframe.iterrows(): segment = Segment(*index[0]) track = index[1] annotation[segment, track] = '' labels = dataframe.columns return cls(uri=uri, modality=modality, annotation=annotation, labels=labels, values=dataframe.values) def __init__(self, uri=None, modality=None, annotation=None, labels=None, values=None, dtype=None): super(Scores, self).__init__() names = [PYANNOTE_SEGMENT + '_' + field for field in Segment._fields] + [PYANNOTE_TRACK] if annotation: annotation = annotation.copy() index = Index( [s + (t, ) for s, t in annotation.itertracks()], name=names) else: annotation = Annotation(uri=uri, modality=modality) index = MultiIndex(levels=[list() for name in names], labels=[list() for name in names], names=names) self.annotation_ = annotation columns = None if labels is None else list(labels) data = None if values is None else np.array(values) dtype = np.float if values is None else values.dtype self.dataframe_ = DataFrame(data=data, dtype=dtype, index=index, columns=columns) self.hasChanged_ = True self.modality = modality self.uri = uri def copy(self): self._reindexIfNeeded() copied = self.__class__(uri=self.uri, modality=self.modality) copied.dataframe_ = self.dataframe_.copy() copied.annotation_ = self.annotation_.copy() copied.hasChanged_ = self.hasChanged_ return copied # del scores[segment] # del scores[segment, :] # del scores[segment, track] def __delitem__(self, key): if isinstance(key, Segment): segment = key self.dataframe_.drop(tuple(segment), axis=0, inplace=True) del self.annotation_[segment] self.hasChanged_ = True elif isinstance(key, tuple) and len(key) == 2: segment, track = key self.dataframe_.drop(tuple(segment) + (track, ), axis=0, inplace=True) del self.annotation_[segment, track] self.hasChanged_ = True else: raise KeyError('') # value = scores[segment, track, label] def __getitem__(self, key): if len(key) == 2: key = (key[0], '_', key[1]) segment, track, label = key return self.dataframe_.at[tuple(segment) + (track, ), label] # scores[segment, track, label] = value # scores[segment, label] ==== scores[segment, '_', label] def __setitem__(self, key, value): if len(key) == 2: key = (key[0], '_', key[1]) segment, track, label = key # do not add empty track if not segment: return self.dataframe_.at[tuple(segment) + (track,), label] = value self.annotation_[segment, track] = label self.hasChanged_ = True def __len__(self): """Number of annotated segments""" return len(self.annotation_) def __nonzero__(self): return self.__bool__() def __bool__(self): """False if annotation is empty""" return True if self.annotation_ else False def __contains__(self, included): """Check if segments are annotated Parameters ---------- included : `Segment` or `Timeline` Returns ------- contains : bool True if every segment in `included` is annotated, False otherwise. """ return included in self.annotation_ def __iter__(self): """Iterate over sorted segments""" return iter(self.annotation_.get_timeline()) def __reversed__(self): """Reverse iterate over sorted segments""" return reversed(self.annotation_.get_timeline()) def itersegments(self): return iter(self) def tracks(self, segment): """Set of tracks for query segment Parameters ---------- segment : `Segment` Query segment Returns ------- tracks : set Set of tracks for query segment """ return self.annotation_.get_tracks(segment) def has_track(self, segment, track): """Check whether a given track exists Parameters ---------- segment : `Segment` Query segment track : Query track Returns ------- exists : bool True if track exists for segment """ return self.annotation_.has_track(segment, track) def get_track_by_name(self, track): """Get all tracks with given name Parameters ---------- track : any valid track name Requested name track Returns ------- tracks : list List of (segment, track) tuples """ return self.annotation_.get_track_by_name(track) def new_track(self, segment, candidate=None, prefix=None): """Track name generator Parameters ---------- segment : Segment prefix : str, optional candidate : any valid track name Returns ------- track : str New track name """ return self.annotation_.new_track(segment, candidate=None, prefix=None) def itertracks(self): """Iterate over annotation as (segment, track) tuple""" return self.annotation_.itertracks() def itervalues(self): """Iterate over scores as (segment, track, label, value) tuple""" # make sure segment/track pairs are sorted self._reindexIfNeeded() labels = self.labels() # yield one (segment, track, label) tuple per loop for index, columns in self.dataframe_.iterrows(): segment = Segment(*index[:-1]) track = index[-1] for label in labels: value = columns[label] if not np.isnan(value): yield segment, track, label, value def get_track_scores(self, segment, track): """Get all scores for a given track. Parameters ---------- segment : Segment track : hashable segment, track must be a valid track Returns ------- scores : dict {label: score} dictionary """ return dict(self.dataframe_.xs(tuple(segment) + (track, ))) def labels(self, unknown=True): """List of labels Parameters ---------- unknown : bool, optional When False, do not return Unknown instances When True, return any label (even Unknown instances) Returns ------- labels : list Sorted list of existing labels Remarks ------- Labels are sorted based on their string representation. """ labels = sorted(self.dataframe_.columns, key=str) if unknown: return labels else: return [l for l in labels if not isinstance(l, Unknown)] def _reindexIfNeeded(self): if not self.hasChanged_: return names = [PYANNOTE_SEGMENT + '_' + field for field in Segment._fields] + [PYANNOTE_TRACK] new_index = Index( [s + (t, ) for s, t in self.annotation_.itertracks()], name=names) self.dataframe_ = self.dataframe_.reindex(new_index) self.hasChanged_ = False return def retrack(self): """ """ self._reindexIfNeeded() retracked = self.copy() annotation = self.annotation_.retrack() retracked.annotation_ = annotation names = [PYANNOTE_SEGMENT + '_' + field for field in Segment._fields] + [PYANNOTE_TRACK] new_index = Index( [s + (t, ) for s, t in annotation.itertracks()], name=names) retracked.dataframe_.index = new_index return retracked def apply(self, func, axis=0): applied = self.copy() applied.dataframe_ = self.dataframe_.apply(func, axis=axis) applied.hasChanged_ = True return applied def rank(self, ascending=False): """ Parameters ---------- ascending : boolean, default False False for ranks by high (0) to low (N-1) Returns ------- rank : `Scores` """ ranked = self.copy() ranked.dataframe_ = -1 + self.dataframe_.rank(axis=1, ascending=ascending) ranked.hasChanged_ = True return ranked def nbest(self, n, ascending=False): """ Parameters ---------- n : int Size of n-best list ascending : boolean, default False False for ranks by high (0) to low (N-1) Returns ------- nbest : `Scores` New scores where only n-best are kept. """ filtered = self.copy() ranked_ = -1 + self.dataframe_.rank(axis=1, ascending=ascending) filtered.dataframe_ = filtered.dataframe_.where(ranked_ < n, other=np.NaN) filtered.hasChanged_ = True return filtered def subset(self, labels, invert=False): """Scores subset Extract scores subset based on labels Parameters ---------- labels : set Set of labels invert : bool, optional If invert is True, extract all but requested `labels` Returns ------- subset : `Scores` Scores subset. """ self._reindexIfNeeded() if not isinstance(labels, set): raise TypeError('labels must be provided as a set of labels.') if invert: labels = set(self.labels()) - labels else: labels = labels & set(self.labels()) subset = Scores(uri=self.uri, modality=self.modality) subset.annotation_ = self.annotation_ subset.dataframe_ = self.dataframe_[list(labels)] return subset def to_annotation(self, threshold=-np.inf, posterior=False): """ Parameters ---------- threshold : float, optional Each track is annotated with the label with the highest score. Yet, if the latter is smaller than `threshold`, label is replaced with an `Unknown` instance. posterior : bool, optional If True, scores are posterior probabilities in open-set identification. If top model posterior is higher than unknown posterior, it is selected. Otherwise, label is replaced with an `Unknown` instance. """ if not self: return Annotation(uri=self.uri, modality=self.modality) best = self.nbest(1, ascending=False) large_enough = best.copy() if posterior: unknown_posterior = 1. - self.dataframe_.sum(axis=1) large_enough.dataframe_ = ( ((best.dataframe_.T > unknown_posterior) & (best.dataframe_.T > threshold)).T ) else: large_enough.dataframe_ = ( (best.dataframe_.T > threshold).T ) large_enough.dataframe_.where(best.dataframe_.notnull(), inplace=True, other=np.NaN) annotation = Annotation(uri=self.uri, modality=self.modality) for segment, track, label, value in large_enough.itervalues(): label = label if value else Unknown() annotation[segment, track] = label return annotation def map(self, func): """Apply function to all values""" mapped = self.copy() mapped.dataframe_ = self.dataframe_.applymap(func) mapped.hasChanged_ = True return mapped def crop(self, focus, mode='strict'): """Crop on focus Parameters ---------- focus : `Segment` or `Timeline` mode : {'strict', 'loose', 'intersection'} In 'strict' mode, only segments fully included in focus coverage are kept. In 'loose' mode, any intersecting segment is kept unchanged. In 'intersection' mode, only intersecting segments are kept and replaced by their actual intersection with the focus. Returns ------- cropped : same type as caller Cropped version of the caller containing only tracks matching the provided focus and mode. Remarks ------- In 'intersection' mode, the best is done to keep the track names unchanged. However, in some cases where two original segments are cropped into the same resulting segments, conflicting track names are modified to make sure no track is lost. """ if isinstance(focus, Segment): return self.crop(Timeline([focus], uri=self.uri), mode=mode) self._reindexIfNeeded() cropped = self.copy() if mode in ['strict', 'loose']: new_annotation = self.annotation_.crop(focus, mode=mode) keep = [new_annotation.has_track(segment, track) for segment, track in self.itertracks()] cropped.dataframe_ = self.dataframe_[keep] cropped.annotation_ = new_annotation cropped.hasChanged_ = True return cropped elif mode in ['intersection']: raise NotImplementedError('') # # two original segments might be cropped into the same resulting # # segment -- therefore, we keep track of the mapping # intersection, mapping = timeline.crop(coverage, # mode=mode, mapping=True) # # # create new empty annotation # A = self.__class__(uri=self.uri, modality=self.modality) # # for cropped in intersection: # for original in mapping[cropped]: # for track in self.tracks(original): # # try to use original track name (candidate) # # if it already exists, create a brand new one # new_track = A.new_track(cropped, candidate=track) # # copy each value, column by column # for label in self.dataframe_.columns: # value = self.dataframe_.get_value((original, track), # label) # A.dataframe_ = A.dataframe_.set_value((cropped, new_track), # label, value) # # return A def __str__(self): """Human-friendly representation""" if self: self._reindexIfNeeded() return str(self.dataframe_) else: return "" def _repr_png_(self): from .notebook import repr_scores return repr_scores(self)
d1.sort_index() #%% 按字段排序 d1.sort(['b', 'c'], ascending=[1, 0]) #%% d1.sort(['c', 'b'], ascending=[1, 0]) #%% sort_index功能可以覆盖sort d1.sort_index(by='c',ascending=False) #%% d1.sort_index(by=['b','c'],ascending=[0,1]) #%% 对列名排序 d1.sort_index(axis=1,ascending=False) #%% 对列指定出现的顺序 d1[['c','b','a']] #%% 如果想根据某一行数据对列进行排序 d1.reindex(columns=d1.ix['j'].order().index) #%% 指定行顺序的排序 d1.ix[['b','c','a']] #%% 根据多级索引进行排序 d2.sortlevel(0,ascending=False) #%% d2.sortlevel(0,ascending=False).sortlevel(1) #%% 对元素进行排名 d1.rank() #%% 指定是用相同值最小的排名、倒叙 d1.rank(method='min',ascending=False)
print "根据索引排序,对于DataFrame可以指定轴。" obj = Series(range(4), index=["d", "a", "b", "c"]) print obj.sort_index() frame = DataFrame(np.arange(8).reshape((2, 4)), index=["three", "one"], columns=list("dabc")) print frame.sort_index() print frame.sort_index(axis=1) # axis=1 表示对列进行操作 print frame.sort_index(axis=1, ascending=False) # 降序 print print "根据值排序" obj = Series([4, 7, -3, 2]) print obj.sort_values() # order已淘汰 print print "DataFrame指定列排序" frame = DataFrame({"b": [4, 7, -3, 2], "a": [0, 1, 0, 1]}) print frame print frame.sort_values(by="b") # sort_index(by = ...)已淘汰 print frame.sort_values(by=["a", "b"]) print print "rank,求排名的平均位置(从1开始)" obj = Series([7, -5, 7, 4, 2, 0, 4]) # 对应排名:-5(1), 0(2), 2(3), 4(4), 4(5), 7(6), 7(7) print obj.rank() print obj.rank(method="first") # 去第一次出现,不求平均值。 print obj.rank(ascending=False, method="max") # 逆序,并取最大值。所以-5的rank是7. frame = DataFrame({"b": [4.3, 7, -3, 2], "a": [0, 1, 0, 1], "c": [-2, 5, 8, -2.5]}) print frame print frame.rank(axis=1)
obj.order() # 排序时缺失值都会被放在末尾 # 对多列进行排序 frame = DataFrame({'b':[4,7,-3,2],'a':[0,1,0,1]}) frame.sort_index(by=['a','b']) frame.order(by=['a','b']) # 排名 obj = Series([7,-5,7,4,2,0,4]) obj.rank() # 对于相同值,按照出现次序排 obj.rank(method='first') # 降序 obj.rank(ascending=False,method='max') # 对列计算排名 frame = DataFrame({'b':[4.3,7,-3,2],'a':[0,1,0,1],'c':[-2,5,8,-2.5]}) frame.rank(axis=1) ## 带有重复值的轴索引 obj = Series(range(5), index=['a','a','b','b','c']) # 检验是否唯一 obj.index.is_unique # 一个索引有多个值,那么该索引就会返回多个值。 obj['a'] ## 汇总和计算描述统计 df = DataFrame([[1.4,np.nan],[7.1,-4.5],[np.nan,np.nan],[0.75,-1.3]], index=['a','b','c','d'], columns=['one','two']) # 对列 df.sum() # 对行 df.sum(axis=1)
print(obj.sort_values()) frame = DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]}) print(frame) print(frame.sort_values(by='b')) print(frame.sort_values(by=['a', 'b'])) # rank obj = Series([7, -5, 7, 4, 2, 0, 4]) print(obj.rank()) print(obj.rank(method='first')) print(obj.rank(method='max', ascending=False)) frame = DataFrame({'b': [4.3, 7, -3, 2], 'a': [0, 1, 0, 1], 'c': [-2, 5, 8, -2.5]}) print(frame) print(frame.rank(axis=1)) ''' duplicate index ''' obj = Series(range(5), index=['a', 'a', 'b', 'b', 'c']) print(obj) print(obj.index.is_unique) print(obj['a']) print(obj['c']) df = DataFrame(np.random.randn(4, 3), index=['a', 'a', 'b', 'b']) print(df) print(df.ix['b'])
import numpy as np randn = np.random.randn import pandas as pd from pandas import Series, DataFrame frame = DataFrame(np.random.randn(4, 3), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon']) print frame f = lambda x: x.max() - x.min() print frame.apply(f) print frame.apply(f, axis=1) print 'fancy f(x)---------' def f(x): return Series([x.min(), x.max()], index=['min', 'max']) print frame.apply(f) format = lambda x: '%.2f' % x print frame.applymap(format) print 'sorting-------------' print frame.sort_index(by='b') print frame.rank(method='max', axis=1)
def test_pct_max_many_rows(self): # GH 18271 df = DataFrame({'A': np.arange(2**24 + 1), 'B': np.arange(2**24 + 1, 0, -1)}) result = df.rank(pct=True).max() assert (result == 1).all()
d 2.0 ''' frame = DataFrame({'b': [4.3, 7, -3, 2], 'a': [0, 1, 0, 1], 'c': [-2, 5, 8, -2.5]}) print frame ''' a b c 0 0 4.3 -2.0 1 1 7.0 5.0 2 0 -3.0 8.0 3 1 2.0 -2.5 ''' print frame.rank(axis=1) # 按行进行排名,默认升序 ''' a b c 0 2.0 3.0 1.0 1 1.0 3.0 2.0 2 2.0 1.0 3.0 3 2.0 3.0 1.0 ''' print '重复索引:进行两层索引' obj = Series([0, 1, 2, 3, 4], index=['a', 'a', 'b', 'b', 'c']) print obj.index.is_unique # 判断是非有重复索引 # False print
def test_rank_axis(self): # check if using axes' names gives the same result df = DataFrame([[2, 1], [4, 3]]) tm.assert_frame_equal(df.rank(axis=0), df.rank(axis='index')) tm.assert_frame_equal(df.rank(axis=1), df.rank(axis='columns'))