Example #1
0
def _spearman_r(a, b, weights, axis, skipna):
    """ndarray implementation of scipy.stats.spearmanr.

    Parameters
    ----------
    a : ndarray
        Input array.
    b : ndarray
        Input array.
    axis : int
        The axis to apply the correlation along.
    weights : ndarray
        Input array.
    skipna : bool
        If True, skip NaNs when computing function.

    Returns
    -------
    res : ndarray
        Spearmanr's correlation coefficient.

    See Also
    --------
    scipy.stats.spearmanr
    """
    if skipna:
        a, b, weights = _match_nans(a, b, weights)
    _a = bn.nanrankdata(a, axis=axis)
    _b = bn.nanrankdata(b, axis=axis)
    return _pearson_r(_a, _b, weights, axis, skipna)
Example #2
0
def getStatsTS(X, Y, quantile=10, window=500, minCnt=250):
    """
    X: Input factor, shape should be 40320*1082
    Y: Existing factor, price
    Calculate the return of 10, 20 ,30 by
    Standardized Return_i = (Price_t+i-Price_t)/Price_t/i
    """
    def calcFwdRet(price, window=30):
        """
        """
        fwd = np.roll(price, -window, axis=0)
        fwd[-window:, :] = np.nan

        return fwd / price - 1

    print('Now Calculating IC and IR matrix, start counting...')
    t0 = time.time()
    X = np.asarray(X)
    Y = np.asarray(Y)
    Y_ = np.zeros(Y.shape)
    for i in range(len(Y) - 30):
        for j in range(Y.shape[1]):
            Y_[i, j] = (Y[i + 30, j] - Y[i, j]) / Y[i, j] / 30

    Y = Y_
    if X.shape != Y.shape:
        print(X.shape)
        print(Y.shape)
        raise
    N = len(X)
    IC = np.zeros((N, ))

    bottom = 1.0 / quantile
    top = 1 - bottom

    # ts rank
    X = bn.move_rank(X, window=window, min_count=minCnt, axis=0)
    print(np.isnan(X).sum())
    # norm to [0, 1]
    X = 0.5 * (X + 1)

    # get common data
    X = np.where((~np.isnan(X) & (~np.isnan(Y))), X, np.nan)
    Y = np.where((~np.isnan(X) & (~np.isnan(Y))), Y, np.nan)
    # cross-rank Y
    Y_rk = bn.nanrankdata(Y, axis=1)
    Y_rk /= bn.nanmax(Y_rk, axis=1)[:, np.newaxis]

    # ls
    LS = np.nanmean(np.where(X > top, Y, np.nan), axis=1) \
         - np.nanmean(np.where(X < bottom, Y, np.nan), axis=1)

    # Loop
    for ii in range(N):
        IC[ii] = np.corrcoef(X[ii][~np.isnan(X[ii])],
                             Y_rk[ii][~np.isnan(Y_rk[ii])])[0, 1]

    t1 = time.time()
    print("total time used for IC and LS matrix calculation is:", (t1 - t0))
    return IC, LS
Example #3
0
def gev_from_samples(arr_ams, n_sample, shape_param):
    """draw sample with replacement and find GEV parameters using the
    Probability-Weighted Moments method.
    """
    assert arr_ams.ndim == 1
    # Remove NaN
    arr_ams = arr_ams[np.isfinite(arr_ams)]
    # Records length is exclusive of NaN
    n_obs = len(arr_ams)
    # print('n_obs', n_obs)
    # Random sampling with replacement of indices
    sampling_idx = helper.get_sampling_idx(n_sample, n_obs)
    # Draw samples. Add dimension n_sample in first position.
    arr_samples = arr_ams[sampling_idx]
    # print(arr_samples.shape)
    ax_year = 1
    # rank samples
    rank = bottleneck.nanrankdata(arr_samples, axis=ax_year).astype(fscalar)
    # fit distribution. ev_params is a tuple of ndarrays.
    ecdf = ecdf_jit(rank, n_obs)
    gev_pwm_njit = nb.njit(gev_pwm)
    ev_params = gev_pwm_njit(arr_samples,
                             ecdf,
                             n_obs,
                             ax_year,
                             shape=shape_param)
    # Add one axis. Changes shape to (ev_params, samples).
    return np.array(ev_params)
Example #4
0
def hsa(gefs_sprd, subset, debug=False):
    '''Standardizes, sets min and max between -1 and 1, and takes the arctanh to derive
    a "normal" distribution to ascribe more statistical relevance to the zscore values.
    
    Known as historical spread anomaly, or HSA.'''
    try:
        gefs_sprd = gefs_sprd.rename({'time': 'fhour'})
    except:
        pass
    try:
        gefs_sprd = gefs_sprd.assign_coords(fhour=subset.fhour)
    except:
        pass
    subset_vals = (gefs_sprd['Pressure'] - subset.mean(
        'time', skipna=True)) / subset.std('time', skipna=True)
    new_stacked = xr.concat(
        [subset.drop('timestr').to_dataset(),
         gefs_sprd.expand_dims('time')], 'time')
    percentile = bottleneck.nanrankdata(
        new_stacked['Pressure'], axis=0) / np.count_nonzero(
            ~np.isnan(new_stacked['Pressure'][:, 0, 0, 0]))
    perc_ds = xr.Dataset(
        data_vars=dict(spread_percentile=(["fhour", "lat", "lon"],
                                          percentile[-1])),
        coords=dict(lon=new_stacked.lon.values,
                    lat=new_stacked.lat.values,
                    fhour=new_stacked.fhour.values),
        attrs=dict(description="Spread percentile based on reforecast\
                    of similar mean anomalies by gridpoint."),
    )
    if debug:
        return subset_vals
    # subset_vals = (0.99-(-0.99))*(subset_vals-subset_vals.min(['lat','lon']))/(subset_vals.max(['lat','lon'])-subset_vals.min(['lat','lon'])) + -0.99
    # subset_vals = np.arctanh(subset_vals)
    return subset_vals, perc_ds
Example #5
0
 def _nanrankdata(self, X, axis=1):
     """
     Replaces bottleneck's nanrankdata with scipy and numpy alternative.
     """
     #ranks = sp.stats.mstats.rankdata(np.ma.masked_invalid(X), axis=axis)
     ranks = bn.nanrankdata(X, axis=axis)
     ranks[ranks == 0] = np.nan
     return ranks
Example #6
0
 def _nanrankdata(self, X, axis=1):
     """
     Replaces bottleneck's nanrankdata with scipy and numpy alternative.
     """
     #ranks = sp.stats.mstats.rankdata(np.ma.masked_invalid(X), axis=axis)
     ranks = bn.nanrankdata(X, axis=axis)
     ranks[ranks == 0] = np.nan
     return ranks
Example #7
0
def rank(data):
    """Rank normalize data
    
    Rank standardize data to make nonparametric
    
    Arguments:
        data {np.array} -- 2-D coexpression network
    
    Returns:
        np.array -- Rank normalized between 0 and 1 array
    """
    orig_shape = data.shape
    data = bottleneck.nanrankdata(data) - 1
    return (data / np.sum(~np.isnan(data))).reshape(orig_shape)
Example #8
0
def compute_aurocs_default(sum_in, study_ct_uniq, pheno, study_col, ct_col,
                           compute_p):
    """Helper function to compute AUROCs from votes matrix of cells


    Arguments:
        sum_in {np.ndarray} -- votes matrix, cells x cell types votes
        study_ct_uniq {vector} -- vector of study_id|cell_type labels
        pheno {pd.DataFrame} -- dataframe wtih study_ct, study_id and ct_col for all cells
        study_col {str} -- String name of study_col in pheno
        ct_col {str} -- Stirng name of cell type col in pheno

    Returns:
        pd.DataFrame -- ROCs for cell type x cell type labels
    """
    cell_nv = pd.DataFrame(index=study_ct_uniq)
    if compute_p:
        cell_p = pd.DataFrame(index=study_ct_uniq)
    for ct in study_ct_uniq:
        predicts_tmp = sum_in.copy()
        study, cellT = (pheno[pheno.study_ct == ct].drop_duplicates()[[
            study_col, ct_col
        ]].values[0])  # Don't want to split string in case of charcter issues
        slicer = pheno[study_col] == study
        pheno2 = pheno[slicer]
        predicts_tmp = predicts_tmp[slicer]
        predicts_tmp = bottleneck.nanrankdata(predicts_tmp, axis=0)

        filter_mat = np.zeros_like(predicts_tmp)
        filter_mat[pheno2.study_ct == ct] = 1

        predicts_tmp[filter_mat == 0] = 0

        n_p = bottleneck.nansum(filter_mat, axis=0)
        nn = filter_mat.shape[0] - n_p
        p = bottleneck.nansum(predicts_tmp, axis=0)
        roc = (p / n_p - (n_p + 1) / 2) / nn
        cell_nv[ct] = roc
        if compute_p:
            U = roc * n_p * nn
            Z = (np.abs(U - (n_p * nn / 2))) / np.sqrt(n_p * nn *
                                                       (n_p + nn + 1) / 12)
            P = stats.norm.sf(Z)
            cell_p[ct] = P
        del predicts_tmp, filter_mat
        gc.collect()
    if compute_p:
        return cell_nv, cell_p
    return cell_nv
Example #9
0
    def _fit(self, X, y):
        # check input params
        self._check_params(X, y)
        # setup variables for Boruta
        n_sample, n_feat = X.shape
        _iter = 1
        # holds the decision about each feature:
        # 0  - default state = tentative in original code
        # 1  - accepted in original code
        # -1 - rejected in original code
        dec_reg = np.zeros(n_feat, dtype=np.int)
        # counts how many times a given feature was more important than
        # the best of the shadow features
        hit_reg = np.zeros(n_feat, dtype=np.int)
        # these record the history of the iterations
        imp_history = np.zeros(n_feat, dtype=np.float)
        sha_max_history = []

        # set n_estimators
        if self.n_estimators != 'auto':
            self.estimator.set_params(n_estimators=self.n_estimators)

        # main feature selection loop
        while np.any(dec_reg == 0) and _iter < self.max_iter:
            # find optimal number of trees and depth
            if self.n_estimators == 'auto':
                # number of features that aren't rejected
                not_rejected = np.where(dec_reg >= 0)[0].shape[0]
                n_tree = self._get_tree_num(not_rejected)
                self.estimator.set_params(n_estimators=n_tree)

            # make sure we start with a new tree in each iteration
            rnd_st = np.random.randint(1,1e6,1)[0]
            self.estimator.set_params(random_state=rnd_st)

            # add shadow attributes, shuffle them and train estimator, get imps
            cur_imp = self._add_shadows_get_imps(X, y, dec_reg)

            # get the threshold of shadow importances we will use for rejection
            imp_sha_max = np.percentile(cur_imp[1], self.perc)

            # record importance history
            sha_max_history.append(imp_sha_max)
            imp_history = np.vstack((imp_history, cur_imp[0]))

            # register which feature is more imp than the max of shadows
            hit_reg = self._assign_hits(hit_reg, cur_imp, imp_sha_max)

            # based on hit_reg we check if a feature is doing better than
            # expected by chance
            dec_reg = self._do_tests(dec_reg, hit_reg, _iter)

            # print out confirmed features
            if self.verbose > 0 and _iter < self.max_iter:
                self._print_results(dec_reg, _iter, 0)
            if _iter < self.max_iter:
                _iter += 1

        # we automatically apply R package's rough fix for tentative ones
        confirmed = np.where(dec_reg == 1)[0]
        tentative = np.where(dec_reg == 0)[0]
        # ignore the first row of zeros
        tentative_median = np.median(imp_history[1:, tentative], axis=0)
        # which tentative to keep
        tentative_confirmed = np.where(tentative_median
                                       > np.median(sha_max_history))[0]
        tentative = tentative[tentative_confirmed]

        # basic result variables
        self.n_features_ = confirmed.shape[0]
        self.support_ = np.zeros(n_feat, dtype=np.bool)
        self.support_[confirmed] = 1
        self.support_weak_ = np.zeros(n_feat, dtype=np.bool)
        self.support_weak_[tentative] = 1

        # ranking, confirmed variables are rank 1
        self.ranking_ = np.ones(n_feat, dtype=np.int)
        # tentative variables are rank 2
        self.ranking_[tentative] = 2
        # selected = confirmed and tentative
        selected = np.hstack((confirmed, tentative))
        # all rejected features are sorted by importance history
        not_selected = np.setdiff1d(np.arange(n_feat), selected)
        # large importance values should rank higher = lower ranks -> *(-1)
        imp_history_rejected = imp_history[1:, not_selected] * -1
        # calculate ranks in each iteration, then median of ranks across feats
        iter_ranks = bn.nanrankdata(imp_history_rejected, axis=1)
        #iter_ranks = self._nanrankdata(imp_history_rejected, axis=1)
        rank_medians = np.nanmedian(iter_ranks, axis=0)
        ranks = bn.nanrankdata(rank_medians)
        #ranks = self._nanrankdata(rank_medians, axis=0)

        # update rank for not_selected features
        if not_selected.shape[0] > 0:
            # set smallest rank to 3 if there are tentative feats
            if tentative.shape[0] > 0:
                ranks = ranks - np.min(ranks) + 3
            else:
                # and 2 otherwise
                ranks = ranks - np.min(ranks) + 2
            self.ranking_[not_selected] = ranks

        # notify user
        if self.verbose > 0:
            self._print_results(dec_reg, _iter, 1)
        return self
Example #10
0
 def rank_alpha(self, alpha):
     # 将alpha因子转化为 01之间的排序
     rankAlpha = bk.nanrankdata(alpha, axis=1)
     rankAlpha = (rankAlpha.T / bk.nanmax(rankAlpha, axis=1)).T
     return rankAlpha
Example #11
0
 def time_nanrankdata(self, dtype, shape):
     bn.nanrankdata(self.arr)
Example #12
0
def ranking(x, axis=0, norm='-1,1'):
    """
    Normalized ranking treating NaN as missing and averaging ties.
    
    Parameters
    ----------
    x : ndarray
        Data to be ranked.
    axis : {int, None} optional
        Axis to rank over. Default axis is 0.
    norm: str, optional
        A string that specifies the normalization:
            ==========  ================================================
            '0,N-1'     Zero to N-1 ranking
            '-1,1'      Scale zero to N-1 ranking to be between -1 and 1
            'gaussian'  Rank data then scale to a Gaussian distribution
            ==========  ================================================
        The default ranking is '-1,1'.
        
    Returns
    -------
    idx : ndarray
        The ranked data.The dtype of the output is always np.float even if
        the dtype of the input is int.
    
    Notes
    -----
    If there is only one non-NaN value along the given axis, then that value
    is set to the midpoint of the specified normalization method. For example,
    if the input is array([1.0, nan]), then 1.0 is set to zero for the '-1,1'
    and 'gaussian' normalizations and is set to 0.5 (mean of 0 and 1) for the
    '0,N-1' normalization.
    
    For '0,N-1' normalization, note that N is x.shape[axis] even in there are
    NaNs. That ensures that when ranking along the columns of a 2d array, for
    example, the output will have the same min and max along all columns.
    
    """
    if axis is None:
        ranked_x = ranking(x.reshape(-1), norm=norm)
        return ranked_x.reshape(*x.shape)
    ax = axis
    if ax < 0:
        # This converts a negative axis to the equivalent positive axis
        ax = range(x.ndim)[ax]
    masknan = np.isnan(x)
    countnan = np.expand_dims(masknan.sum(ax), ax)
    countnotnan = x.shape[ax] - countnan
    idx = bn.nanrankdata(x, ax)
    idx -= 1
    if norm == '-1,1':
        idx /= (countnotnan - 1)
        idx *= 2
        idx -= 1
        middle = 0.0
    elif norm == '0,N-1':
        idx *= (1.0 * (x.shape[ax] - 1) / (countnotnan - 1))
        middle = (idx.shape[ax] + 1.0) / 2.0 - 1.0
    elif norm == 'gaussian':
        global ndtri
        if ndtri is None:
            try:
                from scipy.special import ndtri
            except ImportError:
                msg = "SciPy required for `gaussian` normalization"
                raise ImportError(msg)
        idx *= (1.0 * (x.shape[ax] - 1) / (countnotnan - 1))
        idx = ndtri((idx + 1.0) / (x.shape[ax] + 1.0))
        middle = 0.0
    else:
        msg = "norm must be '-1,1', '0,N-1', or 'gaussian'."
        raise ValueError(msg)
    np.putmask(idx, (countnotnan == 1) * (~masknan), middle)
    return idx
Example #13
0
def score_default(
    X, S, C, node_degree_normalization, means=True
):
    """Compute ROCs according to the default procedure

     Default procedure computes ranked cell similarity matrix and then uses neighbor voting

    Arguments:
         X {array} -- Array (sparse or dense) of geneset x cells
         S {vector} -- Study labels, length cells
         C {vector} -- Cell type labels, legnth cells
         node_degree_normalization {bool} -- Flag for whether to normalize votes by node degree

     Returns:
         pd.Series -- Series containing AUROCs for each cell type for the given gene set
    """
    nw = create_nw_spearman(X.T)
    nw = (nw + nw.T) / 2

    cell_labels = design_matrix(C)

    x1 = cell_labels.shape[1]
    x2 = cell_labels.shape[0]

    studies = np.unique(S)
    exp_cols = np.repeat(studies, x1)

    test_cell_labels = np.tile(cell_labels.values, studies.shape[0])
    for study in studies:  # Hide testing labels
        d = np.where(study == S)[0]
        a = np.where(study == exp_cols)[0]
        for i in a:
            test_cell_labels[d, i] = 0

    predicts = nw @ test_cell_labels

    if node_degree_normalization:
        sum_all = np.sum(nw, axis=0)
        predicts /= sum_all[:, None]

    predicts[test_cell_labels == 1] = np.nan

    exp_cols = np.repeat(studies, x1)

    filter_mat = np.tile(cell_labels.values, studies.shape[0])
    for study in studies:
        mask = (study != S).astype(float)[:, None] @ (study == exp_cols).astype(float)[
            :, None
        ].T
        mask = mask.astype(bool)
        filter_mat[mask] = np.nan
        predicts[mask] = np.nan

    predicts = bottleneck.nanrankdata(np.abs(predicts), axis=0)
    predicts[filter_mat == 0] = 0

    n_p = bottleneck.nansum(filter_mat, axis=0)
    n_n = bottleneck.nansum((filter_mat == 0).astype(float), axis=0)
    p = bottleneck.nansum(predicts, axis=0)
    rocNV = (p / n_p - (n_p + 1) / 2) / n_n

    # C array opposite of F in R
    rocNV = rocNV.reshape([studies.shape[0], x1]).T
    if means:
        return pd.Series(bottleneck.nanmean(rocNV, axis=1), index=cell_labels.columns)
    else:
        return pd.DataFrame(rocNV, index=cell_labels.columns, columns=studies)
def Rank(A):
    '''
    横截面排序
    '''
    return bk.nanrankdata(A, axis=1)
Example #15
0
def spearman_correlation(x, y, axis=0):
    import bottleneck as bn
    x_ranks = bn.nanrankdata(x, axis=axis)
    y_ranks = bn.nanrankdata(y, axis=axis)
    return pearson_correlation(x_ranks, y_ranks, axis=axis)
Example #16
0
    def _fit(self, X, y):
        # check input params
        self._check_params(X, y)
        # setup variables for Boruta
        n_sample, n_feat = X.shape
        _iter = 1
        # holds the decision about each feature:
        # 0  - default state = tentative in original code
        # 1  - accepted in original code
        # -1 - rejected in original code
        dec_reg = np.zeros(n_feat, dtype=np.int)
        # counts how many times a given feature was more important than
        # the best of the shadow features
        hit_reg = np.zeros(n_feat, dtype=np.int)
        # these record the history of the iterations
        imp_history = np.zeros(n_feat, dtype=np.float)
        sha_max_history = []

        # set n_estimators
        if self.n_estimators != 'auto':
            self.estimator.set_params(n_estimators=self.n_estimators)

        # main feature selection loop
        while np.any(dec_reg == 0) and _iter < self.max_iter:
            # find optimal number of trees and depth
            if self.n_estimators == 'auto':
                # number of features that aren't rejected
                not_rejected = np.where(dec_reg >= 0)[0].shape[0]
                n_tree = self._get_tree_num(not_rejected)
                self.estimator.set_params(n_estimators=n_tree)

            # make sure we start with a new tree in each iteration
            rnd_st = np.random.randint(1, 1e6, 1)[0]
            self.estimator.set_params(random_state=rnd_st)

            # add shadow attributes, shuffle them and train estimator, get imps
            cur_imp = self._add_shadows_get_imps(X, y, dec_reg)

            # get the threshold of shadow importances we will use for rejection
            imp_sha_max = np.percentile(cur_imp[1], self.perc)

            # record importance history
            sha_max_history.append(imp_sha_max)
            imp_history = np.vstack((imp_history, cur_imp[0]))

            # register which feature is more imp than the max of shadows
            hit_reg = self._assign_hits(hit_reg, cur_imp, imp_sha_max)

            # based on hit_reg we check if a feature is doing better than
            # expected by chance
            dec_reg = self._do_tests(dec_reg, hit_reg, _iter)

            # print out confirmed features
            if self.verbose > 0 and _iter < self.max_iter:
                self._print_results(dec_reg, _iter, 0)
            if _iter < self.max_iter:
                _iter += 1

        # we automatically apply R package's rough fix for tentative ones
        confirmed = np.where(dec_reg == 1)[0]
        tentative = np.where(dec_reg == 0)[0]
        # ignore the first row of zeros
        tentative_median = np.median(imp_history[1:, tentative], axis=0)
        # which tentative to keep
        tentative_confirmed = np.where(
            tentative_median > np.median(sha_max_history))[0]
        tentative = tentative[tentative_confirmed]

        # basic result variables
        self.n_features_ = confirmed.shape[0]
        self.support_ = np.zeros(n_feat, dtype=np.bool)
        self.support_[confirmed] = 1
        self.support_weak_ = np.zeros(n_feat, dtype=np.bool)
        self.support_weak_[tentative] = 1

        # ranking, confirmed variables are rank 1
        self.ranking_ = np.ones(n_feat, dtype=np.int)
        # tentative variables are rank 2
        self.ranking_[tentative] = 2
        # selected = confirmed and tentative
        selected = np.hstack((confirmed, tentative))
        # all rejected features are sorted by importance history
        not_selected = np.setdiff1d(np.arange(n_feat), selected)
        # large importance values should rank higher = lower ranks -> *(-1)
        imp_history_rejected = imp_history[1:, not_selected] * -1
        # calculate ranks in each iteration, then median of ranks across feats
        iter_ranks = bn.nanrankdata(imp_history_rejected, axis=1)
        #iter_ranks = self._nanrankdata(imp_history_rejected, axis=1)
        rank_medians = np.nanmedian(iter_ranks, axis=0)
        ranks = bn.nanrankdata(rank_medians)
        #ranks = self._nanrankdata(rank_medians, axis=0)

        # update rank for not_selected features
        if not_selected.shape[0] > 0:
            # set smallest rank to 3 if there are tentative feats
            if tentative.shape[0] > 0:
                ranks = ranks - np.min(ranks) + 3
            else:
                # and 2 otherwise
                ranks = ranks - np.min(ranks) + 2
            self.ranking_[not_selected] = ranks

        # notify user
        if self.verbose > 0:
            self._print_results(dec_reg, _iter, 1)
        return self
Example #17
0
def ranking(x, axis=0, norm='-1,1'):
    """
    Normalized ranking treating NaN as missing and averaging ties.
    
    Parameters
    ----------
    x : ndarray
        Data to be ranked.
    axis : {int, None} optional
        Axis to rank over. Default axis is 0.
    norm: str, optional
        A string that specifies the normalization:
            ==========  ================================================
            '0,N-1'     Zero to N-1 ranking
            '-1,1'      Scale zero to N-1 ranking to be between -1 and 1
            'gaussian'  Rank data then scale to a Gaussian distribution
            ==========  ================================================
        The default ranking is '-1,1'.
        
    Returns
    -------
    idx : ndarray
        The ranked data.The dtype of the output is always np.float even if
        the dtype of the input is int.
    
    Notes
    -----
    If there is only one non-NaN value along the given axis, then that value
    is set to the midpoint of the specified normalization method. For example,
    if the input is array([1.0, nan]), then 1.0 is set to zero for the '-1,1'
    and 'gaussian' normalizations and is set to 0.5 (mean of 0 and 1) for the
    '0,N-1' normalization.
    
    For '0,N-1' normalization, note that N is x.shape[axis] even in there are
    NaNs. That ensures that when ranking along the columns of a 2d array, for
    example, the output will have the same min and max along all columns.
    
    """
    if axis is None:
        ranked_x = ranking(x.reshape(-1), norm=norm)
        return ranked_x.reshape(*x.shape)
    ax = axis
    if ax < 0:
        # This converts a negative axis to the equivalent positive axis
        ax = range(x.ndim)[ax]
    masknan = np.isnan(x)
    countnan = np.expand_dims(masknan.sum(ax), ax)
    countnotnan = x.shape[ax] - countnan
    idx = bn.nanrankdata(x, ax)
    idx -= 1
    if norm == '-1,1':
        idx /= (countnotnan - 1)
        idx *= 2
        idx -= 1
        middle = 0.0
    elif norm == '0,N-1':
        idx *= (1.0 * (x.shape[ax] - 1) / (countnotnan - 1))
        middle = (idx.shape[ax] + 1.0) / 2.0 - 1.0
    elif norm == 'gaussian':
        global ndtri
        if ndtri is None:
            try:
                from scipy.special import ndtri
            except ImportError:
                msg = "SciPy required for `gaussian` normalization"
                raise ImportError(msg)
        idx *= (1.0 * (x.shape[ax] - 1) / (countnotnan - 1))
        idx = ndtri((idx + 1.0) / (x.shape[ax] + 1.0))
        middle = 0.0
    else:
        msg = "norm must be '-1,1', '0,N-1', or 'gaussian'."
        raise ValueError(msg)
    np.putmask(idx, (countnotnan==1)*(~masknan), middle)
    return idx
Example #18
0
def spearman_rho(x, y, axis=-1):
    """Spearman rho. Pearson's r of on the rank.
    """
    rank_x = bottleneck.nanrankdata(x, axis=axis)
    rank_y = bottleneck.nanrankdata(y, axis=axis)
    return pearson_r(rank_x, rank_y, axis=axis)
Example #19
0
 def _rank_first(x, y):
     """Concatenates x and y and returns the rank of the
     first element along the last axes"""
     xy = np.concatenate((x[..., np.newaxis], y), axis=-1)
     return bn.nanrankdata(xy, axis=-1)[..., 0]
def _spearman_correlation(x, y):
    x_ranks = bottleneck.nanrankdata(x, axis=-1)
    y_ranks = bottleneck.nanrankdata(y, axis=-1)
    return _pearson_correlation(x_ranks, y_ranks)