コード例 #1
0
def compute_mncp(predicted, cutoff, label):
    """
    This is the MNCP computation adopted from Clarke 2003

    MNCP is a rank based metric similar to AUC but its a plot of TP and all positives
    hence considered to be less affected by false positives.

    MNCP is the mean normalized
    """
    from numpy import mean, array, hstack
    if label == 1:
        fg_vals = predicted[:cutoff]
        bg_vals = predicted[cutoff:]
    else:
        fg_vals = predicted[cutoff:]
        bg_vals = predicted[:cutoff]
    fg_len = len(fg_vals)
    total_len = len(fg_vals) + len(bg_vals)

    if type(fg_vals) != type(array([])):
        fg_vals = array(fg_vals)
    if type(bg_vals) != type(array([])):
        bg_vals = array(bg_vals)
    # Rank the data
    fg_rank = stats.rankdata(fg_vals)

    #combine foreground and background data and get the ranks
    total_rank = stats.rankdata(hstack((fg_vals, bg_vals)))
    slopes = []
    for i in range(len(fg_vals)):
        slope = ((fg_len - fg_rank[i] + 1) / fg_len) / ((total_len - total_rank[i] + 1) / total_len)
        slopes.append(slope)
    mncp = mean(slopes)
    return mncp
コード例 #2
0
    def rank_texts(cls):

        """
        Get total citation counts and ranks for texts.

        Returns: list
        """

        count = fn.Count(Citation.id)

        query = (
            Text.select(Text, count)
            .join(Citation)
            .where(Text.display == True)
            .where(Text.valid == True)
            .group_by(Text.id)
            .order_by(Text.id)
            .naive()
        )

        counts = [t.count for t in query]

        # Compute dense-rank ratios.
        dense_ranks = rankdata(counts, "dense")
        top = max(dense_ranks)
        scores = [float(r / top) for r in dense_ranks]

        # Compute overall ranks (#1 is most frequent).
        max_ranks = rankdata(counts, "max")
        top = max(max_ranks)
        ranks = [int(top - r + 1) for r in max_ranks]

        return [dict(zip(["text", "rank", "score"], t)) for t in zip(query, ranks, scores)]
コード例 #3
0
	def get_scores(self, a, b):
		to_ret = (rankdata(a, 'dense') / np.max(rankdata(a, 'dense'))
		          - rankdata(b, 'dense') / np.max(rankdata(b, 'dense')))

		if type(a) == pd.Series:
			return pd.Series(to_ret, index=a.index)
		return to_ret
コード例 #4
0
ファイル: test_rsa.py プロジェクト: mfalkiewicz/PyMVPA
def test_DissimilarityConsistencyMeasure():
    targets = np.tile(xrange(3),2)
    chunks = np.repeat(np.array((0,1)),3)
    # correct results
    cres1 = 0.41894348
    cres2 = np.array([[ 0.16137995, 0.73062639, 0.59441713]])
    dc1 = data[0:3,:] - np.mean(data[0:3,:],0)
    dc2 = data[3:6,:] - np.mean(data[3:6,:],0)
    center = squareform(np.corrcoef(pdist(dc1,'correlation'),pdist(dc2,'correlation')), 
                        checks=False).reshape((1,-1))
    dsm1 = stats.rankdata(pdist(data[0:3,:],'correlation').reshape((1,-1)))
    dsm2 = stats.rankdata(pdist(data[3:6,:],'correlation').reshape((1,-1)))

    spearman = squareform(np.corrcoef(np.vstack((dsm1,dsm2))), 
                        checks=False).reshape((1,-1))
    
    ds = dataset_wizard(samples=data, targets=targets, chunks=chunks)
    dscm = DissimilarityConsistencyMeasure()
    res1 = dscm(ds)
    dscm_c = DissimilarityConsistencyMeasure(center_data=True)
    res2 = dscm_c(ds)
    dscm_sp = DissimilarityConsistencyMeasure(consistency_metric='spearman')
    res3 = dscm_sp(ds)
    ds.append(ds)
    chunks = np.repeat(np.array((0,1,2,)),4)
    ds.sa['chunks'] = chunks
    res4 = dscm(ds)
    assert_almost_equal(np.mean(res1.samples),cres1)
    assert_array_almost_equal(res2.samples, center)
    assert_array_almost_equal(res3.samples, spearman)
    assert_array_almost_equal(res4.samples,cres2)
コード例 #5
0
ファイル: util.py プロジェクト: vipasu/addseds
def match_number_density(dats, nd=None, mstar=None):
    """
    Cuts catalogs at a stellar mass such that the number density matches that
    found in Hearin and Watson
    """

    new_dats = defaultdict(dict)
    if nd is None:
        fiducial = get_catalog('HW')['dat']
        m_f = fiducial['dat']['mstar']
        n_f= rankdata(-m_f)/fiducial['box_size']**3
        nd = max(n_f)
        print nd
    for name, cat in dats.items():
        m = cat['dat']['mstar']
        n = rankdata(-m)/cat['box_size']**3
        m_s, n_s = zip(*sorted(zip(m,n)))

        idx = np.digitize(nd, n_s, right=True)
        ms_cut = m_s[min(idx, len(m_s)-1)]
        nd_cut = n_s[min(idx, len(n_s)-1)]
        print "Cut in ", name, " at ", ms_cut, " with nd: ", nd_cut

        d = cat['dat']
        new_cat = cat.copy()
        new_cat['cut'] = ms_cut
        new_cat['dat'] = d[d['mstar'] > ms_cut]

        new_dats[name] = new_cat
    return new_dats
コード例 #6
0
ファイル: statsrunner.py プロジェクト: MarkAWard/scatterize
    def to_dict(self):
        params = self.regression_params
        result = self.result
        good_rows = self._possible_rows()
        row_ids = self._row_id_array()[good_rows]
        group_data = self._group_data(good_rows)

        groups = group_data['group_array'][good_rows]
        iv = result['iv']
        dv = result['dv']
        weights = self._non_censored_mask()[good_rows]
        logger.debug(repr(iv.shape))
        logger.debug(repr(weights.shape))
        xvals = ss.rankdata(iv)
        yvals = ss.rankdata(dv)
        regression_line = self._estimate_regression_line(xvals, yvals)
        logger.debug(xvals)
        logger.debug(yvals)
        points = np.column_stack((row_ids, xvals, yvals, weights, groups))
        all_point_data = np.column_stack((points, iv, dv))
        logger.debug(self._all_point_cols)
        col_names = self._all_point_cols()
        x_label, y_label = self._x_y_labels()

        return dict(
            points=points.tolist(),
            stats_diagnostics=self.diagnostics_list(),
            all_point_data=all_point_data.tolist(),
            all_point_cols=self._all_point_cols(),
            regression_line=regression_line,
            group_list=group_data['group_list'],
            x_label=x_label,
            y_label=y_label,
            model_type=params.model_type)
コード例 #7
0
    def _call(self, dataset):
        """Computes the aslmap_dcm = sl_dcm(group_data)verage correlation in similarity structure across chunks."""
        
        chunks_attr = self.chunks_attr
        nchunks = len(np.unique(dataset.sa[chunks_attr]))
        if nchunks < 2:
            raise StandardError("This measure calculates similarity consistency across "
                                "chunks and is not meaningful for datasets with only "
                                "one chunk:")

        #calc neur sim b/w targ_comp targets per subject
        neur_sim={}
        for s in np.unique(dataset.sa[chunks_attr]):
            ds_s = dataset[dataset.sa.chunks == s]
            neur_sim[s+'1'] = 1 - np.corrcoef(ds_s[ds_s.sa.targets == self.targ_comp1[0]],ds_s[ds_s.sa.targets == self.targ_comp1[1]])[0][1]            
            neur_sim[s+'2'] = 1 - np.corrcoef(ds_s[ds_s.sa.targets == self.targ_comp2[0]],ds_s[ds_s.sa.targets == self.targ_comp2[1]])[0][1]            

        #combine xSs_behavs
        xSs_behav = {}
        for s in self.xSs_behav1:
            xSs_behav[s+'1'] = self.xSs_behav1[s]
        for s in self.xSs_behav2:
            xSs_behav[s+'2'] = self.xSs_behav2[s]

        #create dsets where cols are neural sim and mt sim for correlations
        behav_neur = np.array([[xSs_behav[s],neur_sim[s]] for s in neur_sim])
        #correlate behav with neur sim b/w subjects
        if self.comparison_metric == 'spearman':
            xSs_corr = pearsonr(rankdata(behav_neur[:,0]),rankdata(behav_neur[:,1])) 
        xSs_corr = pearsonr(behav_neur[:,0],behav_neur[:,1])
        
        #returns fish z transformed r coeff ; could change to be p value if wanted...
        return Dataset(np.array([np.arctanh(xSs_corr[0])])) 
コード例 #8
0
ファイル: npc.py プロジェクト: statlab/permute
def t2p(distr, alternative="greater", plus1=True):
    r"""
    Use the empirical distribution of a test statistic to compute
    p-values for every value in the distribution.

    Parameters
    ----------
    distr : array_like
        Empirical distribution of statistic
    alternative : {'greater', 'less', 'two-sided'}
        The alternative hypothesis to test (default is 'greater')
    plus1 : bool
        flag for whether to add 1 to the numerator and denominator of the
        p-value based on the empirical permutation distribution. 
        Default is True.

    Returns
    -------
    float
        the estimated p-vlaue
    """

    if not alternative in ['greater', 'less', 'two-sided']:
        raise ValueError('Bad alternative')
    B = len(distr)
    if alternative != "less":
        pupper = 1 - rankdata(distr, method="min")/(plus1+B) + (1 + plus1)/(plus1+B)
        pvalue = pupper
    if alternative != "greater":
        plower = rankdata(distr, method="min") / (plus1+B) + plus1/(plus1+B)
        pvalue = plower
    if alternative == "two-sided":
        pvalue = np.min([np.ones(B), 2 * np.min([plower, pupper], 0)], 0)
    return pvalue
コード例 #9
0
def plot_predicted_vs_observed_pks(obs_score, pred_score, ofname):
    plt.figure()
    heatmap, xedges, yedges = numpy.histogram2d(
        rankdata(-obs_score, method='ordinal'), 
        rankdata(pred_score, method='ordinal'), 
        bins=20)
    heatmap, xedges, yedges = numpy.histogram2d(
        numpy.clip(-numpy.log(1+obs_score), -0.1, 0),
        numpy.clip(numpy.log(1+pred_score), 0, 0.1), 
        bins=100)
    #heatmap, xedges, yedges = numpy.histogram2d(
    #    numpy.clip(-numpy.log(1+data['ATAC_mean']), -0.1, 0),
    #    numpy.clip(numpy.log(y), 0, 0.1), 
    #    bins=100)

    #heatmap, xedges, yedges = numpy.histogram2d(
    #    rankdata(-data['ATAC_mean'], method='average'), 
    #    rankdata(obs_score, method='average'), 
    #    bins=20)
    extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]]
    plt.clf()
    plt.imshow(heatmap, extent=extent)
    #plt.scatter(rankdata(obs_score, method='ordinal'), 
    #            rankdata(pred_score, method='ordinal'))
    plt.savefig(ofname)
    plt.close()
    return
コード例 #10
0
ファイル: npc.py プロジェクト: ml-ai-nlp-ir/permute
def t2p(distr, alternative="greater"):
    '''
    Use the empirical distribution of a test statistic to compute
    p-values for every value in the distribution.

    Parameters
    ----------
    distr : array_like
        Empirical distribution of statistic
    alternative : {'greater', 'less', 'two-sided'}
        The alternative hypothesis to test (default is 'greater')

    Returns
    -------
    float
        the estimated p-vlaue
    '''

    B = len(distr)
    if alternative != "less":
        pupper = 1 - (rankdata(distr, method = "min") / B) + 1/B
        pvalue = pupper
    if alternative != "greater":
        plower = rankdata(distr, method = "min") / B
        pvalue = plower
    if alternative == "two-sided":
        pvalue = np.min([np.ones(B), 2 * np.min([plower, pupper], 0)], 0)
    return pvalue
コード例 #11
0
ファイル: test_rank.py プロジェクト: BranYang/scipy
 def test_empty(self):
     """stats.rankdata([]) should return an empty array."""
     a = np.array([], dtype=int)
     r = rankdata(a)
     assert_array_equal(r, np.array([], dtype=np.float64))
     r = rankdata([])
     assert_array_equal(r, np.array([], dtype=np.float64))
コード例 #12
0
def rdm_similarity(ref_rdms, rdm, similarity_type='spearman', computation_method=None,
                   rdm_as_list=False):
    """

    Parameters
    ----------
    computation_method : str or None
        can be 'spearmanr' or 'rankdata+cdist'
        # these two now only apply to spearman.
        if you specify this, then you must have matching `similarity_type`.
    similarity_type : str
        can only be 'spearman' now.
    ref_rdms: ndarray
    rdm: ndarray

    Returns
    -------

    """
    if similarity_type == 'spearman' and computation_method is None:
        computation_method = 'spearmanr'

    ref_rdms = np.atleast_2d(np.asarray(ref_rdms))
    # deal with case like returning a tuple of stuff.
    rdm = np.asarray(rdm)
    assert type(rdm) == np.ndarray and type(ref_rdms) == np.ndarray
    assert ref_rdms.ndim == 2
    if not rdm_as_list:
        rdm = np.atleast_2d(rdm.ravel())  # this is 1 x N
    assert rdm.ndim == 2
    assert rdm.shape[1] == ref_rdms.shape[1]
    if computation_method == 'spearmanr':
        assert similarity_type == 'spearman'
        assert not rdm_as_list, 'only supporting one by one!'
        # if not, actually spearmanr will return a scalar instead.
        if ref_rdms.shape[0] >= 2:
            rdm_similarities = spearmanr(ref_rdms, rdm, axis=1).correlation[-1, :-1]
        else:
            # print('singular path!')
            rdm_similarities = np.atleast_1d(spearmanr(ref_rdms, rdm, axis=1).correlation)
    elif computation_method == 'rankdata+cdist':
        assert similarity_type == 'spearman'
        # do rank transform first, and then compute pearson.
        ref_rdms_ranked = np.array([rankdata(ref_rdm_this) for ref_rdm_this in ref_rdms])
        if not rdm_as_list:
            rdm_ranked = np.atleast_2d(rankdata(rdm.ravel()))
        else:
            rdm_ranked = np.array([rankdata(rdm_this) for rdm_this in rdm])
        rdm_similarities = 1 - cdist(rdm_ranked, ref_rdms_ranked, 'correlation')
    else:
        raise ValueError('unsupported computation method {}'.format(computation_method))

    # rdm_similarities will be either a 1d stuff if not rdm_as_list, or a 2d stuff if rdm_as_list.
    assert rdm_similarities.ndim == 1 or rdm_similarities.ndim == 2
    if not rdm_as_list:
        rdm_similarities = rdm_similarities.ravel()
    else:
        assert rdm_similarities.ndim == 2

    return rdm_similarities
コード例 #13
0
ファイル: weirdTest.py プロジェクト: mbarakatt/dist
	def spear(self, xs):
		self.y_temp = rankdata(self.distRelated) + (~self.maskDistJaccard)*LARGE_NUMBER
		yRanks = np.amin([self.y_temp, rankdata((self.maskDistJaccard) * LARGE_NUMBER + xs)  + self.nb_y_below_t],axis=0)
		#print zip(yRanks[0::20],(self.y_temp)[0::20],(rankdata((self.maskDistJaccard) * LARGE_NUMBER + xs)  + self.nb_y_below_t)[0::20], rankdata(xs)[0::20])
		#print "numberptsbelowt: ", np.sum(self.maskDistJaccard)
		retval = 1 -np.sum(np.power(rankdata(xs) - yRanks,2))
		return retval
コード例 #14
0
ファイル: correlate.py プロジェクト: tisimst/mcerp
def induce_correlations(data, corrmat):
    """
    Induce a set of correlations on a column-wise dataset
    
    Parameters
    ----------
    data : 2d-array
        An m-by-n array where m is the number of samples and n is the
        number of independent variables, each column of the array corresponding
        to each variable
    corrmat : 2d-array
        An n-by-n array that defines the desired correlation coefficients
        (between -1 and 1). Note: the matrix must be symmetric and
        positive-definite in order to induce.
    
    Returns
    -------
    new_data : 2d-array
        An m-by-n array that has the desired correlations.
        
    """
    # Create an rank-matrix
    data_rank = np.vstack([rankdata(datai) for datai in data.T]).T

    # Generate van der Waerden scores
    data_rank_score = data_rank / (data_rank.shape[0] + 1.0)
    data_rank_score = norm(0, 1).ppf(data_rank_score)

    # Calculate the lower triangular matrix of the Cholesky decomposition
    # of the desired correlation matrix
    p = chol(corrmat)

    # Calculate the current correlations
    t = np.corrcoef(data_rank_score, rowvar=0)

    # Calculate the lower triangular matrix of the Cholesky decomposition
    # of the current correlation matrix
    q = chol(t)

    # Calculate the re-correlation matrix
    s = np.dot(p, np.linalg.inv(q))

    # Calculate the re-sampled matrix
    new_data = np.dot(data_rank_score, s.T)

    # Create the new rank matrix
    new_data_rank = np.vstack([rankdata(datai) for datai in new_data.T]).T

    # Sort the original data according to new_data_rank
    for i in range(data.shape[1]):
        vals, order = np.unique(
            np.hstack((data_rank[:, i], new_data_rank[:, i])), return_inverse=True
        )
        old_order = order[: new_data_rank.shape[0]]
        new_order = order[-new_data_rank.shape[0] :]
        tmp = data[np.argsort(old_order), i][new_order]
        data[:, i] = tmp[:]

    return data
コード例 #15
0
ファイル: test_hbos.py プロジェクト: flaviassantos/pyod
    def test_predict_rank(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2)
        assert_array_less(pred_ranks, self.X_train.shape[0] + 1)
        assert_array_less(-0.1, pred_ranks)
コード例 #16
0
ファイル: test_rank.py プロジェクト: BranYang/scipy
 def test_one(self):
     """Check stats.rankdata with an array of length 1."""
     data = [100]
     a = np.array(data, dtype=int)
     r = rankdata(a)
     assert_array_equal(r, np.array([1.0], dtype=np.float64))
     r = rankdata(data)
     assert_array_equal(r, np.array([1.0], dtype=np.float64))
コード例 #17
0
def getKeywords(cityA,cityB,nid_A,nid_B):
    features = cityA.features
    Aranks = rankdata(cityA.cat_pmi[nid_A])
    Branks = rankdata(cityB.cat_pmi[nid_B])
    good_features = (Aranks+Branks).argsort()[::-1]
    weights = np.sort(len(features)*2.0 - (Aranks+Branks))[::-1][:20]
    good_words = [features[i] for i in good_features[:20]]
    return good_words,weights
コード例 #18
0
ファイル: test_lof.py プロジェクト: flaviassantos/pyod
    def test_predict_rank_normalized(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test, normalized=True)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2)
        assert_array_less(pred_ranks, 1.01)
        assert_array_less(-0.1, pred_ranks)
コード例 #19
0
ファイル: rw.py プロジェクト: rybak/xyz
def mutualInformation(A, B):
    X = rankdata(A) / len(A)
    Y = rankdata(B) / len(B)
    forTree = np.transpose(np.array([X, Y]))
    res = evalNearest(forTree)
    res = res / Gamma / np.power(len(A), alpha)
    res = np.log(res)
    res = res / (1 - alpha)
    return res
コード例 #20
0
ファイル: pcorr.py プロジェクト: mrgeorge/transformers
def pcorr(x,y,z,method):
    # Compute partial correlation coefficient between x and y given z
    # x and y are n-element arrays
    # z is an m x n element array
    # method is 'p' for Pearson or 's' for Spearman rank
    # This is a port of the "var-covar" method pcor.mat written in R at
    # http://www.yilab.gatech.edu/pcor.R
    # See http://www.yilab.gatech.edu/pcor.html for more info

    # Note: I've followed some of their weird naming conventions
    #  e.g., Sxx is the covariance between x and y
    # Also note that cov returns different things in R and in numpy

    nData=len(x)
    nControl=(np.shape(z))[0] # number of control variables

    # The only difference between Pearson and Spearman is that
    # we rank the variables first for Spearman. In the large-N
    # limit they should also have the same null distribution (for pvalue)
    if(method=='p'): # pearson
          xc=x
          yc=y
          zc=z
    elif(method=='s'): # spearman rank
          xc=stats.rankdata(x)
          yc=stats.rankdata(y)
          zc=np.zeros_like(z)
          for ii in range(nControl):
               zc[ii]=stats.rankdata(z[ii])
    else:
         print "Error in pcorr: must specify method"
         exit

    Sxx=np.cov(xc,yc)

    Sxz=np.zeros(2*nControl).reshape(2,nControl)
    Szz=np.zeros(nControl**2).reshape(nControl,nControl)
    for ii in range(nControl):
         Sxz[0,ii]=(np.cov(xc,zc[ii]))[0,1]
         Sxz[1,ii]=(np.cov(yc,zc[ii]))[0,1]
         for jj in range(nControl):
              Szz[ii,jj]=(np.cov(zc[ii],zc[jj]))[0,1]
              
    # Check that Szz is positive definite before inverting
    if(np.min(stats.stats.linalg.eigvals(Szz)) < 0):
         print "Error in pcorr: Szz is not positive definite"
         exit
         
    SzzInv=np.linalg.inv(Szz)
    Sxxz=Sxx - np.dot(np.dot(Sxz,SzzInv),Sxz.T)

    coeff=Sxxz[0,1]/(np.sqrt(Sxxz[0,0])*np.sqrt(Sxxz[1,1]))

    pvalue, significance=pcorr_pvalue(coeff, nData, nControl)

    return (coeff, pvalue, significance)
コード例 #21
0
def process_file(args):
    df = pd.read_table(args.infile, index_col=None)
    if args.drop_paralogs:
        df = drop_all_paralogs(df)
    if args.min_length:
        df = df[df['ORIGINAL_SEQUENCE_LENGTH'] >= args.min_length]
    if args.restrict_to_zero:
        df = df[df['OFFSET_FROM_START'] == 0]
    if args.restrict_around:
        df = df[(df['OFFSET_FROM_START'] >= args.restrict_around[0]) & \
                (df['OFFSET_FROM_START'] <= args.restrict_around[1])]
    if args.remove_sequence:
        df = df[df['ORIGINAL_SEQUENCE'] != args.remove_sequence]
    if args.endswith:
        criterion = df['ORIGINAL_SEQUENCE'].map(
            lambda x: x.endswith(args.endswith))
        df = df[criterion]
    if args.threeprime_startswith:
        df = df[df['THREEPRIME_OF_CLEAVAGE'].map(
            lambda x: x.startswith(args.threeprime_startswith))]
    if args.three_prime_trimmed is not None:
        if args.three_prime_trimmed == '':
            df = df[df['3PTRIMMED'].isnull()]
        else:
            df = df[df['3PTRIMMED'] == args.three_prime_trimmed]
    if args.normalize_by_num_maps:
        df = calculate_num_times_map(df)
        df['WEIGHT'] = df['COUNT'] / df['NUM_TIMES_MAP']
        df['WEIGHT'] = df['WEIGHT'].fillna(0)
    if args.use_rank:
        if 'WEIGHT' in df.columns:
            df['WEIGHT'] = rankdata(df['WEIGHT'])
        else:
            df['WEIGHT'] = rankdata(df['COUNT'])
    if not ('WEIGHT' in df.columns):
        df['WEIGHT'] = df['COUNT']
    # combine upstream and downstream into one sequence
    if args.combine_sequences:
        left = df['ORIGINAL_SEQUENCE'].map(lambda x: x[args.combine_sequences[0]:])
        right = df['THREEPRIME_OF_CLEAVAGE'].map(lambda x: x[:args.combine_sequences[1]])
        df['FINAL_SEQUENCE']  = left + right
    # get the selected columns
    if 'all' not in args.columns:
        df = df[args.columns]

    # get rid of empty and null sequences
    if args.threeprime_of_cleavage:
        nucs = df[['THREEPRIME_OF_CLEAVAGE', 'COUNT']]
        nucs = nucs[nucs['THREEPRIME_OF_CLEAVAGE'] != '']
        nucs = nucs[not nucs['THREEPRIME_OF_CLEAVAGE'].isnull()]
        export = explode_series(nucs['THREEPRIME_OF_CLEAVAGE'])
        export = nucleotide_frequencies(export, count_series=nucs['COUNT'],
                                        normalize=True, ignore_ns=args.ignore_ns)
        export.to_csv(args.outfile, sep='\t')
    else:
        df.to_csv(args.outfile, sep='\t', index=False)
コード例 #22
0
 def train(self, features, labels):
     from milk.unsupervised import zscore
     from scipy import stats
     if self.axis == 0:
         rlabels = np.array([stats.rankdata(ells) for ells in labels])
     else:
         rlabels = np.array([stats.rankdata(ells) for ells in labels.T])
         rlabels = rlabels.T
     rlabels[np.isnan(labels)] = np.nan
     return self.base.train(features, rlabels)
コード例 #23
0
ファイル: Scalers.py プロジェクト: JasonKessler/scattertext
def scale_neg_1_to_1_with_zero_mean_rank_abs_max(v):
	rankv = v * 2 - 1
	pos_v = rankv[rankv > 0]
	pos_v = rankdata(pos_v, 'dense')
	pos_v = pos_v / pos_v.max()
	neg_v = rankv[rankv < 0]
	neg_v = rankdata(neg_v, 'dense')
	neg_v = neg_v / neg_v.max()
	rankv[rankv > 0] = pos_v
	rankv[rankv < 0] = - (neg_v.max() - neg_v)

	return scale_neg_1_to_1_with_zero_mean_abs_max(rankv)
コード例 #24
0
ファイル: test_rank.py プロジェクト: BranYang/scipy
    def test_large_int(self):
        data = np.array([2**60, 2**60+1], dtype=np.uint64)
        r = rankdata(data)
        assert_array_equal(r, [1.0, 2.0])

        data = np.array([2**60, 2**60+1], dtype=np.int64)
        r = rankdata(data)
        assert_array_equal(r, [1.0, 2.0])

        data = np.array([2**60, -2**60+1], dtype=np.int64)
        r = rankdata(data)
        assert_array_equal(r, [2.0, 1.0])
コード例 #25
0
ファイル: windowSize.py プロジェクト: rjeschmi/PePr
def estimate_normalization_constant(readData, parameter): 
    '''Estimate the normalization constant for all samples'''
    # if user tell PePr not to estimate normalization constants
    if parameter.normalization != "YES":
        if parameter.normalization == "NO":
            norm_constants = [1.0]*len(readData.filename_list)
        else:
            norm = parameter.normalization.strip().split(',')
            if len(norm) != len(readData.filename_list):
                raise Exception('''The number of normalization constants does
                not match the number of samples. Quiting..''')
            else: 
                norm_constants = [float(x) for x in norm]
        for idx,file in enumerate(readData.filename_list):
            readData.normalization_constant[file] = norm_constants[idx]
        return  
    # Split the genome into 1kb windows.
    bin = 1000
    array_dict = {}
    for file in readData.filename_list: 
        array = numpy.array([], dtype=numpy.float64)
        for chr in readData.chr_list:
            row_num = int(readData.chr_length_dict[chr]/bin)
            array_by_chr = numpy.zeros(row_num, dtype=numpy.float64)
            for x in readData.data_dict[chr][file]:
                try: array_by_chr[int(x/bin)] += 1
                except IndexError: pass 
            array = numpy.append(array, array_by_chr)
        array_dict[file] = array
    # Create a mixed chip sample and use it as the reference 
    mixed_chip_array = numpy.array([], dtype=numpy.float64)
    for idx, chip in enumerate(readData.chip_filename_list): 
        if idx == 0: 
            mixed_chip_array = array_dict[chip].copy()
            rep_rank_sum = rankdata(-array_dict[chip])
        else: 
            mixed_chip_array += array_dict[chip]
            rep_rank_sum += rankdata(-array_dict[chip])
    mixed_chip_array /= len(readData.chip_filename_list)
    # Estimate the input normalization constant using NCIS 
    for input in readData.input_filename_list: 
        norm_constant = estimate_input_normalization(
                mixed_chip_array, array_dict[input])
        readData.normalization_constant[input] = norm_constant
        debug("The scaling factor for %s is %s", input, norm_constant)
    # Estiamte the chip normalization cosntant using modified TMM method
    for chip in readData.chip_filename_list: 
        norm_constant = estimate_chip_normalization(
                mixed_chip_array, array_dict[chip], rep_rank_sum)
        readData.normalization_constant[chip] = norm_constant
        debug("The scaling factor for %s is %s", chip, norm_constant)
    return
コード例 #26
0
def plotScores(ax, scores, guideFreqs, scoreType, annotate, diam, doLegend=False):
    " create scatter plot "
    regrX = []
    regrY = []
    plotX = []
    plotY = []

    for extSeq, (guideName, modFreq) in guideFreqs.iteritems():
        y = modFreq
        x = scores[extSeq][scoreType]

        regrX.append(x)
        regrY.append(y)

        # just for plot: adding jitter for a scoretype with many identical scores
        if scoreType.startswith('final'):
            x -= random.random()*0.25

        plotX.append(x)
        plotY.append(y)

    # do not plot more than 3000 dots, makes PDF very slow to display
    #if len(plotX)>3000:
        #print "Sampling scatter plot points down to 3000 points"
        #allDots = [x, y for x, y in zip(plotX, plotY)]
        #allDots = random.sample(allDots, 3000)
        #plotX, plotY = zip(*allDots)

    ax.scatter(plotX, plotY, alpha=.5, marker="o", s=diam, linewidth=0)

    if scoreType in ["wang", "wangOrig"]:
        ax.set_xlim(0, 1.0)
    elif scoreType in ["doench"]:
        ax.set_xlim(0, 100)
    elif scoreType=="chariRank":
        ax.set_xlim(0, 100.0)

    slope, intercept, r_value, p_value, std_err = linregress(regrX,regrY)
    print "score type %s: Pearson R %f, P %f" % (scoreType, r_value, p_value)
    line = slope*np.asarray(regrX)+intercept
    ax.plot(regrX,line, linestyle='-', color="orange")

    pearR, pearP = pearsonr(regrX, regrY)
    spearR, spearP = spearmanr(rankdata(regrX), rankdata(regrY))
    #mwU, mwP = mannwhitneyu(regrX, regrY)
    #ret = pearR
    ret = spearR

    #ax.annotate(r'Pearson R = %0.3f (p %0.3f)' % (pearR, pearP) + '\n' + r'Spearman $\rho$ = %0.3f (p %0.3f)' % (spearR, spearP) + "\nMann-Whitney U=%d (p=%0.3f)" % (int(mwU), mwP), xy=(0.40,0.08), fontsize=9, xycoords='axes fraction')
    ax.annotate(r'Pearson R = %0.3f (p %0.3f)' % (pearR, pearP) + '\n' + r'Spearman $\rho$ = %0.3f (p %0.3f)' % (spearR, spearP), xy=(0.40,0.06), fontsize=9, xycoords='axes fraction')

    return ret
コード例 #27
0
ファイル: qq.py プロジェクト: hippozhu/dcs
  def rank_all(self):
    lec_scores = np.vstack((self.refs, self.best_lec_des, self.lec_scores_best))
    lec_rank = np.hstack([rankdata(a, 'min')[:, None] for a in -lec_scores.T])

    lmnn_scores = np.vstack((self.refs, self.best_lmnn_des, self.lmnn_scores_best))
    lmnn_rank = np.hstack([rankdata(a, 'min')[:, None] for a in -lmnn_scores.T])

    combine_scores = np.vstack((self.refs, self.best_combine_des, self.combine_scores_best))
    combine_rank = np.hstack([rankdata(a, 'min')[:, None] for a in -combine_scores.T])

    all_scores = np.vstack((self.refs, self.des_scores_best, self.lec_scores_best, self.lmnn_scores_best, self.combine_scores_best))
    all_rank = np.hstack([rankdata(a, 'min')[:, None] for a in -all_scores.T])
    return lec_rank, lmnn_rank, combine_rank, all_rank
コード例 #28
0
ファイル: misc.py プロジェクト: chrisburr/copulae
def rank_data(obs: np.ndarray, axis=0, ties='average'):
    """
    Assign ranks to data, dealing with ties appropriately. This function works on core as well as vectors

    Parameters
    ----------
    obs: ndarray
        Data to be ranked. Can only be 1 or 2 dimensional.
    axis: {0, 1}, optional
        The axis to perform the ranking. 0 means row, 1 means column.
    ties

    Returns
    -------

    """

    """
    

    :param obs: numpy array
        

    :param axis: int, default 0
        The axis to perform the ranking. 0 means row, 1 means column.

    :param ties: str, default 'average'
        The method used to assign ranks to tied elements. The options are 'average', 'min', 'max', 'dense' and
        'ordinal'.
        'average': The average of the ranks that would have been assigned to all the tied values is assigned to each
            value.
        'min': The minimum of the ranks that would have been assigned to all the tied values is assigned to each
            value. (This is also referred to as "competition" ranking.)
        'max': The maximum of the ranks that would have been assigned to all the tied values is assigned to each value.
        'dense': Like 'min', but the rank of the next highest element is assigned the rank immediately after those
            assigned to the tied elements. 'ordinal': All values are given a distinct rank, corresponding to
            the order that the values occur in `a`.

    :return: numpy array
        matrix or vector of the same dimension as X containing the pseudo observations
    """
    obs = np.asarray(obs)

    if obs.ndim == 1:
        return stats.rankdata(obs, ties)
    elif obs.ndim == 2:
        if axis == 0:
            return np.array([stats.rankdata(obs[i, :], ties) for i in range(obs.shape[0])])
        return np.array([stats.rankdata(obs[:, i], ties) for i in range(obs.shape[1])]).T
    else:
        raise ValueError('Can only rank data which is 1 or 2 dimensions')
コード例 #29
0
ファイル: data_consistency.py プロジェクト: feiyu1990/intern
def kendall_w(list_all):
    rank_all = []
    corrections = 0
    for i in list_all:
        rank_all.append(rankdata(list_all[i]))
        tie_count = Counter(rankdata(list_all[i]))
        corrections += np.sum(tie_count[i]**3 - tie_count[i] for i in tie_count)
    rank_ = np.sum(rank_all, axis = 0)
    rank_bar = np.mean(rank_)
    S = np.sum([(i - rank_bar)**2 for i in rank_])
    S_prime = np.sum([i**2 for i in rank_])
    m = len(list_all); n = len(rank_)
    W = (12 * S_prime - 3*m**2*n*(n+1)**2) / (m**2 * (n**3 - n) - m*corrections)
    return W
コード例 #30
0
    def __init__(self, target_dsm, control_dsms = None, resid = False,  
                    pairwise_metric='correlation', comparison_metric='pearson', 
                    center_data = False, corrcoef_only = False, **kwargs):
        """
        Initialize

        Parameters
        ----------
        dataset :           Dataset with N samples such that corresponding dissimilarity
                            matrix has N*(N-1)/2 unique pairwise distances
        target_dsm :        numpy array, length N*(N-1)/2. Target dissimilarity matrix
                            this is the predictor who's results get mapped back
        control_dsms:       list of numpy arrays, length N*(N-1)/2. DMs to be controlled for
                            Default: 'None'  
                            controlled for when getting results of target_dsm back
        resid:              Set to True to return residuals to searchlight center for 
                            smoothing estimation, default to False
        pairwise_metric :   To be used by pdist to calculate dataset DSM
                            Default: 'correlation', 
                            see scipy.spatial.distance.pdist for other metric options.
        comparison_metric : To be used for comparing dataset dsm with target dsm
                            Default: 'pearson'. Options: 'pearson' or 'spearman'
        center_data :       Center data by subtracting mean column values from
                            columns prior to calculating dataset dsm. 
                            Default: False
        corrcoef_only :     If true, return only the correlation coefficient
                            (rho), otherwise return rho and probability, p. 
                            Default: False
        Returns
        -------
        Dataset :           Dataset contains the correlation coefficient (rho) only or
                            rho plus p, when corrcoef_only is set to false.
        """
        # init base classes first
        Measure.__init__(self, **kwargs)
        if comparison_metric not in ['spearman','pearson']:
            raise Exception("comparison_metric %s is not in "
                            "['spearman','pearson']" % comparison_metric)
        self.target_dsm = target_dsm
        if comparison_metric == 'spearman':
            self.target_dsm = rankdata(target_dsm)
        self.pairwise_metric = pairwise_metric
        self.comparison_metric = comparison_metric
        self.center_data = center_data
        self.corrcoef_only = corrcoef_only
        self.control_dsms = control_dsms
        if comparison_metric == 'spearman' and control_dsms != None:
            self.control_dsms = [rankdata(dm) for dm in control_dsms]
        self.resid = resid
コード例 #31
0
    def test_on_set(fid, dataset, data_generator, label_generator,
                    num_batches):
        '''Helper function that works for both training and validation sets'''
        print('Testing on {} data'.format(dataset))
        # Need to process data using generator

        our_preds = []
        true_preds = []
        corr = 0

        for batch_num in range(num_batches):
            (x, y) = data_generator.next()
            labels = label_generator.next()
            #raw_input('pause...')
            y = y[0]  # only one output, which is True/False or yield

            # TODO: pre-fetch data in queue
            preds = model.predict_on_batch(x)

            for i in range(preds.shape[0]):

                edits = labels['candidate_edits'][i]
                pred = preds[i, :]

                trueprob = pred[y[i, :] != 0][
                    0]  # prob assigned to true outcome
                rank_true_edit = 1 + len(pred) - (ss.rankdata(pred))[np.argmax(
                    y[i, :])]

                true_preds.append(trueprob)
                our_preds.append(pred[np.argmax(y[i, :])])
                if np.argmax(pred) == np.argmax(y[i, :]):
                    corr += 1

                # Get most informative labels for the highest predictions
                if rank_true_edit != 1:
                    # record highest probability
                    most_likely_edit_i = np.argmax(pred)
                    most_likely_prob = np.max(pred)
                else:
                    # record number two prediction
                    most_likely_edit_i = np.argmax(pred[pred != np.max(pred)])
                    most_likely_prob = np.max(pred[pred != np.max(pred)])
                trueyield = 0.0

                try:
                    most_likely_smiles = labels['candidate_smiles'][i][
                        most_likely_edit_i]
                    most_likely_edit = edits[most_likely_edit_i]
                except IndexError:
                    most_likely_smiles = 'no_reaction'
                    most_likely_edit = 'no_reaction'

                fid.write(
                    '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(
                        labels['reaction_true'][i], dataset,
                        edits[np.argmax(y[i, :])], trueprob, most_likely_edit,
                        most_likely_prob, rank_true_edit,
                        labels['reaction_true'][i].split('>')[-1],
                        most_likely_smiles, labels['rxdid'][i], trueyield))

        return our_preds, corr
コード例 #32
0
# -*- coding: utf-8 -*-
"""
Created on Mon Jul 10 11:47:53 2017

@author: kcarnold
"""

import numpy as np
from suggestion import suggestion_generator
from scipy.special import logsumexp
from scipy.stats import rankdata
#%%
model = suggestion_generator.get_model('yelp_train-balanced')
#%%
wf_bins_rank = rankdata(model.unigram_probs, method='average')
# np.arange(len(model.unigram_probs_wordsonly))[
#wf_bins_rank = np.argsort(model.unigram_probs_wordsonly)
wf_bins = (10 * wf_bins_rank / (wf_bins_rank.max() + 1)).astype(int)
bin_counts = np.bincount(wf_bins)
#%%
for word in 'huevos tri place'.split():
    idx = model.model.vocab_index(word)
    print(f"{model.unigram_probs_wordsonly[idx]:.2f}, bin={wf_bins[idx]}")

#%%
mean_probs = model.unigram_probs_wordsonly @ np.eye(10)[wf_bins]
#%%
# bin 6 seems high, and bin 1. Why?
[model.id2str[idx] for idx in np.flatnonzero(wf_bins == 6)[:20]]
[
    wf_bins_rank[idx] / wf_bins_rank.max()
コード例 #33
0
 def rank_configs_by_width(models, C):
     mpiws = [pi.mw(model, C) for model in models]
     return rankdata(mpiws, 'max')
コード例 #34
0
ファイル: _mannwhitneyu.py プロジェクト: Aathi410/Pro123
def mannwhitneyu(x,
                 y,
                 use_continuity=True,
                 alternative="two-sided",
                 axis=0,
                 method="auto"):
    r'''Perform the Mann-Whitney U rank test on two independent samples.

    The Mann-Whitney U test is a nonparametric test of the null hypothesis
    that the distribution underlying sample `x` is the same as the
    distribution underlying sample `y`. It is often used as a test of
    of difference in location between distributions.

    Parameters
    ----------
    x, y : array-like
        N-d arrays of samples. The arrays must be broadcastable except along
        the dimension given by `axis`.
    use_continuity : bool, optional
            Whether a continuity correction (1/2) should be applied.
            Default is True when `method` is ``'asymptotic'``; has no effect
            otherwise.
    alternative : {'two-sided', 'less', 'greater'}, optional
        Defines the alternative hypothesis. Default is 'two-sided'.
        Let *F(u)* and *G(u)* be the cumulative distribution functions of the
        distributions underlying `x` and `y`, respectively. Then the following
        alternative hypotheses are available:

        * 'two-sided': the distributions are not equal, i.e. *F(u) ≠ G(u)* for
          at least one *u*.
        * 'less': the distribution underlying `x` is stochastically less
          than the distribution underlying `y`, i.e. *F(u) > G(u)* for all *u*.
        * 'greater': the distribution underlying `x` is stochastically greater
          than the distribution underlying `y`, i.e. *F(u) < G(u)* for all *u*.

        Under a more restrictive set of assumptions, the alternative hypotheses
        can be expressed in terms of the locations of the distributions;
        see [5] section 5.1.
    axis : int, optional
        Axis along which to perform the test. Default is 0.
    method : {'auto', 'asymptotic', 'exact'}, optional
        Selects the method used to calculate the *p*-value.
        Default is 'auto'. The following options are available.

        * ``'asymptotic'``: compares the standardized test statistic
          against the normal distribution, correcting for ties.
        * ``'exact'``: computes the exact *p*-value by comparing the observed
          :math:`U` statistic against the exact distribution of the :math:`U`
          statistic under the null hypothesis. No correction is made for ties.
        * ``'auto'``: chooses ``'exact'`` when the size of one of the samples
          is less than 8 and there are no ties; chooses ``'asymptotic'``
          otherwise.

    Returns
    -------
    res : MannwhitneyuResult
        An object containing attributes:

        statistic : float
            The Mann-Whitney U statistic corresponding with sample `x`. See
            Notes for the test statistic corresponding with sample `y`.
        pvalue : float
            The associated *p*-value for the chosen `alternative`.

    Notes
    -----
    If ``U1`` is the statistic corresponding with sample `x`, then the
    statistic corresponding with sample `y` is
    `U2 = `x.shape[axis] * y.shape[axis] - U1``.

    `mannwhitneyu` is for independent samples. For related / paired samples,
    consider `scipy.stats.wilcoxon`.

    `method` ``'exact'`` is recommended when there are no ties and when either
    sample size is less than 8 [1]_. The implementation follows the recurrence
    relation originally proposed in [1]_ as it is described in [3]_.
    Note that the exact method is *not* corrected for ties, but
    `mannwhitneyu` will not raise errors or warnings if there are ties in the
    data.

    The Mann-Whitney U test is a non-parametric version of the t-test for
    independent samples. When the the means of samples from the populations
    are normally distributed, consider `scipy.stats.ttest_ind`.

    See Also
    --------
    scipy.stats.wilcoxon, scipy.stats.ranksums, scipy.stats.ttest_ind

    References
    ----------
    .. [1] H.B. Mann and D.R. Whitney, "On a test of whether one of two random
           variables is stochastically larger than the other", The Annals of
           Mathematical Statistics, Vol. 18, pp. 50-60, 1947.
    .. [2] Mann-Whitney U Test, Wikipedia,
           http://en.wikipedia.org/wiki/Mann-Whitney_U_test
    .. [3] A. Di Bucchianico, "Combinatorics, computer algebra, and the
           Wilcoxon-Mann-Whitney test", Journal of Statistical Planning and
           Inference, Vol. 79, pp. 349-364, 1999.
    .. [4] Rosie Shier, "Statistics: 2.3 The Mann-Whitney U Test", Mathematics
           Learning Support Centre, 2004.
    .. [5] Michael P. Fay and Michael A. Proschan. "Wilcoxon-Mann-Whitney
           or t-test? On assumptions for hypothesis tests and multiple \
           interpretations of decision rules." Statistics surveys, Vol. 4, pp.
           1-39, 2010. https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2857732/

    Examples
    --------
    We follow the example from [4]_: nine randomly sampled young adults were
    diagnosed with type II diabetes at the ages below.

    >>> males = [19, 22, 16, 29, 24]
    >>> females = [20, 11, 17, 12]

    We use the Mann-Whitney U test to assess whether there is a statistically
    significant difference in the diagnosis age of males and females.
    The null hypothesis is that the distribution of male diagnosis ages is
    the same as the distribution of female diagnosis ages. We decide
    that a confidence level of 95% is required to reject the null hypothesis
    in favor of the alternative that that the distributions are different.
    Since the number of samples is very small and there are no ties in the
    data, we can compare the observed test statistic against the *exact*
    distribution of the test statistic under the null hypothesis.

    >>> from scipy.stats import mannwhitneyu
    >>> U1, p = mannwhitneyu(males, females, method="exact")
    >>> print(U1)
    17.0

    `mannwhitneyu` always reports the statistic associated with the first
    sample, which, in this case, is males. This agrees with :math:`U_M = 17`
    reported in [4]_. The statistic associated with the second statistic
    can be calculated:

    >>> nx, ny = len(males), len(females)
    >>> U2 = nx*ny - U1
    >>> print(U2)
    3.0

    This agrees with :math:`U_F = 3` reported in [4]_. The two-sided
    *p*-value can be calculated from either statistic, and the value produced
    by `mannwhitneyu` agrees with :math:`p = 0.11` reported in [4]_.

    >>> print(p)
    0.1111111111111111

    The exact distribution of the test statistic is asymptotically normal, so
    the example continues by comparing the exact *p*-value against the
    *p*-value produced using the normal approximation.

    >>> _, pnorm = mannwhitneyu(males, females, method="asymptotic")
    >>> print(pnorm)
    0.11134688653314041

    Here `mannwhitneyu`'s reported *p*-value appears to conflict with the
    value :math:`p = 0.09` given in [4]_. The reason is that [4]_
    does not apply the continuity correction performed by `mannwhitneyu`;
    `mannwhitneyu` reduces the distance between the test statistic and the
    mean :math:`\mu = n_x n_y / 2` by 0.5 to correct for the fact that the
    discrete statistic is being compared against a continuous distribution.
    Here, the :math:`U` statistic used is less than the mean, so we reduce
    the distance by adding 0.5 in the numerator.

    >>> import numpy as np
    >>> from scipy.stats import norm
    >>> U = min(U1, U2)
    >>> N = nx + ny
    >>> z = (U - nx*ny/2 + 0.5) / np.sqrt(nx*ny * (N + 1)/ 12)
    >>> p = 2 * norm.cdf(z)  # use CDF to get p-value from smaller statistic
    >>> print(p)
    0.11134688653314041

    If desired, we can disable the continuity correction to get a result
    that agrees with that reported in [4]_.

    >>> _, pnorm = mannwhitneyu(males, females, use_continuity=False,
    ...                         method="asymptotic")
    >>> print(pnorm)
    0.0864107329737

    Regardless of whether we perform an exact or asymptotic test, the
    probability of the test statistic being as extreme or more extreme by
    chance exceeds 5%, so we do not consider the results statistically
    significant.

    Suppose that, before seeing the data, we had hypothesized that females
    would tend to be diagnosed at a younger age than males.
    In that case, it would be natural to provide the female ages as the
    first input, and we would have performed a one-sided test using
    ``alternative = 'less'``: females are diagnosed at an age that is
    stochastically less than that of males.

    >>> res = mannwhitneyu(females, males, alternative="less", method="exact")
    >>> print(res)
    MannwhitneyuResult(statistic=3.0, pvalue=0.05555555555555555)

    Again, the probability of getting a sufficiently low value of the
    test statistic by chance under the null hypothesis is greater than 5%,
    so we do not reject the null hypothesis in favor of our alternative.

    If it is reasonable to assume that the means of samples from the
    populations are normally distributed, we could have used a t-test to
    perform the analysis.

    >>> from scipy.stats import ttest_ind
    >>> res = ttest_ind(females, males, alternative="less")
    >>> print(res)
    Ttest_indResult(statistic=-2.239334696520584, pvalue=0.030068441095757924)

    Under this assumption, the *p*-value would be low enough to reject the
    null hypothesis in favor of the alternative.

    '''

    x, y, use_continuity, alternative, axis_int, method = (
        _mwu_input_validation(x, y, use_continuity, alternative, axis, method))

    x, y, xy = _broadcast_concatenate(x, y, axis)

    n1, n2 = x.shape[-1], y.shape[-1]

    if method == "auto":
        method = _mwu_choose_method(n1, n2, xy, method)

    # Follows [2]
    ranks = stats.rankdata(xy, axis=-1)  # method 2, step 1
    R1 = ranks[..., :n1].sum(axis=-1)  # method 2, step 2
    U1 = R1 - n1 * (n1 + 1) / 2  # method 2, step 3
    U2 = n1 * n2 - U1  # as U1 + U2 = n1 * n2

    if alternative == "greater":
        U, f = U1, 1  # U is the statistic to use for p-value, f is a factor
    elif alternative == "less":
        U, f = U2, 1  # Due to symmetry, use SF of U2 rather than CDF of U1
    else:
        U, f = np.maximum(U1, U2), 2  # multiply SF by two for two-sided test

    if method == "exact":
        p = _mwu_state.sf(U.astype(int), n1, n2)
    elif method == "asymptotic":
        z = _get_mwu_z(U, n1, n2, ranks, continuity=use_continuity)
        p = stats.norm.sf(z)
    p *= f

    # Ensure that test statistic is not greater than 1
    # This could happen for exact test when U = m*n/2
    p = np.clip(p, 0, 1)

    return MannwhitneyuResult(U1, p)
コード例 #35
0
def ranks(scores, ascending=False):
    sign = 1 if ascending else -1
    scores = scores * sign
    ranks = [stats.rankdata(scores[i])[0] for i in range(scores.shape[0])]
    return ranks
 def get_background_rank_df(cls, frequency_path=None):
     df = cls.get_background_frequency_df(frequency_path)
     df['rank'] = rankdata(df.background, method='dense')
     df['background'] = df['rank'] / df['rank'].max()
     return df[['background']]
コード例 #37
0
ファイル: statistics.py プロジェクト: biocore-ntnu/pyranges
def fdr(p_vals):
    """Adjust p-values with Benjamini-Hochberg.

    Parameters
    ----------
    data : array-like


    Returns
    -------
    Pandas.DataFrame

        DataFrame where values are order of data.

    Examples
    --------

    >>> np.random.seed(0)
    >>> x = np.random.random(10) / 100

    >>> gr = pr.random(10)
    >>> gr.PValue = x
    >>> gr
    +--------------+-----------+-----------+--------------+----------------------+
    | Chromosome   | Start     | End       | Strand       | PValue               |
    | (category)   | (int32)   | (int32)   | (category)   | (float64)            |
    |--------------+-----------+-----------+--------------+----------------------|
    | chr1         | 176601938 | 176602038 | +            | 0.005488135039273248 |
    | chr1         | 155082851 | 155082951 | -            | 0.007151893663724195 |
    | chr2         | 211134424 | 211134524 | -            | 0.006027633760716439 |
    | chr9         | 78826761  | 78826861  | -            | 0.005448831829968969 |
    | ...          | ...       | ...       | ...          | ...                  |
    | chr16        | 52216522  | 52216622  | +            | 0.004375872112626925 |
    | chr17        | 8085927   | 8086027   | -            | 0.008917730007820798 |
    | chr19        | 17333425  | 17333525  | +            | 0.009636627605010294 |
    | chr22        | 16728001  | 16728101  | +            | 0.003834415188257777 |
    +--------------+-----------+-----------+--------------+----------------------+
    Stranded PyRanges object has 10 rows and 5 columns from 9 chromosomes.
    For printing, the PyRanges was sorted on Chromosome and Strand.

    >>> gr.FDR = pr.stats.fdr(gr.PValue)
    >>> gr.print(formatting={"PValue": "{:.4f}", "FDR": "{:.4}"})
    +--------------+-----------+-----------+--------------+-------------+-------------+
    | Chromosome   | Start     | End       | Strand       | PValue      | FDR         |
    | (category)   | (int32)   | (int32)   | (category)   | (float64)   | (float64)   |
    |--------------+-----------+-----------+--------------+-------------+-------------|
    | chr1         | 176601938 | 176602038 | +            | 0.0055      | 0.01098     |
    | chr1         | 155082851 | 155082951 | -            | 0.0072      | 0.00894     |
    | chr2         | 211134424 | 211134524 | -            | 0.0060      | 0.01005     |
    | chr9         | 78826761  | 78826861  | -            | 0.0054      | 0.01362     |
    | ...          | ...       | ...       | ...          | ...         | ...         |
    | chr16        | 52216522  | 52216622  | +            | 0.0044      | 0.01459     |
    | chr17        | 8085927   | 8086027   | -            | 0.0089      | 0.009909    |
    | chr19        | 17333425  | 17333525  | +            | 0.0096      | 0.009637    |
    | chr22        | 16728001  | 16728101  | +            | 0.0038      | 0.03834     |
    +--------------+-----------+-----------+--------------+-------------+-------------+
    Stranded PyRanges object has 10 rows and 6 columns from 9 chromosomes.
    For printing, the PyRanges was sorted on Chromosome and Strand.
    """

    from scipy.stats import rankdata
    ranked_p_values = rankdata(p_vals)
    fdr = p_vals * len(p_vals) / ranked_p_values
    fdr[fdr > 1] = 1

    return fdr
コード例 #38
0
def match_digit(digits_used, anchor, y_pred, anchor_pred):
    y_pred_matched = np.zeros(y_pred.shape).astype(digits_used.dtype)
    for y_pred_i, rank in enumerate(stats.rankdata(anchor_pred)):
        digit = digits_used[np.argsort(anchor)[int(rank) - 1]]
        y_pred_matched[y_pred == y_pred_i] = digit
    return y_pred_matched
コード例 #39
0
    stackx_data["answer"] = stackx_data["answer"].astype(str)
    #print(stackx_data.shape)
    log.write("data loaded of size %s \n" % (str(stackx_data.shape)))

    # Normalize aux targets
    encoded = []
    trange = tqdm(stackx_data["host"].unique())
    for host in trange:
        host_mask = stackx_data["host"] == host
        trange.set_description(str(host))
        host_labels = deepcopy(stackx_data[host_mask][TARGETS])
        for col in [
                "question_score", "question_views", "question_favs",
                "answer_score"
        ]:
            host_labels[col] = rankdata(
                stackx_data[host_mask][col]) / host_mask.sum()
        encoded.append(host_labels)

    encoded = pd.concat(encoded, sort=False).reindex(stackx_data.index)
    stackx_data[encoded.columns] = encoded
    log.write("Aux targets are normalized \n")

    #Train-Val Split
    train_df, test_df = train_test_split(stackx_data,
                                         test_size=0.1,
                                         random_state=args.seed)

    log.write(" Train-Val Split : train_df size %s \t val_df size is %s \n" %
              (str(train_df.shape), str(test_df.shape)))

    #tokenizer
コード例 #40
0
    def ranksum_thread(vec):
        """
        Wilcoxon rank sum test for one feature
        Adapted from the following R functions:
            `Seurat::FindMarkers` and `stats::wilcox.test`
        """

        # Preparation
        vec = vec.toarray().ravel() if issparse(vec) else vec
        pct = np.empty((group_onehot.shape[1], 2))
        logfc = np.empty((group_onehot.shape[1]))

        for i in range(group_onehot.shape[1]):
            mask = group_onehot[:, i].ravel()
            pct[i, 0] = round(np.sum(vec[mask] > 0) / n_x[i], 3)
            pct[i, 1] = round(np.sum(vec[~mask] > 0) / n_y[i], 3)
            logfc[i] = np.log(vec[mask].mean() + pseudocount) - \
                np.log(vec[~mask].mean() + pseudocount)

        # Percent expressed filtering
        pct_max = pct.max(axis=1)
        pct_min = pct.min(axis=1)
        pct_diff = pct_max - pct_min
        pct_mask = (pct_max > min_pct) & (pct_diff > min_pct_diff)

        # Fold change filtering
        if alternative == "greater":
            logfc_mask = logfc > logfc_threshold
        elif alternative == "less":
            logfc_mask = logfc < -logfc_threshold
        elif alternative == "two-sided":
            logfc_mask = abs(logfc) > logfc_threshold

        total_mask = pct_mask & logfc_mask
        if not np.any(total_mask):
            nan_placeholder = np.empty(group_onehot.shape[1])
            nan_placeholder[:] = np.nan
            return pct[:, 0].ravel(), pct[:, 1].ravel(), logfc, \
                nan_placeholder, nan_placeholder, nan_placeholder

        # Rank sum test
        rank = rankdata(vec)
        n_ties = np.unique(rank, return_counts=True)[1]

        stat = np.empty(group_onehot.shape[1])
        for i in range(group_onehot.shape[1]):
            mask = group_onehot[:, i].ravel()
            if total_mask[i]:
                stat[i] = rank[mask].sum() - n_x[i] * (n_x[i] + 1) / 2
            else:
                stat[i] = np.nan
        z = stat - n_x * n_y / 2
        sigma = np.sqrt(
            (n_xy_prod / 12) * ((n_xy_plus + 1) - (n_ties**3 - n_ties).sum() /
                                (n_xy_plus * (n_xy_plus - 1))))
        if alternative == "two-sided":
            correction = np.sign(z) * 0.5
        elif alternative == "greater":
            correction = 0.5
        elif alternative == "less":
            correction = -0.5
        z = (z - correction) / sigma
        if alternative == "two-sided":
            pval = 2 * np.stack([norm.sf(z), norm.cdf(z)], axis=0).min(axis=0)
        elif alternative == "greater":
            pval = norm.sf(z)
        elif alternative == "less":
            pval = norm.cdf(z)
        return pct[:, 0].ravel(), pct[:, 1].ravel(), logfc, stat, z, pval
コード例 #41
0
def pbo(M, S, metric_func, threshold,
        n_jobs=1,
        verbose=False,
        plot=False,
        hist=False):
    '''
    Based on http://papers.ssrn.com/sol3/papers.cfm?abstract_id=2326253

    Features:
    * training and test sets are of equal size, providing comparable accuracy
    to both IS and OOS Sharpe ratios.
    * CSCV is symmetric, decline in performance can only result from
    overfitting, not arbitrary discrepancies between the training and test
    sets.
    * CSCV respects the time-dependence and other season-dependent features
    present in the data.
    * Results are deterministic, can be replicated.
    * Dispersion in the distribution of logits conveys relevant info regarding
    the robustness of the strategy selection process.
    * Model-free, non-parametric. Logits distribution resembles the cumulative
    Normal distribution if w_bar are close to uniform distribution (i.e. the
    backtest appears to be information-less). Therefore, for good backtesting,
    the distribution of logits will be centered in a significantly positive
    value, and its tail will marginally cover the region of negative logit
    values.

    Limitations:
    * CSCV is symmetric, for some strategies, K-fold CV might be better.
    * Not suitable for time series with strong auto-correlation, especially
    when S is large.
    * Assumes all the sample statistics carry the same weight.
    * Entirely possible that all the N strategy configs have high but similar
    Sharpe ratios. Therefore, PBO may appear high, however, 'overfitting' here
    is among many 'skilful' strategies.

    Parameters:

    M:
        returns data, numpy or dataframe format.
    S:
        chuncks to devided M into, must be even number. Paper suggests setting
        S = 16. See paper for details of choice of S.
    metric_func:
        evaluation function for returns data
    threshold:
        used as prob. of OOS Loss calculation cutoff. For Sharpe ratio,
        this should be 0 to indicate probabilty of loss.
    n_jobs:
        if greater than 1 then enable parallel mode
    hist:
        Default False, whether to plot histogram for rank of logits.
        Some problems exist when S >= 10. Need to look at why numpy /
        matplotlib does it.

    Returns:
    PBO result in namedtuple, instance of PBO.
    '''
    if S % 2 == 1:
        raise ValueError('S must be an even integer, {:.1f} was given'
                         .format(S))

    n_jobs = int(n_jobs)
    if n_jobs < 0:
        n_jobs = max(1, ps.cpu_count(logical=False))

    if isinstance(M, pd.DataFrame):
        # conver to numpy values
        if verbose:
            print('Convert from DataFrame to numpy array.')
        M = M.values

    # Paper suggests T should be 2x the no. of observations used by investor
    # to choose a model config, due to the fact that CSCV compares combinations
    # of T/2 observations with their complements.
    T, N = M.shape
    residual = T % S
    if residual != 0:
        M = M[residual:]
        T, N = M.shape

    sub_T = T // S

    if verbose:
        print('Total sample size: {:,d}, chunck size: {:,d}'.format(T, sub_T))

    # generate subsets, each of length sub_T
    Ms = []
    Ms_values = []
    for i in range(S):
        start, end = i * sub_T, (i + 1) * sub_T
        Ms.append((i, M[start:end, :]))
        Ms_values.append(M[start:end, :])
    Ms_values = np.array(Ms_values)

    if verbose:
        print('No. of Chuncks: {:,d}'.format(len(Ms)))

    # generate combinations
    Cs = [x for x in itr.combinations(Ms, S // 2)]
    if verbose:
        print('No. of combinations = {:,d}'.format(len(Cs)))

    # Ms_index used to find J_bar (complementary OOS part)
    Ms_index = set([x for x in range(len(Ms))])

    # create J and J_bar
    if n_jobs < 2:
        J = []
        J_bar = []

        for i in range(len(Cs)):
            # make sure chucks are concatenated in their original order
            order = [x for x, _ in Cs[i]]
            sort_ind = np.argsort(order)

            Cs_values = np.array([v for _, v in Cs[i]])
            # if verbose:
            #     print('Cs index = {}, '.format(order), end='')
            joined = np.concatenate(Cs_values[sort_ind, :])
            J.append(joined)

            # find Cs_bar
            Cs_bar_index = list(sorted(Ms_index - set(order)))
            # if verbose:
            # print('Cs_bar_index = {}'.format(Cs_bar_index))
            J_bar.append(np.concatenate(Ms_values[Cs_bar_index, :]))

        # compute matrices for J and J_bar, e.g. Sharpe ratio
        R = [metric_func(j) for j in J]
        R_bar = [metric_func(j) for j in J_bar]

        # compute ranks of metrics
        R_rank = [ss.rankdata(x) for x in R]
        R_bar_rank = [ss.rankdata(x) for x in R_bar]

        # find highest metric, rn contains the index position of max value
        # in each set of R (IS)
        rn = [np.argmax(r) for r in R_rank]
        # use above index to find R_bar (OOS) in same index position
        # i.e. the same config / setting
        rn_bar = [R_bar_rank[i][rn[i]] for i in range(len(R_bar_rank))]

        # formula in paper used N+1 as the denominator for w_bar.
        w_bar = [float(r) / N for r in rn_bar]
        # logit(.5) gives 0 so if w_bar value is equal to median logits is 0
        logits = [spec.logit(w) for w in w_bar]
    else:
        # use joblib for parallel calc
        # print('Run in parallel mode.')
        cores = job.Parallel(n_jobs=n_jobs)(
            job.delayed(pbo_core_calc)(Cs_x,
                                       Ms, Ms_values, Ms_index,
                                       metric_func,
                                       verbose)
            for Cs_x in Cs)
        # core_df = pd.DataFrame(cores, columns=PBOCore._fields)
        # convert to values needed.
        # # core_df = pd.DataFrame.from_records(cores)

        # J = core_df.J.values
        # J_bar = core_df.J_bar.values
        # R = core_df.R.values
        # R_bar = core_df.R_bar.values
        # R_rank = core_df.R_rank.values
        # R_bar_rank = core_df.R_bar_rank.values
        # rn = core_df.rn.values
        # rn_bar = core_df.rn_bar.values
        # w_bar = core_df.w_bar.values
        # logits = core_df.logits.values

        J = [c.J for c in cores]
        J_bar = [c.J_bar for c in cores]
        R = [c.R for c in cores]
        R_bar = [c.R_bar for c in cores]
        R_rank = [c.R_rank for c in cores]
        R_bar_rank = [c.R_bar_rank for c in cores]
        rn = [c.rn for c in cores]
        rn_bar = [c.rn_bar for c in cores]
        w_bar = [c.w_bar for c in cores]
        logits = [c.logits for c in cores]

    # prob of overfitting
    phi = np.array([1.0 if lam <= 0 else 0.0
                    for lam in logits]) / len(Cs)
    pbo_test = np.sum(phi)

    # performance degradation
    R_n_star = np.array([R[i][rn[i]] for i in range(len(R))])
    R_bar_n_star = np.array([R_bar[i][rn[i]] for i in range(len(R_bar))])
    lm = ss.linregress(x=R_n_star, y=R_bar_n_star)

    prob_oos_loss = np.sum([1.0 if r < threshold else 0.0
                            for r in R_bar_n_star]) / len(R_bar_n_star)

    # Stochastic dominance
    y = np.linspace(min(R_bar_n_star), max(R_bar_n_star),
                    endpoint=True, num=1000)
    R_bar_n_star_cdf = smd.ECDF(R_bar_n_star)
    optimized = R_bar_n_star_cdf(y)

    R_bar_cdf = smd.ECDF(np.concatenate(R_bar))
    non_optimized = R_bar_cdf(y)

    dom_df = pd.DataFrame(dict(optimized_IS=optimized,
                               non_optimized_OOS=non_optimized))
    dom_df.index = y
    # visually, non_optimized curve above optimized curve indicates good
    # backtest with low overfitting.
    dom_df['SD2'] = dom_df.non_optimized_OOS - dom_df.optimized_IS

    result = PBO(pbo_test,
                 prob_oos_loss,
                 lm,
                 dom_df,
                 Cs,
                 J, J_bar,
                 R, R_bar,
                 R_rank, R_bar_rank,
                 rn, rn_bar,
                 w_bar,
                 logits,
                 R_n_star,
                 R_bar_n_star)

    if plot:
        plot_pbo(result, hist=hist)

    return result
コード例 #42
0
 def rankSelection(self):
     #fitnesses = self.getFitnessPop()
     rank_fitnesses = rankdata(self.fitnesses)
     probs = [f / sum(rank_fitnesses) for f in rank_fitnesses]
     p1, p2 = np.random.choice(self.pop, 2, p=probs)
     return p1, p2
コード例 #43
0
ファイル: views.py プロジェクト: sunvir72/EduWeb
def prec_(request):
    if request.method == 'POST':
        checks = request.POST.getlist('checks_[]')
        target = request.POST['target_']
        typedata = request.POST.get('typedata_')
        checks = list(map(int, checks))
        target = int(target)

        print(typedata)
        #global test_df
        test_df = request.session['test_df']
        x = test_df.iloc[:, checks].values
        y = test_df.iloc[:, target].values.astype('int64')

        train_df = request.session['train_df']
        x_tr = train_df.iloc[:, checks].values
        y_tr = train_df.iloc[:, target].values
        #preprocessing:
        dtypes_list = list(test_df.dtypes)
        categorical_lst = []
        for i in range(0, len(checks)):
            if (dtypes_list[checks[i]] == 'object'):
                categorical_lst.append(i)

        labelencoder = LabelEncoder()
        for i in categorical_lst:
            x[:, i] = labelencoder.fit_transform(x[:, i])
            x_tr[:, i] = labelencoder.fit_transform(x_tr[:, i])

        if len(categorical_lst) != 0:
            oneh = OneHotEncoder(categorical_features=categorical_lst)
            x = oneh.fit_transform(x).toarray()
            x_tr = oneh.fit_transform(x_tr).toarray()
        '''
        avoid dummy variable
        '''

        sc = StandardScaler()
        x = sc.fit_transform(x)
        x_tr = sc.fit_transform(x_tr)

        regList = request.session['regList']
        regnames = request.session['regnames']
        print(regnames)
        resultDict = {
            'regs': len(regList),
            'regnames': regnames,
            'auc': [],
            'tp': [],
            'tn': [],
            'fn': [],
            'fp': [],
            'accuracy': [],
            'recall': [],
            'precision': [],
            'f1': []
        }

        if (typedata == 'train'):
            x = x_tr
            y = y_tr

        for i in range(0, len(regList)):
            y_pred = regList[i].predict(x)
            cm = confusion_matrix(y, y_pred.round())
            # predict probabilities
            probs = regList[i].predict_proba(x)
            # keep probabilities for the positive outcome only
            probs = probs[:, 1]
            auc_value = roc_auc_score(y, probs)
            resultDict['auc'].append(auc_value)
            resultDict['tp'].append(int(cm[0][0]))
            resultDict['tn'].append(int(cm[1][1]))
            resultDict['fn'].append(int(cm[0][1]))
            resultDict['fp'].append(int(cm[1][0]))
            accuracy = (cm[0][0] + cm[1][1]) / (cm[0][0] + cm[1][0] +
                                                cm[0][1] + cm[1][1])
            resultDict['accuracy'].append(round(accuracy, 4))
            recall = cm[0][0] / (cm[0][0] + cm[0][1])
            precision = cm[0][0] / (cm[0][0] + cm[1][0])
            resultDict['recall'].append(round(recall, 4))
            resultDict['precision'].append(round(precision, 4))
            resultDict['f1'].append(
                round((2 * (recall * precision) / (recall + precision)), 4))
        #Next line to be used in savemodel function
        request.session['f1'] = resultDict['f1']

        #TOPSIS
        arr = []
        for i in range(0, resultDict['regs']):
            arr.append([])
            arr[i].append(resultDict['tp'][i])
            arr[i].append(resultDict['tn'][i])
            arr[i].append(resultDict['fn'][i])
            arr[i].append(resultDict['fp'][i])
            arr[i].append(resultDict['accuracy'][i])
            arr[i].append(resultDict['recall'][i])
            arr[i].append(resultDict['precision'][i])
            arr[i].append(resultDict['f1'][i])
        print(arr)
        w = [1, 1, 1, 1, 1, 1, 1, 1]
        f = ['+', '+', '-', '-', '+', '+', '+', '+']
        sqr = []
        nm = []
        #ds=pd.read_csv('topsis.csv')
        target1 = resultDict['regnames']

        ord_arr = copy.deepcopy(arr)
        ds = arr
        rows = len(arr)
        cols = len(arr[0])

        for i in range(0, cols):
            sum1 = 0
            for j in range(0, rows):
                sum1 = sum1 + (ds[j][i] * ds[j][i])
            sum1 = math.sqrt(sum1)
            sqr.append(sum1)
        sum2 = 0
        for i in range(0, cols):
            sum2 = sum2 + w[i]
        for i in range(0, cols):
            w[i] = w[i] / sum2
        for i in range(0, cols):
            for j in range(0, rows):
                ds[j][i] = (ds[j][i] / sqr[i]) * w[i]
        max1 = []
        min1 = []
        best = []
        worst = []
        for i in range(0, cols):
            max2 = -100000
            min2 = 100000
            for j in range(0, rows):
                if (ds[j][i] > max2):
                    max2 = ds[j][i]
                if (ds[j][i] < min2):
                    min2 = ds[j][i]
            if (f[i] == '+'):
                best.append(max2)
                worst.append(min2)
            elif (f[i] == '-'):
                best.append(min2)
                worst.append(max2)

        sip = []
        sin = []
        for i in range(0, rows):
            sumsip = 0
            sumsin = 0
            for j in range(0, cols):
                sumsip = sumsip + (ds[i][j] - best[j]) * (ds[i][j] - best[j])
                sumsin = sumsin + (ds[i][j] - worst[j]) * (ds[i][j] - worst[j])
            sip.append(math.sqrt(sumsip))
            sin.append(math.sqrt(sumsin))
        p = []
        for i in range(0, rows):
            p.append(sin[i] / (sip[i] + sin[i]))

        print(p)
        #rank array, convert float to int, convert np arr to python list
        lst = (len(p) - ss.rankdata(p) + 1).astype(int).tolist()

        print(lst)
        x = np.array(lst)
        unq = np.unique(x)
        unq = unq.tolist()

        for i in unq:
            count = 0
            for ind, j in enumerate(lst):
                if (i == j):
                    lst[ind] = j + count
                    count = count + 1
        print(lst)

        resultDict['models'] = []

        for j in range(0, len(lst)):
            i = lst.index(j + 1)
            resultDict['models'].append(ord_arr[i])
            resultDict['models'][j].append(regnames[i])
            #next line to pass origninal index so that it can be used when save model button clicked
            resultDict['models'][j].append(i)

        return JsonResponse(resultDict, status=200)
    else:
        return redirect('Link5')
コード例 #44
0
def mantel_test(X, Y, perms=10000, method='pearson', tail='two-tail'):
#Source: https://github.com/jwcarr/MantelTest/blob/master/Mantel.py
  """
  Takes two distance matrices (either redundant matrices or condensed vectors)
  and performs a Mantel test. The Mantel test is a significance test of the
  correlation between two distance matrices.
  Parameters
  ----------
  X : array_like
      First distance matrix (condensed or redundant).
  Y : array_like
      Second distance matrix (condensed or redundant), where the order of
      elements corresponds to the order of elements in the first matrix.
  perms : int, optional
      The number of permutations to perform (default: 10000). A larger number
      gives more reliable results but takes longer to run. If the actual number
      of possible permutations is smaller, the program will enumerate all
      permutations. Enumeration can be forced by setting this argument to 0.
  method : str, optional
      Type of correlation coefficient to use; either 'pearson' or 'spearman'
      (default: 'pearson').
  tail : str, optional
      Which tail to test in the calculation of the empirical p-value; either
      'upper', 'lower', or 'two-tail' (default: 'two-tail').
  Returns
  -------
  r : float
      Veridical correlation
  p : float
      Empirical p-value
  z : float
      Standard score (z-score)
  """

  # Ensure that X and Y are formatted as Numpy arrays.
  X, Y = np.asarray(X, dtype=float), np.asarray(Y, dtype=float)

  # Check that X and Y are valid distance matrices.
  if spatial.distance.is_valid_dm(X) == False and spatial.distance.is_valid_y(X) == False:
    raise ValueError('X is not a valid condensed or redundant distance matrix')
  if spatial.distance.is_valid_dm(Y) == False and spatial.distance.is_valid_y(Y) == False:
    raise ValueError('Y is not a valid condensed or redundant distance matrix')

  # If X or Y is a redundant distance matrix, reduce it to a condensed distance matrix.
  if len(X.shape) == 2:
    X = spatial.distance.squareform(X, force='tovector', checks=False)
  if len(Y.shape) == 2:
    Y = spatial.distance.squareform(Y, force='tovector', checks=False)

  # Check for size equality.
  if X.shape[0] != Y.shape[0]:
    raise ValueError('X and Y are not of equal size')

  # Check for minimum size.
  if X.shape[0] < 3:
    raise ValueError('X and Y should represent at least 3 objects')

  # If Spearman correlation is requested, convert X and Y to ranks.
  if method == 'spearman':
    X, Y = stats.rankdata(X), stats.rankdata(Y)

  # Check for valid method parameter.
  elif method != 'pearson':
    raise ValueError('The method should be set to "pearson" or "spearman"')

  # Check for valid tail parameter.
  if tail != 'upper' and tail != 'lower' and tail != 'two-tail':
    raise ValueError('The tail should be set to "upper", "lower", or "two-tail"')

  # Now we're ready to start the Mantel test using a number of optimizations:
  #
  # 1. We don't need to recalculate the pairwise distances between the objects
  #    on every permutation. They've already been calculated, so we can use a
  #    simple matrix shuffling technique to avoid recomputing them. This works
  #    like memoization.
  #
  # 2. Rather than compute correlation coefficients, we'll just compute the
  #    covariances. This works because the denominator in the equation for the
  #    correlation coefficient will yield the same result however the objects
  #    are permuted, making it redundant. Removing the denominator leaves us
  #    with the covariance.
  #
  # 3. Rather than permute the Y distances and derive the residuals to calculate
  #    the covariance with the X distances, we'll represent the Y residuals in
  #    the matrix and shuffle those directly.
  #
  # 4. If the number of possible permutations is less than the number of
  #    permutations that were requested, we'll run a deterministic test where
  #    we try all possible permutations rather than sample the permutation
  #    space. This gives a faster, deterministic result.

  # Calculate the X and Y residuals, which will be used to compute the
  # covariance under each permutation.
  X_residuals, Y_residuals = X - X.mean(), Y - Y.mean()

  # Expand the Y residuals to a redundant matrix.
  Y_residuals_as_matrix = spatial.distance.squareform(Y_residuals, force='tomatrix', checks=False)

  # Get the number of objects.
  m = Y_residuals_as_matrix.shape[0]

  # Calculate the number of possible matrix permutations.
  n = np.math.factorial(m)

  # Initialize an empty array to store temporary permutations of Y_residuals.
  Y_residuals_permuted = np.zeros(Y_residuals.shape[0], dtype=float)

  # If the number of requested permutations is greater than the number of
  # possible permutations (m!) or the perms parameter is set to 0, then run a
  # deterministic Mantel test ...
  if perms >= n or perms == 0:

    # Initialize an empty array to store the covariances.
    covariances = np.zeros(n, dtype=float)

    # Enumerate all permutations of row/column orders and iterate over them.
    for i, order in enumerate(permutations(range(m))):

      # Take a permutation of the matrix.
      Y_residuals_as_matrix_permuted = Y_residuals_as_matrix[order, :][:, order]

      # Condense the permuted version of the matrix. Rather than use
      # distance.squareform(), we call directly into the C wrapper for speed.
      spatial.distance._distance_wrap.to_vector_from_squareform_wrap(Y_residuals_as_matrix_permuted, Y_residuals_permuted)

      # Compute and store the covariance.
      covariances[i] = (X_residuals * Y_residuals_permuted).sum()

  # ... otherwise run a stochastic Mantel test.
  else:

    # Initialize an empty array to store the covariances.
    covariances = np.zeros(perms, dtype=float)

    # Initialize an array to store the permutation order.
    order = np.arange(m)

    # Store the veridical covariance in 0th position...
    covariances[0] = (X_residuals * Y_residuals).sum()

    # ...and then run the random permutations.
    for i in range(1, perms):

      # Choose a random order in which to permute the rows and columns.
      np.random.shuffle(order)

      # Take a permutation of the matrix.
      Y_residuals_as_matrix_permuted = Y_residuals_as_matrix[order, :][:, order]

      # Condense the permuted version of the matrix. Rather than use
      # distance.squareform(), we call directly into the C wrapper for speed.
      spatial.distance._distance_wrap.to_vector_from_squareform_wrap(Y_residuals_as_matrix_permuted, Y_residuals_permuted)

      # Compute and store the covariance.
      covariances[i] = (X_residuals * Y_residuals_permuted).sum()

  # Calculate the veridical correlation coefficient from the veridical covariance.
  r = covariances[0] / np.sqrt((X_residuals ** 2).sum() * (Y_residuals ** 2).sum())

  # Calculate the empirical p-value for the upper or lower tail.
  if tail == 'upper':
    p = (covariances >= covariances[0]).sum() / float(covariances.shape[0])
  elif tail == 'lower':
    p = (covariances <= covariances[0]).sum() / float(covariances.shape[0])
  elif tail == 'two-tail':
    p = (abs(covariances) >= abs(covariances[0])).sum() / float(covariances.shape[0])

  # Calculate the standard score.
  #z = (covariances[0] - covariances.mean()) / covariances.std()

  return r, p
コード例 #45
0
 def updateErr(self, indx, error):
     for i in range(0, len(indx)):
         self.err[indx[i]] = math.sqrt(error[i])
     r_err = ss.rankdata(
         self.err)  # rank of the error from smallest (1) to largest
     self.prob = [1 / (len(r_err) - i + 1) for i in r_err]
コード例 #46
0
 def check_case(values, method, expected):
     r = rankdata(values, method=method)
     assert_array_equal(r, expected)
コード例 #47
0
 def ranked_mwr(mwr, mat, wcol):
     mat[:, wcol] = rankdata(mat[:, wcol], method='dense')
     return direct_mwr(mwr, mat, wcol)
コード例 #48
0
def rolling_rank(np_data):
    return rankdata(np_data, method='min')[-1]
コード例 #49
0
def fitmodels_direct(catd,
                     mmix,
                     mask,
                     t2s,
                     t2s_full,
                     tes,
                     combmode,
                     ref_img,
                     reindex=False,
                     mmixN=None,
                     full_sel=True,
                     label=None,
                     out_dir='.',
                     verbose=False):
    """
    Fit TE-dependence and -independence models to components.

    Parameters
    ----------
    catd : (S x E x T) array_like
        Input data, where `S` is samples, `E` is echos, and `T` is time
    mmix : (T x C) array_like
        Mixing matrix for converting input data to component space, where `C`
        is components and `T` is the same as in `catd`
    mask : (S [x E]) array_like
        Boolean mask array
    t2s : (S [x T]) array_like
        Limited T2* map or timeseries.
    t2s_full : (S [x T]) array_like
        Full T2* map or timeseries. For voxels with good signal in only one
        echo, which are zeros in the limited T2* map, this map uses the T2*
        estimate using the first two echoes.
    tes : list
        List of echo times associated with `catd`, in milliseconds
    combmode : {'t2s', 'ste'} str
        How optimal combination of echos should be made, where 't2s' indicates
        using the method of Posse 1999 and 'ste' indicates using the method of
        Poser 2006
    ref_img : str or img_like
        Reference image to dictate how outputs are saved to disk
    reindex : bool, optional
        Default: False
    mmixN : array_like, optional
        Default: None
    full_sel : bool, optional
        Whether to perform selection of components based on Rho/Kappa scores.
        Default: True

    Returns
    -------
    seldict : dict
    comptab : (N x 5) :obj:`pandas.DataFrame`
        Array with columns denoting (1) index of component, (2) Kappa score of
        component, (3) Rho score of component, (4) variance explained by
        component, and (5) normalized variance explained by component
    betas : :obj:`numpy.ndarray`
    mmix_new : :obj:`numpy.ndarray`
    """
    if not (catd.shape[0] == t2s.shape[0] == t2s_full.shape[0] ==
            mask.shape[0]):
        raise ValueError('First dimensions (number of samples) of catd ({0}), '
                         't2s ({1}), and mask ({2}) do not '
                         'match'.format(catd.shape[0], t2s.shape[0],
                                        mask.shape[0]))
    elif catd.shape[1] != len(tes):
        raise ValueError('Second dimension of catd ({0}) does not match '
                         'number of echoes provided (tes; '
                         '{1})'.format(catd.shape[1], len(tes)))
    elif catd.shape[2] != mmix.shape[0]:
        raise ValueError('Third dimension (number of volumes) of catd ({0}) '
                         'does not match first dimension of '
                         'mmix ({1})'.format(catd.shape[2], mmix.shape[0]))
    elif t2s.shape != t2s_full.shape:
        raise ValueError('Shape of t2s array {0} does not match shape of '
                         't2s_full array {1}'.format(t2s.shape,
                                                     t2s_full.shape))
    elif t2s.ndim == 2:
        if catd.shape[2] != t2s.shape[1]:
            raise ValueError('Third dimension (number of volumes) of catd '
                             '({0}) does not match second dimension of '
                             't2s ({1})'.format(catd.shape[2], t2s.shape[1]))

    mask = t2s != 0  # Override mask because problems

    # compute optimal combination of raw data
    tsoc = combine.make_optcom(catd,
                               tes,
                               mask,
                               t2s=t2s_full,
                               combmode=combmode,
                               verbose=False).astype(float)[mask]

    # demean optimal combination
    tsoc_dm = tsoc - tsoc.mean(axis=-1, keepdims=True)

    # compute un-normalized weight dataset (features)
    if mmixN is None:
        mmixN = mmix
    WTS = computefeats2(utils.unmask(tsoc, mask), mmixN, mask, normalize=False)

    # compute PSC dataset - shouldn't have to refit data
    tsoc_B = get_coeffs(tsoc_dm, mmix, mask=None)
    tsoc_Babs = np.abs(tsoc_B)
    PSC = tsoc_B / tsoc.mean(axis=-1, keepdims=True) * 100

    # compute skews to determine signs based on unnormalized weights,
    # correct mmix & WTS signs based on spatial distribution tails
    signs = stats.skew(WTS, axis=0)
    signs /= np.abs(signs)
    mmix = mmix.copy()
    mmix *= signs
    WTS *= signs
    PSC *= signs
    totvar = (tsoc_B**2).sum()
    totvar_norm = (WTS**2).sum()

    # compute Betas and means over TEs for TE-dependence analysis
    betas = get_coeffs(catd, mmix,
                       np.repeat(mask[:, np.newaxis], len(tes), axis=1))
    n_samp, n_echos, n_components = betas.shape
    n_voxels = mask.sum()
    n_data_voxels = (t2s != 0).sum()
    mu = catd.mean(axis=-1, dtype=float)
    tes = np.reshape(tes, (n_echos, 1))
    fmin, _, _ = utils.getfbounds(n_echos)

    # mask arrays
    mumask = mu[t2s != 0]
    t2smask = t2s[t2s != 0]
    betamask = betas[t2s != 0]

    # set up Xmats
    X1 = mumask.T  # Model 1
    X2 = np.tile(tes, (1, n_data_voxels)) * mumask.T / t2smask.T  # Model 2

    # tables for component selection
    kappas = np.zeros([n_components])
    rhos = np.zeros([n_components])
    varex = np.zeros([n_components])
    varex_norm = np.zeros([n_components])
    Z_maps = np.zeros([n_voxels, n_components])
    F_R2_maps = np.zeros([n_data_voxels, n_components])
    F_S0_maps = np.zeros([n_data_voxels, n_components])
    Z_clmaps = np.zeros([n_voxels, n_components])
    F_R2_clmaps = np.zeros([n_data_voxels, n_components])
    F_S0_clmaps = np.zeros([n_data_voxels, n_components])
    Br_R2_clmaps = np.zeros([n_voxels, n_components])
    Br_S0_clmaps = np.zeros([n_voxels, n_components])
    pred_R2_maps = np.zeros([n_data_voxels, n_echos, n_components])
    pred_S0_maps = np.zeros([n_data_voxels, n_echos, n_components])

    LGR.info('Fitting TE- and S0-dependent models to components')
    for i_comp in range(n_components):
        # size of B is (n_echoes, n_samples)
        B = np.atleast_3d(betamask)[:, :, i_comp].T
        alpha = (np.abs(B)**2).sum(axis=0)
        varex[i_comp] = (tsoc_B[:, i_comp]**2).sum() / totvar * 100.
        varex_norm[i_comp] = (utils.unmask(WTS, mask)[t2s != 0][:, i_comp]**2).sum() /\
            totvar_norm * 100.

        # S0 Model
        # (S,) model coefficient map
        coeffs_S0 = (B * X1).sum(axis=0) / (X1**2).sum(axis=0)
        pred_S0 = X1 * np.tile(coeffs_S0, (n_echos, 1))
        pred_S0_maps[:, :, i_comp] = pred_S0.T
        SSE_S0 = (B - pred_S0)**2
        SSE_S0 = SSE_S0.sum(axis=0)  # (S,) prediction error map
        F_S0 = (alpha - SSE_S0) * (n_echos - 1) / (SSE_S0)
        F_S0_maps[:, i_comp] = F_S0

        # R2 Model
        coeffs_R2 = (B * X2).sum(axis=0) / (X2**2).sum(axis=0)
        pred_R2 = X2 * np.tile(coeffs_R2, (n_echos, 1))
        pred_R2_maps[:, :, i_comp] = pred_R2.T
        SSE_R2 = (B - pred_R2)**2
        SSE_R2 = SSE_R2.sum(axis=0)
        F_R2 = (alpha - SSE_R2) * (n_echos - 1) / (SSE_R2)
        F_R2_maps[:, i_comp] = F_R2

        # compute weights as Z-values
        wtsZ = (WTS[:, i_comp] - WTS[:, i_comp].mean()) / WTS[:, i_comp].std()
        wtsZ[np.abs(wtsZ) > Z_MAX] = (
            Z_MAX * (np.abs(wtsZ) / wtsZ))[np.abs(wtsZ) > Z_MAX]
        Z_maps[:, i_comp] = wtsZ

        # compute Kappa and Rho
        F_S0[F_S0 > F_MAX] = F_MAX
        F_R2[F_R2 > F_MAX] = F_MAX
        norm_weights = np.abs(
            np.squeeze(utils.unmask(wtsZ, mask)[t2s != 0]**2.))
        kappas[i_comp] = np.average(F_R2, weights=norm_weights)
        rhos[i_comp] = np.average(F_S0, weights=norm_weights)

    # tabulate component values
    comptab = np.vstack([kappas, rhos, varex, varex_norm]).T
    if reindex:
        # re-index all components in Kappa order
        sort_idx = comptab[:, 0].argsort()[::-1]
        comptab = comptab[sort_idx, :]
        mmix_new = mmix[:, sort_idx]
        betas = betas[..., sort_idx]
        pred_R2_maps = pred_R2_maps[:, :, sort_idx]
        pred_S0_maps = pred_S0_maps[:, :, sort_idx]
        F_S0_maps = F_S0_maps[:, sort_idx]
        F_R2_maps = F_R2_maps[:, sort_idx]
        Z_maps = Z_maps[:, sort_idx]
        WTS = WTS[:, sort_idx]
        PSC = PSC[:, sort_idx]
        tsoc_B = tsoc_B[:, sort_idx]
        tsoc_Babs = tsoc_Babs[:, sort_idx]
    else:
        mmix_new = mmix

    if verbose:
        # Echo-specific weight maps for each of the ICA components.
        io.filewrite(betas, op.join(out_dir, label + 'betas_catd.nii'),
                     ref_img)
        # Echo-specific maps of predicted values for R2 and S0 models for each
        # component.
        io.filewrite(utils.unmask(pred_R2_maps, mask),
                     op.join(out_dir, label + 'R2_pred.nii'), ref_img)
        io.filewrite(utils.unmask(pred_S0_maps, mask),
                     op.join(out_dir, label + 'S0_pred.nii'), ref_img)
        # Weight maps used to average metrics across voxels
        io.filewrite(utils.unmask(Z_maps**2., mask),
                     op.join(out_dir, label + 'metric_weights.nii'), ref_img)

    comptab = pd.DataFrame(comptab,
                           columns=[
                               'kappa', 'rho', 'variance explained',
                               'normalized variance explained'
                           ])
    comptab.index.name = 'component'

    # full selection including clustering criteria
    seldict = None
    if full_sel:
        LGR.info('Performing spatial clustering of components')
        csize = np.max([int(n_voxels * 0.0005) + 5, 20])
        LGR.debug('Using minimum cluster size: {}'.format(csize))
        for i_comp in range(n_components):
            # save out files
            out = np.zeros((n_samp, 4))
            out[:, 0] = np.squeeze(utils.unmask(PSC[:, i_comp], mask))
            out[:, 1] = np.squeeze(utils.unmask(F_R2_maps[:, i_comp],
                                                t2s != 0))
            out[:, 2] = np.squeeze(utils.unmask(F_S0_maps[:, i_comp],
                                                t2s != 0))
            out[:, 3] = np.squeeze(utils.unmask(Z_maps[:, i_comp], mask))

            ccimg = io.new_nii_like(ref_img, out)

            # Do simple clustering on F
            sel = spatclust(ccimg,
                            min_cluster_size=csize,
                            threshold=fmin,
                            index=[1, 2],
                            mask=(t2s != 0))
            F_R2_clmaps[:, i_comp] = sel[:, 0]
            F_S0_clmaps[:, i_comp] = sel[:, 1]
            countsigFR2 = F_R2_clmaps[:, i_comp].sum()
            countsigFS0 = F_S0_clmaps[:, i_comp].sum()

            # Do simple clustering on Z at p<0.05
            sel = spatclust(ccimg,
                            min_cluster_size=csize,
                            threshold=1.95,
                            index=3,
                            mask=mask)
            Z_clmaps[:, i_comp] = sel

            # Do simple clustering on ranked signal-change map
            spclust_input = utils.unmask(stats.rankdata(tsoc_Babs[:, i_comp]),
                                         mask)
            spclust_input = io.new_nii_like(ref_img, spclust_input)
            Br_R2_clmaps[:, i_comp] = spatclust(
                spclust_input,
                min_cluster_size=csize,
                threshold=max(tsoc_Babs.shape) - countsigFR2,
                mask=mask)
            Br_S0_clmaps[:, i_comp] = spatclust(
                spclust_input,
                min_cluster_size=csize,
                threshold=max(tsoc_Babs.shape) - countsigFS0,
                mask=mask)

        seldict = {}
        selvars = [
            'WTS', 'tsoc_B', 'PSC', 'Z_maps', 'F_R2_maps', 'F_S0_maps',
            'Z_clmaps', 'F_R2_clmaps', 'F_S0_clmaps', 'Br_R2_clmaps',
            'Br_S0_clmaps'
        ]
        for vv in selvars:
            seldict[vv] = eval(vv)

    return seldict, comptab, betas, mmix_new
コード例 #50
0
#    plt.plot(prop_average_scores, 'b--^',label='PI_avg_score')
#    plt.legend(bbox_to_anchor=(0.7, 1))
#    plt.xlabel('proposal number')
#    plt.ylabel('scores')
#    plt.show()
    
    ######################## Step 3, Generate global ranking #################
    #generate score matrix and rank matrix for the assignment
    simulated_assignment_scores=np.zeros((n,m))
    simulated_assignment_ranks=np.zeros((n,m))
    for i in range(n):
        for j in range(m):
            ##fill in the simulated scores for the successful assignment        
            simulated_assignment_scores[i,j]=prop_scores[i,assignment[i,j]]
        ##Assign ranks to assignment_scores, dealing with ties appropriately
        simulated_assignment_ranks[i]=rankdata(simulated_assignment_scores[i])-1   
    
    
    prop_total_scores=np.zeros(n)
    MBC_total_scores=np.zeros(n)
    for i in range(n):
        #find index of assignment for each proposal
        find_index=np.where(assignment==i)
        prop_index=np.asarray(find_index).T.tolist()
        for (r,v) in prop_index:
            ##summarize simulated ranks from different PIs for each proposal
            prop_total_scores[i]+=simulated_assignment_ranks[r,v]
            MBC_total_scores[i]=prop_total_scores[i] / (m*(m-1))
            
    
    prop_global_ranks=rankdata(MBC_total_scores)-1