Example #1
0
	def significance_assessment(self, cscPairA, cscPairD, leftregion, rightregion, meta_chrome, arm, AmpPat, DelPat, chrm_genebkt):
		if len(cscPairA.keys()) != 0 or len(cscPairD.keys()) != 0:
			scorelistA, scorelistD = [], []
			for i in range(0, self.num_permutation):				
				permute_regionA, permute_regionD = cna_utils.cycle_shift_permutation(self.dlcall.regionA[meta_chrome][arm], self.dlcall.regionD[meta_chrome][arm], leftregion, rightregion)				
				pedgesetA, pedgesetD, pedgetoPatient, pedgewA, pedgewD, pposA, pposD = cna_utils.formatEdgeId(AmpPat.union(DelPat), permute_regionA, permute_regionD)#, abbA, abbD) 
				pcscPairA, pcscPairD = self.RAIG_algo(pedgesetA, pedgesetD, pedgetoPatient, pedgewA, pedgewD, pposA, pposD, chrm_genebkt, len(AmpPat), len(DelPat))
				if len(pcscPairA.keys()) != 0:
					scorelistA.append(max([2*min(pcscPairA[cid]['lcount'],pcscPairA[cid]['rcount']) for cid in pcscPairA.keys()]))
				else:
					scorelistA.append(0)

				if len(pcscPairD.keys()) != 0:
					scorelistD.append(max([2*min(pcscPairD[cid]['lcount'],pcscPairD[cid]['rcount']) for cid in pcscPairD.keys()]))
				else:
					scorelistD.append(0)

		if len(cscPairA.keys()) != 0:
			pvals = list()
			cidlist = list()
			for cid in cscPairA.keys():
				csc_score = 2*min(cscPairA[cid]['lcount'],cscPairA[cid]['rcount'])
				count = 0
				for s in scorelistA:
					if s > csc_score:
						count += 1
				
				cscPairA[cid]['p-val'] = float(count)/self.num_permutation
				pvals.append(float(count)/self.num_permutation)
				cidlist.append(cid)
			
			corrected_pval = smm.multipletests(pvals, alpha=0.05, method='fdr_bh')[1]
			for i in range(len(cidlist)):
				cscPairA[cidlist[i]]['corrected-p-val'] = corrected_pval[i]
		
		if len(cscPairD.keys()) != 0:
			pvals = list()
			cidlist = list()
			for cid in cscPairD.keys():
				csc_score = 2*min(cscPairD[cid]['lcount'],cscPairD[cid]['rcount'])
				count = 0
				for s in scorelistD:
					if s > csc_score:
						count +=1
				
				cscPairD[cid]['p-val'] = float(count)/self.num_permutation
				pvals.append(float(count)/self.num_permutation)
				cidlist.append(cid)
			
			corrected_pval = smm.multipletests(pvals, alpha=0.05, method='fdr_bh')[1]
			for i in range(len(cidlist)):
				cscPairD[cidlist[i]]['corrected-p-val'] = corrected_pval[i]
Example #2
0
def test_issorted(method):
    # test that is_sorted keyword works correctly
    # the fdrcorrection functions are tested indirectly

    # data generated as random numbers np.random.beta(0.2, 0.5, size=10)
    pvals = np.array([31, 9958111, 7430818, 8653643, 9892855, 876, 2651691,
                      145836, 9931, 6174747]) * 1e-7
    sortind = np.argsort(pvals)
    sortrevind = sortind.argsort()
    pvals_sorted = pvals[sortind]

    res1 = multipletests(pvals, method=method, is_sorted=False)
    res2 = multipletests(pvals_sorted, method=method, is_sorted=True)
    assert_equal(res2[0][sortrevind], res1[0])
    assert_allclose(res2[0][sortrevind], res1[0], rtol=1e-10)
Example #3
0
def get_p_values(dat):
    #%%
    feat_x = dat[dat['region']=='Before']
    feat_y = dat[dat['region']=='After']
        
    p_values = []
    for feat in feat_avg_names:
        x = feat_x[feat]
        x = x.dropna()
        y = feat_y[feat].dropna()
        
        if x.size > 0 and y.size > 0:
            _, p = ttest_ind(x, y)
        else:
            p = np.nan
        p_values.append((feat, p))
    
    feats, p_val = zip(*p_values)
    p_values = pd.Series(p_val, index=feats).dropna()
    p_values = p_values.sort_values(ascending=True)
    
    if p_values.size > 0:
        reject, pvals_corrected, alphacSidak, alphacBonf = \
            smm.multipletests(p_values.values, method = 'fdr_tsbky')
            
        pvals_corrected = pd.Series(pvals_corrected, index=p_values.index)
    else:
        pvals_corrected = pd.Series()
    #%%
    return p_values, pvals_corrected
Example #4
0
def multi_correct(data, meth='fdr_bh'):
    """
    Run fdr correction on nodes of interest contained in an array of p values. 
    
    Parameters:
    -----------
    data : numpy array
        nnodes x nnodes array containing p values of correlation between each node
    noi_idx : numpy
        indices (applicable to both row and column) of nodes of interest. This
        reduces the number of nodes corrected for
    meth : str
        Method of correction. Options are: 
            `bonferroni` : one-step correction
            `sidak` : on-step correction
            `holm-sidak` :
            `holm` :
            `simes-hochberg` :
            `hommel` :
            `fdr_bh` : Benjamini/Hochberg (default)
            `fdr_by` : Benjamini/Yekutieli 
    
    Returns:
    ----------
    fdr_corrected : numpy array
        array containing p values corrected with fdr
    """
    rej, corrp, alpha_sidak, alpha_bonnf = smm.multipletests(data, 
                                                            alpha=0.05, 
                                                            method=meth)
    return corrp
Example #5
0
def test_hommel():
    #tested agains R stats p_adjust(pval0, method='hommel')
    pval0 = np.array(
              [ 0.00116,  0.00924,  0.01075,  0.01437,  0.01784,  0.01918,
                0.02751,  0.02871,  0.03054,  0.03246,  0.04259,  0.06879,
                0.0691 ,  0.08081,  0.08593,  0.08993,  0.09386,  0.09412,
                0.09718,  0.09758,  0.09781,  0.09788,  0.13282,  0.20191,
                0.21757,  0.24031,  0.26061,  0.26762,  0.29474,  0.32901,
                0.41386,  0.51479,  0.52461,  0.53389,  0.56276,  0.62967,
                0.72178,  0.73403,  0.87182,  0.95384])

    result_ho = np.array(
              [ 0.0464            ,  0.25872           ,  0.29025           ,
                0.3495714285714286,  0.41032           ,  0.44114           ,
                0.57771           ,  0.60291           ,  0.618954          ,
                0.6492            ,  0.7402725000000001,  0.86749           ,
                0.86749           ,  0.8889100000000001,  0.8971477777777778,
                0.8993            ,  0.9175374999999999,  0.9175374999999999,
                0.9175374999999999,  0.9175374999999999,  0.9175374999999999,
                0.9175374999999999,  0.95384           ,  0.9538400000000001,
                0.9538400000000001,  0.9538400000000001,  0.9538400000000001,
                0.9538400000000001,  0.9538400000000001,  0.9538400000000001,
                0.9538400000000001,  0.9538400000000001,  0.9538400000000001,
                0.9538400000000001,  0.9538400000000001,  0.9538400000000001,
                0.9538400000000001,  0.9538400000000001,  0.9538400000000001,
                0.9538400000000001])

    rej, pvalscorr, _, _ = multipletests(pval0, alpha=0.1, method='ho')
    assert_almost_equal(pvalscorr, result_ho, 15)
    assert_equal(rej, result_ho < 0.1)  #booleans
Example #6
0
def DEGI(gctfile,clsfile,number):

    #open and save input files
    with open(gctfile) as gct:
        gct=numpy.genfromtxt(gct,dtype=None,delimiter="\t",missing_values="NA",invalid_raise=False,skip_header=2)
        gct_exp=gct[1:,2:].astype(float) #matrix of expression values
        gct_genes=gct[1:,1] #list of gene names
    with open(clsfile) as label:
        label=label.read().splitlines()
        label=label[2].split() #list of class labels

    #initialize empty list for p-values
    pvals=[]

    #first, caluclate difference in means with original labels
    for i in range(0,len(gct_genes)):
        class0=[]
        class1=[]
        for j in range(0,len(label)):
            if label[j]=="0":
                class0.append(gct_exp[i,j])
            if label[j]=="1":
                class1.append(gct_exp[i,j])
        mean0=sum(class0)/len(class0)
        mean1=sum(class1)/len(class1)
        null_diff=abs(mean0-mean1)

        #then, calculate difference in means with permutated labels
        #p-value is determined by the proportion of permutated differences that are less than the original difference
        greater=0.
        for k in range(0,number):
            label_shuffle=numpy.random.permutation(label)
            class0_shuffle=[]
            class1_shuffle=[]
            for j in range(0,len(label_shuffle)):
                if label_shuffle[j]=="0":
                    class0_shuffle.append(gct_exp[i,j])
                if label_shuffle[j]=="1":
                    class1_shuffle.append(gct_exp[i,j])
            mean0_shuffle=sum(class0_shuffle)/len(class0_shuffle)
            mean1_shuffle=sum(class1_shuffle)/len(class1_shuffle)
            alt_diff=abs(mean0_shuffle-mean1_shuffle)
            if null_diff>=alt_diff:
                greater+=1.
        pvals.append(greater/number)

    #correct for multiple hypothesis tests using benjamini-hochberg
    bh=smm.multipletests(pvals,alpha=0.05,method='fdr_bh')
    bh_sig=bh[0]
    bh_pvals=bh[1].astype(str)

    sig=0
    for i in range(0,len(bh_sig)):
        if bh_sig[i]==True:
            print gct_genes[i]+" is differentially expressed.\nThe adjusted p-value is "+bh_pvals[i]+"\n"
            sig+=1
    if sig==0:
        print "There are no differentially expressed genes."
Example #7
0
    def pval_corrected(self, method=None):
        '''p-values corrected for multiple testing problem

        This uses the default p-value correction of the instance stored in
        ``self.multitest_method`` if method is None.

        '''
        import statsmodels.stats.multitest as smt
        if method is None:
            method = self.multitest_method
        #TODO: breaks with method=None
        return smt.multipletests(self.pvals_raw, method=method)[1]
Example #8
0
def test_pvalcorrection_reject(alpha, method, ii):
    # consistency test for reject boolean and pvalscorr

    pval1 = np.hstack((np.linspace(0.0001, 0.0100, ii),
                       np.linspace(0.05001, 0.11, 10 - ii)))
    # using .05001 instead of 0.05 to avoid edge case issue #768
    reject, pvalscorr = multipletests(pval1, alpha=alpha,
                                      method=method)[:2]

    msg = 'case %s %3.2f rejected:%d\npval_raw=%r\npvalscorr=%r' % (
                     method, alpha, reject.sum(), pval1, pvalscorr)
    assert_equal(reject, pvalscorr <= alpha, err_msg=msg)
Example #9
0
    def test_multi_pvalcorrection_rmethods(self, key, val):
        # test against R package multtest mt.rawp2adjp

        res_multtest = self.res2
        pval0 = res_multtest[:, 0]

        if val[1] in self.methods:
            reject, pvalscorr = multipletests(pval0,
                                              alpha=self.alpha,
                                              method=val[1])[:2]
            assert_almost_equal(pvalscorr, res_multtest[:, val[0]], 15)
            assert_equal(reject, pvalscorr <= self.alpha)
Example #10
0
    def get_score_df(self, correction_method=None):
        '''

        :param correction_method: str or None, correction method from statsmodels.stats.multitest.multipletests
         'fdr_bh' is recommended.
        :return: pd.DataFrame
        '''
        # From https://people.kth.se/~lang/Effect_size.pdf
        # Shinichi Nakagawa1 and Innes C. Cuthill. 2007. In Biological Reviews 82.
        X = self._get_X().astype(np.float64)
        X = X / X.sum(axis=1)
        cat_X, ncat_X = self._get_cat_and_ncat(X)
        n1, n2 = float(cat_X.shape[1]), float(ncat_X.shape[1])
        n = n1 + n2
        m1 = cat_X.mean(axis=0).A1
        m2 = ncat_X.mean(axis=0).A1
        v1 = cat_X.var(axis=0).A1
        v2 = ncat_X.var(axis=0).A1
        s_pooled = np.sqrt(((n2 - 1) * v2 + (n1 - 1) * v1) / (n - 2.))
        cohens_d = (m1 - m2) / s_pooled
        cohens_d_se = np.sqrt(((n - 1.) / (n - 3)) * (4. / n) * (1 + np.square(cohens_d)))
        cohens_d_z = cohens_d / cohens_d_se
        cohens_d_p = norm.sf(cohens_d_z)
        hedges_r = cohens_d * (1 - 3. / ((4. * (n - 2)) - 1))
        hedges_r_se = np.sqrt(n / (n1 * n2) + np.square(hedges_r) / (n - 2.))
        hedges_r_z = hedges_r / hedges_r_se
        hedges_r_p = norm.sf(hedges_r_z)

        score_df = pd.DataFrame({
            'cohens_d': cohens_d,
            'cohens_d_se': cohens_d_se,
            'cohens_d_z': cohens_d_z,
            'cohens_d_p': cohens_d_p,
            'hedges_r': hedges_r,
            'hedges_r_se': hedges_r_se,
            'hedges_r_z': hedges_r_z,
            'hedges_r_p': hedges_r_p,
            'm1': m1,
            'm2': m2,
        }, index=self.corpus_.get_terms()).fillna(0)
        if correction_method is not None:
            from statsmodels.stats.multitest import multipletests
            score_df['hedges_r_p_corr'] = 0.5
            for method in ['cohens_d', 'hedges_r']:
                score_df[method + '_p_corr'] = 0.5
                score_df.loc[(score_df['m1'] != 0) | (score_df['m2'] != 0), method + '_p_corr'] = (
                    multipletests(score_df.loc[(score_df['m1'] != 0) | (score_df['m2'] != 0), method + '_p'],
                                  method=correction_method)[1]
                )

        return score_df
Example #11
0
    def is_from_null(self,alpha,samples,chane_prob):
        dims = samples.shape[1]
        boots = 10*int(dims/alpha)
        pvals = np.zeros(dims)
        for dim in range(dims):
            U,_ = self.tester.get_statistic_multiple_dim(samples,dim)
            p = self.tester.compute_pvalues_for_processes(U,chane_prob,boots)
            pvals[dim] = p

        print(pvals)
        alt_is_true, pvals_corrected,_,_ =  multipletests(pvals,alpha,method='holm')



        return any(alt_is_true),pvals_corrected
Example #12
0
def test_pvalcorrection_reject():
    # consistency test for reject boolean and pvalscorr

    for alpha in [0.01, 0.05, 0.1]:
        for method in ['b', 's', 'sh', 'hs', 'h', 'hommel', 'fdr_i', 'fdr_n',
                       'fdr_tsbky', 'fdr_tsbh', 'fdr_gbs']:
            for ii in range(11):
                pval1 = np.hstack((np.linspace(0.0001, 0.0100, ii),
                                   np.linspace(0.05001, 0.11, 10 - ii)))
                # using .05001 instead of 0.05 to avoid edge case issue #768
                reject, pvalscorr = multipletests(pval1, alpha=alpha,
                                                  method=method)[:2]
                #print 'reject.sum', v[1], reject.sum()
                msg = 'case %s %3.2f rejected:%d\npval_raw=%r\npvalscorr=%r' % (
                                 method, alpha, reject.sum(), pval1, pvalscorr)
                assert_equal(reject, pvalscorr <= alpha, err_msg=msg)
Example #13
0
def correct_enrichment_pvalues(enrichments, method, sig_cutoff):
    corrected_enrichments = []
    for enrichment in enrichments:
        pvalues = enrichment.values()
        gene_set_names = enrichment.keys()
        if method == 'none' or method is None:
            corrected_pvalues = pvalues
            reject = pvalues > sig_cutoff
        else:
            reject, corrected_pvalues, _, _ = smm.multipletests(pvalues,
                                                        alpha=sig_cutoff,
                                                        method=method)
        accepted_indices = np.where(reject)[0]
        accepted_pvalues = dict([(gene_set_names[i], corrected_pvalues[i]) 
                                    for i in accepted_indices])
        corrected_enrichments.append(accepted_pvalues)
    return corrected_enrichments
    def __call__(self, track):
        print "Reading %s" % track
        data = pandas.read_csv(self.openFile(track),
                               header=0,
                               names=["contig", "start", "p"],
                               sep="\t")
        print "Done"
        data["qvalues"] = multipletests(data["p"], method="fdr_bh")[1]

        output = dict()

        output["Bases"] = data.shape[0]
        output["Significant"] = (data["qvalues"] < 0.01).sum()
        output["Fraction_Significant"] = \
            float(output["Significant"])/output["Bases"]

        return output
Example #15
0
def test_multi_pvalcorrection():
    #test against R package multtest mt.rawp2adjp
    #because of sort this doesn't check correct sequence - TODO: rewrite DONE
    rmethods = {'rawp':(0,'pval'), 'Bonferroni':(1,'b'), 'Holm':(2,'h'),
                'Hochberg':(3,'sh'), 'SidakSS':(4,'s'), 'SidakSD':(5,'hs'),
                'BH':(6,'fdr_i'), 'BY':(7,'fdr_n')}

    for k,v in rmethods.items():
        if v[1] in ['b', 's', 'sh', 'hs', 'h', 'fdr_i', 'fdr_n']:
            #pvalscorr = np.sort(multipletests(pval0, alpha=0.1, method=v[1])[1])
            r_sortindex = [6, 8, 9, 7, 5, 1, 2, 4, 0, 3]
            pvalscorr = multipletests(pval0, alpha=0.1, method=v[1])[1][r_sortindex]
            assert_almost_equal(pvalscorr, res_multtest[:,v[0]], 15)

    pvalscorr = np.sort(fdrcorrection(pval0, method='n')[1])
    assert_almost_equal(pvalscorr, res_multtest[:,7], 15)
    pvalscorr = np.sort(fdrcorrection(pval0, method='i')[1])
    assert_almost_equal(pvalscorr, res_multtest[:,6], 15)
Example #16
0
    def test_multi_pvalcorrection(self):
        #test against R package multtest mt.rawp2adjp

        res_multtest = self.res2
        pval0 = res_multtest[:,0]

        for k,v in iteritems(rmethods):
            if v[1] in self.methods:
                reject, pvalscorr = multipletests(pval0,
                                                  alpha=self.alpha,
                                                  method=v[1])[:2]
                assert_almost_equal(pvalscorr, res_multtest[:,v[0]], 15)
                assert_equal(reject, pvalscorr <= self.alpha)

        pvalscorr = np.sort(fdrcorrection(pval0, method='n')[1])
        assert_almost_equal(pvalscorr, res_multtest[:,7], 15)
        pvalscorr = np.sort(fdrcorrection(pval0, method='i')[1])
        assert_almost_equal(pvalscorr, res_multtest[:,6], 15)
Example #17
0
def compute_q_values(contingencies, bonferroni_count=None):
    """Compute p and q-values"""
    logging.info("Computing p and q-values")
    target_event_pairs = []
    p_vals = []
    for (target, event), table in contingencies.iteritems():
        chi2, pvalue, ddof, expected = stats.chi2_contingency(table)
        target_event_pairs.append((target, event))
        p_vals.append(pvalue)
    #Calculate the qvalue (p-adjusted FDR)
    if bonferroni_count:
        logging.info("Using Bonferroni correction for q-value calculations")
        q_vals = [pval * float(bonferroni_count) for pval in p_vals]
    else:
        logging.info("Using Holm correction for q-value calculations")
        reject_array, q_vals, alpha_c_sidak, alpha_c_bonf = multipletests(
            p_vals, alpha=0.05, method='holm')
    return target_event_pairs, p_vals, q_vals
Example #18
0
def test_associations(data, test_types=("two-sided",), threshold=None, corr_method="fdr_bh", associations=None):
    if associations is None:
        associations = itertools.combinations(data.columns, 2)

    row_gen = (
        (a, b, test_type, test_association(data[[a, b]], test_type=test_type))
        for a, b in associations
        for test_type in test_types
    )

    frame = pd.DataFrame(row_gen, columns=["a", "b", "test_type", "p_value"])
    frame["p_value_adj"] = multipletests(frame["p_value"], method=corr_method)[1]
    frame.sort_values(by="p_value_adj", inplace=True)

    if threshold is not None:
        frame = frame.query("p_value_adj <= {}".format(threshold))

    return frame
def calc_kruskal(x, sample_num_l, alpha):
	tmp_input_l = split_list(x[1:],sample_num_l) #ignore id column

	try:
		h,p = stats.kruskal(*tmp_input_l) #run kruskal-wallist test
#		h,p = stats.f_oneway(*tmp_input_l)
	except ValueError:
		return x+['1.00','0']
	
	if math.isnan(p) :
		return x+['1.00','0']	

	result = []

	if p < alpha :
		num = len(sample_num_l)
		
		pval_l = []
		
		for i in range(num-1):
			for j in range(i+1, num):
				tmp_p = 0.0
				try:
					tmp_u, tmp_p = stats.mannwhitneyu(tmp_input_l[i],tmp_input_l[j]) #This is one-sied result
				except ValueError :
					tmp_p = 0.5

				pval_l.append(tmp_p*2)
		
		rej = smm.multipletests(pval_l, alpha=alpha, method='fdr_bh')[0] # fdr correction
		
		flag = 1

		for i in range(len(rej)):
			if ~rej[i] :
				flag = 0
				break

		result = [`p`,`flag`]

	else:
		result = [`p`,'0']
	
	return x+result
def main(table_fpath, fdr=.1):
   
    pvalues = []
    with open(table_fpath) as tables_file:
        for line in tables_file:
            if '#' in line:
                continue
            spl = line.split('\t')
            if len(spl) == 5:
                pvalues.extend(float(x) for x in spl[1:])
    
    pvalues = np.asarray(pvalues)
    reject = multitest.multipletests(pvalues, fdr, method='fdr_bh')[0]
    n = reject.shape[0]
    X = reject.reshape((n // 4, 4))[:, 0:2]
    P = pvalues.reshape((n // 4, 4))[:, 0:2]
    
    for row in P:
        print(row < .05)
 def calcEnrichment(self, method='Fisher', correction='FDR'):
     if not method in self.__SUPP_METHODS:
         raise ValueError('\'%s\' is not a supported method' % method)
     # get the union set of drug properties of any of the foreground drugs
     db_dict = dict()
     if method == 'Fisher':
         p_val = list()
         odds_r = list()
         n_r = list()
         props = list()
         chemicals = list()
         # test each property (k) independently for enrichment
         # (e.g. drug targets with ligand set L in foreground F)
         # assemble 2x2 contingency table (rows: in F / not in F; cols: in L / not in L)
         foreground = set(self.fg_score_dict.viewkeys())
         not_foreground = self.background.difference(foreground)
         for k in self.bg_cid_prop_map.viewkeys():
             ligands = self.db_prop_cid_map[k]
             ct_11 = len(foreground.intersection(ligands)) # in F & in L
             ct_12 = len(foreground.difference(ligands)) # in F & not in L
             ct_21 = len(not_foreground.intersection(ligands)) # not in F & in L
             ct_22 = len(not_foreground.difference(ligands)) # not in F & not in L                    
             table = [[ct_11, ct_12], [ct_21, ct_22]]
             o, p = stats.fisher_exact(table)
             props.append(k)
             odds_r.append(o)
             n_r.append(str(ct_11)+'/'+str(ct_11+ct_21))
             p_val.append(p)
         # correct for multiple testing
         if correction=='FDR':
             tmp1, p_adj, tmp2, tmp3 = multitest.multipletests(p_val, method='fdr_bh')
             p_adj = [p for p in p_adj]
         elif correction=='Bonferroni':
             p_adj = [p*len(p_val) for p in p_val]
         else:
             print 'Unknown method for multiple hypothesis correction:'
             print correction
             print 'Exiting'
             exit(1)
         return(props, odds_r, n_r, p_val, p_adj)
     else:
          raise ValueError('\'%s\' is not yet implemented' % method)          
Example #22
0
    def get_corrected_pvalues(self, pvalues, method=None):
        """Return corrected pvalues

        :param list pvalues: list or array of pvalues to correct.
        :param method: use the one defined in the constructor by default
            but can be overwritten here
        """
        if method is not None:
            self.method = method

        pvalues = np.array(pvalues)

        if self.method == 'qvalue':
            qv = QValue(pvalues)
            corrections = qv.qvalue()
            return corrections
        else:
            corrections = multitest.multipletests(pvalues,
               alpha=self.alpha, method=self.method)[1]
            return corrections
def getPValues(feat_mean, strain_list, feat_list):    
    strain_groups = feat_mean.groupby('Strain');
    features_N2 = strain_groups.get_group('N2');
    
    pvalue_table = pd.DataFrame(np.nan, index = feat_list, columns = strain_list, dtype = np.float64)
    for strain in pvalue_table.columns.values:
        features_S = strain_groups.get_group(strain);
        for feat in pvalue_table.index.values:
            x, y = features_N2[feat].values, features_S[feat].values
            dd, p_value = ttest_ind(x,y, equal_var=False)
            #dd, p_value = ranksums(x,y)
            
            #p_value positive if N2 is larger than the strain
            pvalue_table.loc[feat, strain] = p_value
        
        good = ~np.isnan(pvalue_table[strain])
        #correct for false discovery rate using 2-stage Benjamini-Krieger-Yekutieli
        reject, pvals_corrected, alphacSidak, alphacBonf = \
        smm.multipletests(pvalue_table.loc[good,strain].values, method = 'fdr_tsbky')
        pvalue_table.loc[good,strain] = pvals_corrected
    return pvalue_table
Example #24
0
 def deg_stat(data, classes, pos, neg, adjust='fdr_bh'):
     '''
     Basic t-test for certain normalized DataFrame
     If its a RNA SEQ data, use READemption for data process is a better option
     
     :param data: the pandas dataframe
     :param classes: the class vector
     :param pos: the positive class name
     :param neg: the negative class name
     :param adjust: the multipletest adjust method
     :return: a dataframe contains the result of the basic ttest.
     '''
     data = data.copy()
     PDF = data.groupby(classes, axis=1).get_group(pos)
     CDF = data.groupby(classes, axis=1).get_group(neg)
     ttests = [ttest_ind(PDF.iloc[i], CDF.iloc[i], equal_var=False)[1] for i in range(PDF.shape[0])]
     fc = PDF.mean(axis=1) - CDF.mean(axis=1)
     mul = multipletests(ttests, method=adjust)
     data['fold-change'] = pd.Series(fc, index=data.index)
     data['p-value'] = pd.Series(ttests, index=data.index)
     data['fdr'] = pd.Series(mul[1], index=data.index)
     return data
Example #25
0
def multi_correct(data, noi_idx, meth='fdr_bh'):
    """
    Run fdr correction on nodes of interest contained in an array of p values. 
    
    Parameters:
    -----------
    data : numpy array
        nnodes x nnodes array containing p values of correlation between each node
    noi_idx : numpy
        indices (applicable to both row and column) of nodes of interest. This
        reduces the number of nodes corrected for
    meth : str
        Method of correction. Options are: 
            `bonferroni` : one-step correction
            `sidak` : on-step correction
            `holm-sidak` :
            `holm` :
            `simes-hochberg` :
            `hommel` :
            `fdr_bh` : Benjamini/Hochberg (default)
            `fdr_by` : Benjamini/Yekutieli 
    
    Returns:
    ----------
    fdr_corrected : numpy array
        nnodes x nnodes array containing p values corrected with fdr (
    """
    noi_data = data[np.ix_(noi_idx,noi_idx)]
    noi_upper = np.triu(noi_data, k=1)
    upper_rows, upper_cols  = np.triu_indices_from(noi_data, k=1)
    masked_upper = noi_upper[np.ma.nonzero(noi_upper)].ravel()
    rej, corrp, alpha_sidak, alpha_bonnf = smm.multipletests(masked_upper, 
                                                            alpha=0.05, 
                                                            method=meth)
    fdr_corr_array = np.zeros((len(noi_idx),len(noi_idx)))
    for i in range(len(corrp)):
        fdr_corr_array[upper_rows[i],upper_cols[i]] = corrp[i]
    return fdr_corr_array + fdr_corr_array.T
def BH_correct(data,indx,thresh):
    pvals = []
    d_exclude=[]
    names = []
    vals = []
    datums = []
    for lines in data:        
        if float(lines[indx]) > float(thresh):
            datum = lines
            datum = [float(num) if is_number(num) else num for num in datum]
            names.append(lines[0])
            vals.append(float(lines[-1]))
            datums.append(datum)
        else:
            d_exclude.append([lines[0],"-10000"])
    bhs= list(ssm.multipletests(vals,method="fdr_bh")[1])
    for i in xrange(len(bhs)):
        #print datums[i]
        datums[i].append(bhs[i])
        #print datums[i]
    datums = sorted(datums,key=itemgetter(-1,-2))
    datums = datums + d_exclude
    return datums
Example #27
0
def t_test_multi(result, contrasts, method='hs', alpha=0.05, ci_method=None,
                 contrast_names=None):
    """perform t_test and add multiplicity correction to results dataframe

    Parameters
    ----------
    result results instance
        results of an estimated model
    contrasts : ndarray
        restriction matrix for t_test
    method : string or list of strings
        method for multiple testing p-value correction, default is'hs'.
    alpha : float
        significance level for multiple testing reject decision.
    ci_method : None
        not used yet, will be for multiplicity corrected confidence intervals
    contrast_names : list of strings or None
        If contrast_names are provided, then they are used in the index of the
        returned dataframe, otherwise some generic default names are created.

    Returns
    -------
    res_df : pandas DataFrame
        The dataframe contains the results of the t_test and additional columns
        for multiplicity corrected p-values and boolean indicator for whether
        the Null hypothesis is rejected.
    """
    tt = result.t_test(contrasts)
    res_df = tt.summary_frame(xname=contrast_names)

    if type(method) is not list:
        method = [method]
    for meth in method:
        mt = multipletests(tt.pvalue, method=meth, alpha=alpha)
        res_df['pvalue-%s' % meth] = mt[1]
        res_df['reject-%s' % meth] = mt[0]
    return res_df
Example #28
0
    def close(self):

        output = pd.concat(self.output)
        output = output.sort_index()
        E.debug("most 3' coordingate seen is %s" % (output.index.values[-1],))
        if self.correct:
            E.info("Correcting p-values using BH ...")
            corrected_pvals = multipletests(output, method="fdr_bh")
            output = pd.Series(corrected_pvals[1], index=output.index)

        E.info("Writing output")
        E.debug("output contains %i entries" % len(output))
 
        if self.outfile_windows:
            E.info("Writing windows")
            sig_windows = output[output < self.threshold]
            for gene in self.genes:
                windows = bases_to_windows(sig_windows, gene, self.window_size,
                                           self.threshold)
                for bed in windows:
                    self.outfile_windows.write(str(bed) + "\n")

        if self.outfile_bases:
            E.info("Writing bases")
           
            output = output.reset_index()
            output.drop("strand", axis=1, inplace=True)
            output.drop("gene_id", axis=1, inplace=True)
            output = output.groupby(["contig", "position"], as_index=False).min()
            output.position = output.position.astype("int64")
            output["end"] = output["position"] + 1
            output = output[["contig", "position", "end", 0]]

            output.to_csv(self.outfile_bases,
                          sep="\t",
                          header=False,
                          index=False)
def outlier_test(model_results, method='bonf', alpha=.05, labels=None,
                 order=False, cutoff=None):
    """
    Outlier Tests for RegressionResults instances.

    Parameters
    ----------
    model_results : RegressionResults
        Linear model results
    method : str
        - `bonferroni` : one-step correction
        - `sidak` : one-step correction
        - `holm-sidak` :
        - `holm` :
        - `simes-hochberg` :
        - `hommel` :
        - `fdr_bh` : Benjamini/Hochberg
        - `fdr_by` : Benjamini/Yekutieli
        See `statsmodels.stats.multitest.multipletests` for details.
    alpha : float
        familywise error rate
    labels : None or array_like
        If `labels` is not None, then it will be used as index to the
        returned pandas DataFrame. See also Returns below
    order : bool
        Whether or not to order the results by the absolute value of the
        studentized residuals. If labels are provided they will also be sorted.
    cutoff : None or float in [0, 1]
        If cutoff is not None, then the return only includes observations with
        multiple testing corrected p-values strictly below the cutoff. The
        returned array or dataframe can be empty if there are no outlier
        candidates at the specified cutoff.

    Returns
    -------
    table : ndarray or DataFrame
        Returns either an ndarray or a DataFrame if labels is not None.
        Will attempt to get labels from model_results if available. The
        columns are the Studentized residuals, the unadjusted p-value,
        and the corrected p-value according to method.

    Notes
    -----
    The unadjusted p-value is stats.t.sf(abs(resid), df) where
    df = df_resid - 1.
    """
    from scipy import stats  # lazy import
    if labels is None:
        labels = getattr(model_results.model.data, 'row_labels', None)
    infl = getattr(model_results, 'get_influence', None)
    if infl is None:
        results = maybe_unwrap_results(model_results)
        raise AttributeError("model_results object %s does not have a "
                             "get_influence "
                             "method." % results.__class__.__name__)
    resid = infl().resid_studentized_external
    if order:
        idx = np.abs(resid).argsort()[::-1]
        resid = resid[idx]
        if labels is not None:
            labels = np.asarray(labels)[idx]
    df = model_results.df_resid - 1
    unadj_p = stats.t.sf(np.abs(resid), df) * 2
    adj_p = multipletests(unadj_p, alpha=alpha, method=method)

    data = np.c_[resid, unadj_p, adj_p[1]]
    if cutoff is not None:
        mask = data[:, -1] < cutoff
        data = data[mask]
    else:
        mask = slice(None)

    if labels is not None:
        from pandas import DataFrame
        return DataFrame(data,
                         columns=['student_resid', 'unadj_p', method + "(p)"],
                         index=np.asarray(labels)[mask])
    return data
Example #30
0
def html_report(outdir, infile, pwmfile, threshold=0.01):
    df = pd.read_table(infile, index_col=0)
    del df.index.name
    df["corrected P-value"] = multipletests(df["P-value"], method="fdr_bh")[1]

    cols = [
        "Logo", "# matches", "# matches background", "P-value",
        "log10 P-value", "corrected P-value", "ROC AUC", "Enr. at 1% FPR",
        "Recall at 10% FDR"
    ]

    m2f = pwmfile.replace(".pwm", ".motif2factors.txt")
    if os.path.exists(m2f):
        sys.stderr.write("reading mapping\n")
        m2f = pd.read_table(m2f, index_col=0)
        m2f.columns = ["factors"]
        f = m2f["factors"].str.len() > 30
        m2f["factors"] = '<div title="' + m2f["factors"] + '">' + m2f[
            "factors"].str.slice(0, 30)
        m2f.loc[f, "factors"] += '(...)'
        m2f['factors'] += '</div>'
        df = df.join(m2f)
        cols = ["factors"] + cols

    df = df[df["corrected P-value"] <= threshold]

    df["Logo"] = [
        '<img src="logos/{}.png" height=40/>'.format(x) for x in list(df.index)
    ]

    df = df[cols]
    if not os.path.exists(outdir + "/logos"):
        os.makedirs(outdir + "/logos")
    for motif in read_motifs(open(pwmfile)):
        if motif.id in df.index:
            motif.to_img(outdir + "/logos/{}.png".format(motif.id), fmt="PNG")

    bar_cols = [
        "log10 P-value", "ROC AUC", "MNCP", "Enr. at 1% FDR", "Max enr.",
        "Recall at 10% FDR"
    ]
    template_dir = MotifConfig().get_template_dir()
    js = open(os.path.join(template_dir, "sortable/sortable.min.js"),
              encoding="utf-8").read()
    css = open(os.path.join(template_dir, "sortable/sortable-theme-slick.css"),
               encoding="utf-8").read()
    with open(outdir + "/gimme.roc.report.html", "w", encoding="utf-8") as f:
        f.write("<head>\n")
        f.write("<style>{}</style>\n".format(css))
        f.write("</head>\n")
        f.write("<body>\n")
        if df.shape[0] > 0:
            f.write(
                df.sort_values(
                    "ROC AUC",
                    ascending=False).style.bar(bar_cols).set_precision(3).
                set_table_attributes("data-sortable").render().replace(
                    "data-sortable",
                    'class="sortable-theme-slick" data-sortable'))
        else:
            f.write("No enriched motifs found.")
        f.write("<script>{}</script>\n".format(js))
        f.write("</body>\n")
	sample_num_l = map(int,sys.argv[2].split(","))
	
	alpha = float(sys.argv[3])

	partial_kruskal = partial(calc_kruskal, sample_num_l=sample_num_l, alpha=alpha)
	
	pool = Pool(processes=int(sys.argv[4]))

	result = pool.map(partial_kruskal,[row for row in reader])

	p_val_list=[]

	for elem in result:
		p_val_list += [float(elem[-2])]
	
	rej, pval_corr = smm.multipletests(p_val_list, alpha=alpha, method=sys.argv[6])[:2]

	for index in range(len(result)):
			result[index] = result[index] + [`pval_corr[index]`]
	
	with open(sys.argv[5], 'w') as f_out:
		f_out.write(header_line)
		f_out.writelines('\t'.join(i) + '\n' for i in result)
	
#	with open(sys.argv[5], 'r') as correc:
#		correc_reader = csv.reader(correc, delimiter="\t")
		
#		correc_header_line = next(correc)
#		correc_header_line = correc_header_line.rstrip() + '\tp.adj'
#
#		p_val_list=[]
Example #32
0
# Differential protein abundance
comparisons_fc = []
for k, v in comparisons.items():
    df = pd.DataFrame(
        ttest_ind(
            prot[v["control"]].T,
            prot[v["condition"]].T,
            equal_var=False,
            nan_policy="omit",
        ),
        index=["tstat", "pvalue"],
        columns=prot.index,
    ).T.astype(float).sort_values("pvalue").dropna()

    df["comparison"] = k
    df["fdr"] = multipletests(df["pvalue"], method="fdr_bh")[1]
    df["diff"] = prot.loc[df.index, v["control"]].median(1) - prot.loc[
        df.index, v["condition"]].mean(1)

    comparisons_fc.append(df.reset_index())
comparisons_fc = pd.concat(comparisons_fc).sort_values("fdr")
comparisons_diff = pd.pivot_table(comparisons_fc,
                                  index="GeneSymbol",
                                  columns="comparison",
                                  values="diff")
comparisons_fc.to_csv(f"{DPATH}/perturbation_proteomics_diff_analysis.csv",
                      index=False)

# Plot distribtuions
fig, ax = plt.subplots(1, 1, figsize=(2, 1), dpi=600)
Example #33
0
def run_experiment(depth,
                   cutoff,
                   out_folder,
                   expression_path,
                   categories_path,
                   id_names_path,
                   col_names,
                   phase_2_index,
                   alter_id=True,
                   only_save=True,
                   total_count_all=True,
                   sig_p=0.05):
    """
    Performs enrichment analysis og gene functions.
    The analysis is performed with a hypergeometric test, the multiple testing coreccbenjamini hochberg
    correction.

    :param depth: The depth of the functions(catgeroies) used
    :param cutoff: Cutoff value for expression
    :param out_folder: Path to output folder
    :param expression_path: Path to expression values
    :param categories_path: Path to mappings of genes to functions
    :param id_names_path: Path to names and id-s
    :param col_names: Names of columns used in the id_names_path file
    :param phase_2_index: Indexes of phases in the expression value file
    :param alter_id: Alternative id
    :param only_save: If true values are stored to out directory, heatmaps are not plotted
    :param total_count_all: If true the number of successes per function in population is calculated from all genes,
    else only from expressed
    :param sig_p: The p-value considered as significant
    :return:
    """

    gene_2_profile = import_profiles(expression_path)
    gene_2_cat = import_gene_2_categories(categories_path, depth)
    orig_2_alter = import_mappings(id_names_path, col_names, alter_id)
    """
    Get expressed genes by phases.
    """
    phase_2_genes = {}
    for phase in phase_2_index:
        phase_2_genes[phase] = []

    for gene in gene_2_profile:
        profile = gene_2_profile[gene]

        for phase in phase_2_index:
            if is_expressed(profile, phase_2_index[phase], cutoff):
                phase_2_genes[phase].append(gene)

    cluster_sample_count = []
    cat_2_tot_count = {}
    total_count = 0
    clus_2_res = {}
    all_cats = set()
    no_annot_gene_count = 0

    annotation_out_path = os.path.join(
        out_folder, "annotations_phase_cutoff_" +
        str(cutoff).replace(".", "_") + "_depth_" + str(depth) + ".txt")

    with open(annotation_out_path, "w") as ann_out:
        for phase in phase_2_genes:
            print("Analyzing phase " + phase)

            ann_out.write("\n")
            ann_out.write(phase + "\n")

            clus_2_count_sample = {}
            sample_count = 0

            for gene_id in phase_2_genes[phase]:
                if gene_id in orig_2_alter:
                    alter_gene_id = orig_2_alter[gene_id]
                else:
                    print("No orig -> alter mapping for " + gene_id)
                    continue

                if alter_gene_id in gene_2_cat:
                    categories = gene_2_cat[alter_gene_id]

                    for cat in categories:
                        if cat not in clus_2_count_sample:
                            clus_2_count_sample[cat] = 0
                        if cat not in cat_2_tot_count:
                            cat_2_tot_count[cat] = 0

                        clus_2_count_sample[cat] += 1
                        sample_count += 1

                        ann_out.write(gene_id + "###" + cat + "\n")
                        all_cats.add(cat)

                        cat_2_tot_count[cat] += 1
                        total_count += 1
                else:
                    no_annot_gene_count += 1
                    cat = "no_annotation"
                    if cat not in clus_2_count_sample:
                        clus_2_count_sample[cat] = 0
                    if cat not in cat_2_tot_count:
                        cat_2_tot_count[cat] = 0

                    clus_2_count_sample[cat] += 1
                    sample_count += 1

                    ann_out.write(gene_id + "###" + cat + "\n")
                    all_cats.add(cat)

                    cat_2_tot_count[cat] += 1
                    total_count += 1

            clus_2_res[phase] = clus_2_count_sample.copy()
            cluster_sample_count.append(sample_count)

    if total_count_all:
        cat_2_tot_count, total_count = get_total_function_counts(
            gene_2_profile, orig_2_alter, gene_2_cat, out_folder)

    print("Total number of different categories: " + str(len(all_cats)))
    p_values = []
    total_sample_count = sum(cluster_sample_count)

    if total_sample_count != total_count and not total_count_all:
        raise Exception("Total count and total sample count must be equal!!!")
    else:
        print("OK")

    cluster_counter = 0

    for c in clus_2_res:
        print("Calculating p-values sample: " + str(cluster_counter))

        for cat in clus_2_res[c]:
            pval = hypergeometric_over(clus_2_res[c][cat],
                                       cluster_sample_count[cluster_counter],
                                       cat_2_tot_count[cat], total_count)
            p_values.append(pval)

        cluster_counter += 1

    # benjamini hochberg correction
    p_adjusted = multi.multipletests(p_values, method="fdr_bh")[1]

    p_counter = 0
    cluster_counter = 0
    res_out_path = os.path.join(out_folder, "hyper_cutoff_" + str(cutoff).replace(".", "_") \
                   + "_depth_" + str(depth) + ".txt")
    res_out_path_filtered = os.path.join(out_folder, "hyper_filtered_cutoff_" + str(cutoff).replace(".", "_") \
                            + "_depth_" + str(depth) + ".txt")

    with open(res_out_path, "w") as out:
        with open(res_out_path_filtered, "w") as out_filter:

            for c in clus_2_res:
                print("Storing sample " + str(cluster_counter))
                out.write(
                    c + "\tquant\tsample\thit\ttotal\tp_value\tp_adj\tlog_odds"
                    "\n")
                out_filter.write(
                    c + "\tquant\tsample\thit\ttotal\tp_value\tp_adj\tlog_odds"
                    "\n")

                sample_count = cluster_sample_count[cluster_counter]
                temp_out = []

                for cat in clus_2_res[c]:
                    quant = clus_2_res[c][cat]
                    sample = sample_count
                    hit = cat_2_tot_count[cat]
                    total = total_count

                    out_line = cat + "\t" + str(quant) + "\t" + str(sample) + "\t" \
                               + str(hit) + "\t" + str(total) + "\t" + \
                               str(p_values[p_counter]) + "\t" + str(p_adjusted[p_counter])

                    if quant != 0 and (sample - quant) != 0 and (
                            total - hit - sample +
                            quant) != 0 and (hit - quant) != 0:
                        odds_sample = quant / (sample - quant)
                        odds_rest = (hit - quant) / (total - hit - sample +
                                                     quant)
                        real_log_odds = math.log2(odds_sample / odds_rest)
                        out_line += "\t" + "%.2f" % real_log_odds
                        temp_out.append((p_adjusted[p_counter], out_line))
                    else:
                        out_line += "\tnan"
                        temp_out.append((p_adjusted[p_counter], out_line))

                    p_counter += 1
                temp_out.sort()
                cluster_counter += 1

                for t in temp_out:
                    out.write(t[1] + "\n")
                    if t[0] < sig_p:
                        out_filter.write(t[1] + "\n")

                out.write("\n")
                out_filter.write("\n")

    # set paths
    table_path_log = os.path.join(out_folder, "log_odds_cutoff_" + str(cutoff).replace(".", "_") + "_depth_" \
                     + str(depth) + ".xlsx" )
    table_p_values_path = os.path.join(out_folder, "p_adj_values_cutoff_" + str(cutoff).replace(".", "_") + "_depth_" \
                     + str(depth) + ".xlsx")
    gene_sig_path = os.path.join(out_folder, "genes_functions_cutoff_" + str(cutoff).replace(".", "_") + "_depth_" \
                       + str(depth) + ".xlsx")

    plot_store_heatmap(res_out_path,
                       table_path_log,
                       p_adj=False,
                       only_save=only_save,
                       sig_p=sig_p)
    plot_store_heatmap(res_out_path,
                       table_p_values_path,
                       p_adj=True,
                       only_save=only_save,
                       sig_p=sig_p)
    store_genes_with_significant_functions(annotation_out_path,
                                           table_p_values_path, id_names_path,
                                           col_names, gene_sig_path, alter_id)
Example #34
0
# Generate QQ plot for p-values
fig, ax = plt.subplots()

ax.scatter(p_values_df["uniform_logP"], p_values_df["log_p_values"])
ax.plot([8, 0], [8, 0], color="black")

ax.set_title("QQ Plot")
ax.set_xlabel("Expected -log10(p-value)")
ax.set_ylabel("Observed -log10(p-value)")

fig.savefig("qq_plot.png")

# Identify transcripts that are differential expressed at a 10% false discovery rate
p_values_df["fdr_0.10"] = multitest.multipletests(p_values_df["p_values"],
                                                  method="fdr_bh",
                                                  alpha=0.10)[0]

# Write these transcripts to an output file
p_values_df["Transcript"][p_values_df["fdr_0.10"]].to_csv(
    "diff_expression.txt", index=False)

# Repeat analysis, but with sex as a covariate
p_values_cov = []

for transcript in fpkm_reformat["t_name"].unique():

    # Get all expression data for one transcript
    transcript_data = fpkm_reformat[fpkm_reformat["t_name"] == transcript]

    # Use OLS to test if transcript is differentially expressed across stages while controlling for sex
Example #35
0

def melt_upper_triangle(df_, val_str):
    dfnan = df_.where(np.triu(np.ones(df_.shape)).astype(np.bool))
    melted_df = dfnan.stack().reset_index()
    melted_df.columns = ['OTU_1', 'OTU_2', val_str]
    melted_df2 = melted_df[melted_df['OTU_1'] != melted_df['OTU_2']]
    return melted_df2.set_index(['OTU_1', 'OTU_2'])


mpdf = melt_upper_triangle(p_df, 'p-value')
mdf = melt_upper_triangle(df, 'correlation')

fulldf = mdf.join(mpdf)

# pull total abundances
# pull taxonomy (order?)

reject, pvals_corrected = multipletests(fulldf['p-value'].values,
                                        alpha=0.05,
                                        method='fdr_bh')[:2]

thresholded = fulldf.loc[fulldf.index[reject], ['correlation']].reset_index()
corr_cutoff = abs(thresholded.correlation) > 0.5
thresholded_cutoff = thresholded[corr_cutoff]

thresholded_cutoff.to_csv(
    "/Volumes/KeithSSD/CB_V4/otu_data/sparcc_data/test_correlations.txt",
    sep="\t",
    index=False)
Example #36
0
            hemi))

    compare_dict = CsvReader(compare_file).to_dict(1)
    valid_idx_mat = np.array(compare_dict['p']) != 'nan'
    if mask_file is not None:
        mask_vertices = nib.freesurfer.read_label(mask_file)
        mask_idx_mat = np.zeros_like(valid_idx_mat, dtype=np.bool)
        mask_idx_mat[mask_vertices] = True
        valid_idx_mat = np.logical_and(valid_idx_mat, mask_idx_mat)

    compare_data = np.zeros((3, maps.shape[1]))
    ps_uncorrected = np.array([
        float(p) for idx, p in enumerate(compare_dict['p'])
        if valid_idx_mat[idx]
    ])
    reject, ps_corrected, alpha_sidak, alpha_bonf = multipletests(
        ps_uncorrected, 0.05, 'fdr_bh')
    ts = [
        float(t) for idx, t in enumerate(compare_dict['t'])
        if valid_idx_mat[idx]
    ]
    compare_data[0, valid_idx_mat] = ts
    compare_data[1, valid_idx_mat] = -ps_uncorrected
    compare_data[2, valid_idx_mat] = -ps_corrected
    compare_data[0, np.logical_not(valid_idx_mat)] = np.min(ts)
    compare_data[1, np.logical_not(valid_idx_mat)] = np.min(-ps_uncorrected)
    compare_data[2, np.logical_not(valid_idx_mat)] = np.min(-ps_corrected)
    save2nifti(
        pjoin(compare_dir, '{}_g1_vs_g2_posterior_masked.nii.gz'.format(hemi)),
        compare_data)
    # ---compare2nifti end---
    else:
        sig_p.append(np.nan)

log_pvals = -(np.log10(p_vals))

cor_alpha = 0.05 / 90
cor_alphalog = -(np.log10(cor_alpha))

ax, fig = plt.subplots()
plt.plot(log_pvals)
plt.xlabel('Samples')
plt.ylabel('-log(p)')
plt.hlines(cor_alphalog, 0, 90, color='red')

import statsmodels.stats.multitest as sm
bools, p_adj, x, x2 = sm.multipletests(p_vals, method='bonferroni')

# Use MNE cluster permutation
X_input = [face_5chs, scene_5chs]
X_3Dinput = [dat_5chs, dat_5chs_face]

Fobs, clusters, clusters_pval, H0 = mne.stats.permutation_cluster_test(
    X_3Dinput)
Fobs1, clusters1, clusters_pval1, H01 = mne.stats.permutation_cluster_test(
    X_input, n_permutations=10)

plt.plot(clusters1)

#%%
# Get evoked
scene_evoked_allSubs = MNEevoked_scene._data
Example #38
0
 def compute_fdr_by_dist(d):
     fdrs = multipletests(list(d['pvalue']), method='fdr_bh')[1]
     d.loc[:, 'fdr_dist'] = fdrs
     return d
Example #39
0
def simple_auto_stationarize(df,
                             verbosity=None,
                             alpha=None,
                             multitest=None,
                             get_conclusions=False,
                             get_actions=False):
    """Auto-stationarize the given time-series dataframe.

    Parameters
    ----------
    df : pandas.DataFrame
        A dataframe composed solely of numeric columns.
    verbosity : int, logging.Logger, optional
        If an int is given, it is interpreted as the logging lever to use. See
        https://docs.python.org/3/library/logging.html#levels for details. If a
        logging.Logger object is given, it is used for printing instead, with
        appropriate logging levels. If no value is provided, the default
        logging.Logger behaviour is used.
    alpha : int, optional
        Family-wise error rate (FWER) or false discovery rate (FDR), depending
        on the method used for multiple hypothesis testing error control. If no
        value is provided, a default value of 0.05 (5%) is used.
    multitest : str, optional
        The multiple hypothesis testing eror control method to use. If no value
        is provided, the Benjamini–Yekutieli is used. See
        `the documesimple_auto_stationarizentation of statsmodels' multipletests method for supported values <https://www.statsmodels.org/dev/generated/statsmodels.stats.multitest.multipletests.html>`.
    get_conclusions : bool, defaults to False
        If set to true, a conclusions dict is returned.
    get_actions : bool, defaults to False
        If set to true, an actions dict is returned.

    Returns
    -------
    results : pandas.DataFrame or dict
        By default, only he transformed dataframe is returned. However, if
        get_conclusions or get_actions are set to True, a dict is returned
        instead, with the following mappings:
        - `postdf` - Maps to the transformed dataframe.
        - `conclusions` - Maps to a dict mapping each column name to the
          arrived conclusion regarding its stationarity.
        - `actions` - Maps to a dict mapping each column name to the
          transformations performed on it to stationarize it.
    """  # noqa: E501
    if verbosity is not None:
        prev_verbosity = set_verbosity_level(verbosity)
    if alpha is None:
        alpha = DEF_ALPHA

    logger = get_logger()
    logger.info("Starting to auto-stationarize a dataframe!")
    logger.info("Starting to check input data validity...")
    logger.info(f"Data shape (time, variables) is {df.shape}.")
    # the first axis - rows - is expected to represent the time dimension,
    # while the second axis - columns - is expected to represent variables;
    # thus, the first expected to be much longer than the second
    logger.info(
        "Checking current data orientation (rows=time, columns=variables)...")
    if df.shape[1] >= df.shape[0]:
        logger.warning((
            "stationarizer's input dataframe has more columns than rows! "
            "Columns are expected to represent variables, while rows represent"
            " time steps, and thus the input dataframe is expected to have "
            "more rows than columns. Either the input data is inverted, or the"
            " data has far more variables than samples."))
    else:
        logger.info("Data orientation is valid.")
    # assert all columns are numeric
    all_cols_numeric = all([np.issubdtype(x, np.number) for x in df.dtypes])
    if not all_cols_numeric:
        err = ValueError(
            "All columns of stationarizer's input dataframe must be numeric!")
        logger.exception(err)

    # util var
    n = len(df.columns)

    # testing for unit root
    logger.info(
        ("Checking for the presence of a unit root in the input time series "
         "using the Augmented Dicky-Fuller test"))
    logger.info(
        ("Reminder:\n "
         "Null Hypothesis: The series has a unit root (value of a=1); meaning,"
         " it is NOT stationary.\n"
         "Alternate Hypothesis: The series has no unit root; it is either "
         "stationary or non-stationary of a different model than unit root."))
    adf_results = []
    for colname in df.columns:
        srs = df[colname]
        result = adfuller(srs, regression='ct')
        logger.info(
            (f"{colname}: test statistic={result[0]}, p-val={result[1]}."))
        adf_results.append(result)

    # testing for trend stationarity
    logger.info((
        "Testing for trend stationarity of input series using the KPSS test."))
    logger.info(("Reminder:\n"
                 "Null Hypothesis (H0): The series is trend-stationarity.\n"
                 "Alternative Hypothesis (H1): The series has a unit root."))
    kpss_results = []
    for colname in df.columns:
        srs = df[colname]
        result = kpss(srs, regression='ct')
        logger.info(
            (f"{colname}: test statistic={result[0]}, p-val={result[1]}."))
        kpss_results.append(result)

    # Controling FDR
    logger.info(
        ("Controling the False Discovery Rate (FDR) using the Benjamini-"
         f"Yekutieli procedure with α={DEF_ALPHA}."))
    adf_pvals = [x[1] for x in adf_results]
    kpss_pvals = [x[1] for x in kpss_results]
    pvals = adf_pvals + kpss_pvals
    by_res = multipletests(
        pvals=pvals,
        alpha=alpha,
        method='fdr_by',
        is_sorted=False,
    )
    reject = by_res[0]
    corrected_pvals = by_res[1]
    adf_rejections = reject[:n]
    kpss_rejections = reject[n:]
    adf_corrected_pvals = corrected_pvals[:n]  # noqa: F841
    kpss_corrected_pvals = corrected_pvals[n:]  # noqa: F841
    conclusion_counts = {}

    def dict_inc(dicti, key):
        try:
            dicti[key] += 1
        except KeyError:
            dicti[key] = 1

    # interpret results
    logger.info("Interpreting test results after FDR control...")
    conclusions = {}
    actions = {}
    for i, colname in enumerate(df.columns):
        conclusion = conclude_adf_and_kpss_results(
            adf_reject=adf_rejections[i], kpss_reject=kpss_rejections[i])
        dict_inc(conclusion_counts, conclusion)
        trans = CONCLUSION_TO_TRANSFORMATIONS[conclusion]
        conclusions[colname] = conclusion
        actions[colname] = trans
        logger.info((f"--{colname}--\n "
                     f"ADF corrected p-val: {adf_corrected_pvals[i]}, "
                     f"H0 rejected: {adf_rejections[i]}.\n"
                     f"KPSS corrected p-val: {kpss_corrected_pvals[i]}, "
                     f"H0 rejected: {kpss_rejections[i]}.\n"
                     f"Conclusion: {conclusion}\n Transformations: {trans}."))

    # making non-stationary series stationary!
    post_cols = {}
    logger.info("Applying transformations...")
    for colname in df.columns:
        srs = df[colname]
        if Transformation.DETREND in actions[colname]:
            logger.info(f"Detrending {colname} (len={len(srs)}).")
            srs = detrend(srs, order=1, axis=0)
        if Transformation.DIFFRENTIATE in actions[colname]:
            logger.info(f"Diffrentiating {colname} (len={len(srs)}).")
            srs = diff(srs, k_diff=1)
        post_cols[colname] = srs
        logger.info(f"{colname} transformed (len={len(post_cols[colname])}).")

    # equalizing lengths
    min_len = min([len(post_cols[x]) for x in post_cols])
    for colname in df.columns:
        post_cols[colname] = post_cols[colname][:min_len]
    postdf = df.copy()
    postdf = postdf.iloc[:min_len]
    for colname in df.columns:
        postdf[colname] = post_cols[colname]
    logger.info(f"Post transformation shape: {postdf.shape}")

    for k in conclusion_counts:
        count = conclusion_counts[k]
        ratio = 100 * (count / len(df.columns))
        logger.info(f"{count} series ({ratio}%) found with conclusion: {k}.")

    if verbosity is not None:
        set_verbosity_level(prev_verbosity)

    if not get_actions and not get_conclusions:
        return postdf
    results = {'postdf': postdf}
    if get_conclusions:
        results['conclusions'] = conclusions
    if get_actions:
        results['actions'] = actions
    return results
Example #40
0
def getCorrectedPValues(pval_raw,alpha=0.05,method='fdr_i'):
    rej, pval_corr = smm.multipletests(pval_raw, alpha=alpha, method=method)[:2]
    return pval_corr
Example #41
0
    

    return gsea_dat




### Thresholding
HIT = 10
LOWER = 20
UPPER = 500
alpha = 0.05

topGeneN = int(sys.argv[1])
mf_genes_SI = readin_gsea_result_SI('topGene%d' % topGeneN,HIT, lower=LOWER, higher=UPPER)
mf_genes_SI['BH_p'] = multitest.multipletests(mf_genes_SI['pvalue'], method = 'fdr_bh')[1]
print(len(set(mf_genes_SI[mf_genes_SI['BH_p'] < alpha].index)))

x = mf_genes_SI[mf_genes_SI['group'] != 0]
print(len(set(x[x['BH_p'] < alpha].index)))

suffix = '%s_stringent' % FMfn
mf_genes_SI.to_csv('/work-zfs/abattle4/heyuan/tissue_spec_eQTL_v8/plots/Fig3_GSEA_%s.txt' % suffix, sep='\t', index=True)



mf_genes_SI = readin_gsea_result_SI('topGene30',HIT, lower=LOWER, higher=UPPER)
mf_genes_SI['BH_p'] = multitest.multipletests(mf_genes_SI['pvalue'], method = 'fdr_bh')[1]
print(len(set(mf_genes_SI[mf_genes_SI['BH_p'] < alpha].index)))

x = mf_genes_SI[mf_genes_SI['group'] != 0]
Example #42
0
def rep_compare(valueD, total_first, total_second, method, log2, scale,
                log2_already):
    resultL = []
    correctL = []
    pList = []
    no_correctL = []
    for id, valueL in valueD.items():
        tmpL = [id]
        #meanL = [id]
        if log2:
            v1 = [
                log((float(i) + 1) * scale / total_first, 2) for i in valueL[0]
            ]
        else:
            v1 = [float(i) * scale / total_first for i in valueL[0]]
        tmpL.extend(v1)
        len_v1 = len(v1)
        meanV1 = sum(v1) / len_v1
        if log2:
            v2 = [
                log((float(i) + 1) * scale / total_second, 2)
                for i in valueL[1]
            ]
        else:
            v2 = [float(i) * scale / total_second for i in valueL[1]]
        len_v2 = len(v2)
        meanV2 = sum(v2) / len_v2
        tmpL.extend(v2)
        tmpL.append(meanV1)
        tmpL.append(meanV2)
        #meanL.append()
        if meanV1 * meanV2 == 0:
            meanV1 += 1
            meanV2 += 1
        if log2_already:
            diff = meanV2 - meanV1
        else:
            diff = log(meanV2 / meanV1, 2)
        #if log2:
        #    v1 = [log(i+1, 2) for i in v1]
        #    v2 = [log(i+1, 2) for i in v2]
        tmpL.append(diff)
        if abs(diff) >= 0.2:
            p = stat_pvalue(v1, v2, method)
        else:
            p = 0.5
        tmpL.append(p)
        if abs(diff) >= 0.2 and p < 0.2:
            tmpL.append(1)
            correctL.append(tmpL)
            pList.append(p)
        else:
            tmpL.append(1)
            #no_correctL.append(tmpL)
    if pList:
        p_adjL = multipletests(pList, method="fdr_bh")[1]
        for tmpL, p_adj in zip(correctL, p_adjL):
            tmpL[-1] = p_adj
    resultL = correctL[:]
    #resultL.extend(no_correctL)
    resultL.sort(key=lambda x: x[-1])
    return resultL
Example #43
0
def main(argv=sys.argv):
    parser = argparse.ArgumentParser(description='MODriver v1.0')
    parser.add_argument("-c",
                        dest='coding',
                        default="./coding_key.csv",
                        help="coding file")
    parser.add_argument("-n",
                        dest='non_coding',
                        default="./non_coding_key.csv",
                        help="non_coding file")
    parser.add_argument("-s",
                        dest='pos',
                        default="./pos_2018.txt",
                        help="coding file")
    parser.add_argument("-g",
                        dest='neg',
                        default="./neg_2018.txt",
                        help="non_coding file")
    parser.add_argument("-m", dest='mode', default="sort", help="mode")
    parser.add_argument("-l", dest='learn', default="MODNN", help="mode")
    parser.add_argument("-t",
                        dest='type',
                        default="Pancan",
                        help="cancer type")
    parser.add_argument("-o",
                        dest='out',
                        default="./score/",
                        help="coding file")
    parser.add_argument("-p",
                        dest='threads_num',
                        type=int,
                        default=1,
                        help="threads num")
    args = parser.parse_args()
    df_tmp = pd.read_csv('./chr_id.txt',
                         header=0,
                         index_col=3,
                         sep='\t',
                         usecols=[0, 1, 2, 3])
    all_list = df_tmp.index.tolist()
    key_2018 = './key_2018.txt'
    # if args.type  != 'Pancan':
    #     key_2018 = "./input/%s.key" % args.type
    pd_key = pd.read_csv(key_2018, header=None, sep='\t')
    pd_neg = pd.read_csv('./neg_2018.txt', header=None, sep='\t')
    pd_neg.columns = ['gene']
    pd_key.columns = ['gene', 'type']
    pd_key = pd_key.drop_duplicates(subset=['gene'], keep='first')
    pd_neg = pd_neg.drop_duplicates(subset=['gene'], keep='first')
    key_18 = pd_key['gene'].values.tolist()
    neg_18 = pd_neg['gene'].values.tolist()
    known_key = ['TERT']
    neg_key = ['CACNA1E', 'COL11A1', 'DST', 'TTN']
    key_18 = list(set(key_18) | set(known_key))
    #neg_key = list(set(neg_18) | set(neg_key))
    pos, neg = build_set(key_18, neg_key, all_list, nb_imb=20)
    # pos, neg = pickle.load(open('pos.neg', 'rb'))
    X_train, y_train, X, X_sim, ids = file2data(args.type, pos, neg)
    print(X_train[0].shape[0], X[0].shape[0], X_sim[0].shape[0])
    if args.mode == 'train':
        fit(X_train, y_train, args.type, method=args.learn)
    elif args.mode == 'gen_bed':
        input = 'PCAWG_test_genomic_elements.bed12.gz'
        out = 'chr_id.bed'
        df = pd.read_csv(input, header=None, sep='\t', usecols=[0, 1, 2, 3])
        df.columns = ['chr', 'start', 'end', 'id']
        df.index = df['id']
        ids = df.loc[::, 'id'].values.tolist()
        ban_list = [
            '::TTN::', '::DST::', '::DMD::', '::CACNA1E::', '::COL11A1::',
            '::mitranscriptome::'
        ]
        ids_new = []
        for id in ids:
            b_keep = True
            for ban in ban_list:
                if ban in id:
                    b_keep = False
            if b_keep:
                ids_new.append(id)
        df = df.loc[ids_new, ::]
        df['chr'] = df['chr'].apply(lambda x: str(x).replace("chr", ""))
        df = df.sort_values(by=['chr', 'start'], ascending=[True, True])
        df.to_csv(out, header=False, index=False, sep='\t')
    if args.mode == 'neg':
        apps = [
            '2020plus', 'ActiveDriver', 'CompositeDriver', 'MuSiC',
            'MutSig2CV', 'OncodriveCLUST', 'OncodriveFML', 'e-Driver'
        ]
        nb_line = 0
        for app in apps:
            nb_line += 1
            thr = 0.6
            path = '../coding/%s/PANCAN.txt' % app
            df = pd.read_csv(path,
                             header=0,
                             sep='\t',
                             index_col=0,
                             usecols=['gene', 'qvalue'])
            df = df[df['qvalue'] > thr]
            if nb_line == 1:
                neg_list = set(df.index.tolist())
            else:
                neg_list = neg_list & set(df.index.tolist())
        neg = list(neg_list)
        df = pd.DataFrame(data=neg, index=None, columns=['gene'])
        out = './neg_2018.txt'
        df.to_csv(out, header=False, index=False, sep='\t')
    elif args.mode == 'cv':
        fit_cv(X_train, y_train, 10, args.learn, False)
    elif args.mode == 'score':
        y_p = predict(X, args.type, method=args.learn)
        null_dist_path = '%s%s.null' % (args.out, args.type)
        f = open(null_dist_path, 'rb')
        null_dist = pickle.load(f)
        f.close()
        df_all = pd.DataFrame(data=y_p, index=ids, columns=['score'])
        ge_type = {}
        for id in ids:
            tmp = re.split('::', id)[0]
            tmp = str(tmp).replace("gc19_pc.", "")
            if tmp not in ge_type:
                ge_type[tmp] = [id]
            else:
                ge_type[tmp].append(id)
        nb_coding_drivers = 0
        nb_noncoding_drivers = 0
        dfs = []
        for key in ge_type.keys():
            df_score = df_all.loc[ge_type[key], ::]
            out_path = '%s%s.%s.score' % (args.out, args.type, key)
            pvals = 1 - null_dist(df_score['score'].values.tolist())
            df_score['p'] = pvals
            p_min = 1e-6
            df_score.loc[df_score['p'] < p_min, 'p'] = p_min
            _, qvals, _, _ = mt.multipletests(pvals=pvals,
                                              alpha=0.1,
                                              method='fdr_bh')
            df_score['q'] = qvals
            df_show = df_score[df_score['q'] < 0.1]
            dfs.append(df_show)
            if key == 'cds':
                nb_coding_drivers += df_show.shape[0]
            else:
                nb_noncoding_drivers += df_show.shape[0]
            df_score = df_score.sort_values(by=['score'], ascending=[False])
            df_score.to_csv(out_path, header=True)
        out_path = "%s%s.%s.score" % ("./", args.type, 'all')
        df = pd.concat(dfs, axis=0)
        df = df.sort_values(by=['score'], ascending=[False])
        df.to_csv(out_path, header=True)
        print(nb_coding_drivers + nb_noncoding_drivers, nb_coding_drivers,
              nb_noncoding_drivers)
    elif args.mode == 'null':
        y_sim = predict(X_sim, args.type, method=args.learn, b_null=True)
        df_sim = pd.DataFrame(data=y_sim, columns=['score'])
        out_path = '%s%s.null' % (args.out, args.type)
        null_dist = sm.distributions.ECDF(df_sim['score'].values.tolist())
        fp = open(out_path, 'wb')
        pickle.dump(null_dist, fp)
        fp.close()
    elif args.mode == 'simulation':
        tmp_dir = '/data/tmp/'
        random_out_file = 'simulation.txt.gz'
        # based on the ori maf file
        ori_input = '../data/ICGC/final_consensus_passonly.snv_mnv_indel.icgc.public.maf.gz'
        col0 = [
            'Chromosome', 'Start_position', 'End_position', 'Reference_Allele',
            'Tumor_Seq_Allele2', 'Tumor_Sample_Barcode',
            'Matched_Norm_Sample_Barcode'
        ]
        promoter_set = ['TERT', 'MALAT1', 'NEAT1']
        df = pd.read_csv(ori_input,
                         header=0,
                         sep='\t',
                         usecols=col0 + ['Hugo_Symbol'])
        # remove the mutations in the TERT promoter, MALAT1, or NEAT1
        df_anno = df.loc[~df['Hugo_Symbol'].isin(promoter_set), col0]
        all_input_file = '%s/all_input.txt' % tmp_dir
        all_out_file = '%s/all_out.txt' % tmp_dir
        df_anno.to_csv(all_input_file, header=False, index=False, sep='\t')
        cmd = "python parallel_do.py -c 'python simulation.py -i %s -o %s' -t %d --r" % (
            all_input_file, all_out_file, args.threads_num)
        # cmd = 'python simulation.py -i %s -o %s' % (all_input_file, all_out_file)
        print(cmd)
        check_output(cmd, shell=True)
        df = pd.read_csv(all_out_file, header=None, sep='\t')
        df.columns = [
            'Chromosome', 'Start_position', 'End_position',
            'Variant_Classification', 'Variant_Type', 'Reference_Allele',
            'Tumor_Seq_Allele2', 'Tumor_Sample_Barcode',
            'Matched_Norm_Sample_Barcode', 'gc_content'
        ]
        df.to_csv(random_out_file,
                  header=True,
                  index=False,
                  sep='\t',
                  compression='gzip',
                  float_format='%.3f')
        print("random mutations: " + str(df.shape[0]))
def resampleAllGo(go_term_groups,
                  goi,
                  go_terms,
                  essential_count,
                  non_essential_count,
                  n=10000,
                  save_intermediate=True):
    print('...resampling all GO terms...')

    #split go terms into essential and nonessential, and if in goi
    go_terms['essential'] = go_terms.index.isin(essential_genes)
    go_terms['in_goi'] = go_terms.index.isin(goi)

    essential_go_df = go_terms[go_terms['essential'] == True]
    non_essential_go_df = go_terms[go_terms['essential'] == False]
    goi_go_df = go_terms[go_terms['in_goi'] == True]

    print('- saving intermediate?: ' + str(save_intermediate))
    if save_intermediate == True:
        goi_go_df.to_csv('goi_go_df.tsv', sep='\t', index=True)

    #get all go terms represented by more than one gene in goi
    all_goi_go_terms = goi_go_df.go_term.tolist()
    print('- number of go terms among goi (including dup.): ' +
          str(len(all_goi_go_terms)))
    goi_go_terms = list(set(all_goi_go_terms))
    print('- number of go terms among goi (without dup.): ' +
          str(len(goi_go_terms)))
    goi_go_dupes = [
        item for item, count in collections.Counter(all_goi_go_terms).items()
        if count > 1
    ]
    print('-number of duplicated go terms among goi): ' +
          str(len(goi_go_dupes)))
    #print('terms: \n',goi_go_dupes)
    ##count go terms in goi
    goi_go_counts = goi_go_df.go_term.value_counts()
    ##print(goi_go_counts)

    ##get n random samples of the same # of essential and nonessential genes
    print('- number of random samples: ' + str(n))
    samples = []
    for i in range(n):
        random_sample = list(
            random.sample(essential_go_df.index.values.tolist(),
                          essential_count))
        random_sample += list(
            random.sample(non_essential_go_df.index.values.tolist(),
                          non_essential_count))
        go_terms['in_random'] = go_terms.index.isin(random_sample)
        samples.append(go_terms[go_terms['in_random'] == True])
    print(samples[0])
    go_terms = go_terms.drop(columns=['in_random'])

    ##count go terms in each random sample
    sample_counts = []
    for sample in samples:
        sample_counts.append(sample.go_term.value_counts())
    ##print('sample counts[:2]',sample_counts[:2])

    ##resample each go term duplicated in the goi with all n samples and the goi:
    rv_dict = {}
    counter = 0
    for term in goi_go_dupes:
        ##get number of that goi in go term

        n_go_goi = goi_go_counts[term]

        ##proceed if that number is above the minimum:
        if n_go_goi >= min_goi_count:

            ##get number of genes in each random sample in go term, compare to goi
            n_samples_greater_or_equal_to_goi = 0
            counts_from_random_samples = []
            for sample_count in sample_counts:
                try:
                    n_go_sample = sample_count[term]
                except KeyError:
                    n_go_sample = 0
    ##            if term=='GO:0005737':
    ##                print(n_go_goi,n_go_sample)
                if n_go_sample >= n_go_goi:
                    n_samples_greater_or_equal_to_goi += 1
                counts_from_random_samples.append(n_go_sample)
            median_random = np.median(counts_from_random_samples)

            rv_dict[term] = [
                n_go_goi, median_random,
                float(n_samples_greater_or_equal_to_goi) / float(n)
            ]

    #adjust for multiple hypothesis testing


##    print(rv_dict['GO:0005737'])
    rv_df = pd.DataFrame.from_dict(
        rv_dict,
        orient='index',
        columns=['count_in_goi', 'median_count_in_random_sample', 'raw_rv'])

    pvalue_list = rv_df['raw_rv'].tolist()
    fdrbh_output = smm.multipletests(
        pvalue_list, method='fdr_bh')  # benjamini hochberg method
    adjusted_pvalues = np.asarray(fdrbh_output[1].tolist())

    rv_df['bh_rv'] = adjusted_pvalues

    #add counts to go_terms
    rv_df.reset_index(inplace=True)
    rv_df['GO_term_gene_count'] = rv_df.apply(
        lambda x: all_go_counts.loc[x['index'], 'count'], axis=1)
    rv_df.set_index('index', inplace=True)

    rv_df.sort_values(by=['bh_rv'], inplace=True)

    rv_df.to_csv('resample_v6_goi_go_df_repeatfiltered_min' +
                 str(min_go_count) + 'max' + str(max_go_count) + '_mingoi' +
                 str(min_goi_count) + 'onlyBioprocess' + str(onlyBioprocess) +
                 '.tsv',
                 sep='\t',
                 index=True)
    #
    ##    print(rv_df)

    return
Example #45
0
    def get_sign_pvals(self, alpha=0.1, min_present=5):
        '''Get FDR corrected p-values for rejecting the null hypothesis that the signs of the ratios originate from a p=0.5 binomial distribution.

        This test is used in order to identify features that increase/decrease significantly. For example, if the RatioExperiments is created for
        pre- and post-treatment samples of individuals (ratio is pre/post), get_sign_pvals can be used to identify features that significantly
        increase/decrease following the treatment.

        NOTE: The test is performed only on the non nan feature values.

        Parameters
        ----------
        alpha: float, optional
            The required FDR control level
        min_present: int, optional
            The minimal number of samples where the ratio is not nan or zero in order to include in the test.
            Used as filtering to achieve better FDR power (less hypothesis to test)

        Returns
        -------
        RatioExperiment
            Only features with higher than random number of positive or negative ratios.
            Features are sorted by the effect size (and by p-value for similar effect size).
            The feature_metadata contains 4 new fields: '__calour_stat', '_calour_pval', '_calour_qval', '_calour_direction'
            , similar to calour.analysis.diff_abundance().
        '''
        exp = self.copy()

        # need to convert to non-sparse in order to use np.isfinite()
        exp.sparse = False

        keep = []
        pvals = np.ones(exp.shape[1])
        esize = np.zeros(exp.shape[1])
        npos = np.zeros(exp.shape[1])
        nneg = np.zeros(exp.shape[1])
        for idx in range(exp.shape[1]):
            cdat = exp.data[:, idx]
            cnpos = np.sum(cdat[np.isfinite(cdat)] > 0)
            cnneg = np.sum(cdat[np.isfinite(cdat)] < 0)
            npos[idx] = cnpos
            nneg[idx] = cnneg
            # test if we have enough non-zero samples
            if npos[idx] + nneg[idx] >= min_present:
                # calculate the binomial p-value and effect size for the feature
                pvals[idx] = scipy.stats.binom_test(cnpos, cnpos + cnneg)
                esize[idx] = (cnpos - cnneg) / (cnpos + cnneg)
                keep.append(idx)
        logger.debug('keeping %d features with enough ratios' % len(keep))
        exp = exp.reorder(keep, axis='f')
        if len(keep) == 0:
            logger.warning('No significant features found')
            return exp

        pvals = pvals[keep]
        esize = esize[keep]

        # multiple testing correction using Benjamini-Hochberg FDR
        # note we cannot use dsFDR as this is not a 2 group test
        reject, qvals, *_ = multipletests(pvals, alpha=alpha, method='fdr_bh')
        newexp = _new_experiment_from_pvals(exp, None, reject, esize, pvals,
                                            qvals)
        # set the effect direction field
        newexp.feature_metadata[_CALOUR_DIRECTION] = [
            'positive' if x > 0 else 'negative'
            for x in newexp.feature_metadata[_CALOUR_STAT]
        ]

        logger.info('found %d significant' % len(newexp.feature_metadata))
        return newexp
Example #46
0
                'elem_id': setElem,
                'population_size': populationSize,
                'success_population': numSuccInPopulation,
                'sample_size': sampleSize,
                'success_samples': drawnSuccesses,
                'pval': pval,
                'sample_success_fraction': fractionOfHitSamples,
                'genes': ";".join(successIntersection),
                'direction': direction
            }

            setToResult[setElem] = resultObj

        sortedElems = [x for x in setToResult]
        elemPvals = [setToResult[x]["pval"] for x in sortedElems]

        rej, elemAdjPvals, _, _ = multipletests(elemPvals,
                                                alpha=0.05,
                                                method='fdr_bh',
                                                is_sorted=False,
                                                returnsorted=False)

        for eidx, elem in enumerate(sortedElems):
            assert (setToResult[elem]['pval'] == elemPvals[eidx])
            setToResult[elem]['adj_pval'] = elemAdjPvals[eidx]

        for elem in sortedElems:
            dr = DataRow.fromDict(setToResult[elem])
            outdf.addRow(dr)

    outdf.export(args.output.name)
Example #47
0
def plot_heatmaps(xs, ys, rhos, p_values, time):

    layout = go.Layout(margin=get_margin(),
                       autosize=True,
                       showlegend=False,
                       yaxis=dict(type='category',
                                  showgrid=True,
                                  showline=True,
                                  mirror='ticks',
                                  titlefont=dict(
                                      family='Arial',
                                      color='black',
                                      size=2,
                                  ),
                                  showticklabels=True,
                                  tickangle=0,
                                  tickfont=dict(family='Arial',
                                                color='black',
                                                size=2),
                                  exponentformat='e',
                                  showexponent='all'))

    passed, p_values_corr, _, _ = multipletests(p_values.flatten(),
                                                0.05,
                                                method='fdr_bh')
    passed.shape = (len(ys), len(xs))
    passed = passed.astype(int)
    p_values_corr.shape = (len(ys), len(xs))

    trace = go.Heatmap(z=rhos, x=xs, y=ys, colorscale=balance)
    fig = go.Figure(data=trace, layout=layout)
    plotly.offline.plot(fig,
                        filename=out_path + '/rhos_' + time + '.html',
                        auto_open=False,
                        show_link=True)
    plotly.io.write_image(fig, out_path + '/rhos_' + time + '.png')
    plotly.io.write_image(fig, out_path + '/rhos_' + time + '.pdf')

    trace = go.Heatmap(z=-np.log10(p_values), x=xs, y=ys, colorscale=dense_inv)
    fig = go.Figure(data=trace, layout=layout)
    plotly.offline.plot(fig,
                        filename=out_path + '/p_values_' + time + '.html',
                        auto_open=False,
                        show_link=True)
    plotly.io.write_image(fig, out_path + '/p_values_' + time + '.png')
    plotly.io.write_image(fig, out_path + '/p_values_' + time + '.pdf')

    trace = go.Heatmap(z=-np.log10(p_values_corr),
                       x=xs,
                       y=ys,
                       colorscale=dense_inv)
    fig = go.Figure(data=trace, layout=layout)
    plotly.offline.plot(fig,
                        filename=out_path + '/p_values_corr_' + time + '.html',
                        auto_open=False,
                        show_link=True)
    plotly.io.write_image(fig, out_path + '/p_values_corr_' + time + '.png')
    plotly.io.write_image(fig, out_path + '/p_values_corr_' + time + '.pdf')

    trace = go.Heatmap(z=passed, x=xs, y=ys)
    fig = go.Figure(data=trace, layout=layout)
    plotly.offline.plot(fig,
                        filename=out_path + '/passed_' + time + '.html',
                        auto_open=False,
                        show_link=True)
    plotly.io.write_image(fig, out_path + '/passed_' + time + '.png')
    plotly.io.write_image(fig, out_path + '/passed_' + time + '.pdf')
def anova_oneway_simulation(data,
                            variables,
                            effect_size,
                            sample_size,
                            alpha=0.05,
                            n_repeats=15,
                            weight_values=None,
                            weight_threshold=0.8,
                            modification_type='correlation',
                            class_balance=0.5,
                            multiple_testing_correction='fdr_by'):
    """
    Worker function to perform power calculations for a one-way ANOVA model, with effect size added parametrized
    using Cohen's d measure.

    :param numpy.ndarray data: X data matrix (real or simulated) to use in th
    :param int, float or numpy.ndarray variables: List of variables to modify. In case of an `int` value or numpy.ndarray with dtype=`int` only variable
    with If a single `Float` value is provided interpreted as a proportion will all be modified by their effect size
    :param numpy.ndarray effect_size: array with effect size values to test
    :param numpy.ndarray sample_size: array with sample sizes to test
    :param float alpha:
    :param int n_repeats:
    :param numpy.ndarray weight: Can be
    :param numpy.ndarray weight_threshold: Used in all modification methods invol
    :param str modification_type: How to mo. Single means only the variables requested are modified. Proportion means
    that a set of
    :param float class_balance:
    :return:
    """

    try:
        import warnings
        warnings.filterwarnings('ignore')
        if modification_type not in [
                'correlation', 'manual', 'proportion', 'correlation_weighted'
        ]:
            raise ValueError("modification_type argument not supported")
        if modification_type == 'proportion' and not isinstance(
                variables, float):
            raise TypeError(
                "When using \'proportion\' as modification_type \'variables\' must be a float"
            )

        # get the list of metrics calculated in scoreResults and update
        results = dict.fromkeys(score_metrics)
        for key in results.keys():
            results[key] = np.zeros(
                (effect_size.size, sample_size.size, n_repeats))

        if multiple_testing_correction is not None:
            adjusted_results = dict.fromkeys(score_metrics)
            for key in adjusted_results.keys():
                adjusted_results[key] = np.zeros(
                    (effect_size.size, sample_size.size, n_repeats))
            adjusted_results['method'] = multiple_testing_correction

        n_vars = data.shape[1]
        # Loop over effect size, sample size and finally each monte carlo repeat
        for eff_idx, curr_effect in np.ndenumerate(effect_size):
            for ssize_idx, curr_ssize in np.ndenumerate(sample_size):
                for rep_idx in range(n_repeats):
                    # Select samples to use
                    ## Select a subset of the simulated spectra
                    mod_data = np.copy(data[np.random.choice(
                        data.shape[0], curr_ssize, replace=False), :])
                    # if any option other than proportion
                    if modification_type != 'proportion':
                        # Modify only variables above a certain threshold of correlation
                        var_to_mod = np.zeros(n_vars, dtype='int')
                        var_to_mod[variables] = 1

                        expected_hits = np.zeros(n_vars, dtype='int')
                        expected_hits[var_to_mod == 1] = 1
                        # If correlation and correlation_weighted
                        if weight_values is not None and modification_type in [
                                "correlation", "correlation_weighted"
                        ]:
                            if weight_values.ndim == 1:
                                var_to_mod |= abs(
                                    weight_values) >= weight_threshold
                            else:
                                var_to_mod |= np.any(
                                    abs(weight_values) >= weight_threshold,
                                    axis=1)

                        expected_hits = var_to_mod
                    # Select a subset of samples to add the effect on
                    which_samples = np.random.choice(
                        range(curr_ssize),
                        int(np.floor(class_balance * curr_ssize)),
                        replace=False)

                    if modification_type == 'correlation_weighted':
                        mod_data = effect_cohen_d(mod_data,
                                                  curr_effect,
                                                  which_vars=var_to_mod,
                                                  which_samples=which_samples,
                                                  standardized=True,
                                                  noise=0,
                                                  weight=weight_values)
                    else:
                        mod_data = effect_cohen_d(mod_data,
                                                  curr_effect,
                                                  which_vars=var_to_mod,
                                                  which_samples=which_samples,
                                                  standardized=True,
                                                  noise=0,
                                                  weight=None)

                    # Would it be possible to pass a model selection criteria?
                    # P-values for the one-way ANOVA
                    pvals = scistats.f_oneway(
                        np.delete(mod_data, which_samples, axis=0),
                        mod_data[which_samples, :])[1]

                    if modification_type == 'correlation_weighted':
                        scored_res = score_confusionmetrics(
                            result_vector=pvals,
                            expected_hits=expected_hits,
                            weight_vector=weight_values,
                            alpha=alpha)
                    else:
                        scored_res = score_confusionmetrics(
                            result_vector=pvals,
                            expected_hits=expected_hits,
                            weight_vector=None,
                            alpha=alpha)

                    for key in scored_res.keys():
                        results[key][eff_idx, ssize_idx,
                                     rep_idx] = scored_res[key]
                    # Would it be possible to pass a model selection criteria?
                    # P-values for the one-way ANOVA
                    if multiple_testing_correction is not None:
                        adjusted_pvalues = multipletests(
                            pvals,
                            alpha=0.05,
                            method=multiple_testing_correction)[1]

                        scored_res = score_confusionmetrics(
                            result_vector=adjusted_pvalues,
                            expected_hits=expected_hits,
                            weight_vector=None,
                            alpha=alpha)
                        for key in scored_res.keys():
                            adjusted_results[key][eff_idx, ssize_idx,
                                                  rep_idx] = scored_res[key]

        results['Sample Size'] = sample_size
        results['Effect Size'] = effect_size

        if multiple_testing_correction is not None:
            adjusted_results['Sample Size'] = sample_size
            adjusted_results['Effect Size'] = effect_size

        # process the results...
        if multiple_testing_correction is None:
            return results
        else:
            return results, adjusted_results

    except TypeError as terp:
        raise terp
    except ValueError as verr:
        raise verr
    except Exception as exp:
        raise exp
Example #49
0
def save_top_manova(config, attributes_types, attribute_target, num_top=500, window=3, test=MANOVATest.pillai_bartlett):
    dict_bop_cpgs = load_bop_cpg_dict(config)
    dict_bop_genes = get_dict_bop_genes(config, dict_bop_cpgs)
    cpgs, betas = load_cpg_data(config)

    atr_table = []
    atr_cols = []
    for atr_type in attributes_types:
        if isinstance(atr_type, Attribute):
            atr_table.append(get_attributes(config, atr_type))
        elif isinstance(atr_type, CellPop):
            atr_table.append(get_cell_pop(config, [atr_type]))
        atr_cols.append(atr_type.value)

    num_bops = 0
    bops_passed = []
    bops_pvals = []
    for bop in dict_bop_cpgs:
        curr_cpgs = dict_bop_cpgs.get(bop)
        cpgs_passed = []
        for cpg in curr_cpgs:
            if cpg in cpgs:
                cpgs_passed.append(cpg)
        if len(cpgs_passed) > 2:
            pvals_on_bop = []
            for win_id in range(0, len(cpgs_passed) - 2):
                val_table = []
                val_cols = []
                for cpg_id in range(0, window):
                    cpg = cpgs_passed[win_id + cpg_id]
                    beta = betas[cpgs.index(cpg)]
                    val_table.append(beta)
                    val_cols.append('cpg_'+str(cpg_id))
                table = atr_table + val_table
                cols = atr_cols + val_cols

                formula = val_cols[0]
                for val_col_id in range(1, len(val_cols)):
                    val_col = val_cols[val_col_id]
                    formula += ' + ' + val_col
                formula += ' ~ ' + atr_cols[0]
                for atr_col_id in range(1, len(atr_cols)):
                    atr_col = atr_cols[atr_col_id]
                    formula += ' + ' + atr_col

                table = list(map(list, zip(*table)))
                x = pd.DataFrame(table, columns=cols)
                manova = MANOVA.from_formula(formula, x)
                mv_test_res = manova.mv_test()
                pvals = mv_test_res.results[attribute_target.value]['stat'].values[0:4, 4]
                target_pval = pvals[0]
                if test is MANOVATest.wilks:
                    target_pval = pvals[0]
                elif test is MANOVATest.pillai_bartlett:
                    target_pval = pvals[1]
                elif test is MANOVATest.lawley_hotelling:
                    target_pval = pvals[2]
                elif test is MANOVATest.roy:
                    target_pval = pvals[3]
                pvals_on_bop.append(target_pval)
            min_pval = np.min(pvals_on_bop)
            bops_passed.append(bop)
            bops_pvals.append(min_pval)
        num_bops += 1
        if num_bops % config.print_rate == 0:
            print('num_bops: ' + str(num_bops))

    reject, pvals_corrected, alphacSidak, alphacBonf = multipletests(bops_pvals, 0.05, method='fdr_bh')
    order = np.argsort(pvals_corrected)
    bops_opt = list(np.array(bops_passed)[order])[0:num_top]
    pvals_opt = list(np.array(pvals_corrected)[order])[0:num_top]
    genes_opt = []
    genes_from_bop = []
    for bop in bops_opt:
        curr_genes = dict_bop_genes.get(bop)
        genes_str = curr_genes[0]
        for gene_id in range(1, len(curr_genes)):
            genes_str += ';' + curr_genes[gene_id]
        genes_opt.append(genes_str)
        for gene in curr_genes:
            if gene not in genes_from_bop:
                genes_from_bop.append(gene)

    fn = 'top.txt'
    fn = get_result_path(config, fn)
    save_features(fn, [bops_opt, genes_opt, pvals_opt])

    config.approach_gd = GeneDataType.from_bop
    config.dt = DataType.gene
    fn = 'top.txt'
    fn = get_result_path(config, fn)
    save_features(fn, [genes_from_bop])
    config.dt = DataType.cpg
Example #50
0
def Enrichment_Analyses_GO_terms(Name_Network,
                                 save_directory,
                                 Annotation_Directory,
                                 Original_Network_Name,
                                 enrichment="NO",
                                 MaxSize=500,
                                 MinSize=5,
                                 Repetitions=10,
                                 total_K=100,
                                 Rand_K_Compare=66,
                                 comparison="Bin_VS_Prob"):

    ## Preparation of the Network to Enrich ##

    # All the information of the data base:

    GO_Complete_Experimental = pd.read_csv(Annotation_Directory,
                                           sep="\t",
                                           header=0)

    GO_Complete_Experimental.drop("Level", inplace=True, axis=1)
    GO_Complete_Experimental.columns = ["Gene", "GO_Term"]

    # The genes of our Network:

    genes_Network = pd.read_csv(save_directory + "_Gene_Names_" +
                                str(Original_Network_Name),
                                sep=",",
                                header=None,
                                skiprows=[0])
    genes_Network = pd.DataFrame({"Gene": genes_Network[1]})

    # Annotation of the genes of our Network:

    GO_Join_Network = GO_Complete_Experimental[
        GO_Complete_Experimental.Gene.isin(genes_Network.Gene)]

    # Filters:

    Cut_By = pd.DataFrame(
        GO_Join_Network.groupby('GO_Term')['Gene'].nunique(dropna=True))
    Cut_By = Cut_By[Cut_By.Gene > MinSize]
    Cut_By = Cut_By[Cut_By.Gene < MaxSize]

    GO_Join_Network = GO_Join_Network[GO_Join_Network.GO_Term.isin(
        Cut_By.index)]

    ## Distances Preparation for clustering ##

    # Loading:

    Distance = pd.read_csv(
        save_directory + "_Result_Tijana_Final_" + Name_Network,
        sep=" ",
        header=0,
    )
    Distance.set_index('1', inplace=True)

    ## Variables to store the results ##

    # Result DataFrames:

    Results_GO_Enrichment_Final = pd.DataFrame(
        columns=["K_Option", "Terms_Enriched"])
    Results_Cluster_Enrichment_Final = pd.DataFrame(
        columns=["K_Option", "Cluster_Enriched"])
    Results_Genes_Percent_Final = pd.DataFrame(
        columns=["K_Option", "Total_enriched"])

    # Results for Rand_Index:

    Results_Rand_Index = pd.DataFrame(columns=["Option", "Cluster", "Terms"])

    # Results for Gene:

    Results_Gene_GO_final = pd.DataFrame(
        columns=["Gene", "GO_Term", "Cluster", "Option", "Repetition"])

    ## Enrichment Analyses ##:

    for Statistics in range(Repetitions):

        print("Repetition number", Statistics)

        # Per each repetition we should reload the variables:

        Results_Enrichment = pd.DataFrame(columns=[
            "Option", "Enriched_GO", "Cluster", "Num_Genes_Annotated", "Term"
        ])

        Results_Gene_GO = pd.DataFrame(
            columns=["Gene", "GO_Term", "Cluster", "Option", "Repetition"])

        for option in range(1, total_K, 5):

            # First we start with the computing of the clusters

            medoids_ini = option

            print("K number", option)

            clusters = Cluster_Option(option,
                                      genes_Network,
                                      Distance,
                                      method_clust="Kmedoids")

            clusters_Data_frame = pd.DataFrame(
                columns=["Gene", "Cluster", "Option"])

            for i in range(len(clusters)):

                gene_selection = genes_Network.iloc[clusters[i]]
                cluster_Repetition = np.repeat(i + 1, len(gene_selection))
                Option_Repetition = np.repeat(option, len(gene_selection))
                Iterator_DB = pd.DataFrame({
                    'Gene': gene_selection["Gene"],
                    'Cluster': cluster_Repetition,
                    "Option": Option_Repetition
                })
                clusters_Data_frame = clusters_Data_frame.append(Iterator_DB)

            # Number of genes annotated and how many of them are in each category:

            Total_Annotated_Genes = GO_Join_Network["Gene"].nunique()
            Number_Genes_per_GO = GO_Join_Network.groupby(
                'GO_Term')['Gene'].nunique(dropna=True)

            # With this information we go to the cluster:

            # For each cluster:

            for cluster in range(option):

                # Cluster Selection:

                selection_cluster = clusters_Data_frame[
                    clusters_Data_frame.Cluster == (cluster + 1)]

                # Annotation of genes in the cluster with GO:

                GO_Selected = GO_Join_Network[GO_Join_Network.Gene.isin(
                    selection_cluster.Gene)]

                # Put inside of the external variable to keep info:

                Genes_itera = pd.DataFrame({
                    "Gene": GO_Selected.Gene,
                    "GO_Term": GO_Selected.GO_Term,
                    'Cluster': cluster + 1,
                    "Option": option,
                    "Repetition": Statistics
                })

                Results_Gene_GO = Results_Gene_GO.append(Genes_itera)

                # Number Genes with a concrete GO term in the cluster:

                k_selection = GO_Selected.groupby('GO_Term')['Gene'].nunique()

                K_and_k_data_frame = pd.merge(Number_Genes_per_GO,
                                              k_selection,
                                              on='GO_Term',
                                              how='right')

                # Total genes with annotation in the cluster:

                Total_Annotated_Cluster = GO_Selected["Gene"].nunique()

                # For the results of the enrichment in the cluster:

                Results_Enrichment_Cluster = pd.DataFrame(
                    columns=["GO_Term", "p_value", 'Cluster', 'Option'])

                # For each GO term in the cluster:

                # Probabilistic:

                for GO_term in range(len(K_and_k_data_frame)):

                    Enrich_GO = K_and_k_data_frame.iloc[GO_term]
                    M = Total_Annotated_Genes
                    k = Enrich_GO[0]
                    N = Total_Annotated_Cluster
                    X = Enrich_GO[1]

                    p_value = hypergeom.sf(X - 1, M, k, N)

                    results_db_iter = pd.DataFrame({
                        'GO_Term':
                        K_and_k_data_frame.index[GO_term],
                        'p_value': [p_value],
                        'Cluster': [cluster + 1],
                        'Option': [medoids_ini]
                    })

                    Results_Enrichment_Cluster = Results_Enrichment_Cluster.append(
                        results_db_iter)

                if len(K_and_k_data_frame
                       ) != 0:  # To avoid errors with empty clusters

                    p_value_Correction = multipletests(
                        Results_Enrichment_Cluster["p_value"],
                        alpha=0.01,
                        method='fdr_bh',
                        is_sorted=False,
                        returnsorted=False)

                    Results_Enrichment_Cluster[
                        'p_value_Correction'] = p_value_Correction[1]

                    count_GO_Enriched = sum(
                        Results_Enrichment_Cluster['p_value_Correction'] < 0.05
                    )

                    names_GO_Enriched = Results_Enrichment_Cluster[
                        Results_Enrichment_Cluster.p_value_Correction < 0.05]
                    names_GO_Enriched = list(names_GO_Enriched["GO_Term"])

                    Results_itera_Enrich = pd.DataFrame({
                        'Option': [medoids_ini],
                        "Enriched_GO": [count_GO_Enriched],
                        'Cluster': [cluster + 1],
                        "Num_Genes_Annotated": [Total_Annotated_Cluster],
                        "Term": [names_GO_Enriched]
                    })

                    Results_Enrichment = Results_Enrichment.append(
                        Results_itera_Enrich)

                elif len(K_and_k_data_frame) == 0:

                    Results_itera_Enrich = pd.DataFrame({
                        'Option': [medoids_ini],
                        "Enriched_GO": [0],
                        'Cluster': [cluster + 1],
                        "Num_Genes_Annotated": [Total_Annotated_Cluster],
                        "Term": [[]]
                    })

                    Results_Enrichment = Results_Enrichment.append(
                        Results_itera_Enrich)

                # Calculations per each iteration:

        GO_percent = Function_GO_enriched(Results_Enrichment,
                                          GO_Join_Network["GO_Term"].nunique(),
                                          total_K)
        Cluster_percent = Function_Calculate_Clust_Perc(
            Results_Enrichment, total_K)
        Genes_percent = Function_Gene_enriched(Results_Gene_GO,
                                               Results_Enrichment, total_K,
                                               genes_Network)

        Results_GO_Enrichment_Final = Results_GO_Enrichment_Final.append(
            GO_percent)
        Results_Cluster_Enrichment_Final = Results_Cluster_Enrichment_Final.append(
            Cluster_percent)
        Results_Genes_Percent_Final = Results_Genes_Percent_Final.append(
            Genes_percent)

        # For Rand:

        Rand = pd.DataFrame({
            "Option": Results_Enrichment.Option,
            "Cluster": Results_Enrichment.Cluster,
            "Terms": Results_Enrichment.Term
        })

        Results_Rand_Index = Results_Rand_Index.append(Rand)

        # For Cluster.

        Results_Gene_GO_final = Results_Gene_GO_final.append(Results_Gene_GO)

    # Only a concrete k value to compare with The Rand index:

    #Results_Rand_Index = Results_Rand_Index[Results_Rand_Index.Option == Rand_K_Compare]

    Results_GO_Enrichment_Final.drop("K_Option", inplace=True, axis=1)
    Results_GO_Enrichment_Final.drop("Terms_Enriched", inplace=True, axis=1)
    Results_Cluster_Enrichment_Final.drop("Cluster_Enriched",
                                          inplace=True,
                                          axis=1)
    Results_Cluster_Enrichment_Final.drop("K_Option", inplace=True, axis=1)

    # Save files just in case:

    Results_GO_Enrichment_Final.to_csv(save_directory + "_Enrichment_GO" +
                                       "_" + enrichment + "_" + Name_Network +
                                       ".txt",
                                       header=True,
                                       index=False)
    Results_Cluster_Enrichment_Final.to_csv(
        save_directory + "_Enrichment_Cluster" + "_" + enrichment + "_" +
        Name_Network + ".txt",
        header=True,
        index=False)
    Results_Genes_Percent_Final.to_csv(save_directory + "_Enrichment_Genes" +
                                       "_" + enrichment + "_" + Name_Network +
                                       ".txt",
                                       header=True,
                                       index=False)

    # Save info about Genes and Clusters:

    Results_Gene_GO_final.to_csv(save_directory + "_Cluster_Information_" +
                                 "_" + enrichment + "_" + Name_Network +
                                 ".txt",
                                 header=True,
                                 index=False)

    # Save Rand:

    Results_Rand_Index.to_csv(save_directory + "_Rand_Information_" + "_" +
                              enrichment + "_" + Name_Network + ".txt",
                              header=True,
                              index=False)
Example #51
0
def pfam_hyg(pfam):
    k = gi1_pfams.count(pfam)
    M = len(full_pfams)
    N = len(gi1_pfams)
    n = full_pfams.count(pfam)
    p = hypergeom.sf(k=k, M=M, n=n, N=N)
    ratio = (float(k) / N) / (float(n) / M)
    return p, ratio


pfams_pvals = {p: pfam_hyg(p)[0] for p in all_pfams}
pfams_effect = {p: pfam_hyg(p)[1] for p in all_pfams}

adj_pval = dict(
    zip(all_pfams,
        multi.multipletests(list(pfams_pvals.values()), method="fdr_bh")[1]))
sigs = {
    k.split(".")[0]: {
        'name': pfam2name[k.split(".")[0]],
        'pval': pfams_pvals[k],
        'adj.pval': v,
        'ratio': pfams_effect[k]
    }
    for k, v in sorted(adj_pval.items(), key=lambda x: x[1]) if v < 0.05
}
pfam_table = DataFrame.from_dict(sigs, orient='index')
pfam_table = pfam_table.sort_values(by='adj.pval')
pfam_table['group'] = [
    "other transferase" if "ransferase" in p else "" for p in pfam_table.name
]
pfam_table['group'] = [
Example #52
0
def main(_):
  print("Loading data...")
  data = pd.read_csv(FLAGS.data, encoding="utf-8")
  print("%d Examples" % (len(set(data["id"]))))
  print("%d Annotations" % len(data))
  os.makedirs(FLAGS.plot_dir, exist_ok=True)

  with open(FLAGS.target_file, "r") as f:
    all_targets = f.read().splitlines()
  all_targets_neutral = all_targets + ["neutral"]
  target2idx = {e: i for i, e in enumerate(all_targets)}
  print("%d Target Categories" % len(all_targets))

  print("Processing data...")

  # Remove neutral labels
  data = data[data["neutral"] == 0]

  # Remove examples with no ratings (difficult examples)
  data = data[data[all_targets_neutral].sum(axis=1) != 0]

  # Convert into num_examples x num_raters x num_ratings format
  data = data.groupby("id").filter(lambda x: len(x) >= 3)
  id_groups = data.groupby("id")

  worker2examples = {}  # dict mapping worker ids to (example, rater id) tuples
  max_num_raters = data.groupby("id").size().max()
  ratings = np.zeros(
      (len(id_groups), max_num_raters, len(all_targets)))  # ignore "neutral"
  rater_msk = np.zeros(
      (len(id_groups), max_num_raters))  # for masking out non-existent raters
  print("Ratings shape", ratings.shape)

  # Get ratings and rater mask
  texts = []
  for ex_idx, (_, g) in enumerate(id_groups):
    texts.append(g.iloc[0]["text"])
    rater_count = 0

    # iterate through workers
    for _, row in g.iterrows():
      for e in all_targets:
        ratings[ex_idx, rater_count, target2idx[e]] = row[e]
        rater_msk[ex_idx, rater_count] = 1

      worker_id = row["rater_id"]
      if worker_id in worker2examples:
        worker2examples[worker_id].append((ex_idx, rater_count))
      else:
        worker2examples[worker_id] = [(ex_idx, rater_count)]
      rater_count += 1

  print("Calculating leave-out (partial) correlations...")
  partial_corr_per_rater = []
  corr_per_rater = []
  for worker_id in worker2examples:
    partial_corrs, corrs = LeaveOut(ratings, rater_msk, worker2examples,
                                    worker_id)
    if len(partial_corrs) < len(all_targets):
      continue

    partial_corr_per_rater.append(partial_corrs)
    corr_per_rater.append(corrs)
  corr_per_rater = np.array(corr_per_rater)
  partial_corr_per_rater = np.array(partial_corr_per_rater)

  # Verify that there are no NaN values
  assert np.isnan(corr_per_rater).sum() == 0

  # Apply Wilcoxon signed rank test to test significance of each dimension
  p_vals = np.apply_along_axis(wilcoxon, 0, partial_corr_per_rater)[1]

  # Apply Bonferroni correction
  reject, corr_pvals, _, newalpha = multipletests(
      p_vals, alpha=0.05, method="bonferroni")
  print("Which dimensions to keep?")
  print(reject)
  print(corr_pvals)
  print(newalpha)

  print("Running PPCA on all the data...")
  # Take all raters and split them randomly
  x = []
  y = []
  rater_counts = rater_msk.sum(axis=1).astype(int)
  all_ratings_avg = []
  for i, ex in enumerate(ratings):
    # Get actual raters based on mask
    keep = []
    for worker_rating in ex[:rater_counts[i]]:
      keep.append(list(worker_rating))
    all_ratings_avg.append(list(np.array(keep).mean(axis=0)))

    # Shuffle raters randomly
    random.shuffle(keep)

    num_raters = len(keep)
    x.append(list(np.array(keep[:int(num_raters / 2)]).mean(axis=0)))
    y.append(list(np.array(keep[int(num_raters / 2):]).mean(axis=0)))

  x = np.array(x)
  y = np.array(y)
  all_ratings_avg = np.array(all_ratings_avg)
  w, v = PPCA(x, y)  # final components (p-values determine which ones to keep)

  print("Plotting percentage of covariance explained...")
  PlotCovar(v)

  # Apply varimax rotation
  w_vari = Varimax(w)

  # Get mapping between ppcs and targets
  map_df = pd.DataFrame(
      w_vari, index=all_targets, columns=np.arange(len(all_targets))).round(4)
  # Sort to move values to diagonal
  map_df = map_df[list(
      np.argsort(map_df.apply(lambda x: pd.Series.nonzero(x)[0]).values)[0])]
  f = plt.figure(figsize=(10, 6), dpi=300)
  sns.heatmap(
      map_df,
      center=0,
      cmap=sns.diverging_palette(240, 10, n=50),
      yticklabels=all_targets)
  plt.xlabel("Component")
  plt.savefig(
      FLAGS.plot_dir + "/component_loadings.pdf",
      dpi=600,
      format="pdf",
      bbox_inches="tight")
  ppc2target = map_df.abs().idxmax().to_dict()
  target2ppc = {e: i for i, e in ppc2target.items()}
  print(ppc2target)

  print("Plotting frequency and mean left-out rater correlations...")
  corr_mean = corr_per_rater.mean(axis=0)
  corr_mean_ordered = [corr_mean[target2ppc[e]] for e in all_targets]
  df_plot = pd.DataFrame({
      "target": all_targets,
      "agreement": corr_mean_ordered
  })
  df_plot["count"] = df_plot["target"].map(
      data[all_targets].sum(axis=0).to_dict())
  df_plot.sort_values("count", ascending=False, inplace=True)
  df_plot.to_csv(FLAGS.plot_dir + "/target_agreements.csv", index=False)

  # Get colors
  norm = plt.Normalize(df_plot["agreement"].min(), df_plot["agreement"].max())
  sm = plt.cm.ScalarMappable(cmap="BuPu", norm=norm)
  sm.set_array([])

  # Generate figure
  fig = plt.figure(dpi=600, figsize=(5, 6))
  ax = sns.barplot(
      data=df_plot,
      y="target",
      x="count",
      orient="h",
      hue="agreement",
      palette="BuPu",
      dodge=False,
      edgecolor="black",
      linewidth=1)
  ax.get_legend().remove()
  ax.figure.colorbar(sm)
  plt.text(18000, 31, "Interrater\nCorrelation", ha="center")
  plt.xlabel("Number of Examples")
  plt.ylabel("")
  plt.draw()
  labels = [item.get_text() for item in ax.get_xticklabels()]
  ax.set_xticklabels(["%dk" % (int(int(label) / 1000)) for label in labels])
  plt.tight_layout()
  fig.savefig(
      FLAGS.plot_dir + "/label_distr_agreement.pdf",
      dpi=600,
      format="pdf",
      bbox_inches="tight")

  print("Generating t-SNE plot...")
  # Get PPC scores for all examples
  all_ratings_avg = Demean(all_ratings_avg)  # demean all ratings
  ppc_scores = all_ratings_avg.dot(w_vari)  # project onto ppcs
  ppc_scores_abs = np.absolute(ppc_scores)

  # Load maximally distinct colors
  colors = pd.read_csv(
      FLAGS.rgb_colors, sep="\t", header=None, names=np.arange(3))

  # Set colors (todo(ddemszky): add names to colors in file)
  palette_rgb = colors.values
  with open(FLAGS.target_color_order) as f:
    color_order = f.read().splitlines()
  ppc2color = {target2ppc[e]: i for i, e in enumerate(color_order)}
  # get rgb value for each example based on weighted average of top targets
  rgb_vals = []
  hex_vals = []
  top_categories = []
  threshold = 0.5  # exclude points not loading on any of the top 10 categories
  counter = 0
  rgb_max = 255
  other_color = palette_rgb[len(all_targets), :]
  for i, scores in enumerate(ppc_scores_abs):

    top_ppcs = [
        idx for idx in (-scores).argsort()[:2] if scores[idx] > threshold
    ]
    top_targets = ",".join([ppc2target[idx] for idx in top_ppcs
                           ]) if top_ppcs else "other"
    top_categories.append(top_targets)
    if len(top_ppcs) < 1:  # doesn't have top targets from list
      color = other_color  # use grey
      counter += 1
    else:
      # Weighted average of top targets (square->weighted average->square root)
      color_ids = [ppc2color[idx] for idx in top_ppcs]
      weights = [scores[idx] for idx in top_ppcs]
      # Need to round, otherwise floating point precision issues will result
      # in values slightly above 1
      avg = np.round(
          np.sqrt(
              np.average(
                  np.power(palette_rgb[color_ids] * rgb_max, 2),
                  axis=0,
                  weights=weights)) / rgb_max, 4)
      if (avg > 1).sum() > 0:
        print(avg)
      color = avg
    rgb_vals.append(list(color))
    hex_vals.append("#%02x%02x%02x" %
                    tuple(np.array(color * rgb_max, dtype=int)))
  rgb_vals = np.array(rgb_vals)

  # Create t-SNE model
  tsne_model = TSNE(
      perplexity=30,
      n_components=2,
      n_iter=1000,
      random_state=23,
      learning_rate=500,
      init="pca")
  new_values = tsne_model.fit_transform(ppc_scores)
  x = []
  y = []
  for value in new_values:
    x.append(value[0])
    y.append(value[1])
  # Put data in dataframe
  df = pd.DataFrame({
      "x": x,
      "y": y,
      "color": hex_vals,
      "label(s)": top_categories,
      "text": texts
  })

  df = df[df["label(s)"] != "other"]
  df["top_label"] = df["label(s)"].str.split(",").str[0]

  # Two selections:
  # - a brush that is active on the top panel
  # - a multi-click that is active on the bottom panel
  brush = alt.selection(type="interval")
  click = alt.selection_multi(encodings=["color"])

  sample = df.sample(5000)  # max 5000 examples can be plotted
  points = alt.Chart(sample).mark_point(
      filled=True, size=50).encode(
          x="x:Q",
          y="y:Q",
          color=alt.Color("color", scale=None),
          tooltip=["label(s)", "text"]).properties(
              width=700, height=600).add_selection(brush)

  # Bottom panel is a bar chart
  bars = alt.Chart(sample).mark_bar().encode(
      x="count()",
      y="top_label:N",
      color=alt.condition(click, alt.Color("color:N", scale=None),
                          alt.value("lightgray")),
  ).transform_filter(brush.ref()).properties(
      width=700, selection=click)

  chart = alt.vconcat(
      points, bars, data=sample, title="t-SNE Projection of Examples")

  chart.save(FLAGS.plot_dir + "/tsne.html", format="html")
Example #53
0
    def get_score_df(self, correction_method=None):
        '''

        :param correction_method: str or None, correction method from statsmodels.stats.multitest.multipletests
         'fdr_bh' is recommended.
        :return: pd.DataFrame
        '''
        # From https://people.kth.se/~lang/Effect_size.pdf
        # Shinichi Nakagawa1 and Innes C. Cuthill. Effect size, confidence interval and statistical
        # significance: a practical guide for biologists. 2007. In Biological Reviews 82.
        #
        # Modification: when calculating variance, an empty document is added to each set
        X = self._get_X().astype(np.float64)
        X = X / X.sum(axis=1)
        X[np.isnan(X)] = 0
        cat_X, ncat_X = self._get_cat_and_ncat(X)
        empty_cat_X_smoothing_doc = np.zeros((1, cat_X.shape[1]))
        empty_ncat_X_smoothing_doc = np.zeros((1, ncat_X.shape[1]))
        smoothed_cat_X = np.vstack([empty_cat_X_smoothing_doc, cat_X])
        smoothed_ncat_X = np.vstack([empty_ncat_X_smoothing_doc, ncat_X])
        n1, n2 = float(smoothed_cat_X.shape[1]), float(
            smoothed_ncat_X.shape[1])
        n = n1 + n2
        m1 = cat_X.mean(axis=0).A1
        m2 = ncat_X.mean(axis=0).A1
        v1 = smoothed_cat_X.var(axis=0).A1
        v2 = smoothed_ncat_X.var(axis=0).A1

        s_pooled = np.sqrt(((n2 - 1) * v2 + (n1 - 1) * v1) / (n - 2.))
        cohens_d = (m1 - m2) / s_pooled
        cohens_d_se = np.sqrt(
            ((n - 1.) / (n - 3)) * (4. / n) * (1 + np.square(cohens_d) / 8.))
        cohens_d_z = cohens_d / cohens_d_se
        cohens_d_p = norm.sf(cohens_d_z)
        hedges_r = cohens_d * (1 - 3. / ((4. * (n - 2)) - 1))
        hedges_r_se = np.sqrt(n / (n1 * n2) + np.square(hedges_r) / (n - 2.))
        hedges_r_z = hedges_r / hedges_r_se
        hedges_r_p = norm.sf(hedges_r_z)

        score_df = pd.DataFrame(
            {
                'cohens_d': cohens_d,
                'cohens_d_se': cohens_d_se,
                'cohens_d_z': cohens_d_z,
                'cohens_d_p': cohens_d_p,
                'hedges_r': hedges_r,
                'hedges_r_se': hedges_r_se,
                'hedges_r_z': hedges_r_z,
                'hedges_r_p': hedges_r_p,
                'm1': m1,
                'm2': m2,
            },
            index=self.corpus_.get_terms()).fillna(0)

        if correction_method is not None:
            from statsmodels.stats.multitest import multipletests
            score_df['hedges_r_p_corr'] = 0.5
            for method in ['cohens_d', 'hedges_r']:
                score_df[method + '_p_corr'] = 0.5
                score_df.loc[(score_df['m1'] != 0) | (score_df['m2'] != 0),
                             method + '_p_corr'] = (multipletests(
                                 score_df.loc[(score_df['m1'] != 0) |
                                              (score_df['m2'] != 0),
                                              method + '_p'],
                                 method=correction_method)[1])

        return score_df
Example #54
0
def parse_IPMASS(t=None, mode='log2'):
    if t is None:
        t = 'T1'
    f = '/Share2/home/zhangqf7/gongjing/zebrafish/script/zebrafish_structure/data/IP-mass/%s_P_N.xlsx' % (
        t)

    df = pd.read_excel(f)
    print "read: %s, num=%s" % (f, df.shape[0])

    #     cols_keep = ['Gene names', #'Q-value', 'Score',
    #                  'LFQ intensity %s_N1'%(t), 'LFQ intensity %s_N2'%(t), 'LFQ intensity %s_N3'%(t), 'LFQ intensity %s_N4'%(t),
    #                  'LFQ intensity %s_P1'%(t), 'LFQ intensity %s_P2'%(t), 'LFQ intensity %s_P3'%(t), 'LFQ intensity %s_P4'%(t),]
    #     rep_N = ['LFQ intensity %s_N1'%(t), 'LFQ intensity %s_N2'%(t), 'LFQ intensity %s_N3'%(t), 'LFQ intensity %s_N4'%(t),]
    #     rep_P = ['LFQ intensity %s_P1'%(t), 'LFQ intensity %s_P2'%(t), 'LFQ intensity %s_P3'%(t), 'LFQ intensity %s_P4'%(t),]
    rep_N = ['LFQ intensity %s_N1' % (t), 'LFQ intensity %s_N2' % (t)]
    rep_P = ['LFQ intensity %s_P1' % (t), 'LFQ intensity %s_P2' % (t)]
    cols_keep = ['Gene names'] + rep_N + rep_P

    # print df.head()

    # log2 first
    if mode == 'log2':
        for i in rep_N + rep_P:
            log2_ls = []
            for v in list(df[i]):
                if float(v) == 0:
                    log2_ls.append(0.001)
                else:
                    log2_ls.append(np.log2(float(v)))
            df['log2(%s)' % (i)] = log2_ls

        df['sum(N)'] = df.loc[:, ['log2(%s)' % (i) for i in rep_N]].sum(axis=1)
        df['sum(P)'] = df.loc[:, ['log2(%s)' % (i) for i in rep_P]].sum(axis=1)
        df['mean(N)'] = df['sum(N)'] / 4.0
        df['mean(P)'] = df['sum(P)'] / 4.0
        df['sum(P)-sum(N)'] = df['sum(P)'] - df['sum(N)']
        df['mean(P)-mean(N)'] = df['mean(P)'] - df['mean(N)']

        #     print df.head()

        rep_N_log2_ls = ['log2(%s)' % (i) for i in rep_N]
        rep_P_log2_ls = ['log2(%s)' % (i) for i in rep_P]

        pvalue = []
        for index, row in df.iterrows():
            rep_N_val_ls = [row[i] for i in rep_N_log2_ls]
            rep_P_val_ls = [row[i] for i in rep_P_log2_ls]
            s, p = stats.ttest_ind(rep_P_val_ls, rep_N_val_ls)
            if np.isnan(p):
                p = 1
            pvalue.append(p)
    else:
        df['sum(N)'] = df.loc[:, ['%s' % (i) for i in rep_N]].sum(axis=1)
        df['sum(P)'] = df.loc[:, ['%s' % (i) for i in rep_P]].sum(axis=1)
        df['mean(N)'] = df['sum(N)'] / 4.0
        df['mean(P)'] = df['sum(P)'] / 4.0
        df['mean(N)'] = [1 if i == 0 else i for i in df['mean(N)']]
        df['mean(P)'] = [1 if i == 0 else i for i in df['mean(P)']]
        df['log2(mean(N))'] = np.log2(df['mean(N)'])
        df['log2(mean(P))'] = np.log2(df['mean(P)'])
        df['sum(P)-sum(N)'] = df['sum(P)'] - df['sum(N)']
        df['mean(P)-mean(N)'] = df['mean(P)'] - df['mean(N)']
        df['log2(mean(P)/mean(N))'] = df['log2(mean(P))'] - df['log2(mean(N))']

        rep_N_log2_ls = ['log2(%s)' % (i) for i in rep_N]
        rep_P_log2_ls = ['log2(%s)' % (i) for i in rep_P]

        pvalue = []
        for index, row in df.iterrows():
            rep_N_val_ls = [row[i] for i in rep_N]
            rep_P_val_ls = [row[i] for i in rep_P]
            s, p = stats.ttest_ind(rep_P_val_ls, rep_N_val_ls)
            if np.isnan(p):
                p = 1
            pvalue.append(p)

    # print pvalue
    qvalue = multi.multipletests(pvalue)
    # print qvalue

    df['pvalue'] = pvalue
    df['qvalue'] = qvalue[1]
    df['-log10(qvalue)'] = -np.log10(df['qvalue'])
    df['-log10(pvalue)'] = -np.log10(df['pvalue'])
    cols_calc = [
        'sum(N)', 'sum(P)', 'mean(N)', 'mean(P)', 'log2(mean(N))',
        'log2(mean(P))', 'sum(P)-sum(N)', 'mean(P)-mean(N)',
        'log2(mean(P)/mean(N))', 'pvalue', 'qvalue', '-log10(pvalue)',
        '-log10(qvalue)'
    ]
    df = df[cols_keep + cols_calc]
    df.to_excel(
        '/Share2/home/zhangqf7/gongjing/zebrafish/script/zebrafish_structure/data/IP-mass/%s_enrich_table.xlsx'
        % (t),
        header=True,
        index=False)
    df.head()

    fig, ax = plt.subplots(figsize=(6, 6))
    x_col = 'log2(mean(P)/mean(N))'
    x_col = 'log2(mean(N))'
    y_col = '-log10(pvalue)'
    y_col = 'log2(mean(P))'
    df.plot(kind='scatter', x=x_col, y=y_col, ax=ax)
    ratio_max = max(df[x_col])
    #     plt.axvline(x=0, ymin=0, ymax=1, ls='--', color='grey')
    #     plt.axhline(y=-np.log10(0.05), xmin=0, xmax=1, ls='--', color='grey')
    #     ax.set_xlim(-ratio_max-1, ratio_max+1)
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    plt.title('time: %s (n=%s)' % (t, df.shape[0]))
    savefn = '/Share2/home/zhangqf7/gongjing/zebrafish/script/zebrafish_structure/data/IP-mass/%s_enrich_pvalue.pdf' % (
        t)

    df['Gene Names'] = [i.split(';')[0] for i in df['Gene names']]
    texts = []
    for x, y, t in zip(df[x_col], df[y_col], df['Gene Names']):
        if y > -np.log10(0.05) and t == 'elavl1':
            #             ax.annotate(t, (x, y), fs=3)
            texts.append(plt.text(x, y, t, fontsize=12))


#     plt.tight_layout()
    adjust_text(texts, only_move={'text': 'x'})

    plt.tight_layout()
    plt.savefig(savefn)
    plt.close()

    return df, rep_N_log2_ls, rep_P_log2_ls, df[cols_keep + cols_calc]
Example #55
0
def build_graph(pairs_occurances,filtered_clustering_table,alpha,method,verb=False):
    
    """Calculate p-values for domain pairs based on filtered clustering table 
    
    Parameters:
        pairs_occurances (str): 
        filtered_clustering_table (int) : 
        alpha (float) :(default is False)
        method (str) : 
        verb (bool) : 

    Returns: 
        G (nx.network) : 
    
    Raises:
        IOError: An error occurred accessing the bigtable.Table object.
    
    """
    G=nx.Graph()
    
    #Weight edges based on co-occurance
    
    pairs_occurances['pair'] = zip(pairs_occurances.V1.values,pairs_occurances.V2.values)
    if method == 'pvalue':
        try:
            edges = pairs_occurances[pairs_occurances['pvalue'] < alpha]['pair'].apply(lambda x: ast.literal_eval(x)).values
        except:
            edges = pairs_occurances[pairs_occurances['pvalue'] < alpha]['pair'].values
        finally:
            risks = pairs_occurances[pairs_occurances['pvalue'] < alpha]['pvalue'].apply(lambda x: -np.log10(x)).astype(str).values
            weightedEdges = [ e + (b,) for e,b  in zip(edges,risks)]
    else:
        (reject, pvals_correct,a,b) = multipletests(pairs_occurances.pvalue.values,alpha,method)
        pairs_occurances['pvalue_correct'] = pvals_correct
        pairs_occurances['reject'] = reject
        try:
            edges = pairs_occurances[pairs_occurances['reject']]['pair'].apply(lambda x: ast.literal_eval(x)).values
        except:
            edges = pairs_occurances[pairs_occurances['reject']]['pair'].values
        finally:
            risks = pairs_occurances[pairs_occurances['reject']]['pvalue_correct'].apply(lambda x: -np.log10(x)).astype(str).values
            weightedEdges = [ e + (b,) for e,b  in zip(edges,risks)]

    G.add_weighted_edges_from(weightedEdges)
    
    log("Constructing network --> %s %s" % (method,alpha))
    log("%s nodes and %s edges found..." % (len(G.nodes()),len(G.edges)))


    if verb:
        log('annotating netwrok file...')

    wellsReadsDict = filtered_clustering_table.astype(str).groupby('seed')['well'].apply( lambda x: set(x.tolist())).to_dict()
    s = pd.Series(wellsReadsDict)
    attr_dict = s.apply(lambda x: '_'.join(sorted(list(x),key=int ))).to_dict()
    nx.set_node_attributes(G, name='well', values=attr_dict)
    
    #extract attributes from filtered_clustering_table to graph
    centroids_indexs = filtered_clustering_table[filtered_clustering_table['type'] == 'S'].index
    for attr in ['seq','clusterSize','domain']:
        attr_dict = dict(zip(filtered_clustering_table.loc[centroids_indexs,'seed'],filtered_clustering_table.loc[centroids_indexs,attr]))
        nx.set_node_attributes(G, name=attr, values=attr_dict)
        
    nx.set_node_attributes(G, name='compressed', values=0)

    return G
Example #56
0
def get_interactions():

    df_path = '/Users/wrshoemaker/Desktop/ParEvol_test/data/Tenaillon_et_al/gene_by_pop.txt'
    df = pandas.read_csv(df_path, sep = '\t', header = 'infer', index_col = 0)
    df_np = df.values
    df_np = numpy.transpose(df_np)
    genes = df.columns.to_list()

    gene_pairs = list(itertools.combinations(genes,2))

    pairwise_null_dict = {}

    for gene_pair in gene_pairs:
        pairwise_null_dict[gene_pair] = []


    mutal_info_matrix = get_mutual_information_binary_matrix(df_np)
    #mutal_info_matrix = numpy.cov(df_np)
    mutal_info_matrix_flat = mutal_info_matrix[numpy.triu_indices(mutal_info_matrix.shape[0], k = 1)]

    n_simulations = 10000

    for i in range(n_simulations):

        if ( i % 1000 == 0) and (i>0):

            print("%d simulations complete!" % i)

        df_np_null = get_random_matrix(df_np)
        null_mutal_info_matrix = get_mutual_information_binary_matrix(df_np_null)
        #null_mutal_info_matrix = numpy.cov(df_np_null)
        null_mutal_info_matrix_flat = null_mutal_info_matrix[numpy.triu_indices(null_mutal_info_matrix.shape[0], k = 1)]

        for gene_pair_idx, gene_pair in enumerate(gene_pairs):

            pairwise_null_dict[gene_pair].append(null_mutal_info_matrix_flat[gene_pair_idx])


    #print(pairwise_null_dict)

    p_values = []
    for gene_pair_idx, gene_pair in enumerate(gene_pairs):

        null_array = numpy.asarray(pairwise_null_dict[gene_pair])
        observed_mutual_info = mutal_info_matrix_flat[gene_pair_idx]

        p_value_gene_pair =  (len(null_array[null_array > observed_mutual_info ]) + 1) / (n_simulations+1)

        p_values.append(p_value_gene_pair)

    # 63190 tests

    reject, pvals_corrected, alphacSidak, alphacBonf = multitest.multipletests(p_values, alpha=0.05, method='fdr_bh')
    significanat_interaction_dict = {}
    count = 0
    for gene_pair_idx, gene_pair in enumerate(gene_pairs):

        observed_mutual_info = mutal_info_matrix_flat[gene_pair_idx]
        p_value_corrected = pvals_corrected[gene_pair_idx]

        if p_value_corrected >= 0.01:
            continue

        #if reject[gene_pair_idx] == True:
        #    continue

        count += 1

        if gene_pair[0] not in significanat_interaction_dict:
            significanat_interaction_dict[gene_pair[0]] = {}

        if gene_pair[1] not in significanat_interaction_dict:
            significanat_interaction_dict[gene_pair[1]] = {}

        significanat_interaction_dict[gene_pair[0]][gene_pair[1]] = observed_mutual_info
        significanat_interaction_dict[gene_pair[1]][gene_pair[0]] = observed_mutual_info



    df_significant = pandas.DataFrame.from_dict(significanat_interaction_dict)

    df_significant = df_significant.fillna(0)
    df_out = '/Users/wrshoemaker/Desktop/ParEvol_test/data/Tenaillon_et_al/significant_mutual_information_tenaillon.txt'
    df_significant.to_csv(df_out, sep = '\t', index = True)
def stats_test(results_dir, signal_root):
    signal_pvalues_df_path = results_dir / "signal_pvalues.csv"
    signal_adjusted_pvalues_df_path = results_dir / "signal_adjusted_pvalues.csv"

    if signal_pvalues_df_path.exists():
        signal_pvalues_df = pd.read_csv(signal_pvalues_df_path, index_col=0)
    else:
        signal_pvalues_df = calc_signal_pvalues(signal_pvalues_df_path, signal_root)

    print("Processed hypothesis:", len(signal_pvalues_df))
    print(signal_pvalues_df.head(10))
    signal_pvalues_df.index = signal_pvalues_df.name
    signal_pvalues_df.drop("name", inplace=True, axis=1)
    print("Not corrected pval, first 10 lowerest pvalues:")
    signal_pvalues_df["min"] = signal_pvalues_df.min(axis=1)
    signal_pvalues_df_sorted_by_min = signal_pvalues_df.sort_values(by="min")
    signal_pvalues_df_sorted_by_min.to_csv(str(results_dir / "signal_pvalues_sorted.csv"))
    print(signal_pvalues_df_sorted_by_min.head(10).to_string(line_width=300))
    # P-values correction
    # see: http://www.statsmodels.org/dev/_modules/statsmodels/stats/multitest.html
    print("Adjust pvalues..")
    signal_pvalues_bh_df = signal_pvalues_df.copy().drop("min", axis=1)
    for c in signal_pvalues_bh_df.columns:
        pvalues = signal_pvalues_bh_df.loc[:, c]
        pvalues_not_nan_mask = ~np.isnan(pvalues)
        pvalues_not_nan = pvalues[pvalues_not_nan_mask]
        _reject, pvalues_corrected, *_ = multipletests(
            pvals=pvalues_not_nan,
            # fdr_bh, holm-sidak, bonferroni
            alpha=0.05, method="fdr_bh"
        )
        signal_pvalues_bh_df.loc[pvalues_not_nan_mask, c] = pvalues_corrected
    signal_pvalues_bh_df["min"] = signal_pvalues_bh_df.min(axis=1, skipna=True)
    signal_pvalues_bh_sorted_df = signal_pvalues_bh_df.sort_values(by="min")
    signal_pvalues_bh_sorted_df.to_csv(str(signal_adjusted_pvalues_df_path))

    # Passing FDR correction
    signal_pvalues_bh_sorted_df_005 = signal_pvalues_bh_sorted_df[
        signal_pvalues_bh_sorted_df["min"] < 0.05]
    print("Passing FDR 0.05 by any metric:", len(signal_pvalues_bh_sorted_df_005))
    # print(signal_pvalues_bh_sorted_df_005.head(10).to_string(line_width=300))
    print("Corrected, first 10 lowerest pvalues:")
    print(signal_pvalues_bh_sorted_df.head(10).to_string(line_width=300))
    print("Same records, but original pvalues:")
    print(signal_pvalues_df.loc[signal_pvalues_bh_sorted_df.head(10).index, :].to_string(
        line_width=300))

    # Plots:

    with PdfPages(str(results_dir / "signal_pvalues.pdf")) as pdf:
        for col in signal_pvalues_df.columns:
            loir.manhattan_plot(
                signal_pvalues_df.sort_values(by="min"), col,
                "Signal [{}] ODS vs YDS: Mann whitney u test pvalues".format(col),
                correction="Uncorrected",
                save_to=pdf
            )
            loir.manhattan_plot(
                signal_pvalues_bh_sorted_df.sort_values(by="min"), col,
                "Signal [{}] ODS vs YDS: Mann whitney u test pvalues".format(col),
                correction="Benjamini–Hochberg corrected",
                save_to=pdf
            )

            if (col != "min"):
                plot_signal_at_signif_loci("Uncorrected",
                                           signal_pvalues_df,
                                           col, pdf, signal_root)
                plot_signal_at_signif_loci("Benjamini–Hochberg corrected",
                                           signal_pvalues_bh_sorted_df, col, pdf, signal_root)
Example #58
0
    def start(self):
        self.print_arguments()

        print("Loading data")
        discovery_df = self.load_file(self.discovery_path,
                                      header=0,
                                      index_col=None)
        replication_df = self.load_file(self.bryois_path,
                                        header=0,
                                        index_col=0)

        print(discovery_df)
        print(replication_df)

        print("Pre-process the discovery data.")
        discovery_df = discovery_df.loc[
            ~discovery_df["SNP"].str.contains("nors"), :]
        discovery_df.index = discovery_df["Gene"].str.split(
            ".", expand=True)[0] + "_" + discovery_df["SNP"].str.split(
                ":", expand=True)[2]
        discovery_df = discovery_df.loc[~discovery_df.index.duplicated(), :]
        discovery_cell_types = [
            x.split(" ")[0] for x in discovery_df.columns if "pvalue" in x
        ]
        discovery_aa_dict = dict(
            zip(discovery_df.index, discovery_df["Allele assessed"]))

        discovery_index_columns = [
            "Gene", "Gene symbol", "SNP", "Alleles", "Allele assessed"
        ]
        discovery_df.columns = [
            "MetaBrain " + col if col not in discovery_index_columns else col
            for col in discovery_df.columns
        ]

        print("Pre-process the replication data.")
        # Translate the cell types.
        colnames = []
        for col in replication_df.columns:
            found = False
            for bryois_ct, metabrain_ct in self.bryois_ct_trans.items():
                if found:
                    break

                if bryois_ct in col:
                    colnames.append(col.replace(bryois_ct, metabrain_ct))
                    found = True

            if not found:
                colnames.append(col)
        replication_df.columns = colnames

        # Add the discovery affect allele.
        replication_df["discovery_aa"] = replication_df.index.map(
            discovery_aa_dict)

        # Flipping the beta's
        replication_df["flip"] = replication_df[
            "effect_allele"] != replication_df["discovery_aa"]
        replication_cell_types = [
            x.replace(" p-value", "") for x in replication_df
            if x.endswith(" p-value")
        ]
        for ct in replication_cell_types:
            replication_df.loc[:, "{} beta".format(ct)] = replication_df[
                "{} beta".format(ct)] * replication_df["flip"].map({
                    True: -1,
                    False: 1
                })

        # Remove unwanted columns.
        replication_df.drop(["flip", "SNP", "effect_allele", "discovery_aa"],
                            axis=1,
                            inplace=True)

        # Change the column names.
        replication_df.columns = [
            "Bryois " + col for col in replication_df.columns
        ]

        # Add the sample size.
        replication_df["Bryois N"] = self.bryois_n

        print("Merging data.")
        df = discovery_df.merge(replication_df,
                                left_index=True,
                                right_index=True,
                                how="left")
        print(df)

        print("Adding BH-FDR for the replication.")
        overlap_ct = list(
            set(discovery_cell_types).intersection(
                set(replication_cell_types)))
        overlap_ct.sort()
        for ct in overlap_ct:
            print("\t{}".format(ct))
            df["Bryois {} BH-FDR".format(ct)] = np.nan
            discovery_mask = (df["MetaBrain {} BH-FDR".format(ct)] <=
                              0.05).to_numpy()
            print("\t  Discovery N-ieqtls: {:,}".format(
                np.sum(discovery_mask)))
            replication_mask = (
                ~df["Bryois {} p-value".format(ct)].isna()).to_numpy()
            mask = np.logical_and(discovery_mask, replication_mask)
            n_overlap = np.sum(mask)
            if n_overlap > 1:
                df.loc[
                    mask,
                    "Bryois {} BH-FDR".format(ct)] = multitest.multipletests(
                        df.loc[mask, "Bryois {} p-value".format(ct)],
                        method='fdr_bh')[1]
            n_replicating = df.loc[
                df["Bryois {} BH-FDR".format(ct)] <= 0.05, :].shape[0]
            print("\t  Replication N-ieqtls: {:,} / {:,} [{:.2f}%]".format(
                n_replicating, n_overlap, (100 / n_overlap) * n_replicating))

        print("Reordering columns")
        columns_of_interest = discovery_index_columns.copy() + [
            "MetaBrain N", "MetaBrain HW pval", "MetaBrain Minor allele",
            "MetaBrain MAF", "Bryois N"
        ]
        for ct in overlap_ct:
            columns_of_interest.append("MetaBrain {} pvalue".format(ct))
            columns_of_interest.append("MetaBrain {} BH-FDR".format(ct))
            columns_of_interest.append(
                "MetaBrain {} interaction beta".format(ct))
        colnames = columns_of_interest.copy()
        for ct in replication_cell_types:
            columns_of_interest.append("Bryois {} p-value".format(ct))
            colnames.append("Bryois {} pvalue".format(ct))

            if ct in overlap_ct:
                columns_of_interest.append("Bryois {} BH-FDR".format(ct))
                colnames.append("Bryois {} BH-FDR".format(ct))

            columns_of_interest.append("Bryois {} beta".format(ct))
            colnames.append("Bryois {} eQTL beta".format(ct))
        df = df.loc[:, columns_of_interest].copy()
        df.columns = colnames
        print(df)

        print("Saving output")
        exclude_in_excel = [
            "MetaBrain N", "MetaBrain HW pval", "MetaBrain Minor allele",
            "MetaBrain MAF", "MetaBrain Overall z-score", "Bryois N"
        ]
        self.save_file(df=df,
                       outpath=os.path.join(self.outdir,
                                            "bryois_replication.txt.gz"),
                       index=False)
        self.save_file(
            df=df.
            loc[:, [col for col in df.columns if col not in exclude_in_excel]],
            outpath=os.path.join(self.outdir, "bryois_replication.xlsx"),
            index=False,
            sheet_name="Bryois et al. 2021")

        # df = self.load_file(os.path.join(self.outdir, "bryois_replication.txt.gz"),
        #                     header=0,
        #                     index_col=None)

        print("Visualizing")
        discovery_ct = set([
            x.split(" ")[1] for x in df.columns
            if "MetaBrain" in x and "FDR" in x
        ])
        replication_ct = set([
            x.split(" ")[1] for x in df.columns if "Bryois" in x and "FDR" in x
        ])
        overlap_ct = list(discovery_ct.intersection(replication_ct))
        overlap_ct.sort()

        replication_stats_df = self.plot(df=df, cell_types=overlap_ct)
        self.save_file(df=replication_stats_df,
                       outpath=os.path.join(self.outdir,
                                            "replication_stats.txt.gz"))

        # replication_stats_df = self.load_file(os.path.join(self.outdir,  "replication_stats.txt.gz"),
        #                                       header=0,
        #                                       index_col=0)

        print("Replication stats")
        for label in replication_stats_df["label"].unique():
            print("\t{}".format(label))
            stats_df = replication_stats_df.loc[
                replication_stats_df["label"] == label, :]
            stats_df_mean = stats_df[["variable",
                                      "value"]].groupby("variable").mean()
            for index, row in stats_df_mean.iterrows():
                print("\t  {}: {:.2f}".format(index, row["value"]))

            stats_df_sum = stats_df[["variable",
                                     "value"]].groupby("variable").sum()
            print("\t  Overall concordance: {:,}/{:,} [{:.2f}%]".format(
                stats_df_sum.loc["N concordant",
                                 "value"], stats_df_sum.loc["N", "value"],
                (100 / stats_df_sum.loc["N", "value"]) *
                stats_df_sum.loc["N concordant", "value"]))
            print("")
Example #59
0
def call_interactions(indir, outdir, chrom_lens, binsize, dist, neighborhood_limit_lower = 3, \
                      neighborhood_limit_upper = 5, rank = 0, n_proc = 1, max_mem = 2, logger = None):
    logger.set_rank(rank)
    try:
        os.makedirs(outdir)
    except:
        pass

    proc_chroms = get_proc_chroms(chrom_lens, rank, n_proc)
    #print(rank, proc_chroms)
    #sys.stdout.flush()
    for chrom in proc_chroms:
        logger.write(f'\tprocessor {rank}: computing for chromosome {chrom}',
                     verbose_level=1,
                     allow_all_ranks=True)
        #print(rank, chrom)
        #d = pd.read_csv(chrom_filename, sep = "\t", header = None, usecols = [0,1,2,3,4,5, num_cells + 6])
        ##command = "awk -F '\t' '{print NF; exit}' " + chrom_filename
        ##proc_output = subprocess.check_output(command, shell = True, executable = "/bin/bash")
        ##num_cells = int(proc_output) - 7
        chrom_filename = os.path.join(
            indir, ".".join([chrom, "normalized", "combined", "bedpe"]))
        with h5py.File(chrom_filename + ".cells.hdf", 'r') as ifile:
            num_cells = ifile[chrom].shape[1]
        logger.write(f'\tprocessor {rank}: detected {num_cells} cells for chromosome {chrom}', \
                             append_time = False, allow_all_ranks = True, verbose_level = 2)
        #print('num_cells', num_cells)
        #sys.stdout.flush()
        d = pd.read_csv(chrom_filename, sep="\t", header=None)
        #num_cells = d.shape[1] - 7
        matrix_max_size = determine_dense_matrix_size(num_cells, dist, binsize,
                                                      max_mem)
        #print(rank, matrix_max_size)
        submatrices = convert_sparse_dataframe_to_dense_matrix(d, matrix_max_size, \
                                                               dist, binsize, neighborhood_limit_upper, \
                                                               num_cells, chrom_lens[chrom], chrom_filename)
        max_distance_bin = dist // binsize
        results = []
        #print(matrix_max_size, neighborhood_limit_upper, neighborhood_limit_lower)
        #neighbor_counts_matrix = get_neighbor_counts_matrix((matrix_max_size + neighborhood_limit_upper * 2, \
        #                                                     matrix_max_size + neighborhood_limit_upper * 2), \
        #                                                 neighborhood_limit_upper, \
        #                                                 neighborhood_limit_lower, max_distance_bin)
        #print('num zeros_2d', len(np.where(neighbor_counts_matrix==0)[0]))
        #print('going in for')
        #sys.stdout.flush()
        for i, (submatrix, start_index) in enumerate(submatrices):
            logger.write(f'\tprocessor {rank}: computing background for batch {i} of {chrom}, start index = {start_index}', \
                              verbose_level = 3, allow_all_ranks = True, append_time = False)
            #print('iteration', i)
            #print('start_index', start_index)
            #sys.stdout.flush()
            if i > 0:
                limit = i * (matrix_max_size - max_distance_bin
                             )  #- neighborhood_limit_upper
                #results[-1] = results[-1][results[-1]['i'] < limit]
                results[-1] = results[-1][results[-1]['i'] < start_index]
            #start_index = i * (matrix_max_size - max_distance_bin) - neighborhood_limit_upper
            #print(start_index)
            submat_result = compute_significances(submatrix, neighborhood_limit_upper, \
                                                  neighborhood_limit_lower, num_cells, start_index, \
                                                  max_distance_bin)
            #print('returned')
            results.append(submat_result)
        #print(rank, 'offtheloop')
        #print(rank, len(results))
        results = pd.concat(results, axis=0)
        #print(rank, chrom, results.shape[0])
        min_index = 0
        max_index = results['j'].max()
        #print(max_index, min_index, results['i'].dtype, results['j'].dtype, neighborhood_limit_upper)
        results = results[(results['i'] >= min_index + neighborhood_limit_upper) & \
                          (results['j'] <= max_index - neighborhood_limit_upper)]

        #print(results.shape[0])

        def compute_fdr_by_dist(d):
            fdrs = multipletests(list(d['pvalue']), method='fdr_bh')[1]
            d.loc[:, 'fdr_dist'] = fdrs
            return d

        results.reset_index(drop=True, inplace=True)
        results = results.groupby(results['j'] - results['i'],
                                  as_index=False).apply(compute_fdr_by_dist)
        results.loc[:, 'fdr_chrom'] = multipletests(list(results['pvalue']),
                                                    method='fdr_bh')[1]
        results.loc[:, 'i'] = (results['i'] * binsize).astype(int)
        results.loc[:, 'j'] = (results['j'] * binsize).astype(int)

        #print('finishing', d.shape)
        d = d.iloc[:, list(range(7))]
        d.columns = ['chr1', 'x1', 'x2', 'chr2', 'y1', 'y2', 'outlier_count']
        #print(d.head())
        #print(results.head())
        #d = d.merge(results, left_on = ['x1', 'y1'], right_on = ['i', 'j'], how = "outer")
        d = d.merge(results, left_on=['x1', 'y1'], right_on=['i', 'j'])
        #print(d.shape)
        d.drop(['i', 'j'], axis=1, inplace=True)
        logger.write(f'\tprocessor {rank}: computation for {chrom} completed. writing to file.', \
                             append_time = False, allow_all_ranks = True, verbose_level = 2)
        d.to_csv(os.path.join(outdir,
                              ".".join(["significances", chrom, "bedpe"])),
                 sep="\t",
                 index=False)
Example #60
0
def adjust_pvalues(pvalues, FDR=0.05):
    """Correct p values with the Benjamini-Hochberg correction method."""
    values = [x[1] for x in pvalues]
    adjusted_values = multipletests(values, alpha=FDR, method="fdr_bh")
    return [(pvalues[i][0], adjusted_values[1][i])
            for i in range(len(pvalues))]