Exemple #1
0
def calc_overlap_stats(test_set, geneset_dict, total_genes):
    """ get the overlaps and compute hypergeometric stats""" 

    overlaps = get_overlaps(test_set, geneset_dict)

    p = overlaps.apply( lambda x: (
      scipy.stats.hypergeom.sf(
          x.ix["match_count"]-1, # number of differentially expressed genes in set
          total_genes,           # total number of genes
          x.ix["size of set"],   # number of genes in current set
          len( test_set ))),     # total number of genes in test set
                               axis=1)
    p = pd.DataFrame(p, columns=["hypergeom p-val"])
    
    overlaps = overlaps.select(lambda x: overlaps.ix[x, "match_count"] > 0)
    overlaps = overlaps.merge(p, 
                              left_index=True, 
                              right_index=True).sort("hypergeom p-val", ascending=True)

    if len(overlaps.index) > 0:
        overlaps["bonferroni"] = multicomp.multipletests(overlaps.ix[:,"hypergeom p-val"], 
                                                         method="bonferroni")[1]
        overlaps["b-h fdr adj pval"] = multicomp.multipletests(
            overlaps.ix[:,"hypergeom p-val"].fillna(1.0), 
            method="fdr_bh")[1]

    return overlaps.sort("hypergeom p-val", ascending=True)
Exemple #2
0
    def adjustPvalue(s, preAdjusted, layoutAware):

        if layoutAware:
            idx = 2
        else:
            idx = 1

        try:
            # Some hosts do not have this library. If not we don't adjust
            import statsmodels.sandbox.stats.multicomp as multicomp
            adjust = True

        except Exception:
            adjust = False

        with open(s.file, 'w') as f:
            f = csv.writer(f, delimiter='\t')

            preAdjVals = []
            for i, row in enumerate(preAdjusted):

                if not adjust:

                    # No adjustment will happen so just write a NaN value to
                    # the file so the UI won't try to display this value
                    f.writerow(row + [float('NaN')])
                    continue

                # Extract the p-values from the data.
                # Translate NaNs to one so the stats routine will take it.
                if math.isnan(row[idx]):
                    preAdjVals.append(1)
                else:
                    preAdjVals.append(row[idx])
            
            if not adjust:
                return

            try:
                # Benjamini-Hochberg FDR correction for p-values returns:
                #   [reject, p_vals_corrected, alphacSidak, alphacBonf]
                # http://statsmodels.sourceforge.net/devel/generated/statsmodels.sandbox.stats.multicomp.multipletests.html
                reject, adjPvals, alphacSidak, alphacBonf = multicomp.multipletests(preAdjVals, alpha=0.05, method='fdr_bh')

            except Exception:
                adjPvals = [1 for x in preAdjusted]

            try:
                # Bonferroni correction for p-values returns:
                #   [reject, p_vals_corrected, alphacSidak, alphacBonf]
                # http://statsmodels.sourceforge.net/devel/generated/statsmodels.sandbox.stats.multicomp.multipletests.html
                reject, adjPvalsB, alphacSidak, alphacBonf = multicomp.multipletests(preAdjVals, alpha=0.05, method='bonferroni')

            except Exception:
                adjPvalsB = [1 for x in preAdjusted]

            for i, row in enumerate(preAdjusted):
                f.writerow(row + [sigDigs(adjPvals[i]), sigDigs(adjPvalsB[i])])
def __enrich_counts(db, to_r, query, pval_cutoff):
    to_r["pval"] = to_r.apply(compute_p, axis=1, M=db.bicluster_info.count(), N=query.shape[0])
    to_r["qval_BH"] = multipletests(to_r.pval, method='fdr_bh')[1]
    to_r["qval_bonferroni"] = multipletests(to_r.pval, method='bonferroni')[1]
    to_r = to_r.sort_values(["pval","counts"], ascending=True)

    # only return below pval cutoff
    to_r = to_r.loc[to_r.pval <= pval_cutoff, :]
    to_r.index = map(int, to_r.index)  # make sure GRE ids are integers
    return to_r
    def hypergeometric_significant_celltypes(self):
        '''
        hypergeometric test for significance of celltype enrichment.
        '''
        print('Testing celltype enrichment....')
        sigcelltype = self.sigCelltypedf
        cellgroup = self.cellgenedf.groupby(self.cellgenedf['celltype'])
        totalgenes = self.occurrencedf.shape[0]

        allsiggenes = self.cellgenedf
        allsiggenes = allsiggenes[allsiggenes['FDR'] <= 0.05]
        allsiggenes = len(set(allsiggenes['gene']))

        sigcelltype.loc[:, 'hyper_pval'] = 1
        col = sigcelltype.columns.get_loc('hyper_pval')
        for index, row in sigcelltype.iterrows():
            #print(row['celltype'])
            #print(row['genecluster'], totalgenes, len(cellgroup.get_group(row['celltype'])), allsiggenes)
            ## stats.hypergeom.sf(x, M, n, N)
            hyper_pval = stats.hypergeom.sf(row['genecluster']-1, totalgenes, allsiggenes, len(cellgroup.get_group(row['celltype'])))
            #print(hyper_pval)
            sigcelltype.iloc[index, col] = hyper_pval

        sigcelltype.loc[:, 'hyper_FDR'] = 1
        #ind_fdr = sigcelltype.columns.get_loc('hyper_FDR')
        sigcelltype = sigcelltype.sort_values('hyper_pval', ascending=True)
        sigcelltype.index = range(len(sigcelltype))

        pvals = sigcelltype['hyper_pval'].values
        corr_pvals = statsmodels.multipletests(pvals=pvals, alpha=0.05, method='fdr_bh')
        sigcelltype['hyper_FDR'] = corr_pvals[1]
        self.sigCelltypedf = sigcelltype
    def celltype_overrepresntation_list(self, enrichmentdf):
        '''
        This method will save the result of significance in one DF.
        '''
        significance = 1
        column = ['celltype', 'gene', 'enrichment', 'binom_pval', 'FDR']
        cellgenedf = pd.DataFrame()
        #print(self.binom_pval_df.head())
        for gene, celltype in self.binom_pval_df.iterrows():
            for cell, pval in celltype.iteritems():
                if pval < significance:
                    cellgenedf = cellgenedf.append(
                        pd.Series([cell, gene, enrichmentdf.loc[gene, cell], pval, 0]), ignore_index=True)
        #print cellgenedf.head(10)
        cellgenedf.columns = column
        cellgenedf = cellgenedf.sort_values(['celltype', 'binom_pval'], ascending=[True, True])
        cellgenedf.index = range(len(cellgenedf))

        pvals = cellgenedf['binom_pval'].values
        corr_pvals = statsmodels.multipletests(pvals=pvals, alpha=0.05, method='fdr_bh')
        #print(pvals)
        #print(corr_pvals)
        cellgenedf['FDR'] = 0
        cellgenedf['FDR'] = corr_pvals[1]
        '''
        for ind, row in cellgenedf.iterrows():
            fdr = (row['Binom p-val'] * len(cellgenedf)) / (ind + 1)
            cellgenedf.iloc[ind, 4] = fdr
        '''
        print('cellgenedf shape:', cellgenedf.shape)
        #cellgenedf = self.filter_df(cellgenedf)
        self.cellgenedf = cellgenedf
        print('cellgenedf shape after:', cellgenedf.shape)
        #self.filter_cellgenedf()  # Filter single cell multigene enrihment
        self.overall_significant_celltypes()
Exemple #6
0
 def run(study, pop, gene_set, adjust='fdr_bh'):
     '''
     Run a Over-represent analysis toward a gene set
     
     :param study: the significant gene set
     :param pop:  the background gene set
     :param gene_set: the function set
     :param adjust: the adjust method in the multiple tests, 
         details in http://www.statsmodels.org/dev/generated/statsmodels.sandbox.stats.multicomp.multipletests.html
     :return: the ORA analysis result
     '''
     gene_sets = gene_set if type(gene_set) == dict else GMTUtils.parse_gmt_file(gene_set)
     mapped = {k: list(set(v) & set([str(x) for x in pop])) for k, v in gene_sets.items()}
     s_mapped = {k: list(set(v) & set([str(x) for x in study])) for k, v in gene_sets.items()}
     result = {}
     for k, v in mapped.items():
         result[k] = stats.hypergeom.sf(len(s_mapped[k]) - 1, len(pop), len(mapped[k]), len(study))
     _, o, _, _ = multicomp.multipletests(list(result.values()), method=adjust)
     rfdr = {list(result.keys())[i]: o[i] for i in range(len(list(result.keys())))}
     # !
     df_result = {'name': [], 'mapped': [], 'number in study': [], 'p-value': [], 'fdr': []}
     for k, v in mapped.items():
         df_result['name'].append(k)
         df_result['mapped'].append(len(mapped[k]))
         df_result['number in study'].append(len(s_mapped[k]))
         df_result['p-value'].append(result[k])
         df_result['fdr'].append(rfdr[k])
     df = pd.DataFrame(df_result)
     df = df[['name', 'mapped', 'number in study', 'p-value', 'fdr']]
     return ORA(df, study, pop, adjust)
Exemple #7
0
	def combine(self, results):
		"""
		Stouffer combination of zscores
		:param results:
		:return:
		"""

		zscores = results.sum(axis=1) / np.sqrt(results.count(axis=1))

		size = zscores.size
		is_nan = zscores.mask
		valid_indices = np.where(~is_nan)
		invalid_indices = np.where(is_nan)

		pv = stats.norm.sf(zscores[valid_indices])

		pvalues = np.empty(size)
		pvalues[valid_indices] = pv
		pvalues[invalid_indices] = np.nan

		if pv.size != 0:
			qv = multipletests(pv, method='fdr_bh')[1]
		else:
			qv = np.array([])
		qvalues = np.empty(size)
		qvalues[valid_indices] = qv
		qvalues[invalid_indices] = np.nan

		return np.array([zscores, pvalues, qvalues])
Exemple #8
0
def stat_test(f, test_name, test, fdr):
    print('Testing', test_name, f, 'fdr', fdr)
    df = pd.read_csv(f, sep='\t')
    # Drop contigs
    df = df.loc[[bool(re.match('chr[0-9XYM]+$', c)) for c in df['chr']]]
    ods = [c for c in df.columns.values if is_od(c)]
    yds = [c for c in df.columns.values if is_yd(c)]
    pvals = np.array([test(row[ods], row[yds]) for _, row in df.iterrows()])
    res = multipletests(pvals, fdr, "fdr_bh")
    h0_rejects = res[0]
    pvals_adj = res[1]
    df['pval'] = pvals
    df['pval_adj'] = pvals_adj
    df['od_mean'] = df[ods].mean(axis=1).to_frame('od_mean')['od_mean']
    df['yd_mean'] = df[yds].mean(axis=1).to_frame('yd_mean')['yd_mean']
    df['logfc'] = np.log(df['od_mean'] / df['yd_mean'])
    # Sort by pvalue
    pvals_order = pvals.argsort()
    df = df.loc[pvals_order]
    h0_rejects = h0_rejects[pvals_order]

    # Save results
    results = re.sub(r'\.tsv', '_{}.tsv'.format(test_name), f)
    df[['chr', 'start', 'end', 'yd_mean', 'od_mean', 'logfc', 'pval', 'pval_adj']] \
        .to_csv(results, sep='\t', index=None, header=True)
    print('Saved test results to', results)

    # Save significant results
    if sum(h0_rejects) > 0:
        results_fdr = re.sub(r'\.tsv', '_{}_diff_fdr_{}.bed'.format(test_name, fdr), f)
        df.loc[h0_rejects][['chr', 'start', 'end']] \
            .to_csv(results_fdr, sep='\t', index=None, header=True)
        print('Saved {} significant results at FDR={} to {}'.format(
            sum(h0_rejects), fdr, results_fdr))
Exemple #9
0
 def fit(self, df_X, df_y):
     if not df_y.shape[0] == df_X.shape[0]:
         raise ValueError("number of regions is not equal")
     if df_y.shape[1] != 1:
         raise ValueError("y needs to have 1 label column")
     
     # calculate Mann-Whitney U p-values
     pvals = []
     clusters  =  df_y[df_y.columns[0]].unique()
     for cluster in clusters:
         pos = df_X[df_y.iloc[:,0] == cluster]
         neg = df_X[df_y.iloc[:,0] != cluster]
         p = []
         for m in pos:
             try:
                 p.append(mannwhitneyu(pos[m], neg[m], alternative="greater")[1])
             except Exception as e:
                 sys.stderr.write(str(e) + "\n")
                 sys.stderr.write("motif {} failed, setting to p = 1\n".format(m))
                 p.append(1)
         pvals.append(p)
     
     # correct for multipe testing
     pvals = np.array(pvals)
     fdr = multipletests(pvals.flatten(), 
             method="fdr_bh")[1].reshape(pvals.shape)
     
     # create output DataFrame
     self.act_ = pd.DataFrame(-np.log10(pvals.T), 
             columns=clusters, index=df_X.columns)
Exemple #10
0
	def combine(self, results):
		"""
		Fisher's combination of pvalues
		:param results:
		:return:
		"""

		results = np.copy(results)
		results[results < PVALUE_EPSILON] = PVALUE_EPSILON

		log = np.ma.log(results)
		s = log.sum(axis=1)
		count = log.count(axis=1)

		size = s.size
		is_nan = s.mask
		valid_indices = np.where(~is_nan)
		invalid_indices = np.where(is_nan)

		pv = 1.0 - stats.chi2.cdf(-2.0 * s[valid_indices], 2 * count[valid_indices])
		pvalues = np.empty(size)
		pvalues[valid_indices] = pv
		pvalues[invalid_indices] = np.nan

		if pv.size != 0:
			qv = multipletests(pv, method='fdr_bh')[1]
		else:
			qv = np.array([])
		qvalues = np.empty(size)
		qvalues[valid_indices] = qv
		qvalues[invalid_indices] = np.nan

		return np.array([pvalues, qvalues])
Exemple #11
0
def perform_multiple_comparison_stat(data1,data2, alpha=0.05):
    """

    :param data1:
    :param data2:
    :return: True if they are statistically different
    """
    mat1 = np.array(data1)
    mat2 = np.array(data2)
    comparisons = len(data1[0])
    pvals = [ttest_ind(mat1[:,i].tolist(),mat2[:,i])[1] for i in range(comparisons)]

    mult_comparison = multipletests(pvals, alpha=alpha)
    #print(mult_comparison)
    print(mult_comparison[0])
    """Version where just once is enough
    for val in mult_comparison[0]:
        if val == True:
            return True
    return False
    """
    # Version where the number of trues must exceed alpha (useful when you have A LOT of elements)
    true_counter = 0
    for val in mult_comparison[0]:
        if val == True:
            true_counter += 1

    return True if true_counter/len(mult_comparison[0]) >= alpha else False
    def binom_significant_celltypes(self):
        '''
        Binomial test for significance of celltype enrichment.
        '''
        print('Testing celltype enrichment....')
        sigcelltype = self.sigCelltypedf
        cellgroup = self.cellgenedf.groupby(self.cellgenedf['celltype'])

        binom_prob_occu = self.binom_prob_occu

        sigcelltype.loc[:, 'binom_pval'] = 1
        col = sigcelltype.columns.get_loc('binom_pval')
        for index, row in sigcelltype.iterrows():
            #print(row['celltype'])
            #print(row['genecluster'], totalgenes, len(cellgroup.get_group(row['celltype'])), allsiggenes)

            bprob_ind = binom_prob_occu[binom_prob_occu['celltype'] == row['celltype']].index[0]
            #print(bprob_ind)
            background_prob = binom_prob_occu.loc[bprob_ind, 'background_prob']
            #print(background_prob)
            binom_pval = stats.binom_test(row['genecluster']-1, len(cellgroup.get_group(row['celltype'])), background_prob, alternative='two-sided')
            sigcelltype.iloc[index, col] = binom_pval

        sigcelltype.loc[:, 'binom_FDR'] = 1
        sigcelltype = sigcelltype.sort_values('binom_pval', ascending=True)
        sigcelltype.index = range(len(sigcelltype))

        pvals = sigcelltype['binom_pval'].values
        corr_pvals = statsmodels.multipletests(pvals=pvals, alpha=0.05, method='fdr_bh')
        #print(pvals)
        #print(corr_pvals)
        sigcelltype['binom_FDR'] = corr_pvals[1]
        self.sigCelltypedf = sigcelltype
Exemple #13
0
def test_multi_pvalcorrection():
    # test against R package multtest mt.rawp2adjp
    # because of sort this doesn't check correct sequence - TODO: rewrite DONE
    rmethods = {
        "rawp": (0, "pval"),
        "Bonferroni": (1, "b"),
        "Holm": (2, "h"),
        "Hochberg": (3, "sh"),
        "SidakSS": (4, "s"),
        "SidakSD": (5, "hs"),
        "BH": (6, "fdr_i"),
        "BY": (7, "fdr_n"),
    }

    for k, v in rmethods.items():
        if v[1] in ["b", "s", "sh", "hs", "h", "fdr_i", "fdr_n"]:
            # pvalscorr = np.sort(multipletests(pval0, alpha=0.1, method=v[1])[1])
            r_sortindex = [6, 8, 9, 7, 5, 1, 2, 4, 0, 3]
            pvalscorr = multipletests(pval0, alpha=0.1, method=v[1])[1][r_sortindex]
            assert_almost_equal(pvalscorr, res_multtest[:, v[0]], 15)

    pvalscorr = np.sort(fdrcorrection0(pval0, method="n")[1])
    assert_almost_equal(pvalscorr, res_multtest[:, 7], 15)
    pvalscorr = np.sort(fdrcorrection0(pval0, method="i")[1])
    assert_almost_equal(pvalscorr, res_multtest[:, 6], 15)
 def calculate_quantile_pvalue(
         self, quantile, minvalues=10
     ):
     # Check arguments
     if isinstance(quantile, float):
         quantile = [quantile]
     elif isinstance(quantile, list):
         for q in quantile:
             if not isinstance(q, float):
                 raise TypeError('quantile list must contain floats')
     else:
         raise TypeError('quantile must be float or list of floats')
     # Create colnames for output dataframe
     colNames = []
     for condition in self.matrices:
         for sample in self.matrices[condition]:
             colNames.append('{}_{}_no'.format(condition, sample))
             colNames.append('{}_{}_mean'.format(condition, sample))
     for condition in self.matrices:
         colNames.append('{}_no'.format(condition))
         colNames.append('{}_mean'.format(condition))
     colNames.extend(['pvalue', 'fdr'])
     # Create output dataframe
     outDF = pd.DataFrame(index=quantile, columns=colNames)
     outDF = outDF.sort_index()
     # Extract quantile distance data
     quantData = self.extract_dist_quantile(quantile)
     splitQuant = quantData.groupby('quan')
     for q, data in splitQuant:
         # Extract data for conditions and samples
         condValues = []
         for cond in self.matrices:
             # Extract data for condition
             condData = data[data['cond'] == cond]
             condDist = condData['dist']
             condValues.append(condDist)
             # Add condition data to output
             colPrefix = '{}_'.format(cond)
             outDF.loc[q, colPrefix + 'no'] = condDist.size
             outDF.loc[q, colPrefix + 'mean'] = condDist.mean()
             for smpl in self.matrices[cond]:
                 # Extract data for sample
                 smplData = condData[condData['smpl'] == smpl]
                 smplDist = smplData['dist']
                 # Add sample data to output
                 colPrefix = '{}_{}_'.format(cond, smpl)
                 outDF.loc[q, colPrefix + 'no'] = smplDist.size
                 outDF.loc[q, colPrefix + 'mean'] = smplDist.mean()
         # Calculate pvalues
         dist1, dist2 = condValues
         if dist1.size >= minvalues and dist2.size >= minvalues:
             outDF.loc[q, 'pvalue'] = mannwhitneyu(dist1, dist2)[1]
     # Add fdr and return
     pvalueIndex = outDF.index[~outDF['pvalue'].isnull()]
     outDF.loc[pvalueIndex, 'fdr'] = multipletests(
         outDF.loc[pvalueIndex, 'pvalue'], method='fdr_bh')[1]
     return(outDF)
Exemple #15
0
def calculate_gene_expression_similarity(reduced_stat_map_data, mask="full"):
    store_file = "/ahba_data/store_max1_reduced.h5"
    subcortex_mask = "/ahba_data/subcortex_mask.npy"

    results_dfs = []
    with pd.HDFStore(store_file, 'r') as store:
        for donor_id in store.keys():
            print "Loading expression data (%s)" % donor_id
            expression_data = store.get(donor_id.replace(".", "_"))

            print "Getting statmap values (%s)" % donor_id
            nifti_values = reduced_stat_map_data[expression_data.columns]

            print "Removing missing values (%s)" % donor_id
            na_mask = np.isnan(nifti_values)
            if mask == "subcortex":
                na_mask = np.logical_or(na_mask,
                    np.isnan(np.load(subcortex_mask)[expression_data.columns]))
            elif mask == "cortex":
                na_mask = np.logical_or(na_mask, np.logical_not(np.isnan(
                    np.load(subcortex_mask)[expression_data.columns])))
            else:
                assert mask == "full"

            nifti_values = np.array(nifti_values)[np.logical_not(na_mask)]
            expression_data.drop(expression_data.columns[na_mask], axis=1, inplace=True)

            print "z scoring (%s)" % donor_id
            expression_data = pd.DataFrame(zscore(expression_data, axis=1), columns=expression_data.columns,
                                           index=expression_data.index)
            nifti_values = zscore(nifti_values)

            print "Calculating linear regressions (%s)" % donor_id
            regression_results = np.linalg.lstsq(np.c_[nifti_values, np.ones_like(nifti_values)], expression_data.T)
            results_df = pd.DataFrame({"slope": regression_results[0][0]}, index=expression_data.index)

            results_df.columns = pd.MultiIndex.from_tuples([(donor_id[1:], c,) for c in results_df.columns],
                                                           names=['donor_id', 'parameter'])

            results_dfs.append(results_df)

        print "Concatenating results"
        results_df = pd.concat(results_dfs, axis=1)
        del results_dfs

    t, p = ttest_1samp(results_df, 0.0, axis=1)
    group_results_df = pd.DataFrame({"t": t, "p": p}, columns=['t', 'p'], index=expression_data.index)
    _, group_results_df["p (FDR corrected)"], _, _ = multipletests(group_results_df.p, method='fdr_bh')
    group_results_df["variance explained (mean)"] = (results_df.xs('slope', axis=1, level=1) ** 2 * 100).mean(axis=1)
    group_results_df["variance explained (std)"] = (results_df.xs('slope', axis=1, level=1) ** 2 * 100).std(axis=1)
    del results_df
    probe_info = pd.read_csv("/ahba_data/probe_info_max1.csv", index_col=0).drop(['chromosome', "gene_id"], axis=1)
    group_results_df = group_results_df.join(probe_info)
    group_results_df = group_results_df[["gene_symbol", "entrez_id.1", "gene_name","t", "p", "p (FDR corrected)",
                                         "variance explained (mean)", "variance explained (std)"]]

    return group_results_df
Exemple #16
0
def bhCorrection(s, n=None):
    s = s.fillna(1.)
    if n > len(s):
        p_vals = list(s) + [1] * (n - len(s))
    else:
        p_vals = list(s)
    q = multicomp.multipletests(p_vals, method='fdr_bh')[1][:len(s)]
    q = pd.Series(q[:len(s)], s.index, name='p_adj')
    return q
Exemple #17
0
 def qvalues(self, below=0.1):
     if not self._pvalues:
         self.pvalues()
     pvals = [x[1] for x in self._pvalues]
     qvals = list(multipletests(pvals, method='fdr_bh')[1])
     res = [(q,p,x) for (q, (x,p)) in zip(qvals, self._pvalues)
         if q < below]
     self._qvalues = res
     log.notice('got %d peaks with qvalue below %.2f. From %d possible.' % (
         len(res), below, len(pvals)))
     return res
Exemple #18
0
    def adjust_score(self, score):
        """
        Returns a list of adjusted p-values. Currently only the Benjamini-Hochberg method is supported.

        :param score: the list of p-values to adjust
        :return: the list of adjusted p-values
        """
        if self.args_dict['pvaladjust'] is None:
            return score
        else:
            return multipletests(score, alpha=self.args_dict['threshold'], method=self.args_dict['pvaladjust'])[1]
Exemple #19
0
def bhCorrection(s, n=None):
    """
    Benjamini-Hochberg correction for a Series of p-values.
    """
    s = s.fillna(1.)
    if n > len(s):
        p_vals = list(s) + [1] * (n - len(s))
    else:
        p_vals = list(s)
    q = multicomp.multipletests(p_vals, method='fdr_bh')[1][:len(s)]
    q = pd.Series(q[:len(s)], s.index, name='p_adj')
    return q
Exemple #20
0
def pvalues(vect, vals, side="two-sided", significance_threshold=0.05, multi_tests_cor_method="fdr_bh"):
    """
    Computes the pvalue of a given values in a vector.

    :param vect: the vector used to compute the pvalue
    :param vals: the values used to compute the pvalue (iterable)
    :param side: the sides to find the pvalue (can be 'two-sided', 'left' of 'right'). Default is 'two-sided'
    :param significance_threshold: the significance threshold. Default is 0.05
    :param multi_tests_cor_method: multiple testing correction method. Default is 'fdr_bh'

    :returns: The multiple testing corrected pvalues results as dictionary key:val, values: {corrected_pvalue, description, corrected_signif}, Sidak corrected significance threshold (alpha), Bonferroni corrected alpha

    :raises: IOError: if vect or val are not of an appropriate type or if side is not one of: 'two-sided', 'left' or 'right'

    .. note::
        multi_tests_cor_method parameter accepts the fllowing values:

        - **bonferroni** : one-step correction
        - **sidak** : one-step correction
        - **holm-sidak** : step down method using Sidak adjustments
        - **holm** : step-down method using Bonferroni adjustments
        - **simes-hochberg** : step-up method  (independent)
        - **hommel** : closed method based on Simes tests (non-negative)
        - **fdr_bh** : Benjamini/Hochberg  (non-negative)
        - **fdr_by** : Benjamini/Yekutieli (negative)
        - **fdr_tsbh** : two stage fdr correction (non-negative)
        - **fdr_tsbky** : two stage fdr correction (non-negative)

    """
    pvals = [None] * len(vals)
    for i in range(len(vals)):
        val = vals[i]
        p, d, s = pvalue(vect, val, side=side, significance_threshold=significance_threshold)
        pvals[i] = dict(pvalue=p, description=d, significance=s)
    # Multiple testing correction
    if multi_tests_cor_method is not None:
        s, cp, alphacSidak, alphacBonf = multipletests(
            [pvals[i]["pvalue"] for i in range(len(vals))], alpha=significance_threshold, method=multi_tests_cor_method
        )

        for i in range(len(vals)):
            pvals[i]["uncorrected_pvalue"] = pvals[i]["pvalue"]
            pvals[i]["pvalue"] = cp[i]
            pvals[i]["significance"] = s[i]
            d = pvals[i]["description"]
            pvals[i]["description"] = "%s %.3e" % (d[: d.rindex("=") + 1], cp[i])

        if multi_tests_cor_method == "bonferroni":
            return pvals, alphacSidak
        if multi_tests_cor_method == "sidak" or multi_tests_cor_method == "holm-sidak":
            return pvals, alphacSidak
    return pvals
 def calculate_dist_pvalue(self, rmzero=True, minvalues=10):
     # Extract distances for input matrices
     distProb = self.extract_dist_prob()
     splitDist = distProb.groupby('dist')
     # Create output columns
     colNames = []
     for condition in self.matrices:
         for sample in self.matrices[condition]:
             colNames.append('{}_{}_no'.format(condition, sample))
             colNames.append('{}_{}_mean'.format(condition, sample))
     for condition in self.matrices:
         colNames.append('{}_no'.format(condition))
         colNames.append('{}_mean'.format(condition))
     colNames.extend(['pvalue', 'fdr'])
     # Create output dataframe
     outDF = pd.DataFrame(
         columns = colNames, index = splitDist.groups.keys())
     outDF = outDF.sort_index()
     # Loop through data and calculate results
     for dist, data in splitDist:
         # Remove zero values
         if rmzero:
             data = data[data['prob'] > 0]
         # Extract data for conditions and samples
         condValues = []
         for cond in self.matrices:
             # Extract data for condition
             condData = data[data['cond'] == cond]
             condProb = condData['prob']
             condValues.append(condProb)
             # Add condition data to output
             colPrefix = '{}_'.format(cond)
             outDF.loc[dist, colPrefix + 'no'] = condProb.size
             outDF.loc[dist, colPrefix + 'mean'] = condProb.mean()
             for smpl in self.matrices[cond]:
                 # Extract data for sample
                 smplData = condData[condData['smpl'] == smpl]
                 smplProb = smplData['prob']
                 # Add sample data to output
                 colPrefix = '{}_{}_'.format(cond, smpl)
                 outDF.loc[dist, colPrefix + 'no'] = smplProb.size
                 outDF.loc[dist, colPrefix + 'mean'] = smplProb.mean()
         # Calculate pvalues
         prob1, prob2 = condValues
         if prob1.size >= minvalues and prob2.size >= minvalues:
             outDF.loc[dist, 'pvalue'] = mannwhitneyu(prob1, prob2)[1]
     # Sort data, add fdr and return
     pvalueIndex = outDF.index[~outDF['pvalue'].isnull()]
     outDF.loc[pvalueIndex, 'fdr'] = multipletests(
         outDF.loc[pvalueIndex, 'pvalue'], method='fdr_bh')[1]
     return(outDF)
Exemple #22
0
def t_test(X, group):
    """
    Simple two-group comparison with (unpaired) t-test. 
    """
    R = pd.DataFrame.from_records([], index=X.index)
    R["logFC"] = fold_change(X, group, log=2)
    R["logFC"] = R["logFC"].fillna(0)
    Xm = X.as_matrix()
    ix = group.as_matrix()
    t, p = ttest_ind(Xm[:,ix], Xm[:,~ix], axis=1)
    R["t"] = t
    R["p"] = p
    R["FDR"] = multipletests(R["p"], method="fdr_bh")[1]
    return R
Exemple #23
0
def set_fisher(sets1, sets2, allgenes = None):

    if allgenes is None:
        allgenes = set()
        for k1, s1 in sets1.items():
            allgenes |= set(s1)
        for k2, s2 in sets2.items():
            allgenes |= set(s2)
    else:
        allgenes = set(allgenes)

    rv = []
    for k1, s1 in sets1.items():
        s1 = set(s1) & allgenes
        for k2, s2 in sets2.items():
            s2 = set(s2) & allgenes

            a = s1 & s2
            b = s1 - a
            c = s2 - a
            d = allgenes - (s1 | s2)
            oddsratio, pval = fisher_exact(
                [[len(a), len(b)],
                 [len(c), len(d)]],
                 alternative='two-sided')
            rv.append(pd.Series(dict(
                a=len(a), b=len(b), c=len(c), d=len(d),
                len1 = len(s1), s1=k1,
                len2=len(s2), s2=k2,
                reference = len(allgenes),
                oddsratio = oddsratio, pval=pval
            )))

    rv = pd.DataFrame(rv)
    rv['padj_bh'] = multipletests(rv['pval'], method='fdr_bh')[1]
    rv['padj_bonf'] = multipletests(rv['pval'], method='bonferroni')[1]
    return rv
Exemple #24
0
def filterExptsByPseudoCountDistr( ddict ):
    
    # remove experiments where the pseudocount is high
    # relative to the other pseudocounts
    pseudodict     = { k : ddict[k]['PSEUDO'] for k in ddict }
    pskeys         = list(pseudodict.keys())
    pslogvals      = np.log10(list(pseudodict.values()))
    pslogmad       = mad(pslogvals) ; 
    pslogmedian    = np.percentile(pslogvals,50)
    pslvps_hi      = 1-norm.cdf((pslogvals-pslogmedian)/pslogmad)
    rejected_ds_hi = multipletests( pslvps_hi, alpha=0.05 )[0]

    # return data in a dictionary
    filteredExpts  = {  pskeys[i] : rejected_ds_hi[i] for i in range(len(pskeys))}
    return filteredExpts
Exemple #25
0
def fdr_qvals(obs, mc):
    '''
    compute pvalues and fdr correct them.
    '''
    def compute_pvalues(obs, mc):
        for o in obs:
            h0_greaterequal = len(mc) - bisect_left(mc, o)
            pvalue = float(h0_greaterequal + 1) / (len(mc) + 1)
            yield pvalue

    obs = np.sort(obs)[::-1]
    mc = np.sort(mc)
    pvals = list(compute_pvalues(obs, mc))
    qvals = list(multipletests(pvals, method='fdr_bh')[1])
    return dict(pvals=pvals, qvals=qvals)
def row_wise_anova(mat, categories, method='fdr_bh'):
	'''Apply one-way ANOVA to each row of mat, and adjust p-values.
	'''
	uniq_cats = np.unique(categories)
	pvals = np.ones(mat.shape[0], dtype=float)
	masks = [np.in1d(categories, [cat]) for cat in uniq_cats]
	for i in range(mat.shape[0]):
		row = mat[i]
		grouped_row = [ row[mask] for mask in masks ]
		fval, pval = f_oneway(*grouped_row)
		pvals[i] = pval

	_, qvals, _, _ = multipletests(pvals, method=method)
	return pvals, qvals

	
def GLM (file, score, stat, ind_var, Level, betas=1):

    # Create pandas dataframe
    df_final= pd.DataFrame(columns=['Score', 'stat', 'beta', 'tvalue', 'pvalue' , 'pval_bonferroni', 'signi_bonferonni', 'Rsquare', 'std'])
    db = pd.read_csv(file)

    # Get rid of rows with null values for given columns
    db = db[db[score].notnull()]

	# Select Variables
    Y = np.array(db[score])
    X = np.array(db[ind_var])

    #Run the GLM
    model = sm.OLS(Y, X).fit()
    pvals = model.pvalues

    pvals_fwer = multicomp.multipletests(pvals, alpha = 0.05, method = 'fdr_bh')
     
    # Save it into csv file
    df_final.loc[len(df_final)] = [score, stat, model.params, model.tvalues, model.pvalues, pvals_fwer[1], pvals_fwer[0], model.rsquared, model.bse]
    df_final.to_csv(os.path.join(stat,score, score + "_" + stat + "_" + Level + ".csv"))
    
    # # check quickly if there is significant data
    # for  idx, i in enumerate(model.pvalues):
    #     if model.pvalues[i] < 0.05:
    #         print (score+ " " + stat + " "+ Level + ind_var[idx] )
    #         print (model.pvalues[idx])

    betas_component = model.params[0:betas]

    # ## plot the Betas 
    # Select the variable
    y = model.tvalues

    x = np.array(range(len(ind_var)))
    
    # plot data
    plt.plot(x, y, linestyle="dashed", marker="o", color="green")
    plt.xticks(x, ind_var)
    plt.ylabel(score + "_" + stat)
    plt.xlabel("Rsquare %s" % (model.rsquared))
    #plt.savefig(os.path.join(stat, score, score + "_" + stat + "_" + ".png"))
    plt.close()

    return df_final, db, betas_component , pvals
Exemple #28
0
def parse_hwe(f, alpha, vcf_file):
    """
    Parses a hardy-weinberg output file, corrects p-values according to a FDR
    and generates several plots to visualize the hwe results
    """

    vcf_outfile = vcf_file.split(".")[0] + "_filtered.vcf"

    snp_pos = []
    pvals = []
    het_deficit = []
    het_excess = []

    with open(f) as fh:
        #Skip header
        next(fh)

        for line in fh:
            fields = line.strip().split()
            snp_pos.append((fields[0], fields[1]))
            pvals.append(float(fields[5]))
            het_deficit.append(float(fields[6]))
            het_excess.append(float(fields[7]))

    fdr_bool_list, fdr_pvalue_list, alpha_S, alpha_B = \
        multi_correction.multipletests(pvals, alpha=float(alpha),
                                       method="fdr_bh")

    snp_pvals = OrderedDict()
    for pos, pval in zip(snp_pos, fdr_pvalue_list):
        snp_pvals["-".join(pos)] = pval

    with open(vcf_file) as vcf_fh, open(vcf_outfile, "w") as ofh:
        for line in vcf_file:
            if line.startswith("#"):
                ofh.write(line)
            elif line.strip() != "":
                fields = line.split()
                # Check pval for locus
                pos = "-".join(fields[0], fields[1])
                if snp_pvals[pos] <= 0.05:
                    ofh.write(line)
Exemple #29
0
def lr_tests(sample_info, expression_matrix, full_model, reduced_model='expression ~ 1'):
    tmp = sample_info.copy()

    fit_results = pd.DataFrame(index=expression_matrix.index)

    gene = expression_matrix.index[0]
    tmp['expression'] = expression_matrix.ix[gene]
    m1 = smf.ols(full_model, tmp).fit()
    m2 = smf.ols(reduced_model, tmp).fit()

    for param in m1.params.index:
        fit_results['full ' + param] = np.nan

    params = m1.params.add_prefix('full ')
    fit_results.ix[gene, params.index] = params

    for param in m2.params.index:
        fit_results['reduced ' + param] = np.nan

    params = m2.params.add_prefix('reduced ')
    fit_results.ix[gene, params.index] = params

    fit_results['pval'] = np.nan

    fit_results.ix[gene, 'pval'] = m1.compare_lr_test(m2)[1]

    for gene in tqdm(expression_matrix.index[1:]):
        tmp['expression'] = expression_matrix.ix[gene]

        m1 = smf.ols(full_model, tmp).fit()
        params = m1.params.add_prefix('full ')
        fit_results.ix[gene, params.index] = params

        m2 = smf.ols(reduced_model, tmp).fit()
        params = m2.params.add_prefix('reduced ')
        fit_results.ix[gene, params.index] = params

        fit_results.ix[gene, 'pval'] = m1.compare_lr_test(m2)[1]

    fit_results['qval'] = multipletests(fit_results['pval'], method='b')[1]
    
    return fit_results
Exemple #30
0
def get_state(statename, winner, d):

    '''return table with stat significant for each state'''

    if len(winner[winner['state'] == statename]) < 3: return
    #get avarage post price and standard deviation
    state_mean = winner[winner['state'] == statename]['price'].mean()
    state_sd = winner[winner['state'] == statename]['price'].std()

    # get sample tests that have size larger than 20 and avergae post price larger than state mean
    # 20 is to make sure it's ok to perform a hypothesis test
    df = d[(d['state'] == statename) & (d['price']['count'] > 20)&(d['price']['mean'] > state_mean)]

    # If the distribution of all state price is normal, then we can use z test. If not, use a non-paramatric hypothesis testing
    # if the population size is larger than 5000, assume normal, if not, run stats.shapiro to test normality.
    if len(winner[winner['state'] == statename]) < 5000:
        stat, pval = stats.shapiro(winner[winner['state'] == statename]['price'])
        if pval > 0.05:
#             print "it's normal"
            z_test(state_mean, state_sd, df)
        else:
#             print "it's non-normal"
            stat_list = []
            df.apply(lambda row: go_nonpar(row['county'].values[0], row['state'].values[0], stat_list, state_mean, winner), axis=1)
            df["pval"] = stat_list
    else:
#         print "it's normal"
        z_test(state_mean, state_sd, df)
    alpha = 0.05
    df["reject_naive"] = 1*(df["pval"] < alpha)
    try:
        df["reject_bc"] = 1*(df["pval"] < alpha / len(df))
        is_reject, corrected_pvals, _, _ = multipletests(df["pval"], alpha=0.1, method='fdr_bh')
        df["reject_fdr"] = 1*is_reject
        df["pval_fdr"] = corrected_pvals
    except:
        pass

    return df
def test_lee_et_al(n=300,
                   p=100,
                   s=10,
                   signal=3.5,
                   rho=0.,
                   sigma=1.,
                   cross_validation=True,
                   condition_on_CVR=False,
                   lam_frac=0.6,
                   glmnet=True,
                   X=None,
                   check_screen=True,
                   intervals=False):

    print(n, p, s)

    if X is None:
        X, y, beta, truth, sigma = gaussian_instance(n=n,
                                                     p=p,
                                                     s=s,
                                                     signal=signal,
                                                     sigma=sigma,
                                                     scale=True,
                                                     center=True)
    else:
        beta = np.zeros(p)
        beta[:s] = signal
        y = X.dot(beta) + np.random.standard_normal(n) * sigma

    truth = np.nonzero(beta != 0)[0]

    if cross_validation:
        cv = CV_view(rr.glm.gaussian(X, y),
                     loss_label="gaussian",
                     lasso_randomization=None,
                     epsilon=None,
                     scale1=None,
                     scale2=None)
        # views.append(cv)
        cv.solve(glmnet=glmnet and have_glmnet)
        lam = cv.lam_CVR
        print("minimizer of CVR", lam)

        if condition_on_CVR:
            cv.condition_on_opt_state()
            lam = np.true_divide(lam + cv.one_SD_rule(direction="up"), 2)
            #lam = cv.one_SD_rule(direction="up")
            print("one SD rule lambda", lam)
    else:
        lam = lam_frac * np.fabs(
            X.T.dot(np.random.normal(1, 1. / 2, (n, 1000)))).max()

    L = lasso.gaussian(X, y, lam, sigma=sigma)
    soln = L.fit()

    active = soln != 0
    nactive = active.sum()
    print("nactive", nactive)
    if nactive == 0:
        return None

    active_signs = np.sign(soln[active])

    if (check_screen == False) or (set(truth).issubset(np.nonzero(active)[0])):

        active_set = np.nonzero(active)[0]
        print("active set", active_set)
        true_vec = beta[active]
        active_var = np.zeros(nactive, np.bool)

        # Lee et al. using sigma
        pvalues = np.zeros(nactive)
        sel_length = np.zeros(nactive)
        sel_covered = np.zeros(nactive)

        naive_pvalues = np.zeros(nactive)
        naive_length = np.zeros(nactive)
        naive_covered = np.zeros(nactive)

        C = L.constraints

        if C is not None:
            one_step = L.onestep_estimator
            for i in range(one_step.shape[0]):
                eta = np.zeros_like(one_step)
                eta[i] = active_signs[i]
                alpha = 0.1

                def naive_inference():
                    obs = (eta * one_step).sum()
                    sd = np.sqrt(np.dot(eta.T, C.covariance.dot(eta)))
                    Z = obs / sd
                    # use Phi truncated to [-5,5]
                    _pval = ndist.cdf(obs / sigma)
                    _pval = 2 * min(_pval, 1 - _pval)
                    _interval = (obs - ndist.ppf(1 - alpha / 2) * sd,
                                 obs + ndist.ppf(1 - alpha / 2) * sd)
                    return _pval, _interval

                if C.linear_part.shape[0] > 0:  # there were some constraints
                    L, Z, U, S = C.bounds(eta, one_step)
                    _pval = pivot(L, Z, U, S)
                    # two-sided
                    _pval = 2 * min(_pval, 1 - _pval)

                    if intervals == True:
                        if _pval < 10**(-8):
                            return None
                        L, Z, U, S = C.bounds(eta, one_step)
                        _interval = equal_tailed_interval(L,
                                                          Z,
                                                          U,
                                                          S,
                                                          alpha=alpha)
                        _interval = sorted([
                            _interval[0] * active_signs[i],
                            _interval[1] * active_signs[i]
                        ])
                else:
                    obs = (eta * one_step).sum()
                    ## jelena: should be this sd = np.sqrt(np.dot(eta.T, C.covariance.dot(eta))), no?
                    sd = np.sqrt((eta * C.covariance.dot(eta)))
                    Z = obs / sd
                    _pval = 2 * (ndist.sf(min(np.fabs(Z))) -
                                 ndist.sf(5)) / (ndist.cdf(5) - ndist.cdf(-5))
                    if intervals == True:
                        _interval = (obs - ndist.ppf(1 - alpha / 2) * sd,
                                     obs + ndist.ppf(1 - alpha / 2) * sd)

                pvalues[i] = _pval

                naive_pvalues[i], _naive_interval = naive_inference()

                #print(_naive_interval)

                def coverage(LU):
                    L, U = LU[0], LU[1]
                    _length = U - L
                    _covered = 0
                    if (L <= true_vec[i]) and (U >= true_vec[i]):
                        _covered = 1
                    return _covered, _length

                if intervals == True:
                    sel_covered[i], sel_length[i] = coverage(_interval)
                    naive_covered[i], naive_length[i] = coverage(
                        _naive_interval)

                active_var[i] = active_set[i] in truth
        else:
            return None

        print(pvalues)
        q = 0.2
        BH_desicions = multipletests(pvalues, alpha=q, method="fdr_bh")[0]
        return  pvalues, sel_covered, sel_length, \
                naive_pvalues, naive_covered, naive_length, active_var, BH_desicions
Exemple #32
0
def get_adjusted_pvals(df):
    pvals = df['P-value'].tolist()
    adpvals = multipletests(pvals, 0.1, method='fdr_bh')
    df['Adjusted P-value'] = adpvals[1]
    return df
def FunctionalExamination(client,
                          SL_or_SDL,
                          database,
                          input_genes,
                          percentile_threshold,
                          cn_threshold,
                          adj_method,
                          fdr_level,
                          tissues,
                          input_mutations=None):
    '''
      Description: Gene expression, Copy Number Alteration (CNA), Somatic Mutations (optional) are used to decide whether gene is inactive.
      The SL/SDL pair detection according to difference in gene effect/dependency score given one gene is inactive vs not-inactive

   Inputs:
    client:BigQueryClient, the BigQuery client that will run the function.
    SL_or_SDL:string, Synthetic lethal or Synthetic Dosage Lethal, valid values: 'SL', 'SDL'
    database: string, The dataresource the analysis will be performed on, 	valid values: "CRISPR", "shRNA"
    input_genes:list of strings, the list of genes whose SL/SDL partners will be seeked	
    percentile_threshold:double, the threshold for gene expression (for deciding whether a gene is inactive)
    cn_threshold:double, the threshold for copy number alteration (for deciding whether a gene is inactive)
    adj_method:	string,	optional, p value correction method,  valid_values:bonferroni,  sidak, holm-sidak , holm, simes-hochberg , hommel, fdr_bh,  fdr_by , fdr_tsbh, fdr_tsbky 
    fdr_level:string, the data that will be considered wile doing p value adjustment, valid values : "gene_level", "analysis_level"
    tissues: list of strings, the tissues that the analysis will be performed on. 
    input_mutations:list of strings, optional, valid values: "Missense_Mutation", "Nonsense_Mutation","Translation_Start_Site", "Frame_Shift_Ins", "Splice_Site",
    "In_Frame_Del","Frame_Shift_Del", "Nonstop_Mutation", "In_Frame_Ins"
        
   Output:
       A dataframe of SL/SDL pairs
    '''

    if database == 'CRISPR':

        dep_score_table = 'isb-cgc-bq.DEPMAP.Achilles_gene_effect_DepMapPublic_current'
        sample_id = 'DepMap_ID'
        gene_exp = 'TPM'
        effect = 'Gene_Effect'
        symbol = 'Hugo_Symbol'
        selected_samples = RetrieveSamples(client, 'CRISPR', 'func_ex',
                                           tissues)
        ccle_samples = selected_samples
        ccle_sample_id = 'DepMap_ID'
        cid = "DepMap_ID"

    elif database == 'shRNA':
        dep_score_table = 'isb-cgc-bq.DEPMAP.Combined_gene_dep_score_DEMETER2_current'
        sample_id = 'CCLE_ID'
        gene_exp = 'TPM'
        effect = 'Combined_Gene_Dep_Score'
        symbol = 'Hugo_Symbol'
        selected_samples = RetrieveSamples(client, 'shRNA', 'func_ex', tissues)
        ccle_samples = selected_samples['DepMap_ID']
        shRNA_samples = selected_samples['CCLE_Name']
        ccle_sample_id = 'DepMap_ID'
        cid = "CCLE_Name"

    else:
        print("The database name can be either CRISPR or shRNA")
        return ()

    mutation_table = 'isb-cgc-bq.DEPMAP.CCLE_mutation_DepMapPublic_current'
    gene_exp_table = 'isb-cgc-bq.DEPMAP.CCLE_gene_expression_DepMapPublic_current'
    cn_table = 'isb-cgc-bq.DEPMAP.CCLE_gene_cn_DepMapPublic_current'
    sample_info_table = 'isb-cgc-bq.synthetic_lethality.sample_info_TCGAlabels_DepMapPublic_20Q3'
    cn_threshold = np.log2(2**(cn_threshold) + 1)
    gene_mapping = ProcessGeneAlias(client, input_genes, 'DepMap')

    min_sample_size = 20
    if len(selected_samples) < (min_sample_size + 1):
        print("Sample size needs to be greater than " + str(min_sample_size) +
              ", it is " + str(len(selected_samples)))
        return ()
    sql_without_mutation = """
    WITH
    table1 AS (
    (SELECT   symbol, Barcode FROM
    (SELECT GE.__SYMBOL__ AS symbol, GE.__CCLE_SAMPLE_ID__ AS Barcode ,
    PERCENT_RANK () over (partition by __SYMBOL__ order by __GENE_EXPRESSION__ asc) AS Percentile
    FROM  __GENE_EXP_TABLE__ GE
    WHERE GE.__SYMBOL__ in (__GENELIST__) AND __CCLE_SAMPLE_ID__ in (__SAMPLE_LIST_CCLE__) AND __GENE_EXPRESSION__ is not null ) AS NGE
    WHERE NGE.Percentile __GENE_CMP_STR__

    INTERSECT DISTINCT

    SELECT symbol,  Barcode FROM
    (SELECT CN.__SYMBOL__ AS symbol, CN.__CCLE_SAMPLE_ID__ AS Barcode,
    CN.CNA AS NORM_CN
    FROM  __CN_TABLE__ CN
    WHERE CN.__SYMBOL__ in (__GENELIST__) AND __CCLE_SAMPLE_ID__ in (__SAMPLE_LIST_CCLE__) and    CN.CNA is not null) AS NC
    WHERE NC.NORM_CN __CN_CMP_STR__  )"""

    sql_mutation_part = """  

    UNION DISTINCT
    SELECT M.__SYMBOL__  AS symbol , M.__CCLE_SAMPLE_ID__ AS Barcode
    FROM __MUTATION_TABLE__ M
    WHERE __SYMBOL__ IN (__GENELIST__) AND
    M.Variant_Classification IN (__MUTATIONLIST__) AND __CCLE_SAMPLE_ID__ in (__SAMPLE_LIST_CCLE__))"""

    rest_of_the_query = """
     , table2 AS (
    SELECT
        S.DepMap_ID Barcode,   __SYMBOL__ symbol,
        (RANK() OVER (PARTITION BY __SYMBOL__ ORDER BY __EFFECT__ ASC)) + (COUNT(*) OVER ( PARTITION BY __SYMBOL__, CAST(__EFFECT__ as STRING)) - 1)/2.0  AS rnkdata
    FROM
       __ACHILLES_TABLE__ A, __SAMPLE_INFO_TABLE__ S  
       where __SYMBOL__ IS NOT NULL AND __EFFECT__ IS NOT NULL AND  S.__REL_SAMPLE_ID__=A.__SAMPLE_ID__ AND S.DepMap_ID in (__SAMPLE_LIST_CCLE__)
       ),
summ_table AS (
SELECT
   n1.symbol as symbol1,
   n2.symbol as symbol2,
   COUNT( n1.Barcode) as n_1,
   SUM( n2.rnkdata )  as sumx_1,
FROM
   table1 AS n1
INNER JOIN
   table2 AS n2
ON
   n1.Barcode = n2.Barcode
GROUP BY
    symbol1, symbol2 ),

statistics AS (
SELECT symbol1, symbol2, n1, n, U1,
       (U1 - n1n2/2.0)/den as zscore
FROM (
   SELECT  symbol1, symbol2, n_t as n,
       n_1 as n1,
       sumx_1 - n_1 *(n_1 + 1) / 2.0 as U1,
       n_1 * (n_t - n_1 ) as n1n2,
       SQRT( n_1 * (n_t - n_1 )*(n_t + 1) / 12.0 ) as den
   FROM  summ_table as t1
   LEFT JOIN ( SELECT symbol, COUNT( Barcode ) as n_t
            FROM table2
            GROUP BY symbol)  t2
   ON symbol2 = symbol
   WHERE n_t > 20 and n_1>5
)
WHERE den > 0
)
SELECT symbol1, symbol2, n1, n, U1,
    `cgc-05-0042.functions.jstat_normal_cdf`(zscore, 0.0, 1.0 ) as pvalue
FROM statistics
GROUP BY 1,2,3,4,5,6
#HAVING pvalue <= 0.01
ORDER BY pvalue ASC """

    genes_intermediate_representation = [
        "'" + str(x) + "'" for x in input_genes
    ]
    input_genes_query = ','.join(genes_intermediate_representation)
    included_samples = ["'" + str(x) + "'" for x in selected_samples]
    included_samples = ','.join(included_samples)
    included_samples_ccle = ["'" + str(x) + "'" for x in ccle_samples]
    included_samples_ccle = ','.join(included_samples_ccle)

    if SL_or_SDL == 'SDL' or input_mutations is None:
        sql_func_ex = sql_without_mutation + ')' + ' ' + rest_of_the_query
    else:
        mutations_intermediate_representation = [
            "'" + x + "'" for x in input_mutations
        ]
        input_mutations_for_query = ','.join(
            mutations_intermediate_representation)
        sql_func_ex = sql_without_mutation + ' ' + sql_mutation_part + ' ' + rest_of_the_query
        sql_func_ex = sql_func_ex.replace('__MUTATION_TABLE__', mutation_table)
        sql_func_ex = sql_func_ex.replace('__MUTATIONLIST__',
                                          input_mutations_for_query)

    sql_func_ex = sql_func_ex.replace('__GENELIST__', input_genes_query)
    sql_func_ex = sql_func_ex.replace('__CUTOFFPRC__',
                                      str(percentile_threshold / 100))
    sql_func_ex = sql_func_ex.replace('__CUTOFFSCNA__', str(cn_threshold))
    sql_func_ex = sql_func_ex.replace('__CN_TABLE__', cn_table)
    sql_func_ex = sql_func_ex.replace('__GENE_EXP_TABLE__', gene_exp_table)
    sql_func_ex = sql_func_ex.replace('__SAMPLE_ID__', sample_id)
    sql_func_ex = sql_func_ex.replace('__SYMBOL__', symbol)
    sql_func_ex = sql_func_ex.replace('__ACHILLES_TABLE__', dep_score_table)
    sql_func_ex = sql_func_ex.replace('__GENE_EXPRESSION__', gene_exp)
    sql_func_ex = sql_func_ex.replace('__EFFECT__', effect)
    sql_func_ex = sql_func_ex.replace('__SAMPLE_LIST__', included_samples)
    sql_func_ex = sql_func_ex.replace('__SAMPLE_LIST_CCLE__',
                                      included_samples_ccle)
    sql_func_ex = sql_func_ex.replace('__SAMPLE_INFO_TABLE__',
                                      sample_info_table)
    sql_func_ex = sql_func_ex.replace('__CCLE_SAMPLE_ID__', ccle_sample_id)
    sql_func_ex = sql_func_ex.replace('__REL_SAMPLE_ID__', cid)

    if SL_or_SDL == "SL":
        comp_str = "<" + str(cn_threshold)
        com_gene_th = "<" + str(percentile_threshold / 100)

    elif SL_or_SDL == "SDL":
        comp_str = ">" + str(cn_threshold)
        com_gene_th = ">" + str(percentile_threshold / 100)

    sql_func_ex = sql_func_ex.replace('__CN_CMP_STR__', comp_str)
    sql_func_ex = sql_func_ex.replace('__GENE_CMP_STR__', com_gene_th)

    results = client.query(sql_func_ex).result().to_dataframe()
    if results.shape[0] < 1:
        print("Functional examimation inference procedure applied on " +
              database + " did not find candidate " + SL_or_SDL + " pairs.")
        return (results)

    report = results[['symbol1', 'symbol2', 'n1', 'n', 'pvalue']]
    report = report.dropna()
    report.columns = [
        'InactiveDB', 'SL_Candidate', '#InactiveSamples', '#Samples', 'PValue'
    ]
    report['Inactive'] = report['InactiveDB'].map(gene_mapping)

    if fdr_level == "gene_level":
        inactive_genes = list(report["Inactive"].unique())
        for i in range(len(inactive_genes)):
            report.loc[report["Inactive"] == inactive_genes[i],
                       'FDR'] = multipletests(
                           report.loc[report["Inactive"] == inactive_genes[i],
                                      'PValue'],
                           method=adj_method,
                           is_sorted=False)[1]

    elif fdr_level == "analysis_level":
        FDR = multipletests(report['PValue'],
                            method=adj_method,
                            is_sorted=False)[1]
        report['FDR'] = FDR
    else:
        print("FDR level can be either gene_level or analysis_level")
        return ()

    report['Tissue'] = str(tissues)
    cols = [
        'Inactive', 'InactiveDB', 'SL_Candidate', '#InactiveSamples',
        '#Samples', 'PValue', 'FDR', 'Tissue'
    ]
    report = report[cols]
    if SL_or_SDL == "SDL":
        report.columns = [
            'Overactive', 'OveractiveDB', 'SL_Candidate', '#Overactive',
            '#Samples', 'PValue', 'FDR', 'Tissue'
        ]
    return report
Exemple #34
0
 def run(de, all, organism='hsa', nB=2000, beta=None, combine='fisher'):
     for x in IdMapping.SPECIES:
         if organism in x:
             organism = x[3]
             break
     else:
         raise Exception("Unknown organism")
     if organism not in ['hsa', 'mmu']:
         raise Exception("The organism not contained in the prepared data")
     if type(de) == str:
         de, all = SPIA._load_de(de, all)
     else:
         de = {int(k): float(v) for k, v in de.items()}
         all = [int(x) for x in all]
     datpT_ALL, id2name = SPIA.load_json_data(organism)
     rel = [
         "activation", "compound", "binding/association", "expression",
         "inhibition", "activation_phosphorylation", "phosphorylation",
         "inhibition_phosphorylation", "inhibition_dephosphorylation",
         "dissociation", "dephosphorylation",
         "activation_dephosphorylation", "state change",
         "activation_indirect effect", "inhibition_ubiquination",
         "ubiquination", "expression_indirect effect",
         "inhibition_indirect effect", "repression",
         "dissociation_phosphorylation", "indirect effect_phosphorylation",
         "activation_binding/association", "indirect effect",
         "activation_compound", "activation_ubiquination"
     ]
     inter_value = [
         1, 0, 0, 1, -1, 1, 0, -1, -1, 0, 0, 1, 0, 1, -1, 0, 1, -1, -1, 0,
         0, 1, 0, 1, 1
     ] or beta
     rel_dict = {rel[i]: inter_value[i] for i in range(len(rel))}
     datp_ALL = {}
     for k, v in datpT_ALL.items():
         sizem = len(v[rel[0]][0])
         s, con = np.zeros((sizem, sizem)), np.zeros((sizem, sizem))
         for kk, vv in rel_dict.items():
             con += v[kk] * abs(vv)
             s += v[kk] * vv
         zz = np.reshape(np.repeat(con.sum(axis=0), sizem), (sizem, sizem))
         z = np.transpose(zz)
         z[z == 0] = -1
         r = np.divide(s, z)
         datp_ALL[k] = r
     smPFS, tAraw, tA, pNDE, pb, pG, status = {}, {}, {}, {}, {}, {}, {}
     # calculate the Ac
     for k, v in datp_ALL.items():
         row_names = datpT_ALL[k]['row_names']
         # let first calculate the pNDE
         noMy = len(set(row_names) & set(de.keys()))
         pNDE[k] = stats.hypergeom.sf(noMy - 1, len(all),
                                      len(set(row_names) & set(all)),
                                      len(de))
         # then calculate the Ac and pPERT
         M = np.eye(v.shape[0]) * -1 + v
         if np.linalg.det(M) == 0:
             smPFS[k], tAraw[k], tA[k], pb[
                 k] = np.nan, np.nan, np.nan, np.nan
             continue
         X = []
         for x in row_names:
             if x in de:
                 X.append(de[x])
             else:
                 X.append(0)
         pfs = np.linalg.solve(M, -np.array(X))
         smPFS[k] = sum(pfs - X)
         tAraw[k] = smPFS[k]
         pfstmp = []
         de_sample = list(de.values())
         all_sample = [i for i, x in enumerate(row_names) if x in all]
         length = len(X)
         for i in range(nB):  # nB
             x = np.zeros(length)
             sp = random.sample(de_sample, noMy)
             idx = random.sample(all_sample, noMy)
             x[idx] = sp
             tt = np.linalg.solve(M, -x)
             pfstmp.append(sum(tt - x))
         tA[k] = tAraw[k] - np.median(np.array(pfstmp))
         if tA[k] > 0:
             status[k] = "Activated"
         else:
             status[k] = "Inhibited"
         ob = tA[k]
         pfstmp = np.array(pfstmp) - np.median(np.array(pfstmp))
         if ob > 0:
             pb[k] = sum([1 for pf in pfstmp if pf >= ob]) / len(pfstmp) * 2
             if pb[k] <= 0:
                 pb[k] = 1 / nB / 100
             elif pb[k] >= 1:
                 pb[k] = 1
         elif ob < 0:
             pb[k] = sum([1 for pf in pfstmp if pf <= ob]) / len(pfstmp) * 2
             if pb[k] <= 0:
                 pb[k] = 1 / nB / 100
             elif pb[k] >= 1:
                 pb[k] = 1
         else:
             pb[k] = 1
         if combine == 'fisher':
             c = pNDE[k] * pb[k]
             pG[k] = c - c * math.log(c)
         else:
             # comb = pnorm((qnorm(p1) + qnorm(p2)) / sqrt(2))
             pG[k] = norm.cdf(
                 norm.ppf(pNDE[k]) + norm.ppf(pb[k]) / math.sqrt(2))
         # print('id: ', k, '\ttA:', tA[k], '\tpNDE: ', pNDE[k], '\t pPERT: ', pb[k], '\tPG: ', pG[k])
     _, o, _, _ = multicomp.multipletests(list(pG.values()),
                                          method='fdr_bh')
     pGfdr = {list(pG.keys())[i]: o[i] for i in range(len(list(pG.keys())))}
     _, o, _, _ = multicomp.multipletests(list(pNDE.values()),
                                          method='fdr_bh')
     pNDEfdr = {
         list(pNDE.keys())[i]: o[i]
         for i in range(len(list(pNDE.keys())))
     }
     _, o, _, _ = multicomp.multipletests(list(pG.values()),
                                          method='bonferroni')
     pGbf = {list(pG.keys())[i]: o[i] for i in range(len(list(pG.keys())))}
     df = pd.DataFrame([id2name, pNDE, pb, pG, pGfdr, pGbf, status]).T
     df.columns = [
         'name', 'pNDE', 'pPERT', 'pG', 'pGfdr', 'pGFWER', 'status'
     ]
     df = df.sort_values(by='pGFWER')
     return SPIA(df, de, all, organism, nB, beta, combine)
Exemple #35
0
    pvals = np.zeros((mdl['X'].shape[1], len(pairs)))

    # save mean connectivity for each group pair, and their differences
    output_list = []
    for j, p in enumerate(pairs):
        t1 = int(p.split(',')[0])
        t2 = int(p.split(',')[1])
        for k in np.arange(mdl['X'].shape[1]):
            #pvals[k, j] = kruskalwallis(group_data[t1][:, k], group_data[t2][:, k]).pvalue
            pvals[k, j] = ttest_ind(group_data[t1][:, k], group_data[t2][:,
                                                                         k])[1]

        # binarize with fdr correction or a uncorrected threshold
        if fdr:
            corrected = multipletests(np.ravel(pvals[:, j]),
                                      alpha=0.05,
                                      method='fdr_bh')
            passed = corrected[0]
            pvals_corrected = corrected[1]
        else:
            threshold = 1
            passed = pvals[:, j] < threshold

        # skip comparisons with no significant contrasts
        if len(passed) == 0:
            print('SKIPPING DUE TO NO SIG. DIFFERENCES')
            continue

        try:
            threshold = np.max(pvals[passed])
            print(
    df = elem_df.merge(barc_df.drop(samp_drop_cols, axis=1),
                       on=["unique_id", "element"],
                       how="left")
    df = df.drop_duplicates()
    all_grp_dfs.append(df)

# In[27]:

# correct p values
all_corr_dfs = []
for df in all_grp_dfs:
    pval_cols = [x for x in df.columns if "_pval" in x]
    for col in pval_cols:
        sub_df = df[~pd.isnull(df[col])][["unique_id", "element", col]]
        new_pvals = multicomp.multipletests(sub_df[col],
                                            method="bonferroni")[1]
        padj_col = "rna_%s_padj" % (col.split("_")[1])
        sub_df[padj_col] = new_pvals
        sub_df.drop(col, axis=1, inplace=True)
        df = df.merge(sub_df, on=["unique_id", "element"], how="left")
    all_corr_dfs.append(df)

# ## 4. use stouffer's method to combine p-values across replicates

# in this case, combine the *uncorrected* pvalues and *then adjust* using stouffer's method

# In[28]:

all_names = [
    "POOL1__pMPRA1__HeLa", "POOL1__pMPRA1__HepG2", "POOL1__pMPRA1__K562",
    "POOL1__pNoCMVMPRA1__HeLa", "POOL1__pNoCMVMPRA1__HepG2",
Exemple #37
0
def p_adj_bh(x):
    '''Adjust p values using Benjamini/Hochberg method'''
    return multipletests(x, method='fdr_bh', returnsorted = False)[1]
Exemple #38
0
def pcaller(M,
            cM,
            biases,
            IR,
            chromLen,
            Diags,
            cDiags,
            num,
            pw=2,
            ww=5,
            sig=0.05,
            maxww=20,
            maxapart=2000000,
            res=10000):

    # Necessary Modules
    from scipy.stats import poisson
    from statsmodels.sandbox.stats.multicomp import multipletests

    logger = logging.getLogger()

    extDiags = {}
    for w in range(ww, maxww + 1):
        temp = []
        for i in xrange(num):
            OneDArray = Diags[i]
            extODA = np.zeros(chromLen - i + w * 2)
            extODA[w:-w] = OneDArray
            temp.append(extODA)
        extDiags[w] = temp

    x = np.arange(ww, num)
    predictE = IR.predict(x)
    predictE[predictE < 0] = 0
    EDiags = []
    for i in xrange(x.size):
        OneDArray = np.ones(chromLen - x[i]) * predictE[i]
        EDiags.append(OneDArray)

    EM = sparse.diags(EDiags, x, format='csr')

    extCDiags = {}
    extEDiags = {}
    for w in range(ww, maxww + 1):
        tempC = []
        tempE = []
        for i in xrange(x.size):
            extODA_E = np.zeros(chromLen - x[i] + w * 2)
            extODA_E[w:-w] = EDiags[i]
            tempE.append(extODA_E)
            extODA_C = np.zeros(chromLen - x[i] + w * 2)
            extODA_C[w:-w] = cDiags[i]
            tempC.append(extODA_C)
        extCDiags[w] = tempC
        extEDiags[w] = tempE

    ps = 2 * pw + 1  # Peak Size

    Pool_Diags = {}
    Pool_EDiags = {}
    Pool_cDiags = {}
    Offsets_Diags = {}
    Offsets_EDiags = {}

    for w in range(ww, maxww + 1):
        ws = 2 * w + 1  # Window size
        ss = range(ws)
        Pool_Diags[w] = {}
        Pool_EDiags[w] = {}
        Pool_cDiags[w] = {}
        Offsets_Diags[w] = {}
        Offsets_EDiags[w] = {}
        for i in ss:
            for j in ss:
                Pool_Diags[w][(i, j)] = []
                Pool_EDiags[w][(i, j)] = []
                Pool_cDiags[w][(i, j)] = []
                Offsets_Diags[w][(i, j)] = np.arange(num) + (i - j)
                Offsets_EDiags[w][(i, j)] = x + (i - j)
                for oi in np.arange(num):
                    if Offsets_Diags[w][(i, j)][oi] >= 0:
                        starti = i
                        endi = i + chromLen - Offsets_Diags[w][(i, j)][oi]
                    else:
                        starti = i - Offsets_Diags[w][(i, j)][oi]
                        endi = starti + chromLen + Offsets_Diags[w][(i, j)][oi]
                    Pool_Diags[w][(i, j)].append(extDiags[w][oi][starti:endi])
                for oi in xrange(x.size):
                    if Offsets_EDiags[w][(i, j)][oi] >= 0:
                        starti = i
                        endi = i + chromLen - Offsets_EDiags[w][(i, j)][oi]
                    else:
                        starti = i - Offsets_EDiags[w][(i, j)][oi]
                        endi = starti + chromLen + Offsets_EDiags[w][(i,
                                                                      j)][oi]
                    Pool_EDiags[w][(i,
                                    j)].append(extEDiags[w][oi][starti:endi])
                    Pool_cDiags[w][(i,
                                    j)].append(extCDiags[w][oi][starti:endi])

    ## Peak Calling ...
    xi, yi = M.nonzero()
    Mask = ((yi - xi) >= ww) & ((yi - xi) <= (maxapart // res))
    xi = xi[Mask]
    yi = yi[Mask]
    bSV = np.zeros(xi.size)
    bEV = np.zeros(xi.size)

    logger.info('Observed Contact Number: %d', xi.size)

    RefIdx = np.arange(xi.size)
    RefMask = np.ones_like(xi, dtype=bool)

    iniNum = xi.size

    logger.info('Calculate the expected matrix ...')
    for w in range(ww, maxww + 1):
        ws = 2 * w + 1
        bS = sparse.csr_matrix((chromLen, chromLen))
        bE = sparse.csr_matrix((chromLen, chromLen))
        Reads = sparse.csr_matrix((chromLen, chromLen))
        logger.info('    Current window width: %s' % w)
        P1 = set([(i, j) for i in range(w - pw, ps + w - pw)
                  for j in range(w - pw, ps + w - pw)])
        P_1 = set([(i, j) for i in range(w + 1, ws) for j in range(w)])
        P_2 = set([(i, j) for i in range(w + 1, ps + w - pw)
                   for j in range(w - pw, w)])
        P2 = P_1 - P_2
        for key in Pool_Diags[w]:
            if (key[0] != w) and (key[1] != w) and (key not in P1):
                bS = bS + sparse.diags(
                    Pool_cDiags[w][key], Offsets_EDiags[w][key], format='csr')
                bE = bE + sparse.diags(
                    Pool_EDiags[w][key], Offsets_EDiags[w][key], format='csr')
            if key in P2:
                Reads = Reads + sparse.diags(
                    Pool_Diags[w][key], Offsets_Diags[w][key], format='csr')

        Txi = xi[RefIdx]
        Tyi = yi[RefIdx]
        RNums = np.array(Reads[Txi, Tyi]).ravel()
        EIdx = RefIdx[RNums >= 16]
        logger.info('    Valid Contact Number: %d', EIdx.size)
        Valid_Ratio = EIdx.size / float(iniNum)
        logger.info('    Valid Contact Ratio: %.3f', Valid_Ratio)
        Exi = xi[EIdx]
        Eyi = yi[EIdx]
        bSV[EIdx] = np.array(bS[Exi, Eyi]).ravel()
        bEV[EIdx] = np.array(bE[Exi, Eyi]).ravel()
        RefIdx = RefIdx[RNums < 16]

        iniNum = RefIdx.size

        if Valid_Ratio < 0.1:
            logger.info(
                '    Ratio of valid contact is too small, break the loop ...')
            break

        logger.info('    Continue ...')
        logger.info('    %d Contacts will get into next loop ...', RefIdx.size)

    RefMask[RefIdx] = False

    Mask = np.logical_and((bEV != 0), RefMask)
    xi = xi[Mask]
    yi = yi[Mask]
    bRV = bSV[Mask] / bEV[Mask]

    bR = sparse.csr_matrix((chromLen, chromLen))
    bR[xi, yi] = bRV

    ## Corrected Expected Matrix
    cEM = EM.multiply(bR)

    logger.info('Construct Poisson Models ...')
    ## Poisson Models
    xi, yi = cEM.nonzero()
    Evalues = np.array(cEM[xi, yi]).ravel() * biases[xi] * biases[yi]
    Mask = (Evalues > 0)
    Evalues = Evalues[Mask]
    xi = xi[Mask]
    yi = yi[Mask]
    Poisses = poisson(Evalues)
    logger.info('Number of Poisson Models: %d', Evalues.size)
    logger.info('Assign a p-value for each Observed Contact Frequency ...')
    Ovalues = np.array(M[xi, yi]).ravel()
    pvalues = 1 - Poisses.cdf(Ovalues)
    Fold = Ovalues / Evalues

    # Multiple Tests
    logger.info('Benjamini-Hochberg correcting for multiple tests ...')
    cResults = multipletests(pvalues, alpha=sig, method='fdr_bh')
    reject = cResults[0]
    cP = cResults[1]  # Corrected Pvalue
    xpos = xi[reject]
    ypos = yi[reject]
    pvalues = pvalues[reject]
    qvalues = cP[reject]
    Ovalues = Ovalues[reject]
    Fold = Fold[reject]

    # Remove Gap Effect
    logger.info('Remove Gap Effects ...')
    gaps = set(np.where(np.array(M.sum(axis=1)).ravel() == 0)[0])
    if len(gaps) > 0:
        fIdx = []
        for i in xrange(xpos.size):
            lower = (xpos[i] - 5) if (xpos[i] > 5) else 0
            upper = (xpos[i] +
                     5) if ((xpos[i] + 5) < chromLen) else (chromLen - 1)
            cregion_1 = range(lower, upper)
            lower = (ypos[i] - 5) if (ypos[i] > 5) else 0
            upper = (ypos[i] +
                     5) if ((ypos[i] + 5) < chromLen) else (chromLen - 1)
            cregion_2 = range(lower, upper)
            cregion = set(cregion_1) | set(cregion_2)
            intersect = cregion & gaps
            if len(intersect) == 0:
                fIdx.append(i)

        xpos = xpos[fIdx]
        ypos = ypos[fIdx]
        pvalues = pvalues[fIdx]
        qvalues = qvalues[fIdx]
        Ovalues = Ovalues[fIdx]
        Fold = Fold[fIdx]

    return xpos, ypos, Ovalues, Fold, pvalues, qvalues
                    #expression[gene + "_" + sample] = [float(geneExpression[sampleInd]), np.mean(negativeExpr)]
                    expression[gene + "_" + sample] = [
                        float(geneExpression[sampleInd]), negativeExpr
                    ]

cosmicGenePValuesOneSided = np.array(cosmicGenePValuesOneSided, dtype="object")
sortedInd = cosmicGenePValuesOneSided[:, 1].argsort()
cosmicGenePValuesOneSided = cosmicGenePValuesOneSided[sortedInd]
cosmicGeneTStat = np.array(cosmicGeneTStat, dtype="object")
cosmicGeneTStat = cosmicGeneTStat[sortedInd]

print cosmicGenePValuesOneSided
print cosmicGeneTStat

reject, pAdjusted, _, _ = multipletests(
    cosmicGenePValuesOneSided[:,
                              1], method='bonferroni')  #fdr_bh or bonferroni

import matplotlib.pyplot as plt

print "Significant COSMIC genes after bonferroni and 1-sided: "
filteredPValues = []
signGenes = []
for pValueInd in range(0, len(cosmicGenePValuesOneSided)):

    if reject[pValueInd] == True and np.sign(cosmicGeneTStat[pValueInd,
                                                             1]) == 1:

        pValue = pAdjusted[pValueInd]
        filteredPValues.append(
            [cosmicGenePValuesOneSided[pValueInd, 0], pValue])
import pandas as pd
import numpy as np
from statsmodels.sandbox.stats.multicomp import multipletests

df = pd.read_csv('timeseries_significance_qvalues.csv',sep=",")
corrected = multipletests(df['p_values'].values,alpha=0.05,method='bonferroni')[1]
df['bh_corrected'] = corrected
df.to_csv('corrected.txt',index=False)
Exemple #41
0
    def get_test(self):
        """
        :param model_type: for which we want to extract
        :return:
        """

        print(f"Calculating sign test for subset: {self.subset_type}")

        # Load all models predictions for each phase
        for phase in self.phase_list:
            phase_all_models_ndcg_list = []  # ndcg list
            model_type_list = []  # Name of the model

            for model_type in self.models_list:
                ndcg_path = self._get_ndcg_path(self.model_preds_root,
                                                model_type,
                                                phase=phase,
                                                subset_type=self.subset_type)
                ndcg_list = self.read_file_as_list(ndcg_path)
                print(f"Total samples {model_type}: {len(ndcg_list)}")
                phase_all_models_ndcg_list.append(ndcg_list)
                model_type_list.append(model_type)

            # We form combinations of all indices
            index_models_list = list(range(len(model_type_list)))
            # pairwise
            combination_set = combinations(index_models_list, 2)

            for combination_indices in combination_set:
                model1_preds = phase_all_models_ndcg_list[
                    combination_indices[0]]
                model2_preds = phase_all_models_ndcg_list[
                    combination_indices[1]]
                model1_name = model_type_list[combination_indices[0]]
                model2_name = model_type_list[combination_indices[1]]

                stat, p = mannwhitneyu(model1_preds, model2_preds)
                print(
                    f'Mannwhitneyu - For phase: {phase} - models: {model1_name} vs'
                    f' {model2_name} : stat={stat:.4f}, p={p:.4f}')

                stat, p = wilcoxon(model1_preds, model2_preds)
                print(
                    f'Wilcoxon - For phase: {phase} - models: {model1_name} vs'
                    f' {model2_name} : stat={stat:.4f}, p={p:.4f}')

            # Checking for equivalence of *args
            # stat, p = f_oneway(phase_all_models_ndcg_list[0],
            # phase_all_models_ndcg_list[1], phase_all_models_ndcg_list[2],
            # phase_all_models_ndcg_list[3])
            # stat, p = f_oneway(*phase_all_models_ndcg_list)
            # stat, p = mannwhitneyu(*phase_all_models_ndcg_list)
            # stat, p = wilcoxon(*phase_all_models_ndcg_list)
            stat, p = kruskal(*phase_all_models_ndcg_list)
            print(f'Kruskal - For phase: {phase}: stat={stat:.4f}, p={p:.4f}')

            bonferroni_correction = multipletests(p, method='bonferroni')
            # print(bonferroni_correction)
            # (reject, pvals_corrected, alphacSidak, alphacBonf)
            action = str(bonferroni_correction[0][0])  # np array
            new_p_value = bonferroni_correction[1][0]
            print(
                f'Kruskal - bonferroni - For phase: {phase}: p={new_p_value:.4f}, '
                f'action: {str(action)}')

            stat, p = friedmanchisquare(*phase_all_models_ndcg_list)
            print(
                f'Friedmanchisquare - For phase: {phase}: stat={stat:.4f}, p={p:.4f}'
            )
def modbindevalscorer(modules, binding):
    modules = modules.filter_size(5)
    if len(modules) == 0:
        aucodds = 0

        odds = pd.DataFrame()
        pvals = pd.DataFrame()
        qvals = pd.DataFrame()
    else:
        modmem = modules.cal_membership(G=binding.index)
        binmem = binding

        modsizes = modmem.sum()
        binsizes = binmem.sum()

        tps = modmem.T.dot(binmem.astype(np.int))
        fps = binsizes - tps
        fns = (modsizes - tps.T).T
        tns = binmem.shape[0] - tps - fps - fns

        odds = ((tps * tns) / (fps * fns))

        values = np.array([
            odds.values.flatten(),
            tps.values.flatten(),
            fps.values.flatten(),
            fns.values.flatten(),
            tns.values.flatten()
        ])

        pvals = np.apply_along_axis(filterfisher, 0, values)
        qvals = []
        for pvalrow in pvals.reshape(tps.shape):
            _, qvalrow, _, _ = np.array(multipletests(pvalrow), dtype=object)
            qvals.append(qvalrow)
        qvals = pd.DataFrame(qvals, index=tps.index, columns=tps.columns)
        pvals = pd.DataFrame(pvals.reshape(tps.shape),
                             index=tps.index,
                             columns=tps.columns)
        if binding.columns.nlevels > 1:
            pvals = pvals.T.groupby(level=0).min()
            qvals = qvals.T.groupby(level=0).min()  # group by regulator
            odds = odds.T.groupby(level=0).max()  # group by regulator
        else:
            pvals = pvals.T
            qvals = qvals.T
            odds = odds.T

        ## auc odds

        odds_filtered = odds.copy()
        odds_filtered.values[(qvals > 0.05).values.astype(np.bool)] = 0
        odds_max = odds_filtered.max(1)

        if len(odds_max) == 0:
            aucodds = 0
        else:
            cutoffs = np.linspace(0, 3, 100)

            stillenriched = [
                (np.log10(odds_max) >= cutoff).sum() / len(odds_max)
                for cutoff in cutoffs
            ]
            aucodds = np.trapz(stillenriched,
                               cutoffs) / (cutoffs[-1] - cutoffs[0])

    scores = {"aucodds": aucodds}

    return scores
Exemple #43
0
for dataset in datasetids:
    print(dataset),
    ## Read dataset
    df, meta = read_dataset_files(dataset, datadir)

    for metric in ['shannon', 'chao1', 'simpson']:
        alpha = make_alpha_df(df, meta, dataset, metric)
        alphas.append(alpha)

alphasdf = pd.concat(alphas, ignore_index=True)

alphasdf.to_csv(args.alphas_out, sep='\t', index=False)

# Because I'm using the entire OTU table, some of these samples don't have disease metadata.
# I don't want to compare "NaN" labeled samples with anything because they mean nothing.
alphasdf = alphasdf.query('DiseaseState != " "').dropna(
    subset=['DiseaseState'])

pvals = []
for g, subdf in alphasdf.groupby('alpha_metric'):
    pval = get_layered_pvals(subdf, 'DiseaseState', 'alpha', 'study')
    pval = pd.DataFrame.from_dict(pval).stack().reset_index()
    pval.columns = ['comparison', 'study', 'p']
    pval['q'] = multipletests(pval['p'])[1]
    pval['alpha_metric'] = g
    pvals.append(pval)

pvalsdf = pd.concat(pvals)
pvalsdf.to_csv(args.pvals_out, sep='\t', index=False)
def anova_modt(df, columns, design):
    """ Runs ANOVA on the subset of df defined by columns with the specified design matrix, using the moderated T linear model.

    Args:
        df (Pandas DataFrame): DataFrame with one column per measurement, rows=proteins
        columns (list(columns)): list of column names in df which have data
        design (Pandas DataFrame)): design matrix (see limma documentation for details)
            Note that the columns names of design are the returned coefficient names

    Returns:
        (res_df, result)
        res_df (Pandas DataFrame): DataFrame with one row per protein  
            - Same order as df
            - One column for each best fit coefficient
            - Has columns 'F_<COEF>' and 'PVal_<COEF>' for each coefficient
        result (R object)
    """
    coefs = list(design.columns)
    data = df[columns]
    # Note that result is an R object
    # We can't do much with it directly except call topTable
    result = r['moderated.t'](data, design=design)

    # Now obtain best estimates for each coefficient
    res_coef = pandas2ri.ri2py(r['topTable'](
        result, number=data.shape[0], sort_by='none')).iloc[:, :len(coefs) + 3]
    # Adjust overall p-value
    res_coef['P.Value.Adj'] = multipletests(res_coef['P.Value'],
                                            alpha=ALPHA,
                                            method='fdr_bh')[1]
    # F-test for significance for terms OTHER than intercept and PlexB
    # Do this iteratively and obtain a p-value and F-value for every coefficient
    coefs.remove('Intercept')
    # Create mapping of coefficients to F and PVal columns
    coef_col_map = {
        c: ['F_%s' % c, 'PVal_%s' % c,
            'PVal_%s_Adj' % c]
        for c in coefs
    }
    result_colnames = [col for cols in coef_col_map.values() for col in cols]
    # Create empty pvalue df
    res_f = pd.DataFrame(index=np.arange(data.shape[0]),
                         columns=result_colnames,
                         dtype=float)
    for c in coef_col_map.keys():
        # Find F and pvals/adj_pvals for each coefficient
        F_pv = pandas2ri.ri2py(r['topTable'](result,
                                             coef=c,
                                             number=data.shape[0],
                                             sort_by='none'))[['t', 'P.Value']]
        _, pv_adj, _, _ = multipletests(F_pv['P.Value'],
                                        alpha=ALPHA,
                                        method='fdr_bh')
        res_f[coef_col_map[c]] = np.concatenate(
            (F_pv.values, pv_adj[:, np.newaxis]), axis=1)

    # Now bind together everything into one df
    aux_data = df.drop(columns, axis=1).reset_index(drop=True)
    data.reset_index(drop=True, inplace=True)
    res_f.reset_index(drop=True, inplace=True)
    res_coef.reset_index(drop=True, inplace=True)
    res_df = pd.concat((data, res_coef, res_f, aux_data), axis=1)
    return res_df, result
def run_psm(data, comp1, comp2, plex='both'):
    """
    Use PSM and roll up to the level of unique accession_number
    """
    start = time.time()
    c1, c2 = validate_comp_subset_data(data, comp1, comp2)

    if plex == 'A' or plex == 'B':
        # Filter to corresponding plex
        c1 = c1[[col for col in c1.columns if col[-2] == plex]]
        c2 = c2[[col for col in c2.columns if col[-2] == plex]]
    elif plex == 'both':
        # Do nothing
        pass
    else:
        raise ValueError('Invalid specification of plex')

    pvals = do_stat_tests_protein(c1, c2, data.accession_number)
    # Delete pvals which are all NaN, i.e. skipped
    # Otherwise adjust pvals
    for c in pvals.columns:
        if pvals[c].isnull().all():
            del pvals[c]
        elif c == u'fold_change_med':
            continue  # Don't adjust the pval for fold change
        elif pvals[c].dtype == np.number:
            # Mask NaN values so we don't bias the adjusted test
            pv = pvals[c]
            mask = np.isfinite(pv)
            pv_corr = np.full(pv.shape, np.nan)
            pv_corr[mask] = multipletests(pv[mask],
                                          alpha=0.05,
                                          method='fdr_bh')[1]
            pvals[c + '_adj'] = pv_corr

    pvals.rename(columns={
        'protein_id': 'accession_number',
        'fold_change_med': 'fold_change'
    },
                 inplace=True)
    # Make auxiliary info
    aux_info = data[[
        'accession_number',
        'geneSymbol',
    ]]
    aux_info.drop_duplicates(inplace=True)

    def get_group_counts(x):
        return pd.Series({
            'n_pep': len(x),
            'n_valid': np.sum(1 - np.isnan(x).values)
        })

    tmp = (pd.concat((c1, c2), axis=1).groupby(
        data.accession_number).apply(get_group_counts).reset_index())

    out = pd.merge(pvals, aux_info, on='accession_number')
    out = pd.merge(out, tmp, on='accession_number')

    print time.time() - start
    return out
Exemple #46
0
    def fit(self, ids, ids2=None, voxel_thresh=0.01, q=0.05, corr='FWE',
            n_iters=5000, prior=0.5, n_cores=4):
        self.voxel_thresh = voxel_thresh
        self.corr = corr
        self.n_iters = n_iters
        self.ids = ids
        if ids2 is None:
            ids2 = list(set(self.coordinates['id'].values) - set(self.ids))
        self.ids2 = ids2
        all_ids = self.ids + self.ids2
        red_coords = self.coordinates.loc[self.coordinates['id'].isin(all_ids)]

        k_est = self.kernel_estimator(red_coords, self.mask)
        ma_maps1 = k_est.transform(self.ids, masked=True,
                                   **self.kernel_arguments)
        ma_maps2 = k_est.transform(self.ids2, masked=True,
                                   **self.kernel_arguments)

        # Calculate different count variables
        eps = np.spacing(1)
        n_selected = len(self.ids)
        n_unselected = len(self.ids2)
        n_mappables = n_selected + n_unselected

        # Transform MA maps to 1d arrays
        ma_maps_all = np.vstack((ma_maps1, ma_maps2))

        n_selected_active_voxels = np.sum(ma_maps1, axis=0)
        n_unselected_active_voxels = np.sum(ma_maps2, axis=0)

        # Nomenclature for variables below: p = probability,
        # F = feature present, g = given, U = unselected, A = activation.
        # So, e.g., pAgF = p(A|F) = probability of activation
        # in a voxel if we know that the feature is present in a study.
        pF = (n_selected * 1.0) / n_mappables
        pA = np.array(np.sum(ma_maps_all, axis=0) / n_mappables).squeeze()

        # Conditional probabilities
        pAgF = n_selected_active_voxels * 1.0 / n_selected
        pAgU = n_unselected_active_voxels * 1.0 / n_unselected
        pFgA = pAgF * pF / pA

        # Recompute conditionals with uniform prior
        pAgF_prior = prior * pAgF + (1 - prior) * pAgU
        pFgA_prior = pAgF * prior / pAgF_prior

        # One-way chi-square test for consistency of activation
        pAgF_chi2_vals = one_way(np.squeeze(n_selected_active_voxels),
                                 n_selected)
        pAgF_p_vals = special.chdtrc(1, pAgF_chi2_vals)
        pAgF_sign = np.sign(n_selected_active_voxels -
                            np.mean(n_selected_active_voxels))
        pAgF_z = p_to_z(pAgF_p_vals, tail='two') * pAgF_sign

        # Two-way chi-square for specificity of activation
        cells = np.squeeze(
            np.array([[n_selected_active_voxels, n_unselected_active_voxels],
                      [n_selected - n_selected_active_voxels,
                       n_unselected - n_unselected_active_voxels]]).T)
        pFgA_chi2_vals = two_way(cells)
        pFgA_p_vals = special.chdtrc(1, pFgA_chi2_vals)
        pFgA_p_vals[pFgA_p_vals < 1e-240] = 1e-240
        pFgA_sign = np.sign(pAgF - pAgU).ravel()
        pFgA_z = p_to_z(pFgA_p_vals, tail='two') * pFgA_sign
        images = {
            'pA': pA,
            'pAgF': pAgF,
            'pFgA': pFgA,
            ('pAgF_given_pF=%0.2f' % prior): pAgF_prior,
            ('pFgA_given_pF=%0.2f' % prior): pFgA_prior,
            'consistency_z': pAgF_z,
            'specificity_z': pFgA_z,
            'consistency_chi2': pAgF_chi2_vals,
            'specificity_chi2': pFgA_chi2_vals}

        if corr == 'FWE':
            iter_dfs = [red_coords.copy()] * n_iters
            null_ijk = np.vstack(np.where(self.mask.get_data())).T
            rand_idx = np.random.choice(null_ijk.shape[0],
                                        size=(red_coords.shape[0], n_iters))
            rand_ijk = null_ijk[rand_idx, :]
            iter_ijks = np.split(rand_ijk, rand_ijk.shape[1], axis=1)

            params = zip(iter_dfs, iter_ijks, range(n_iters))

            with mp.Pool(n_cores) as p:
                perm_results = list(tqdm(p.imap(self._perm, params), total=self.n_iters))
            pAgF_null_chi2_dist, pFgA_null_chi2_dist = zip(*perm_results)

            # pAgF_FWE
            pAgF_null_chi2_dist = np.squeeze(pAgF_null_chi2_dist)
            np.savetxt('null_dist.txt', pAgF_null_chi2_dist)
            pAgF_p_FWE = np.empty_like(pAgF_chi2_vals).astype(float)
            for voxel in range(pFgA_chi2_vals.shape[0]):
                pAgF_p_FWE[voxel] = null_to_p(pAgF_chi2_vals[voxel],
                                              pAgF_null_chi2_dist,
                                              tail='upper')
            # Crop p-values of 0 or 1 to nearest values that won't evaluate to
            # 0 or 1. Prevents inf z-values.
            pAgF_p_FWE[pAgF_p_FWE < eps] = eps
            pAgF_p_FWE[pAgF_p_FWE > (1. - eps)] = 1. - eps
            pAgF_z_FWE = p_to_z(pAgF_p_FWE, tail='two') * pAgF_sign
            images['consistency_p_FWE'] = pAgF_p_FWE
            images['consistency_z_FWE'] = pAgF_z_FWE

            # pFgA_FWE
            pFgA_null_chi2_dist = np.squeeze(pFgA_null_chi2_dist)
            pFgA_p_FWE = np.empty_like(pFgA_chi2_vals).astype(float)
            for voxel in range(pFgA_chi2_vals.shape[0]):
                pFgA_p_FWE[voxel] = null_to_p(pFgA_chi2_vals[voxel],
                                              pFgA_null_chi2_dist,
                                              tail='upper')
            # Crop p-values of 0 or 1 to nearest values that won't evaluate to
            # 0 or 1. Prevents inf z-values.
            pFgA_p_FWE[pFgA_p_FWE < eps] = eps
            pFgA_p_FWE[pFgA_p_FWE > (1. - eps)] = 1. - eps
            pFgA_z_FWE = p_to_z(pFgA_p_FWE, tail='two') * pFgA_sign
            images['specificity_p_FWE'] = pFgA_p_FWE
            images['specificity_z_FWE'] = pFgA_z_FWE
        elif corr == 'FDR':
            _, pAgF_p_FDR, _, _ = multipletests(pAgF_p_vals, alpha=0.05,
                                                method='fdr_bh',
                                                is_sorted=False,
                                                returnsorted=False)
            pAgF_z_FDR = p_to_z(pAgF_p_FDR, tail='two') * pAgF_sign
            images['consistency_z_FDR'] = pAgF_z_FDR

            _, pFgA_p_FDR, _, _ = multipletests(pFgA_p_vals, alpha=0.05,
                                                method='fdr_bh',
                                                is_sorted=False,
                                                returnsorted=False)
            pFgA_z_FDR = p_to_z(pFgA_p_FDR, tail='two') * pFgA_sign
            images['specificity_z_FDR'] = pFgA_z_FDR

        self.results = MetaResult(self, mask=self.mask, **images)
Exemple #47
0
        
        fisherResults[id_value] = {id_label: id_value,
                            termlabel: termname,
                            'p-Value': pvalue,
                            '#Test': ct.loc['Test_set', ct.columns[0]],
                            '#Ref': ct.loc['Reference_set', ct.columns[0]],
                            '#notAnnotTest': ct.loc['Test_set', ct.columns[1]],
                            '#notAnnotRef': ct.loc['Reference_set', ct.columns[1]],
                            'Over/Under': sig,
                            'TestSeqs': genelist_test,
                            'RefSeqs': genelist_ref }

    
    fr = pd.DataFrame(fisherResults).T

    benjamini = sm.multipletests(fr['p-Value'], method = 'fdr_bh', alpha=args.thresh)

    fr = pd.concat([fr, pd.Series(benjamini[1], name='FDR', index=fr.index)], axis=1) #p-adjusted
    
    fr = pd.concat([fr, pd.Series(benjamini[0], name='FDR_TEST', index=fr.index)], axis=1) #is_rejected
    
#     sns.set(color_codes=True)
#     sns.distplot(fr['p-Value'], kde=False, bins=20)
#     sns.plt.show()
     
    fr_filtered = fr[fr['p-Value'] <= args.thresh]
    
    fr_filtered.to_csv(os.path.join(basedir, outfile + "ix"), 
                       columns=[termlabel,'FDR','p-Value','#Test','#Ref','#notAnnotTest','#notAnnotRef','Over/Under','TestSeqs','RefSeqs'], 
                       header=True, index_label=id_label, sep='\t')
    
Exemple #48
0
def p_roi_masking(substitution, ts_file_template, beta_file_template, p_file_template, design_file_template, event_file_template, p_level, brain_mask):
	"""Apply a substitution pattern to timecourse, beta, and design file templates - and mask the data of the former two according to a roi. Subsequently scale the design by the mean beta.

	Parameters
	----------

	substitution : dict
	A dictionary containing the template replacement fields as keys and identifiers as values.

	ts_file_template : string
	Timecourse file template with replacement fields. The file should be in NIfTI format.

	beta_file_template : string
	Beta file template with replacement fields. The file should be in NIfTI format.

	design_file_template : string
	Design file template with replacement fields. The file should be in CSV format.

	roi_path : string
	Path to the region of interest file based on which to create a mask for the time course and beta files. The file should be in NIfTI format.

	brain_mask : string
	Path to the a mask file in the *exact same* coordinate space as the input image. This is very important, as the mask is needed to crop out artefactual p=0 values. These cannot just be filtered out nummerically, since it is possible that the GLM resturns p=0 for the most significant results.

	Returns
	-------

	timecourse : array_like
	Numpy array containing the mean timecourse in the region of interest.

	design : array_like
	Numpy array containing the regressor scaled by the mean beta value of the region of interest..

	mask_map : data
	Nibabel image of the mask

	subplot_title : string
	Title for the subplot, computed from the substitution fields.
	"""

	ts_file = path.abspath(path.expanduser(ts_file_template.format(**substitution)))
	beta_file = path.abspath(path.expanduser(beta_file_template.format(**substitution)))
	p_file = path.abspath(path.expanduser(p_file_template.format(**substitution)))
	design_file = path.abspath(path.expanduser(design_file_template.format(**substitution)))
	event_file = path.abspath(path.expanduser(event_file_template.format(**substitution)))
	brain_mask = path.abspath(path.expanduser(brain_mask))
	try:
		img = nib.load(p_file)
		brain_mask = nib.load(brain_mask)
	except (FileNotFoundError, nib.py3k.FileNotFoundError):
		return None,None,None,None,None
	data = img.get_data()
	brain_mask = brain_mask.get_data()
	header = img.header
	affine = img.affine
	shape = data.shape
	data = data.flatten()
	brain_mask = brain_mask.flatten()
	brain_mask = brain_mask.astype(bool)
	brain_data = data[brain_mask]
	reject, nonzero_data, _, _ = multipletests(brain_data, p_level, method="fdr_bh")
	brain_mask[brain_mask]=reject
	brain_mask = brain_mask.astype(int)
	mask = brain_mask.reshape(shape)
	mask_map = nib.Nifti1Image(mask, affine, header)
	masker = NiftiMasker(mask_img=mask_map)
	try:
		timecourse = masker.fit_transform(ts_file).T
		betas = masker.fit_transform(beta_file).T
	except ValueError:
		return None,None,None,None,None
	subplot_title = "\n ".join([str(substitution["subject"]),str(substitution["session"])])
	timecourse = np.mean(timecourse, axis=0)
	design = pd.read_csv(design_file, skiprows=5, sep="\t", header=None, index_col=False)
	design = design*np.mean(betas)
	event_df = pd.read_csv(event_file, sep="\t")

	return timecourse, design, mask_map, event_df, subplot_title
        for t in TARGETS_CLIN_BL for r in REGRESSORS_OI] + \
    ['%s~%s+AGE_AT_INCLUSION+SEX+EDUCATION' % (t, r)
        for t in TARGETS_NI for r in REGRESSORS_OI]

mod = MULM(data=data, formulas=formulas_all_simple)
stats_all_simple = mod.t_test(contrasts=1, out_filemane=None)

mod = MULM(data=data, formulas=formulas_all_covars)
stats_all_covars = mod.t_test(contrasts=1, out_filemane=None)

mod = MULM(data=data, formulas=formulas_all)
stats_all = mod.t_test(contrasts=1, out_filemane=None)

mod = MULM(data=data, formulas=formulas_oi)
stats_oi = mod.t_test(contrasts=1, out_filemane=None)
stats_oi["Corrected P value"] = multipletests(stats_oi.pvalue,
                                              method='fdr_bh')[1]

summary = stats_oi.copy()

summary["Variable"] = summary.target.replace({
    'TMTB_TIME': 'TMTB',
    "MDRS_TOTAL": "MDRS",
    "MRS": "mRS"
})
summary["PC"] = summary.contrast.replace({
    'pc1__tvl1l2': 1,
    'pc2__tvl1l2': 2,
    'pc3__tvl1l2': 3
})
summary["P value"] = summary.pvalue
summary["t statistic"] = summary.tvalue
def CoexpressionAnalysis(client, SL_or_SDL, data_resource, input_genes,
                         adj_method, fdr_level, tissues):
    '''
   Description: "The gene correlation information is used to detect SL pairs."

   Inputs:
    client:BigQueryClient, the BigQuery client that will run the function.
    SL_or_SDL:string, Synthetic lethal or Synthetic Dosage Lethal, valid values: 'SL', 'SDL'
    data_resource: string, The dataresource the analysis will be performed on, 	valid values: "CCLE", "PanCancerAtlas"
    input_genes:list of strings, the list of genes whose SL/SDL partners will be seeked	
    adj_method:	string,	optional, p value correction method,  valid_values:bonferroni,  sidak, holm-sidak , holm, simes-hochberg , hommel, fdr_bh,  fdr_by , fdr_tsbh, fdr_tsbky 
    fdr_level:string, the data that will be considered wile doing p value adjustment, valid values : "gene_level", "analysis_level"
    tissues: The tissues that the analysis will be performed on. 

    Output:
    A dataframe of SL/SDL pairs
        
    '''

    if data_resource == 'PanCancerAtlas':
        table_name = 'isb-cgc-bq.pancancer_atlas.Filtered_EBpp_AdjustPANCAN_IlluminaHiSeq_RNASeqV2_genExp'
        gene_col_name = 'Symbol'
        entrez_col_name = 'Entrez'
        exp_name = 'normalized_count'
        sample_barcode = 'SampleBarcode'
        selected_samples = RetrieveSamples(client, 'PanCancerAtlas',
                                           'correlation', tissues)
        gene_mapping = ProcessGeneAlias(client, input_genes, 'PanCancerAtlas')

    elif data_resource == 'CCLE':
        table_name = 'isb-cgc-bq.DEPMAP.CCLE_gene_expression_DepMapPublic_current'
        gene_col_name = 'Hugo_Symbol'
        exp_name = 'TPM'
        sample_barcode = 'DepMap_ID'
        entrez_col_name = 'Entrez_ID'
        selected_samples = RetrieveSamples(client, 'CCLE', 'correlation',
                                           tissues)
        gene_mapping = ProcessGeneAlias(client, input_genes, 'DepMap')

    else:
        print("The database name can be either PanCancerAtlas or CCLE")
        return ()

    min_sample_size = 20
    if len(selected_samples) < (min_sample_size + 1):
        print("Sample size needs to be greater than " + str(min_sample_size) +
              ", it is " + str(len(selected_samples)))
        return ()
    sql_correlation = """ CREATE TEMPORARY FUNCTION tscore_to_p(a FLOAT64, b FLOAT64, c FLOAT64)
     RETURNS FLOAT64
    LANGUAGE js AS
    \"\"\"
    return jStat.ttest(a,b,c); //jStat.ttest( tscore, n, sides)
    \"\"\"
    OPTIONS (
     library="gs://javascript-lib/jstat.min.js"
    );

    WITH
    table1 AS (
    SELECT
    symbol,
   (RANK() OVER (PARTITION BY symbol ORDER BY data ASC)) + (COUNT(*) OVER ( PARTITION BY symbol, CAST(data as STRING)) -  1)/2.0 AS rnkdata,
   ParticipantBarcode
	FROM (
   SELECT
   __GENE_SYMBOL__  symbol,
      AVG( __EXP_NAME__)  AS data,
      __SAMPLE_ID__ AS ParticipantBarcode
   FROM `__TABLE_NAME__`
   WHERE  __GENE_SYMBOL__   IN (__GENE_LIST__) # labels
         AND __EXP_NAME__ IS NOT NULL  AND __SAMPLE_ID__ in (__SAMPLE_LIST__)
   GROUP BY
      ParticipantBarcode, symbol
       )
    )
    ,
    table2 AS (
    SELECT
    symbol,
   (RANK() OVER (PARTITION BY symbol ORDER BY data ASC)) + (COUNT(*) OVER ( PARTITION BY symbol, CAST(data as STRING)) - 1)/2.0 AS rnkdata,
   ParticipantBarcode
    FROM (
   SELECT
      __GENE_SYMBOL__    symbol,
      AVG(__EXP_NAME__)  AS data,
      __SAMPLE_ID__ AS ParticipantBarcode
   FROM `__TABLE_NAME__`
   WHERE  __GENE_SYMBOL__ IS NOT NULL  # labels
         AND __EXP_NAME__ IS NOT NULL AND __SAMPLE_ID__ in (__SAMPLE_LIST__)
   GROUP BY
      ParticipantBarcode, symbol
       )
    )
,
summ_table AS (
SELECT
   n1.symbol as symbol1,
   n2.symbol as symbol2,
   COUNT( n1.ParticipantBarcode ) as n,
   CORR(n1.rnkdata , n2.rnkdata) as correlation

FROM
   table1 AS n1
INNER JOIN
   table2 AS n2
ON
   n1.ParticipantBarcode = n2.ParticipantBarcode
   AND n2.symbol  NOT IN (__GENE_LIST__)

GROUP BY
   symbol1, symbol2
UNION ALL
SELECT
   n1.symbol as symbol1,
   n2.symbol as symbol2,
   COUNT( n1.ParticipantBarcode ) as n,
   CORR(n1.rnkdata , n2.rnkdata) as correlation

FROM
   table1 AS n1
INNER JOIN
   table1 AS n2
ON
   n1.ParticipantBarcode = n2.ParticipantBarcode
   AND n1.symbol <  n2.symbol
GROUP BY
   symbol1, symbol2
)
SELECT *,
   tscore_to_p( ABS(correlation)*SQRT( (n-2)/((1+correlation)*(1-correlation))) ,n-2, 2) as pvalue
   #`cgc-05-0042.Auxiliary.significance_level_ttest2`(n-2, ABS(correlation)*SQRT( (n-2)/((1+correlation)*(1-correlation)))) as alpha
FROM summ_table
WHERE n > 20
#AND correlation > __COR_THRESHOLD__
GROUP BY 1,2,3,4,5
#HAVING pvalue <= __P_THRESHOLD__
ORDER BY symbol1 ASC, correlation DESC """

    input_genes = ["'" + str(x) + "'" for x in input_genes]
    input_genes_for_query = ','.join(input_genes)

    included_samples = ["'" + str(x) + "'" for x in selected_samples]
    included_samples = ','.join(included_samples)

    sql_correlation = sql_correlation.replace('__GENE_LIST__',
                                              input_genes_for_query)
    sql_correlation = sql_correlation.replace('__TABLE_NAME__', table_name)
    sql_correlation = sql_correlation.replace('__GENE_SYMBOL__', gene_col_name)
    sql_correlation = sql_correlation.replace('__EXP_NAME__', exp_name)
    sql_correlation = sql_correlation.replace('__SAMPLE_ID__', sample_barcode)
    sql_correlation = sql_correlation.replace('__SAMPLE_LIST__',
                                              included_samples)

    results = client.query(sql_correlation).result().to_dataframe()
    if results.shape[0] < 1:
        print("Coexpression inference procedure applied on " + data_resource +
              " did not find candidate " + SL_or_SDL + " pairs.")
        return (results)

    report = results[['symbol1', 'symbol2', 'n', 'correlation', 'pvalue']]
    report = report.dropna()
    report.columns = [
        'InactiveDB', 'SL_Candidate', '#Samples', 'Correlation', 'PValue'
    ]
    report['Inactive'] = report['InactiveDB'].map(gene_mapping)
    if fdr_level == "gene_level":
        inactive_genes = list(report["Inactive"].unique())
        for i in range(len(inactive_genes)):
            report.loc[report["Inactive"] == inactive_genes[i],
                       'FDR'] = multipletests(
                           report.loc[report["Inactive"] == inactive_genes[i],
                                      'PValue'],
                           method=adj_method,
                           is_sorted=False)[1]

    elif fdr_level == "analysis_level":
        FDR = multipletests(report['PValue'],
                            method=adj_method,
                            is_sorted=False)[1]
        report['FDR'] = FDR
    else:
        print("FDR level can be either gene_level or analysis_level")
        return ()

    report['Tissue'] = str(tissues)
    cols = [
        'Inactive', 'InactiveDB', 'SL_Candidate', '#Samples', 'Correlation',
        'PValue', 'FDR', 'Tissue'
    ]
    report = report[cols]
    if SL_or_SDL == "SDL":
        report.columns = [
            'Overactive', 'OveractiveDB', 'SL_Candidate', '#Samples',
            'Correlation', 'PValue', 'FDR', 'Tissue'
        ]
    return report
Exemple #51
0
                valueable_differencies_count += 1
            if min_pvalue > wilcox.pvalue:
                min_pvalue = wilcox.pvalue
                min_first = first
                min_second = second
comparison_frame = pd.DataFrame(comparison_result, columns=["Names", "Statistic", "p-value"])
comparison_frame
#%%
print "\nMost different classificators: \"%s\" and \"%s\" with p-value: %f" % (min_first, min_second, min_pvalue)
# Сколько статистически значимых на уровне 0.05 различий мы обнаружили?
#%%
print "Statistically valuable differencies count: %i" % valueable_differencies_count

# Сравнивая 4 классификатора между собой, мы проверили 6 гипотез.
# Давайте сделаем поправку на множественную проверку. Начнём с метода Холма.
# Сколько гипотез можно отвергнуть на уровне значимости 0.05 после поправки этим методом?
#%%
from statsmodels.sandbox.stats.multicomp import multipletests 
reject_holm, p_corrected_holm, a1_holm, a2_holm = multipletests(comparison_frame["p-value"],
                                                                alpha = 0.05,
                                                                method = 'holm')
print "Hypothesis to reject after holm correction count: %i" % len(filter(lambda whether_reject: whether_reject, reject_holm))

# Сколько гипотез можно отвергнуть на уровне значимости 0.05 после поправки методом
# Бенджамини-Хохберга? 
#%%
reject_fdr, p_corrected_fdr, a1_fdr, a2_fdr = multipletests(comparison_frame["p-value"],
                                                            alpha = 0.05,
                                                            method = 'fdr_bh')
print "Hypothesis to reject after fdr correction count: %i" % len(filter(lambda whether_reject: whether_reject, reject_fdr))
Exemple #52
0
beds = []
for i in bed_paths:
    beds.append(pd.read_table(i, sep="\t", names=["Region", "Start", "End", "Name", "Score", "Strand"]))

vdfs = []
for _i,i in enumerate(df_paths):
    _ = pd.read_table(i, names=["POS", "REF", "ALT", "AD", "REV", "DP", "QUAL"], skiprows = 1)
    vdfs.append(_)

# Fisher's Exact Test
#           | AD | DP  |
# Variant   |    |     |
# Threshold | 3  | 100 |
for vdf in vdfs:
    pvals = vdf.apply(lambda x: fe([[x["AD"], x["DP"]], [(freq/100)*x["DP"], x["DP"]]], "greater"), axis = 1)
    vdf["threshold_"+str(freq)+"%_pval"] = multipletests([i[1] for i in pvals], method="fdr_bh")[1]
    vdf["threshold_"+str(freq)+"%_oddsratio"] = [i[0] for i in pvals]

threshold = freq
col = "threshold_"+str(threshold)+"%"
pval_threshold = 0.05
_ = vdfs
# _ = [i[i[col+"_pval"]<=pval_threshold] for i in _]
df = _[0]
for _i, i in enumerate(_[1:]):
    df = df.merge(i, how='inner', on=['POS', 'REF', 'ALT'], suffixes = ("_0", "_"+str(_i+1)))

cols = df.columns[df.columns.str.match(r"\b"+col+"\b*_pval")]
df = df.ix[df[cols].apply(lambda x: any([i<= pval_threshold for i in x]), axis = 1)]

masked = []
def SurvivalOfFittest(client,
                      SL_or_SDL,
                      data_source,
                      input_genes,
                      percentile_threshold,
                      cn_threshold,
                      adj_method,
                      fdr_level,
                      tissues,
                      input_mutations='None'):
    '''
   Description: Gene expression, Copy Number Alteration (CNA), Somatic Mutations are used to decide whether gene is inactive.
   The SL pair detection according to difference in CNA given one gene is inactive vs not-inactive
   Inputs:
    client:BigQueryClient, the BigQuery client that will run the function.
    SL_or_SDL:string, Synthetic lethal or Synthetic Dosage Lethal, valid values: 'SL', 'SDL'
    data_resource: string, The dataresource the analysis will be performed on, 	valid values: "CCLE", "PanCancerAtlas"
    input_genes:list of strings, the list of genes whose SL/SDL partners will be seeked	
    percentile_threshold:double, the threshold for gene expression (for deciding whether a gene is inactive)
    cn_threshold:double, the threshold for copy number alteration (for deciding whether a gene is inactive)
    adj_method:	string,	optional, p value correction method,  valid_values:bonferroni,  sidak, holm-sidak , holm, simes-hochberg , hommel, fdr_bh,  fdr_by , fdr_tsbh, fdr_tsbky 
    fdr_level:string, the data that will be considered wile doing p value adjustment, valid values : "gene_level", "analysis_level"
    tissues: The tissues that the analysis will be performed on. 
    input_mutations:list of strings, optional, valid values: Missense_Mutation, Nonsense_Mutation,Translation_Start_Site, Frame_Shift_Ins, Splice_Site, In_Frame_DelFrame_Shift_Del, Nonstop_Mutation, In_Frame_Ins
        
   Output:
       A dataframe of SL/SDL  pairs

  '''
    if data_source == 'PanCancerAtlas':
        gene_exp_table = 'isb-cgc-bq.pancancer_atlas.Filtered_EBpp_AdjustPANCAN_IlluminaHiSeq_RNASeqV2_genExp'
        mutation_table = 'isb-cgc-bq.pancancer_atlas.Filtered_MC3_MAF_V5_one_per_tumor_sample'
        cn_table = 'isb-cgc-bq.pancancer_atlas.Filtered_all_CNVR_data_by_gene'

        sample_id = 'SampleBarcode'
        gene_col_name = 'Symbol'
        gene_exp = 'normalized_count'
        cn_gene_name = 'Gene_Symbol'
        mutation_gene_name = 'Hugo_Symbol'
        mutation_sample_id = 'Tumor_SampleBarcode'
        cn_gistic = 'GISTIC_Calls'
        entrez_id = 'Entrez'
        selected_samples = RetrieveSamples(client, 'PanCancerAtlas', 'sof',
                                           tissues)
        gene_mapping = ProcessGeneAlias(client, input_genes, 'PanCancerAtlas')
    elif data_source == 'CCLE':
        mutation_table = 'isb-cgc-bq.DEPMAP.CCLE_mutation_DepMapPublic_current'
        gene_exp_table = 'isb-cgc-bq.DEPMAP.CCLE_gene_expression_DepMapPublic_current'
        cn_table = 'isb-cgc-bq.DEPMAP.CCLE_gene_cn_DepMapPublic_current'
        sample_id = 'DepMap_ID'
        gene_col_name = 'Hugo_Symbol'
        gene_exp = 'TPM'
        cn_gene_name = 'Hugo_Symbol'
        mutation_gene_name = 'Hugo_Symbol'
        mutation_sample_id = 'Tumor_Sample_Barcode'
        cn_gistic = 'CNA'
        cn_threshold = np.log2(2**(cn_threshold) + 1)
        entrez_id = 'Entrez_ID'
        selected_samples = RetrieveSamples(client, 'CCLE', 'sof', tissues)
        gene_mapping = ProcessGeneAlias(client, input_genes, 'DepMap')

    else:
        print("The data source name can be either PanCancerAtlas or CCLE")
        return ()
    min_sample_size = 20
    if len(selected_samples) < (min_sample_size + 1):
        print("Sample size needs to be greater than " + str(min_sample_size),
              " it is " + str(len(selected_samples)))
        return ()

    sql_without_mutation = '''
    WITH
    table1 AS (
    (SELECT   symbol, Barcode FROM
    (SELECT GE.__EXP_GENE_NAME__ AS symbol, GE.__SAMPLE_ID__ AS Barcode ,
    PERCENT_RANK () over (partition by __EXP_GENE_NAME__ order by __GENE_EXPRESSION__ asc) AS Percentile
    FROM  __GENE_EXP_TABLE__ GE
    WHERE GE.__EXP_GENE_NAME__ in (__GENELIST__)  AND __SAMPLE_ID__ in (__SAMPLE_LIST__) AND GE.__GENE_EXPRESSION__ is not null
    )
    AS NGE
    WHERE NGE.Percentile  __GENE_CMP_STR__

    INTERSECT DISTINCT

    SELECT symbol ,  Barcode FROM
    (SELECT CN.__CN_GENE_NAME__ AS symbol, CN.__SAMPLE_ID__ AS Barcode,
    CN.__CN_GISTIC__ AS NORM_CN
    FROM  __CN_TABLE__ CN
    WHERE CN.__CN_GENE_NAME__ in (__GENELIST__)  AND __SAMPLE_ID__ in (__SAMPLE_LIST__)  and CN.__CN_GISTIC__ is not null
    ) AS NC
    WHERE NC.NORM_CN __CN_CMP_STR__
    )'''

    if data_source == 'CCLE':
        sql_mutation_part = '''

        UNION DISTINCT
        SELECT M.__MUTATION_GENE_NAME__  AS symbol , M.__MUTATION_SAMPLE_ID__ AS Barcode
        FROM __MUTATION_TABLE__ M
        WHERE __MUTATION_GENE_NAME__ IN (__GENELIST__) AND
        M.Variant_Classification IN (__MUTATIONLIST__) AND __MUT_SAMPLE_ID__ in (__SAMPLE_LIST__)
        )'''

    elif data_source == 'PanCancerAtlas':
        sql_mutation_part = '''
         UNION DISTINCT
        SELECT M.__MUTATION_GENE_NAME__  AS symbol , M.__MUTATION_SAMPLE_ID__ AS Barcode
        FROM __MUTATION_TABLE__ M
        WHERE __MUTATION_GENE_NAME__ IN (__GENELIST__) AND
        M.Variant_Classification IN (__MUTATIONLIST__) AND __MUT_SAMPLE_ID__ in (__SAMPLE_LIST__) AND Filter="PASS"
        )'''

    rest_of_the_query = '''
     , table2 AS (
    SELECT
        __SAMPLE_ID__ Barcode,  __CN_GENE_NAME__ symbol,
        (RANK() OVER (PARTITION BY __CN_GENE_NAME__ ORDER BY __CN_GISTIC__ ASC)) + (COUNT(*) OVER ( PARTITION BY __CN_GENE_NAME__, CAST(__CN_GISTIC__ as STRING)) - 1)/2.0  AS rnkdata
    FROM
       __CN_TABLE__
       where __CN_GENE_NAME__ IS NOT NULL  AND  __SAMPLE_ID__ in (__SAMPLE_LIST__) AND __CN_GISTIC__ is not null 
       ),
summ_table AS (
SELECT
   n1.symbol as symbol1,
   n2.symbol as symbol2,
   COUNT( n1.Barcode) as n_1,
   SUM( n2.rnkdata )  as sumx_1,
FROM
   table1 AS n1
INNER JOIN
   table2 AS n2
ON
   n1.Barcode = n2.Barcode
GROUP BY
    symbol1, symbol2 ),

statistics AS (
SELECT symbol1, symbol2, n1, n, U1,
      (n1n2/2.0 - U1)/den as zscore

FROM (
   SELECT  symbol1, symbol2, n_t as n,
       n_1 as n1,
       sumx_1 - n_1 *(n_1 + 1) / 2.0 as U1,
       n_1 * (n_t - n_1 ) as n1n2,
       SQRT( n_1 * (n_t - n_1 )*(n_t + 1) / 12.0 ) as den
   FROM  summ_table as t1
   LEFT JOIN ( SELECT symbol, COUNT( Barcode ) as n_t
            FROM table2
            GROUP BY symbol)  t2
   ON symbol2 = symbol
   WHERE n_t > 20 and n_1>5
)
WHERE den > 0
)
SELECT symbol1, symbol2, n1, n, U1,
    `cgc-05-0042.functions.jstat_normal_cdf`(zscore, 0.0, 1.0 ) as pvalue
FROM statistics
GROUP BY 1,2,3,4,5,6
#HAVING pvalue <= 0.01
ORDER BY pvalue ASC '''

    input_genes = ["'" + str(x) + "'" for x in input_genes]
    input_genes_query = ','.join(input_genes)

    included_samples = ["'" + str(x) + "'" for x in selected_samples]
    included_samples = ','.join(included_samples)

    if SL_or_SDL == 'SDL' or input_mutations is None:
        sql_sof = sql_without_mutation + ')' + ' ' + rest_of_the_query
    else:
        mutations_intermediate_representation = [
            "'" + x + "'" for x in input_mutations
        ]
        input_mutations_for_query = ','.join(
            mutations_intermediate_representation)
        sql_sof = sql_without_mutation + ' ' + sql_mutation_part + ' ' + rest_of_the_query
        sql_sof = sql_sof.replace('__MUTATION_TABLE__', mutation_table)
        sql_sof = sql_sof.replace('__MUTATION_SAMPLE_ID__', mutation_sample_id)
        sql_sof = sql_sof.replace('__MUTATIONLIST__',
                                  input_mutations_for_query)

    sql_sof = sql_sof.replace('__GENELIST__', input_genes_query)
    # sql_sof = sql_sof.replace('__CUTOFFPRC__', str(percentile_threshold/100))
    # sql_sof = sql_sof.replace('__CUTOFFSCNA__', str(cn_threshold))
    sql_sof = sql_sof.replace('__CN_TABLE__', cn_table)
    sql_sof = sql_sof.replace('__GENE_EXP_TABLE__', gene_exp_table)
    sql_sof = sql_sof.replace('__SAMPLE_ID__', sample_id)
    sql_sof = sql_sof.replace('__MUT_SAMPLE_ID__', mutation_sample_id)
    sql_sof = sql_sof.replace('__ENTREZ_ID__', entrez_id)
    sql_sof = sql_sof.replace('__CN_TABLE__', cn_table)
    sql_sof = sql_sof.replace('__GENE_EXPRESSION__', gene_exp)
    sql_sof = sql_sof.replace('__CN_GISTIC__', cn_gistic)
    sql_sof = sql_sof.replace('__EXP_GENE_NAME__', gene_col_name)
    sql_sof = sql_sof.replace('__CN_GENE_NAME__', cn_gene_name)
    sql_sof = sql_sof.replace('__MUTATION_GENE_NAME__', mutation_gene_name)
    sql_sof = sql_sof.replace('__SAMPLE_LIST__', included_samples)

    if SL_or_SDL == "SL":
        comp_str = "<" + str(cn_threshold)
        com_gene_th = "<" + str(percentile_threshold / 100)

    elif SL_or_SDL == "SDL":
        comp_str = ">" + str(cn_threshold)
        sql_sof = sql_sof.replace('__CN_CMP_STR__', comp_str)
        com_gene_th = ">" + str(percentile_threshold / 100)
        sql_sof = sql_sof.replace('__GENE_CMP_STR__', com_gene_th)

    sql_sof = sql_sof.replace('__CN_CMP_STR__', comp_str)
    sql_sof = sql_sof.replace('__GENE_CMP_STR__', com_gene_th)

    results = client.query(sql_sof).result().to_dataframe()

    if results.shape[0] < 1:
        print("SOF inference procedure applied on " + data_resource +
              " did not find candidate " + SL_or_SDL + " pairs.")
        return (results)
    report = results[['symbol1', 'symbol2', 'n1', 'n', 'U1', 'pvalue']]
    report = report.dropna()
    report.columns = [
        'InactiveDB', 'SL_Candidate', '#InactiveSamples', '#Samples', 'U1',
        'PValue'
    ]
    report['Inactive'] = report['InactiveDB'].map(gene_mapping)

    if fdr_level == "gene_level":
        inactive_genes = list(report["Inactive"].unique())
        for i in range(len(inactive_genes)):
            report.loc[report["Inactive"] == inactive_genes[i],
                       'FDR'] = multipletests(
                           report.loc[report["Inactive"] == inactive_genes[i],
                                      'PValue'],
                           method=adj_method,
                           is_sorted=False)[1]

    elif fdr_level == "analysis_level":
        FDR = multipletests(report['PValue'],
                            method=adj_method,
                            is_sorted=False)[1]
        report['FDR'] = FDR
    else:
        print("FDR level can be either gene_level or analysis_level")
        return ()

    report['Tissue'] = str(tissues)

    cols = [
        'Inactive', 'InactiveDB', 'SL_Candidate', '#InactiveSamples',
        '#Samples', 'PValue', 'FDR', 'Tissue'
    ]
    report = report[cols]
    if SL_or_SDL == "SDL":
        report.columns = [
            'Overactive', 'OveractiveDB', 'SL_Candidate', '#Overactive',
            '#Samples', 'PValue', 'FDR', 'Tissue'
        ]
    return report
Files = conf.inputs
for File in Files:
    with open(File) as f:
        Lines = f.readlines()
        for line in Lines:
            split_line = line.split()
            Module_ID = str(split_line[0])
            Gene_type = str(split_line[1])
            P_value = float(split_line[2])
            key = "_".join([Module_ID, Gene_type])
            keys.append(key)
            P_vals.append(P_value)
            uncorrected_dict[key] = P_value

FDR = conf.FDR
Benjamini_Pval_array = stats.multipletests(P_vals, alpha=FDR, method='fdr_bh',
                                           is_sorted=False, returnsorted=False)

Bonferroni_Pval_array = stats.multipletests(P_vals, alpha=FDR,
                                            method='bonferroni',
                                            is_sorted=False,
                                            returnsorted=False)

i = 0
for key in keys:
    P_value_BH = Benjamini_Pval_array[1][i]
    BH_dict[key] = P_value_BH
    P_value_BO = Bonferroni_Pval_array[1][i]
    BO_dict[key] = P_value_BO
    i = i + 1

# -----------------------------------------------------
Exemple #55
0
            # Correction of multiple comparaison with Bonferronin
            pvals_GR = model_GRT.pvalues
            pvals_GR_fwer = multicomp.multipletests(pvals_GR, alpha = 0.05, method = 'bonferroni')
           
        """

            X1 = np.array(df_drop[df_drop.label==r][['AgeAtDiagnosis', 'mean']])
            #X1 = np.array(df_drop[df_drop.label==r]["mean"])
            Y1 = np.array(df_drop[df_drop.label==r][VD])
            ## Fit and summary:
            model_SGRT = sm.OLS(Y1, X1).fit()
            print(model_SGRT.summary())

            # Correction of multiple comparaison with Bonferroninipyth
            pvals = model_SGRT.pvalues
            pvals_fwer = multicomp.multipletests(pvals, alpha = 0.05, method = 'bonferroni')

            """
            # PLot the significant roi without global mean RT effect
            pval_roi = pvals_GR_fwer[0]
            
            if pval_roi[1:2].astype(str) == "True":
                
                mask, roi_nii = get_roi_mask(atlas_nii, label_number)
                output= os.path.join(maskfile,"%s.png"%(label_number))
                plotting.plot_roi(roi_nii, anat_nii, output_file= output, title="plot_roi %s"%(label_number))
            
            else:
                continue
            """
            # concatenate all the results into a pd dataframe
    def run_analysis(self):
        zdf = flex_array.standard_df(self.par['zscore_file'])

        f = params.file_IO(
            '../ref_seq/pep_against_human_viruses.tblastn.species.noseg.WS3.max_target_seqs100000.180625 (1).m8',
            '\t')
        orig_aln = f.flat_file_to_df([0, 1, 15])
        f = params.file_IO(
            '../ref_seq/new_pep_against_human_viruses.tblastn.species.noseg.WS3.max_target_seqs100000.180625 (1).m8',
            '\t')
        new_aln = f.flat_file_to_df([0, 1, 15])

        orig_aln.index = [i.split('_')[1] for i in orig_aln.index]
        aln_df = pd.concat([orig_aln, new_aln])
        aln_df.fillna(0, inplace=True)

        binary_b = aln_df[aln_df >= 80].fillna(0)
        binary_b = pd.DataFrame(index=binary_b.index,
                                columns=binary_b.columns,
                                data=binary_b.values,
                                dtype=bool)
        binary_b = flex_array.array(binary_b).filter_aln(
            ref_seq=self.par['dir_ref_seq'])
        binary_b = binary_b.reindex(zdf.index).fillna(0)
        aln_df = aln_df.loc[:, binary_b.columns]

        #binary_b = flex_array.sparse_aln_df(self.par['file_aln'])
        #binary_b = flex_array.array(binary_b).filter_aln(ref_seq=self.par['dir_ref_seq'])

        sum_df = pd.DataFrame(0, index=list(binary_b), columns=list(zdf))
        glob_unique = pd.DataFrame(0, index=list(binary_b), columns=list(zdf))
        pep_df = pd.DataFrame(np.nan, index=list(binary_b), columns=list(zdf))
        p_df = pd.DataFrame(index=list(binary_b), columns=list(zdf))
        n_df = pd.DataFrame(index=list(binary_b), columns=list(zdf))
        padjust_df = pd.DataFrame(index=list(binary_b), columns=list(zdf))
        orig_p = pd.DataFrame(index=list(binary_b), columns=list(zdf))
        filter_df = pd.DataFrame(index=list(binary_b), columns=list(zdf))

        hits_series = pd.Series(index=list(zdf))
        nonoverlap_hits_series = pd.Series(index=list(zdf))
        samples = list(zdf.columns)

        nonoverlap_dict = {}

        parallel_dict1 = {}
        parallel_dict2 = {}

        for sample_name, column in zdf.iteritems():
            hits = column[column >= self.par['Z_threshold']].copy()
            if self.par['use_filter']:
                nonoverlap_hits = flex_array.gen_ind_hits(
                    hits, self.dependent_pep, self.par['graph_dir'],
                    samples.index(sample_name))
                input_num = len(nonoverlap_hits)
            elif not self.par['use_filter']:
                nonoverlap_hits = hits.copy()
                input_num = len(
                    flex_array.gen_ind_hits(hits, self.dependent_pep,
                                            self.par['graph_dir'],
                                            samples.index(sample_name)))
            hits_series[sample_name] = len(hits)
            nonoverlap_hits_series[sample_name] = input_num
            nonoverlap_dict[sample_name] = list(nonoverlap_hits.index)
            print("%s:\thits=%s, nonoverlapped=%s" %
                  (sample_name, len(hits), input_num))

            if input_num > 0:
                zb_df = aln_df.loc[nonoverlap_hits.index]
                parallel_dict1[sample_name] = zb_df
                parallel_dict2[sample_name] = nonoverlap_hits
                '''
				collapse_zb, glob_array, sim_tag, p_series, orig_pseries, filter_series = flex_array.array(zb_df).binom_reassign(
						nonoverlap_hits, self.dependent_pep, self.par['dir_ref_seq'], self.par['p_threshold'], self.par['x_threshold'], self.par['organism'])
				sum_df[sample_name]=collapse_zb.apply(sum, axis=0) + sim_tag
				glob_unique[sample_name] = glob_array.apply(sum, axis=0) + sim_tag
				pep_df[sample_name]=collapse_zb.apply(lambda x: flex_array.array(x).names_string(0.001),axis=0)
				p_df[sample_name]=p_series
				orig_p[sample_name]=orig_pseries
				filter_df[sample_name]=filter_series
				'''
        #parallel_dict1 = pd.Series(parallel_dict1)
        #parallel_dict2 = pd.Series(parallel_dict2)
        #parallel_dict2 = parallel_dict.loc[parallel_dict1.index]
        list1 = list(parallel_dict1.keys())  #sample names
        list2 = list(parallel_dict1.values())  #zb_df
        list3 = [parallel_dict2[i] for i in list1]  #hits series
        zipped = zip(list2, list3, list1)

        results = Parallel(n_jobs=-1)(
            delayed(flex_array.binom_reassign)
            (zb_df, nonoverlap_hits, sample_name, self.dependent_pep,
             self.par['dir_ref_seq'], self.par['p_threshold'],
             self.par['x_threshold'], self.par['organism'])
            for zb_df, nonoverlap_hits, sample_name in zipped)

        r1, r2, r3, r4, r5, r6, r7, r8 = zip(*results)
        for i in range(len(r7)):
            sample_name = r7[i]
            collapse_zb = r1[i]
            glob_array = r2[i]
            sim_tag = r3[i]
            p_series = r4[i]
            orig_pseries = r5[i]
            filter_series = r6[i]
            n_series = r8[i]
            sum_df[sample_name] = collapse_zb.apply(sum, axis=0) + sim_tag
            glob_unique[sample_name] = glob_array.apply(sum, axis=0) + sim_tag
            pep_df[sample_name] = collapse_zb.apply(
                lambda x: flex_array.array(x).names_string(0.001), axis=0)
            n_df[sample_name] = n_series
            p_df[sample_name] = p_series
            orig_p[sample_name] = orig_pseries
            filter_df[sample_name] = filter_series

        file_head = self.par['sub_dir'] + self.par['zscore_file'].split(
            '/')[-1].split('.')[0]  #Removes file path and extension
        if self.par['organism']:
            file_head += '_organism_'
        else:
            file_head += '_species_'

        #Write log file
        params.file_IO(self.par['sub_dir'] + 'parameters.log',
                       sep='=').dict_to_file(self.par)

        #Write analysis files
        sum_df.to_csv(file_head + 'total-counts.txt',
                      sep='\t',
                      header=True,
                      index_label='Specie')
        glob_unique.to_csv(file_head + 'unique-counts.txt',
                           sep='\t',
                           header=True,
                           index_label='Specie')
        pep_df.to_csv(file_head + 'peptides.txt',
                      sep='\t',
                      header=True,
                      index_label='Specie')
        p_df.to_csv(file_head + 'p-values.txt',
                    sep='\t',
                    header=True,
                    index_label='Specie')
        orig_p.to_csv(file_head + 'orig-p-values.txt',
                      sep='\t',
                      header=True,
                      index_label='Specie')
        filter_df.to_csv(file_head + 'virus-filter.txt',
                         sep='\t',
                         header=True,
                         index_label='Specie')

        for i in p_df.columns:
            pvals = np.array(p_df[i].values)
            if not pd.isnull(pvals).all():
                mask = [j for j in np.where(np.isfinite(pvals))[0]]
                pval_corrected = np.empty(pvals.shape)
                pval_corrected.fill(np.nan)
                pval_corrected[mask] = multipletests(pvals[mask],
                                                     method='fdr_bh')[1]
                padjust_df[i] = pval_corrected
        padjust_df.to_csv(file_head + 'p-adjusted.txt',
                          sep='\t',
                          header=True,
                          index_label='Specie')

        #Write independent peptides file
        f = open(self.par['sub_dir'] + 'independent_peptides.txt', 'w')
        for i in samples:
            f.write(i)
            for j in nonoverlap_dict[i]:
                f.write('\t' + str(j))
            f.write('\n')
        f.close()

        #Write summary file
        f = open(file_head + 'results_summary.txt', 'w')
        f.write(
            "Sample name\tVirus\tBH p-value\tRaw p-value\tOrig p-value\tAssigned counts\tFiltered Assigned Counts\t"
        )
        f.write(
            "Assigned peptides\tTotal significant peptides\tRanking N\tTotal sample hits\tTotal filtered sample hits\n"
        )
        for i in samples:
            BH = padjust_df[i]
            BH = BH[BH < self.par['bh_threshold']]
            p_value = p_df[i]
            n_value = n_df[i]
            n_value = n_value[BH.index]
            p_value = p_value[BH.index]
            filter_value = filter_df[i]
            filter_value = filter_value[BH.index]
            orig_pvalue = orig_p[i]
            orig_pvalue = orig_pvalue[BH.index]
            counts = sum_df[i]
            counts = counts[BH.index]
            peptides = pep_df[i]
            peptides = peptides[BH.index]

            for j in BH.index:
                if filter_value[j] > self.par['x_threshold']:
                    f.write(i + '\t')
                    f.write(j + '\t' + str(BH[j]) + '\t')
                    f.write(
                        str(p_value[j]) + '\t' + str(orig_pvalue[j]) + '\t')
                    f.write(
                        str(counts[j]) + '\t' + str(filter_value[j]) + '\t' +
                        str(peptides[j]) + '\t')
                    #write number of peptides
                    pep_set = set()
                    for k in BH.index:
                        pep_list = peptides[k].split(';')
                        pep_set = pep_set.union(set(pep_list))
                    f.write(str(len(pep_set)) + '\t')
                    f.write(str(n_value[j]) + '\t')
                    f.write(
                        str(hits_series[i]) + '\t' +
                        str(nonoverlap_hits_series[i]) + '\n')
        f.close()
        print("End of run.")
        return None
def alpha_group_significance(output_dir: str, alpha_diversity: pd.Series,
                             metadata: qiime2.Metadata) -> None:
    # Filter metadata to only include IDs present in the alpha diversity data.
    # Also ensures every alpha diversity ID is present in the metadata.
    metadata = metadata.filter_ids(alpha_diversity.index)

    # Metadata column filtering could be done in one pass, but this visualizer
    # displays separate warnings for non-categorical columns, and categorical
    # columns that didn't satisfy the requirements of the statistics being
    # computed.
    pre_filtered_cols = set(metadata.columns)
    metadata = metadata.filter_columns(column_type='categorical')
    non_categorical_columns = pre_filtered_cols - set(metadata.columns)

    pre_filtered_cols = set(metadata.columns)
    metadata = metadata.filter_columns(drop_all_unique=True,
                                       drop_zero_variance=True,
                                       drop_all_missing=True)
    filtered_columns = pre_filtered_cols - set(metadata.columns)

    if len(metadata.columns) == 0:
        raise ValueError(
            "Metadata does not contain any columns that satisfy this "
            "visualizer's requirements. There must be at least one metadata "
            "column that contains categorical data, isn't empty, doesn't "
            "consist of unique values, and doesn't consist of exactly one "
            "value.")

    metric_name = alpha_diversity.name

    # save out metadata for download in viz
    alpha_diversity.index.name = 'id'
    alpha = qiime2.Metadata(alpha_diversity.to_frame())
    md = metadata.merge(alpha)
    md.save(os.path.join(output_dir, 'metadata.tsv'))

    filenames = []
    filtered_group_comparisons = []
    for column in metadata.columns:
        metadata_column = metadata.get_column(column)
        metadata_column = metadata_column.drop_missing_values()

        initial_data_length = alpha_diversity.shape[0]
        data = pd.concat(
            [alpha_diversity, metadata_column.to_series()],
            axis=1,
            join='inner')
        filtered_data_length = data.shape[0]

        names = []
        groups = []
        for name, group in data.groupby(metadata_column.name):
            names.append('%s (n=%d)' % (name, len(group)))
            groups.append(list(group[metric_name]))

        escaped_column = quote(column)
        escaped_column = escaped_column.replace('/', '%2F')
        filename = 'column-%s.jsonp' % escaped_column
        filenames.append(filename)

        # perform Kruskal-Wallis across all groups
        kw_H_all, kw_p_all = scipy.stats.mstats.kruskalwallis(*groups)

        # perform pairwise Kruskal-Wallis across all pairs of groups and
        # correct for multiple comparisons
        kw_H_pairwise = []
        for i in range(len(names)):
            for j in range(i):
                try:
                    H, p = scipy.stats.mstats.kruskalwallis(
                        groups[i], groups[j])
                    kw_H_pairwise.append([names[j], names[i], H, p])
                except ValueError:
                    filtered_group_comparisons.append([
                        '%s:%s' % (column, names[i]),
                        '%s:%s' % (column, names[j])
                    ])
        kw_H_pairwise = pd.DataFrame(
            kw_H_pairwise, columns=['Group 1', 'Group 2', 'H', 'p-value'])
        kw_H_pairwise.set_index(['Group 1', 'Group 2'], inplace=True)
        kw_H_pairwise['q-value'] = multipletests(kw_H_pairwise['p-value'],
                                                 method='fdr_bh')[1]
        kw_H_pairwise.sort_index(inplace=True)
        pairwise_fn = 'kruskal-wallis-pairwise-%s.csv' % escaped_column
        pairwise_path = os.path.join(output_dir, pairwise_fn)
        kw_H_pairwise.to_csv(pairwise_path)

        with open(os.path.join(output_dir, filename), 'w') as fh:
            series = pd.Series(groups, index=names)

            fh.write("load_data('%s'," % column)
            series.to_json(fh, orient='split')
            fh.write(",")
            json.dump(
                {
                    'initial': initial_data_length,
                    'filtered': filtered_data_length
                }, fh)
            fh.write(",")
            json.dump({'H': kw_H_all, 'p': kw_p_all}, fh)
            fh.write(",'")
            table = q2templates.df_to_html(kw_H_pairwise)
            fh.write(table.replace('\n', '').replace("'", "\\'"))
            fh.write("','%s', '%s');" % (quote(pairwise_fn), metric_name))

    index = os.path.join(TEMPLATES, 'alpha_group_significance_assets',
                         'index.html')
    q2templates.render(
        index,
        output_dir,
        context={
            'columns': [quote(fn) for fn in filenames],
            'non_categorical_columns':
            ', '.join(sorted(non_categorical_columns)),
            'filtered_columns':
            ', '.join(sorted(filtered_columns)),
            'filtered_group_comparisons':
            '; '.join([' vs '.join(e) for e in filtered_group_comparisons])
        })

    shutil.copytree(
        os.path.join(TEMPLATES, 'alpha_group_significance_assets', 'dist'),
        os.path.join(output_dir, 'dist'))
Exemple #58
0
 def fdr(x):
     return multipletests(x, method='fdr_bh',
                          alpha=0.05 / pvals.shape[1])[1]
Exemple #59
0
# print dof
# print expected
import statsmodels.stats.multicomp
# print frame['a']
# print pd.crosstab(frame['a'],frame['b'])

# crossFD = pd.crosstab(frame['a'],frame['b'])

# print chi2_contingency([crossFD['a'],crossFD['b']],False)

# print chi2_contingency([frame['a'],frame['b']],True)
from scipy.stats import chisquare
# chisquare_value, race_pvalue = chisquare(frame['a'], frame['b'])
# print chisquare_value, race_pvalue

# print(chisquare(f_obs=frame['a'], f_exp=frame['b']))[1]
import scipy
from scipy.stats import chisquare

# print chisquare(frame['a'], f_exp=frame['b'])
#
#
# print chisquare(frame['a'], f_exp=frame['b'], ddof=1)
#
# print chisquare(frame['a'])

# print scipy.stats.chi2_contingency([frame['a'],frame['b'] ])
from statsmodels.sandbox.stats.multicomp import multipletests
print multipletests(frame["treat1"])

# print multipletests(frame["block1"])
def GLM(file, score, stat, ind_var, Level, betas=1):

    # Create pandas dataframe
    df_final = pd.DataFrame(columns=[
        'Score', 'stat', 'beta', 'tvalue', 'pvalue', 'pval_bonferroni',
        'signi_bonferonni', 'Rsquare', 'std'
    ])
    db = pd.read_csv(file)

    ## Standarized scores
    # scaler = StandardScaler()
    # for var in ['age', 'age_at_chirurgie']:
    #     db[var] = scaler.fit_transform(db[var])
    # Get rid of rows with null values for given columns
    db = db[db[score].notnull()]

    # Select Variables
    Y = np.array(db[score])
    X = np.array(db[ind_var])

    # Cross validation GLM LOOCV
    """tras = train accuracy test; teas=test accuray set"""
    kf = KFold(Y.shape[0], n_folds=Y.shape[0])
    predictions = []
    rsquares = []
    tras = []
    confus = []

    cm_shape_max = int(np.max(db[score]) + 1)

    for train_index, test_index in kf:
        olsmodel = sm.OLS(Y[train_index], X[train_index])
        results = olsmodel.fit()
        pred = np.dot(X[train_index], results.params)
        pred = np.round(pred)
        pred[pred < 0] = 0
        # No kids had more than 5 in P2
        # pred[pred > 5] = 5
        ta = np.sum(Y[train_index] == pred) / float(len(Y[train_index]))
        tras.append(ta)
        cm = confusion_matrix(Y[train_index], pred)
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        cm[isnan(cm)] = 0
        if cm.shape[0] == cm_shape_max:
            confus.append(cm)

        rsquares.append(results.rsquared)
        prediction = np.dot(X[test_index], results.params)
        predictions.append(prediction)
    predictions = np.ravel(predictions)

    confus = np.mean(confus, axis=0)
    plot_confusion_matrix(confus,
                          title="Mean confusion matrix_" + stat + "_for_" +
                          score)
    plt.savefig(
        os.path.join(stat, score,
                     'Mean_confusion_matrix_' + stat + "_" + score + ".png"))
    plt.close()

    predictions = np.round(predictions)
    predictions[predictions < 0] = 0
    #predictions[predictions>5]=5

    cvrsq = 1 - (np.sum((Y - predictions)**2) / np.sum((Y - np.mean(Y))**2))
    # print(stat)
    # print(score)
    # print(cvrsq )
    # rsquares = np.ravel(rsquares)
    # print(rsquares)
    # plt.scatter(predictions, Y)
    # plt.plot([min(Y), max(Y)], [min(Y), max(Y)])
    # plt.xlabel( " time of day prediction for"+" "+ stat)
    # plt.ylabel("time of day score for"+ " " + stat)
    # plt.title("Cross validation Rsquare"+ str(cvrsq))
    # plt.savefig(stat+ ".png")
    # plt.close()

    #Compute confusion matrix

    cm = confusion_matrix(Y, predictions)
    np.set_printoptions(precision=2)
    print('Confusion matrix, without normalization')
    print(cm)
    teas = np.sum(Y == predictions) / float(len(Y))
    somme = []
    diagnonal = np.diagonal(cm)

    for i in range(len(diagnonal)):
        somme.append((diagnonal[i] / np.sum(cm[:, i])) * 100)
    category = np.array(somme)

    NanValue = isnan(category)
    category[NanValue] = 0
    # # plt.figure()
    # # plot_confusion_matrix(cm, title='confusion matrix_'+ stat + "_"+ score)
    # # plt.savefig(os.path.join(stat, score,"confusion_matrix_" + stat +"_"+ score + ".png"))
    # # plt.close()

    # # Normalized confusion matrix
    # cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    # print('Normalized confusion matrix')
    # print(cm_normalized)
    # plt.figure()
    # plot_confusion_matrix(cm_normalized, title='Normalized confusion matrix_' + stat + "_"+ score)
    # plt.savefig(os.path.join(stat, score, 'Normalized_confusion_matrix' + stat +"_"+ score + "_.png"))
    # plt.close()

    # RUN GLM
    model = sm.OLS(Y, X).fit()
    pvals = model.pvalues

    pvals_fwer = multicomp.multipletests(pvals, alpha=0.05, method='fdr_bh')

    #Save it into csv file
    df_final.loc[len(df_final)] = [
        score, stat, model.params, model.tvalues, model.pvalues, pvals_fwer[1],
        pvals_fwer[0], model.rsquared, model.bse
    ]
    df_final.to_csv(
        os.path.join(stat, score, score + "_" + stat + "_" + Level + ".csv"))

    #check quickly if there is significant data
    for idx, i in enumerate(model.pvalues):
        if model.pvalues[i] < 0.05:
            print(score + " " + stat + " " + Level + ind_var[idx])
            print(model.pvalues[idx])

    betas_component = model.params[0:betas]

    ### PLOT the T SCORES
    # Select the variable
    y = model.tvalues
    x = np.array(range(len(ind_var)))

    # plot figure
    fig = plt.figure()
    ax = fig.add_subplot(111)
    width = 0.35

    ## the bars
    rec = ax.bar(x, y, width, color='green')
    plt.subplots_adjust(bottom=0.45)
    plt.xticks(x, ind_var, rotation='vertical')
    plt.ylabel(score + "_" + stat)
    plt.xlabel("Rsquare %s" % (model.rsquared))

    rects = rec.patches
    # Plot the pvalues
    labels = ["p = %f" % i for i in model.pvalues]
    for rect, label in zip(rects, labels):
        height = rect.get_height()
        ax.text(rect.get_x() + rect.get_width() / 2,
                height + 2,
                label,
                ha='center',
                va='bottom',
                weight='light',
                size='xx-small')
    plt.savefig(
        os.path.join(stat, score,
                     score + "_" + stat + "_" + brain_type + "_" + ".png"))
    plt.close()

    return df_final, db, betas_component, pvals, cvrsq, tras, category, teas