def diffexpressed(self, _subset, _factor, qval_limit, verbose=True):
        """Returns an array of probes that are differentially expressed according
        to the following method:

        1)  Perform an independent t-test on the probe values for the specified
            subset against the probe values for the non-subset samples.
        2)  To correct for multiple testing errors, calculate the
            Benjamini-Hochberg FDR q-value for each p-value.
        3)  Filter probes where the q-value is above the cutoff.

        Arguments:
            _subset:    the subset to test for expressed genes
            _factor:    the factor the subset belongs to
            fdr_limit:  the FDR q-value representing the upper limit for results
        """
        if not self.filtered():
            print("Warning: Finding differentially expressed genes on an unfiltered matrix may fail. Run dataset.filter().")

        matrix = self.matrix
        probes = self.probes
        samples = self.factors[_factor][_subset]

        inA = array([x in samples for x in self.header[2:]])
        A = numpy.transpose(matrix[:, inA])
        B = numpy.transpose(matrix[:, numpy.invert(inA)])

        t, pvals = stats.ttest_ind(A, B)
        rejected, qvals = multitest.fdrcorrection(pvals, alpha=qval_limit)

        # probe values are [probe_name, entrez_id] form (hence x[0])
        diffexp = [x[0] for i, x in enumerate(probes) if qvals[i] < qval_limit]
        if verbose:
            print("%d samples, %d differentially expressed genes in %s: %s" % (len([x for x in inA if x]), len(diffexp), _factor, _subset))
        return diffexp
def compute_regression(input_cancer_type):

    if input_cancer_type == "CCRCC":
        cancer = cptac.Ccrcc()
    elif input_cancer_type == "Endometrial":
        cancer = cptac.Endometrial()
    elif input_cancer_type == "LUAD":
        cancer = cptac.Luad()
    elif input_cancer_type == "HNSCC":
        cancer = cptac.Hnscc()
    elif input_cancer_type == "LSCC":
        cancer = cptac.Lscc()
    elif input_cancer_type == "PDAC":
        cancer = cptac.Pdac()

    df = dc.get_prot_trans_df(cancer)
    results = df.groupby('Gene').apply(regression)
    reg_df = pd.DataFrame(list(results))
    reg_df.index = results.index
    reg_df.reset_index(inplace=True)
    reg_df = reg_df.dropna()
    reg_df['interaction_FDR'] = ssm.fdrcorrection(
        reg_df['interaction_pval'])[1]
    reg_df['condition_FDR'] = ssm.fdrcorrection(reg_df['condition_pval'])[1]
    reg_df['intercept_FDR'] = ssm.fdrcorrection(reg_df['intercept_pval'])[1]
    reg_df['Cancer'] = [input_cancer_type] * len(reg_df)

    file_name = input_cancer_type + '_regressions.csv'
    reg_df.to_csv(file_name, index=False)
Exemple #3
0
def test_multi_pvalcorrection():
    #test against R package multtest mt.rawp2adjp
    #because of sort this doesn't check correct sequence - TODO: rewrite DONE
    rmethods = {
        'rawp': (0, 'pval'),
        'Bonferroni': (1, 'b'),
        'Holm': (2, 'h'),
        'Hochberg': (3, 'sh'),
        'SidakSS': (4, 's'),
        'SidakSD': (5, 'hs'),
        'BH': (6, 'fdr_i'),
        'BY': (7, 'fdr_n')
    }

    for k, v in rmethods.items():
        if v[1] in ['b', 's', 'sh', 'hs', 'h', 'fdr_i', 'fdr_n']:
            #pvalscorr = np.sort(multipletests(pval0, alpha=0.1, method=v[1])[1])
            r_sortindex = [6, 8, 9, 7, 5, 1, 2, 4, 0, 3]
            pvalscorr = multipletests(pval0, alpha=0.1,
                                      method=v[1])[1][r_sortindex]
            assert_almost_equal(pvalscorr, res_multtest[:, v[0]], 15)

    pvalscorr = np.sort(fdrcorrection(pval0, method='n')[1])
    assert_almost_equal(pvalscorr, res_multtest[:, 7], 15)
    pvalscorr = np.sort(fdrcorrection(pval0, method='i')[1])
    assert_almost_equal(pvalscorr, res_multtest[:, 6], 15)
Exemple #4
0
    def test_multi_pvalcorrection(self):
        # test against R package multtest mt.rawp2adjp

        res_multtest = self.res2
        pval0 = res_multtest[:, 0]

        pvalscorr = np.sort(fdrcorrection(pval0, method='n')[1])
        assert_almost_equal(pvalscorr, res_multtest[:, 7], 15)
        pvalscorr = np.sort(fdrcorrection(pval0, method='i')[1])
        assert_almost_equal(pvalscorr, res_multtest[:, 6], 15)
    def test_multi_pvalcorrection(self):
        #test against R package multtest mt.rawp2adjp

        res_multtest = self.res2
        pval0 = res_multtest[:,0]

        pvalscorr = np.sort(fdrcorrection(pval0, method='n')[1])
        assert_almost_equal(pvalscorr, res_multtest[:,7], 15)
        pvalscorr = np.sort(fdrcorrection(pval0, method='i')[1])
        assert_almost_equal(pvalscorr, res_multtest[:,6], 15)
Exemple #6
0
def make_bed(qc_data, positive_dir, negative_dir, bed):
    bed = pd.read_table(bed)
    target_loc = pd.read_table(qc_data)
    target_loc = target_loc.loc[target_loc["Average_Total_Reads"] >= 30]
    target_loc = target_loc.loc[target_loc["Average_Number_Peaks"] >= 1.5]
    positive_samples = glob("{0}/*_MSIscore.xls".format(positive_dir))
    negative_samples = glob("{0}/*_MSIscore.xls".format(negative_dir))

    positive_s = []
    for s in positive_samples:
        data = pd.read_table(s)
        data = data.loc[data["MSID"].isin(target_loc["MSID"].tolist())]
        data = data[["MSID", "Normalized_Number_of_Peaks"]]
        data.columns = ["MSID", s.split("/")[-1].split("_")[0] + "_positive"]
        target_loc = pd.merge(target_loc, data, on="MSID", how="inner")
        positive_s.append(s.split("/")[-1].split("_")[0] + "_positive")

    negative_s = []
    for s in negative_samples:
        data = pd.read_table(s)
        data = data.loc[data["MSID"].isin(target_loc["MSID"].tolist())]
        data = data[["MSID", "Normalized_Number_of_Peaks"]]
        data.columns = ["MSID", s.split("/")[-1].split("_")[0] + "_negative"]
        target_loc = pd.merge(target_loc, data, on="MSID", how="inner")
        negative_s.append(s.split("/")[-1].split("_")[0] + "_negative")


    target_loc["pval"] = [ranksums(i, ii).pvalue for i, ii in zip(target_loc[positive_s].as_matrix(), target_loc[negative_s].as_matrix())]
    fdr = target_loc["pval"]
    reject, pvals_corrected = mul.fdrcorrection(fdr)
    target_loc['FDR_bh'] = pvals_corrected
    target_loc = target_loc.loc[target_loc["pval"] <= 0.01]
    target_loc.to_csv("peaks.txt", sep="\t", index=False)
    bed = bed.loc[bed["MSID"].isin(target_loc["MSID"].tolist())]
    bed.to_csv("bed.txt", sep="\t", index=False)
 def global_fdr(self, df, alpha_fdr):
     global_stats = {
         'global_padj': [],
         'cilow_global_padj': [],
         'ciupp_global_padj': []
     }
     colloc = {
         'global_padj': 8,
         'cilow_global_padj': 4,
         'ciupp_global_padj': 4
     }
     ids = df[~df['pval_rep0'].isna()].index
     qvals = np.empty((len(ids), len(self.nvs)))
     qvals[:] = np.nan
     for i in range(len(self.nvs)):
         _, qvals[:, i] = fdrcorrection(df['pval_rep' + str(i)][ids],
                                        alpha=alpha_fdr,
                                        method='indep')
     for i in range(qvals.shape[0]):
         mean_padj, low_padj, upp_padj = self.log_stats(qvals[i, :])
         global_stats['global_padj'].append(mean_padj)
         global_stats['cilow_global_padj'].append(low_padj)
         global_stats['ciupp_global_padj'].append(upp_padj)
     for key in global_stats.keys():
         df.insert((len(df.columns) - colloc[key] - len(self.nvs)), key,
                   np.nan)
         df.loc[ids, key] = global_stats[key]
     return df
def do_FDR_correction(df):
    """
    Do FDR correction and add results to dataframe
    # code from gonenrich module of goenrich package by Jan Rudolph (jdrudolph)
    # https://github.com/jdrudolph/goenrich/blob/master/goenrich/enrich.py
    :param df <pd.DataFrame>: GO expression data
    :return <pd.DataFrame>: GO expression data w/ FDR results
    """

    _p = np.array(df['pval'])
    # create array of len corresponding to p
    padj = _p.copy()
    rej = _p.copy()
    # list of bools not nan
    mask = ~np.isnan(_p)
    # remove false entries
    p = _p[mask]
    _rej, _padj = fdrcorrection(p)
    # only change values not nan values
    rej[mask] = _rej
    padj[mask] = _padj

    df['padj'] = padj
    df['rejected'] = rej

    return df
Exemple #9
0
def FDR(p_values, fdr, total=None):
    """
    Runs false detection correction for a table of statistics

    Parameters
    ----------
    p_values : ~pandas.DataFrame
        DataFrame with a 'pvalue' column
    fdr : float
        False detection rate
    total : int
        Total number of tests (for multi-enrichment)

    Returns
    -------
    ~pandas.DataFrame
        Table containing entries that passed multiple hypothesis correction
    """

    if total is not None:
        pvals = p_values.pvalue.values.tolist() + [1] * (total - len(p_values))
    else:
        pvals = p_values.pvalue.values

    keep, qvals = fdrcorrection(pvals, alpha=fdr)

    result = p_values.copy()
    result["qvalue"] = qvals[:len(p_values)]
    result = result[keep[:len(p_values)]]
    return result.sort_values("qvalue")
Exemple #10
0
    def get_list_enrichment(gene_list: pd.Series, alpha: float = 0.05, hide_rejected: bool = False) -> pd.DataFrame:
        print("{} genes in gene list {} are not part of the backgroud".format(
            gene_list[~gene_list.isin(annotated.index)].shape[0], gene_list.name),
            file=sys.stderr)

        list_cluster_dedup = annotated[annotated.index.isin(gene_list)].drop_duplicates('match_id')
        list_cluster_size = list_cluster_dedup.groupby('#pattern name').size()

        def cluster_fisher(row):
            return fisher_exact(
                [[row[0], row[1] - row[0]],
                 [list_cluster_dedup.shape[0] - row[0],
                  ann_dedup.shape[0] - list_cluster_dedup.shape[0] - row[1] + row[0]]],
                alternative='greater')[1]

        p_values = pd.concat([list_cluster_size, cluster_size],
                             axis=1).fillna(0).apply(cluster_fisher, axis=1).sort_values()
        reject, adj_p = fdrcorrection(p_values, alpha=alpha, is_sorted=True)

        if hide_rejected:
            p_values = p_values[reject]
            adj_p = adj_p[reject]

        adj_p = pd.Series(adj_p, index=p_values.index)
        return pd.concat([p_values, adj_p], axis=1).rename(columns={0: 'p', 1: 'adj_p'})
Exemple #11
0
def pairwise_comp(data, cty_prop, prop_list, params, sig_level=0.05):
    """
    Pairwise comparison of parameters between cell-types 
    """

    diff_param_list = []
    p_val_list = []

    for param in params:
        for comb in combinations(prop_list, 2):
            cty_x, cty_y = comb
            paramx = data.loc[data[cty_prop] == cty_x, param].values
            paramy = data.loc[data[cty_prop] == cty_y, param].values
            _, p_val_x = mannwhitneyu(paramx, paramy, alternative='less')
            _, p_val_y = mannwhitneyu(paramy, paramx, alternative='less')
            comp_type = '%s<%s' % (
                cty_x, cty_y) if p_val_x < p_val_y else '%s<%s' % (cty_y, cty_x)
            p_val = min(p_val_x, p_val_y)
            sig_dict = {'Comp_type': comp_type,
                        'param': param}
            diff_param_list.append(sig_dict)
            p_val_list.append(p_val)

    # FDR correction for multiple comparison
    _, p_val_corrected = fdrcorrection(p_val_list)

    diff_param_df = pd.DataFrame(diff_param_list)
    diff_param_df['p_val'] = p_val_corrected
    diff_param_df['sig_level'] = diff_param_df['p_val'].apply(
        lambda x: man_utils.pval_to_sig(x))

    return diff_param_df
Exemple #12
0
def dist2atlas_reg(bfp_path, ref_atlas, sub_files, reg_var, len_time=235):
    """ Perform regression stats based on square distance to atlas """
    print('dist2atlas_reg, assume that the data is normalized')

    num_vert = ref_atlas.shape[1]
    num_sub = len(sub_files)

    # Take absolute value of difference from the mean
    # for the IQ measure
    reg_var = sp.absolute(reg_var - sp.mean(reg_var))

    diff = sp.zeros((num_vert, num_sub))

    # Compute distance to atlas
    for ind in tqdm(range(num_sub)):
        sub_data = spio.loadmat(sub_files[ind])['dtseries'].T
        sub_data, _, _ = normalizeData(sub_data[:len_time, :])
        Y2, _ = brainSync(X=ref_atlas, Y=sub_data)
        diff[:, ind] = sp.sum((Y2 - ref_atlas)**2, axis=0)

    corr_pval = sp.zeros(num_vert)
    for vrt in tqdm(range(num_vert)):
        _, corr_pval[vrt] = sp.stats.pearsonr(diff[vrt, :], reg_var)

    corr_pval[sp.isnan(corr_pval)] = .5

    lab = spio.loadmat(bfp_path + '/supp_data/USCBrain_grayord_labels.mat')
    labs = lab['labels'].squeeze()

    corr_pval_fdr = sp.zeros(num_vert)
    _, pv = fdrcorrection(corr_pval[labs > 0])
    corr_pval_fdr[labs > 0] = pv

    return corr_pval, corr_pval_fdr
def multitest_correction(dataset, ontology, annotation_files):
    annotation_years = (json.load(open(f)) for f in annotation_files)
    factor = 'disease state'
    db = get_connection(100)
    for annotations in annotation_years:
        year = annotations['meta']['year']
        # If we didn't shuffle the annotations, the shuffle level is 0
        shuffled = annotations['meta'].get('shuffled', 0.0)
        for subset in dataset.factors[factor]:
            print"[%s]-[%s]-[%s]-[%s]-[%f]:" % (dataset.id, year, 
                ontology, subset, shuffled),
            with closing(db.cursor()) as c:
                _id = (dataset.id, subset, year, ontology, shuffled)
                print "selecting pvals... ",
                c.execute(select_pvals_sql, _id)
                # list of tuples [(_subid, pval), ...]
                results = list(c.fetchall())
            pvals = [x[1] for x in results] 
            subids = [_id + (x[0],) for x in results]
            print "calculating FDR... ",
            rejected, qvals = multitest.fdrcorrection(pvals)
            results = [(qvals[i],) + subids[i] for i, v in enumerate(qvals)]
            with closing(db.cursor()) as c:
                print "inserting %d qvals... " % len(results),
                c.executemany(insert_qval_sql, results)
                db.commit()
            print "done."
    db.close()
 def compute_latency(self, visual_hfb, image_id, visual_channels):
     """
     Compute latency response of visual channels"
     """
     A_postim = self.crop_stim_hfb(visual_hfb, image_id, tmin=0, tmax=1.5)
     A_prestim = self.crop_stim_hfb(visual_hfb, image_id, tmin=-0.4, tmax=0)
     A_baseline = np.mean(A_prestim, axis=-1) #No
     
     pval = [0]*A_postim.shape[2]
     tstat = [0]*A_postim.shape[2]
     latency_response = [0]*len(visual_channels)
     
     for i in range(0, len(visual_channels)):
         for t in range(0,np.size(A_postim,2)):
             tstat[t] = spstats.wilcoxon(A_postim[:,i,t], A_baseline[:,i],
                                         zero_method=self.zero_method)
             pval[t] = tstat[t][1]
             
         reject, pval_correct = fdrcorrection(pval, alpha=self.alpha) # correct for multiple hypotheses
         
         for t in range(0,np.size(A_postim,2)):
             if np.all(reject[t:t+50])==True :
                 latency_response[i]=t/500*1e3
                 break 
             else:
                 continue
     return latency_response
 def pval_series(self, visual_hfb, image_id, visual_channels):
     """
     Return pvalue of postimulus visual responsivity along observations
     """
     nchan = len(visual_channels)
     A_postim = self.crop_stim_hfb(visual_hfb, image_id, tmin=0, tmax=1.5)
     A_prestim = self.crop_stim_hfb(visual_hfb, image_id, tmin=-0.4, tmax=-0.1)
     A_baseline = np.mean(A_prestim, axis=-1)
     nobs = A_postim.shape[2]
     
     pval = [0]*nobs
     tstat = [0]*nobs
     
     reject = np.zeros((nchan, nobs))
     pval_correct = np.zeros((nchan, nobs))
     
     for i in range(0, nchan):
         for t in range(0,nobs):
             tstat[t] = spstats.wilcoxon(A_postim[:,i,t], A_baseline[:,i], 
                                         zero_method=self.zero_method)
             pval[t] = tstat[t][1]
             
         reject[i,:], pval_correct[i, :] = fdrcorrection(pval, alpha=self.alpha) # correct for multiple hypotheses
         
     return reject, pval_correct
 def multiple_wilcoxon_test(self, A_postim, A_prestim, alternative='two-sided'):
     """
     Wilcoxon test hypothesis of no difference between prestimulus and postimulus amplitude
     Correct for multilple hypothesis test.
     ----------
     Parameters
     ----------
     A_postim: (...,times) array
             Postimulus amplitude
     A_prestim: (...,times) array
                 Presimulus amplitude
     alpha: float
         significance threshold to reject the null
     From scipy.stats.wilcoxon:
     alternative: {“two-sided”, “greater”, “less”}, optional
     zero_method: {“pratt”, “wilcox”, “zsplit”}, optional
     """
     A_postim = np.mean(A_postim, axis=-1)
     A_prestim = np.mean(A_prestim, axis=-1)
     # Iniitialise inflated p values
     nchans = A_postim.shape[1]
     pval = [0]*nchans
     tstat = [0]*nchans
     # Compute inflated stats given non normal distribution
     for i in range(0,nchans):
         tstat[i], pval[i] = spstats.wilcoxon(A_postim[:,i], A_prestim[:,i],
                                              zero_method=self.zero_method, 
                                              alternative=self.alternative) 
     # Correct for multiple testing    
     reject, pval_correct = fdrcorrection(pval, alpha=self.alpha)
     w_test = reject, pval_correct, tstat
     return w_test
Exemple #17
0
def corr_perm_test(X_pairs, Y_pairs, reg_var, num_sub, nperm=1000):
    # X: nsub x vertices
    # Y: cognitive scores nsub X 1

    X, _, _ = normalizeData(X_pairs)

    num_pairs = X.shape[0]
    Y_pairs, _, _ = normalizeData(Y_pairs[:, None])
    rho_orig = np.sum(X * Y_pairs, axis=0)
    max_null = np.zeros(nperm)
    n_count = np.zeros(X.shape[1])

    print('Permutation testing')
    for ind in tqdm(range(nperm)):
        pairs, _ = gen_rand_pairs(num_sub=num_sub, num_pairs=num_pairs)
        pairs = np.array(pairs)
        Y = sp.square(reg_var[pairs[:, 0]] - reg_var[pairs[:, 1]])

        Y, _, _ = normalizeData(Y[:, None])

        rho_perm = np.sum(X * Y, axis=0)
        max_null[ind] = np.amax(rho_perm)
        n_count += np.float32(rho_perm >= rho_orig)

    pval_max = np.sum(rho_orig[:, None] <= max_null[None, :], axis=1) / nperm

    pval_perm = n_count / nperm

    _, pval_perm_fdr = fdrcorrection(pval_perm)

    return pval_max, pval_perm_fdr, pval_perm
Exemple #18
0
def LinReg_corr(subTest_diff, subTest_varmain, subTest_varc1, subTest_varc2):
    print('regressing out 1st covariate')
    diff_resid1 = sp.zeros(subTest_diff.shape)
    numV = subTest_diff.shape[0]
    for nv in tqdm(range(numV)):
        diff_resid1[nv, :] = LinReg_resid(subTest_varc1, subTest_diff[nv, :])

    print('regressing out 2nd covariate')
    diff_resid2 = sp.zeros(subTest_diff.shape)
    for nv in tqdm(range(numV)):
        diff_resid2[nv, :] = LinReg_resid(subTest_varc2, diff_resid1[nv, :])

    print('computing correlation against main variable')
    rval = sp.zeros(numV)
    pval = sp.zeros(numV)
    for nv in tqdm(range(numV)):
        _, _, rval[nv], pval[nv], _ = sp.stats.linregress(
            subTest_varmain, diff_resid2[nv, :])

    a = spio.loadmat('supp_data/USCBrain_grayordinate_labels.mat')
    labs = a['labels'].squeeze()
    labs[sp.isnan(labs)] = 0
    pval_fdr = sp.zeros(numV)
    _, pv = fdrcorrection(pval[labs > 0])
    pval_fdr[labs > 0] = pv

    return rval, pval, pval_fdr
Exemple #19
0
def fdrcorrection_matrix(arr, include_diagonal=True):
    """Apply FDR correction for matrix elements including diagonal entries.
    
    Args:
        arr (np.array):
            Matrix containing p-values.
        include_diagoonal (bool, optional):
            Whether diaganal elements should also be corrected. Defaults to 
            True.
            
    Returns:
        Matrix containing corrected p-values.    
    """
    n = arr.shape[0]
    k = 0 if include_diagonal else 1

    # Vectorize
    v_triu = arr[np.triu_indices(n, k=k)]

    # Restore 2D matrix
    new = np.zeros((n, n))
    new[np.triu_indices(n, k=k)] = fdrcorrection(v_triu)[1]
    new = new + np.tril(new.T, k=-1)

    return new
Exemple #20
0
def calculatePvalue(df): 
    df = pd.Series(data=features)
    df = df.value_counts().rename_axis('feature').reset_index(name='TestStatistic')
    df['pvalues'] = [st.binom_test(x, 20, 1/20, alternative='greater') for x in df.TestStatistic]
    fdrs = correct.fdrcorrection(df.pvalues,  method='negcorr' )[1]
    df['FDR']= fdrs
    return(df)
def test_enrichment(temp_K0_res, rel_kos_df, rel_p):
    ko_enrichment_res = pd.DataFrame(
        index=rel_p,
        columns=['pval', 'natural_pval', 'fdr', 'in_fraction_of_KOs'])

    for pathway in rel_p:
        _, p0 = mannwhitneyu(temp_K0_res.loc[rel_kos_df[rel_kos_df[pathway] ==
                                                        1].index]['rank'],
                             temp_K0_res.loc[rel_kos_df[rel_kos_df[pathway] ==
                                                        0].index]['rank'],
                             alternative='less')
        _, p1 = mannwhitneyu(temp_K0_res.loc[rel_kos_df[rel_kos_df[pathway] ==
                                                        1].index]['rank'],
                             temp_K0_res.loc[rel_kos_df[rel_kos_df[pathway] ==
                                                        0].index]['rank'],
                             alternative='greater')
        if p0 < p1:
            ko_enrichment_res.loc[pathway, 'natural_pval'] = p0
            ko_enrichment_res.loc[pathway, 'pval'] = -(np.log10(p0))
        else:
            ko_enrichment_res.loc[pathway, 'natural_pval'] = p1
            ko_enrichment_res.loc[pathway, 'pval'] = np.log10(p1)

        ko_enrichment_res.loc[pathway,
                              'in_fraction_of_KOs'] = rel_kos_df[pathway].sum(
                              ) / rel_kos_df.shape[0]
    ko_enrichment_res['fdr'] = fdrcorrection(
        ko_enrichment_res['natural_pval'])[1]
    ko_enrichment_res = ko_enrichment_res.sort_values('fdr')
    return ko_enrichment_res
def ANOVA_test(meta_data,df_otu_counts,Group_factor,Group_list,P_cut):
    '''Conducting one way ANOVA test to find significantly enriched OTUs in each group
       dict_enriched_OTU_group: enriched OTUs in each group
       df_ANOVA_results_group: F, p, and p_adj for each enriched OTU within each group
    '''
    
    meta_data_filter = meta_data.loc[df_otu_counts.columns]
    dict_group = dict(zip(Group_list,[list(meta_data_filter[meta_data_filter[Group_factor] == x].index) for x in Group_list]))

    ANOVA_results = {}
    for group in Group_list:
        Others = list(set(df_otu_counts.columns) - set(dict_group[group]))
        ANOVA_results[group] = [f_oneway(list(df_otu_counts[dict_group[group]].loc[x]), list(df_otu_counts[Others].loc[x]))[0:2] for x in df_otu_counts.index] 
            
    df_ANOVA_results_group = {}
    for group in Group_list:
        df_ANOVA_results_group[group] = pd.DataFrame(ANOVA_results[group],index = df_otu_counts.index, columns=['F_ratio','p_value'])
        df_ANOVA_results_group[group]['P_adj'] = list(fdrcorrection(df_ANOVA_results_group[group]['p_value'])[1])
        df_ANOVA_results_group[group] = df_ANOVA_results_group[group].sort_values(by = 'F_ratio',ascending = False)        
    

    # Target the enriched ones for each group of interest
    Enrich_ratio = pd.DataFrame(index = df_otu_counts.index)
    for group in Group_list:
        Other_group = list(set(df_otu_counts.columns) - set(dict_group[group]))
        Enrich_ratio[group] = df_otu_counts[dict_group[group]].mean(axis = 1) - df_otu_counts[Other_group].mean(axis = 1)
    
    dict_enriched_OTU_group = {}
    for group in Group_list:
        df_temp = df_ANOVA_results_group[group]
        pos_list = set(df_temp.index).intersection(Enrich_ratio[Enrich_ratio[group]>0][group].index)
        dict_enriched_OTU_group[group] = df_temp.loc[pos_list][df_temp.P_adj<P_cut].index
    
    return dict_enriched_OTU_group,df_ANOVA_results_group
Exemple #23
0
 def _transform(self, result):
     p = result.maps['p']
     _, p_corr = mc.fdrcorrection(p, alpha=self.q, method=self.method,
                                  is_sorted=False)
     corr_maps = {'p': p_corr}
     self._generate_secondary_maps(result, corr_maps)
     return corr_maps
Exemple #24
0
def MW_U(data):
    """
    Mann Whitney test corrected with fdrcorrection
    Arguments:
    ----------
    data: neuro_data + clinical_data
    Returns:
    -------
    pandas dataframe with the information related to the Mann Whitney test.
    """
    patients_fa, controls_fa, feats = stats_data(data)
    MannWhitney_tests = pd.DataFrame(columns=['ROI', 'U', 'pvalue'])
    for attr in feats:
        stat, p = mannwhitneyu(patients_fa[attr], controls_fa[attr])
        MannWhitney_tests = MannWhitney_tests.append(
            {
                'ROI': attr,
                'U': stat,
                'pvalue': p
            }, ignore_index=True)

    test, p_corr = fdrcorrection(MannWhitney_tests["pvalue"],
                                 alpha=0.05,
                                 method="indep",
                                 is_sorted=False)
    MannWhitney_tests["Rejected"] = test
    MannWhitney_tests["p_corr"] = p_corr

    return MannWhitney_tests
Exemple #25
0
def stretchFinder(profile, l, m=10**4):
    """
    implementation of strechFinder as described in : "Synonymous site conservation in the HIV-1 genome"
    :param profile: a vector of entropy values
    :param l: the window size
    :param m: number of permutations
    :return:
    """
    start_index = []
    p_values = []

    #create a per-profile distribution of averages, then sample
    avgs = np.array([])
    for j in range(m):
        new_profile = profile
        cur_avg = np.mean(new_profile[np.random.choice(len(new_profile), size=l, replace=False)])
        avgs = np.insert(avgs, avgs.searchsorted(cur_avg), cur_avg)

    for i in tqdm(range(0,len(profile) - l)):
        # get the current window and its average value
        w = profile[i:i+l]
        avg = np.mean(w)

        # sort average in order to get the p value
        idx = np.searchsorted(avgs, avg)
        p_value = idx/m
        p_values.append(p_value)
        start_index.append(i)

    data =  pd.DataFrame({'start':start_index, 'p_value':p_values, 'l':l})

    # correct for multiple tests
    data['corrected_pvalue'] = multi.fdrcorrection(data['p_value'])[1]

    return data
def detect_visual(A_pr, A_po, HFB_db):
    
    M1 = np.mean(A_pr,axis=2)
    M2 = np.mean(A_po,axis=2)
    # Get rid of infinity 
    M1[M1==-inf] = 0
    M2[M2 == -inf] = 0
    # Compute inflated p values
    pval = [0]*len(HFB_db.info['ch_names'])
    degf = [0]*len(HFB_db.info['ch_names'])
    tstat = [0]*len(HFB_db.info['ch_names'])
    for i in range(0,len(HFB_db.info['ch_names'])):
        tstat[i], pval[i] = spstats.wilcoxon(M1[:,i], M2[:,i], zero_method='zsplit') # Non normal distrib 
    # Correct multiplt testing    
    reject, pval_correct = fdrcorrection(pval, alpha=0.05)
    
    # Compute effect size: Cohen d 
    MC1 = np.mean(M1, axis=0)
    MC2 = np.mean(M2, axis=0)
    std1 = np.std(M1, axis=0)
    std2 = np.std(M2, axis=0)
    n1 = M1.shape[1]
    n2 = M2.shape[1]
    std = np.sqrt(np.divide((n1-1)*std1**2+(n2-1)*std2**2,(n1+n2-2)))
    cohen = np.divide(MC1-MC2, std)
    # Return visual channels
    idx = np.where(reject==True)
    idx = idx[0]
    visual_chan = []
    visual_cohen = []
    for i in list(idx):
        visual_chan.append(HFB_db.info['ch_names'][i])
        visual_cohen.append(np.abs(cohen[i]))
    return reject, pval_correct, visual_chan
Exemple #27
0
def multiple_testing_correction(ps,
                                alpha=0.05,
                                method='benjamini-hochberg',
                                **kwargs):
    """ correct pvalues for multiple testing and add corrected `q` value
    
    :param ps: list of pvalues
    :param alpha: significance level default : 0.05
    :param method: multiple testing correction method [bonferroni|benjamini-hochberg]
    :returns (q, rej): two lists of q-values and rejected nodes
    """
    _p = np.array(ps)
    q = _p.copy()
    rej = _p.copy()
    mask = ~np.isnan(_p)
    p = _p[mask]
    if method == 'bonferroni':
        q[mask] = p / len(p)
        rej[mask] = q[mask] < alpha
    elif method == 'benjamini-hochberg':
        _rej, _q = fdrcorrection(p, alpha)
        rej[mask] = _rej
        q[mask] = _q
    else:
        raise ValueError(method)
    return q, rej
def pg_ttest(data,
             group_col,
             group1,
             group2,
             fdr=0.05,
             value_col='MS signal [Log2]'):
    '''
    data: long data format with ProteinID as index, one column of protein levels, other columns of grouping.
    '''
    df = data.copy()
    proteins = data.index.unique()
    columns = pg.ttest(x=[1, 2], y=[3, 4]).columns
    scores = pd.DataFrame(columns=columns)
    for i in proteins:
        df_ttest = df.loc[i]
        x = df_ttest[df_ttest[group_col] == group1][value_col]
        y = df_ttest[df_ttest[group_col] == group2][value_col]
        difference = y.mean() - x.mean()
        result = pg.ttest(x=x, y=y)
        result['protein'] = i
        result['difference'] = difference
        scores = scores.append(result)
    scores = scores.assign(new_column=lambda x: -np.log10(scores['p-val']))
    scores = scores.rename({'new_column': '-Log pvalue'}, axis=1)

    #FDR correction
    reject, qvalue = multi.fdrcorrection(scores['p-val'],
                                         alpha=0.05,
                                         method='indep')
    scores['qvalue'] = qvalue
    scores['rejected'] = reject
    scores = scores.set_index('protein')
    return scores
Exemple #29
0
def main(infile, val_col, sep, tail, outfile):
    """
    P values were estimated based on Z-transformed values using the standard normal distribution,
    and were further corrected by multiple testing using the Benjamini–Hochberg false discovery rate (FDR) method
    """
    if not sep:
        df = pd.read_csv(infile, sep='\t', dtype={val_col: float})
    else:
        df = pd.read_csv(infile, sep=sep, dtype={val_col: float})
    print(f'data loaded: {df.shape[0]} rows, {df.shape[1]} columns')
    df = df.dropna(subset=[val_col])
    print(f'data after dropna: {df.shape[0]} rows, {df.shape[1]} columns')
    mean = df[val_col].mean()
    std = df[val_col].std()
    print(f'mean: {mean}, std: {std}')
    df['Z-score'] = zscore(df[val_col].values)
    if tail == 'right':
        df['Pvalue'] = df['Z-score'].apply(
            lambda x: norm.sf(x)
        )  # Survival function (also defined as 1 - cdf, but sf is sometimes more accurate).
    elif tail == 'left':
        df['Pvalue'] = df['Z-score'].apply(
            lambda x: norm.cdf(x))  # Cumulative distribution function.


#    df['FDR'] = multicomp(df['Pvalue'].values, method='fdr_bh')[1]
    df['FDR'] = fdrcorrection(df['Pvalue'].values,
                              alpha=0.05,
                              method='indep',
                              is_sorted=False)[1]
    df.to_csv(outfile, sep='\t', index=False)
Exemple #30
0
 def global_fdr(self, df, alpha_fdr):
     """Determine the global_padj values through FDR multiple testing 
     correction over all gene - GO annotation pairs present in the output
     file.
     """
     global_stats = {
         'global_padj': [],
         'cilow_global_padj': [],
         'ciupp_global_padj': []
     }
     colloc = {
         'global_padj': 8,
         'cilow_global_padj': 4,
         'ciupp_global_padj': 4
     }
     ids = df[~df['pval_rep0'].isna()].index
     qvals = np.empty((len(ids), len(self.nvs)))
     qvals[:] = np.nan
     for i in range(len(self.nvs)):
         _, qvals[:, i] = fdrcorrection(df['pval_rep' + str(i)][ids],
                                        alpha=alpha_fdr,
                                        method='indep')
     for i in range(qvals.shape[0]):
         mean_padj, low_padj, upp_padj = self.log_stats(qvals[i, :])
         global_stats['global_padj'].append(mean_padj)
         global_stats['cilow_global_padj'].append(low_padj)
         global_stats['ciupp_global_padj'].append(upp_padj)
     for key in global_stats.keys():
         df.insert((len(df.columns) - colloc[key] - len(self.nvs)), key,
                   np.nan)
         df.loc[ids, key] = global_stats[key]
     return df
Exemple #31
0
def simper_mothur(fn, order, meta, tax, rng):
    with open(order, 'rU') as f:
        rows = []
        for r in csv.reader(f):
            rows.append(r)
    simp_order, cont = [], []
    for row in range(len(rows)):
        if row > 0 and row < 6:
            simp_order.append(rows[row][0])
            cont.append(float(rows[row][1])*100)
    with open(tax, 'rU') as f:
        rows = []
        for row in csv.reader(f):
            rows.append(row)
    tax = []
    for a in range(len(simp_order)):
        for b in range(len(rows)):
            if simp_order[a] == rows[b][0]:
                phylo = [rows[b][2], rows[b][3], rows[b][4], rows[b][5], rows[b][6]]
                if phylo[4][-12:] != 'unclassified':
                    this_tax = r'$'+str(phylo[4])+'$'
                else:
                    this_tax = phylo[4][:-13]
                tax.append(this_tax)          
    print_otus = []
    for c in simp_order:
        totu = 'OTU'
        d = 0
        while d < len(c):
            if d > 2 and c[d] != '0':
                totu += c[d:]
                d = len(c)
            d += 1
        totu += '\n'
        print_otus.append(totu)
    with open(fn, 'rU') as f:
        rows = []
        for row in csv.reader(f):
            rows.append(row)
    simp_rows = []
    for e in range(len(simp_order)):
        for f in range(len(rows)):
            if rows[f][0] == simp_order[e]:
                simp_rows.append(rows[f])
    krusk, krusk_p, treat_mean, treat_sd = [], [], [], []
    for g in simp_rows:
        krusk.append(float(g[-2]))
        krusk_p.append(float(g[-1]))
        this_mean, this_sd = [], []
        for h in range(rng):
            h += 1
            if h % 2 != 0:
                this_mean.append(float(g[h])*100)
            else:
                this_sd.append(float(g[h])*100)
        treat_mean.append(this_mean)
        treat_sd.append(this_sd)
    krusk_p = smm.fdrcorrection(krusk_p)[1]
    return krusk, krusk_p, treat_mean, treat_sd, cont, print_otus, tax
def full_fdr(p_val_n):
    s = p_val_n.shape
    #print(p_val_n.shape)
    temp = copy.deepcopy(p_val_n)
    pval = np.ravel(temp)
    _, pval_fdr = mul.fdrcorrection(pval)
    pval_fdr_shape = pval_fdr.reshape(s)
    return pval_fdr_shape
def _fdrcorrect(pvals):
    """
		Perform FDR correction with nan's.
	"""

    fdr = np.ones(pvals.shape[0])
    _, fdr[~np.isnan(pvals)] = fdrcorrection(pvals[~np.isnan(pvals)])
    return fdr
Exemple #34
0
def test_multi_pvalcorrection():
    #test against R package multtest mt.rawp2adjp
    #because of sort this doesn't check correct sequence - TODO: rewrite DONE
    rmethods = {'rawp':(0,'pval'), 'Bonferroni':(1,'b'), 'Holm':(2,'h'),
                'Hochberg':(3,'sh'), 'SidakSS':(4,'s'), 'SidakSD':(5,'hs'),
                'BH':(6,'fdr_i'), 'BY':(7,'fdr_n')}

    for k,v in rmethods.items():
        if v[1] in ['b', 's', 'sh', 'hs', 'h', 'fdr_i', 'fdr_n']:
            #pvalscorr = np.sort(multipletests(pval0, alpha=0.1, method=v[1])[1])
            r_sortindex = [6, 8, 9, 7, 5, 1, 2, 4, 0, 3]
            pvalscorr = multipletests(pval0, alpha=0.1, method=v[1])[1][r_sortindex]
            assert_almost_equal(pvalscorr, res_multtest[:,v[0]], 15)

    pvalscorr = np.sort(fdrcorrection(pval0, method='n')[1])
    assert_almost_equal(pvalscorr, res_multtest[:,7], 15)
    pvalscorr = np.sort(fdrcorrection(pval0, method='i')[1])
    assert_almost_equal(pvalscorr, res_multtest[:,6], 15)
Exemple #35
0
    def test_multi_pvalcorrection(self):
        #test against R package multtest mt.rawp2adjp

        res_multtest = self.res2
        pval0 = res_multtest[:,0]

        for k,v in iteritems(rmethods):
            if v[1] in self.methods:
                reject, pvalscorr = multipletests(pval0,
                                                  alpha=self.alpha,
                                                  method=v[1])[:2]
                assert_almost_equal(pvalscorr, res_multtest[:,v[0]], 15)
                assert_equal(reject, pvalscorr <= self.alpha)

        pvalscorr = np.sort(fdrcorrection(pval0, method='n')[1])
        assert_almost_equal(pvalscorr, res_multtest[:,7], 15)
        pvalscorr = np.sort(fdrcorrection(pval0, method='i')[1])
        assert_almost_equal(pvalscorr, res_multtest[:,6], 15)
Exemple #36
0
def multiple_testing_correction(G, pvalues, alpha=0.05, method='benjamini-hochberg', **kwargs):
    """ correct pvalues for multiple testing and add corrected `q` value
    :param alpha: significance level default : 0.05
    :param method: multiple testing correction method [bonferroni|benjamini-hochberg]
    """
    G.graph.update({ 'multiple-testing-correction': method,
        'alpha' : alpha })
    if method == 'bonferroni':
        n = len(pvalues.values())
        for term,p in pvalues.items():
            node = G.node[term]
            q = p * n
            node['q'] = q
            node['significant'] = q < 0.05
    elif method == 'benjamini-hochberg':
        terms, ps = zip(*pvalues.items())
        rejs, qs = fdrcorrection(ps, alpha)
        for term, q, rej in zip(terms, qs, rejs):
            node = G.node[term]
            node['q'] = q
            node['significant'] = rej
    else:
        raise ValueError(method)
Exemple #37
0
def multiple_testing_correction(ps, alpha=0.05,
        method='benjamini-hochberg', **kwargs):
    """ correct pvalues for multiple testing and add corrected `q` value
    
    :param ps: list of pvalues
    :param alpha: significance level default : 0.05
    :param method: multiple testing correction method [bonferroni|benjamini-hochberg]
    :returns (q, rej): two lists of q-values and rejected nodes
    """
    _p = np.array(ps)
    q = _p.copy()
    rej = _p.copy()
    mask = ~np.isnan(_p)
    p = _p[mask]
    if method == 'bonferroni':
        q[mask] = p / len(p)
        rej[mask] = q[mask] < alpha
    elif method == 'benjamini-hochberg':
        _rej, _q = fdrcorrection(p, alpha)
        rej[mask] = _rej
        q[mask] = _q
    else:
        raise ValueError(method)
    return q, rej
Exemple #38
0
    def calculate_enrichment(self, genes, reference=None,
                             evidence_codes=None,
                             aspect=None, use_fdr=True):
        """

        Parameters
        ----------
        genes : list
            list of genes
        reference : list
            reference list of species to calculate enrichment
        evidence_codes : list
            GO evidence codes
        use_fdr : bool
            Correct for multiple hypothesis testing

        Returns
        -------

        """

        # TODO check for alias for genes
        genes = set(genes)
        # TODO add aspects
        term_reference = self.go_to_gene.keys()
        aspect_dict = {
            'P': 'biological_process',
            'C': 'cellular_component',
            'F': 'molecular_function'
        }
        if aspect is None:
            term_reference = self.go_to_gene
            gene_reference = self.gene_to_go
        else:
            term_reference = dict()
            gene_reference = dict()

        if aspect is not None:
            for i in aspect:
                if i not in ['P', 'C', 'F']:
                    print("Error: Aspects are only 'P', 'C', and 'F' \n")
                    quit()
            for i in ['P', 'C', 'F']:
                if i in aspect:
                    term_reference = None

        # TODO add reference
        if reference:
            # TODO check for reference alias
            reference = set(reference)
            reference.intersection_update(set(self.gene_to_go.keys()))
        else:
            reference = set(self.gene_to_go.keys())

        # TODO add evidence_codes

        terms = set()
        for i in genes:
            if i in self.gene_to_go:
                for t in self.gene_to_go[i]:
                    terms.add(t)

        n_genes = len(genes)
        n_ref = float(len(reference))
        res = {}
        for term in terms:

            all_annotated_genes = set(self.go_to_gene[term])
            mapped_genes = genes.intersection(all_annotated_genes)
            n_mapped_genes = len(mapped_genes)

            if n_ref > len(all_annotated_genes):
                mapped_reference_genes = \
                    reference.intersection(all_annotated_genes)
            else:
                mapped_reference_genes = \
                    all_annotated_genes.intersection(reference)

            n_mapped_ref = len(mapped_reference_genes)

            prob = float(n_mapped_ref) / n_ref

            p_value = binom_test(n_mapped_genes, n_genes, prob, 'larger')

            res[term] = ([i for i in mapped_genes], p_value, n_mapped_ref)
        if use_fdr:
            res = sorted(res.items(), key=lambda x: x[1][1])
            fdr = fdrcorrection([p for _, (_, p, _) in res],
                                is_sorted=True)
            values = fdr[1]
            res = dict([(index, (genes, p, ref))
                        for (index, (genes, _, ref)), p in
                        zip(res, values)])
        return res
Exemple #39
0
def enrichment(genes,
               popfile,
               fgname,
               generegulation=None,
               myfilter=[5, 2000],
               org='hsa',
               go=None,
               kegg=None,
               pvalue=0.1,
               anno=None,
               **kwargs):
    """富集分析主程序

    Parameters
    ----------
    kwargs:其他参数
    anno:基因,Term的注释信息
    pvalue:网页文件中的Pvalue阈值
    kegg:KEGG文件夹
    go:GO文件
    org:物种
    myfilter:过滤条件
    generegulation:基因上下调情况
    fgname:分组名称
    popfile:数据库
    genes:差异基因list
    """
    dbname = os.path.basename(popfile)[:-4]
    head = (
        'Term_ID\tTerm_description\tTerm_url\tListHit\tListTotal\tPopHit\tPopTotal'
        '\tFoldEnrichment\tGenes\tGeneSymbols\tP_value\t -log10(pvalue)'
    ).split('\t')
    #数据库中的基因
    allgenes = {n.split('\t')[0]: n.strip().split('\t')[1]
                for n in open(popfile)}
    #差异基因在数据库中的基因
    listgenes = {n: allgenes.get(n) for n in genes if allgenes.get(n)}
    if len(listgenes) == 0:
        logging.warn(u'差异基因在%s数据库中没有注释' % dbname)
        try:
            dbid = kwargs['iddb']
            genes = [dbid.get(n) for n in genes if dbid.get(n)]
            listgenes = {n: allgenes.get(n) for n in genes if allgenes.get(n)}
            logging.info('经过ID转换后共计%d个基因转换成功!' % len(listgenes))
        except Exception as e:
            logging.warn('ID转换不成功,error:%s' % e)
            raise ValueError
    else:
        logging.info('%d个差异基因在%s数据库中有注释。' % (len(listgenes), dbname))
    poptotal, listtotal = len(allgenes), len(listgenes)
    popterms, listterms = count(allgenes), count(listgenes)
    data = []
    for term in listterms:
        listhit, pophit = len(listterms[term]), len(popterms[term])
        if isinstance(myfilter, list):
            if pophit < min(myfilter) or pophit > max(myfilter):
                continue
        else:
            if pophit < myfilter:
                continue
        table = ([listhit, listtotal - listhit], [pophit, poptotal - pophit])
        oddsratio, p_value = fisher_exact(table, 'greater')
        gene = listterms[term]
        genesy = [anno.get(n, n) for n in gene]
        url = geturl(dbname, term, gene, generegulation)
        vv = -log10(p_value)
        line = (term, anno[term], url, listhit, listtotal, pophit, poptotal,
                oddsratio, ';'.join(gene), ';'.join(genesy), p_value, str(vv))
        data.append(line)
    if len(data) == 0:
        logging.debug('Pvalue计算过程没有出结果!')
        raise ValueError('Pvalue计算不出结果')

    df = pd.DataFrame(data, columns=head)
    df = df.sort_values(by='P_value')
    fdr = df['P_value']
    reject, pvals_corrected = mul.fdrcorrection(fdr)
    df['FDR_bh'] = pvals_corrected
    tar = kegg if "KEGG" in dbname else go
    df.to_csv(r'%s\%s\%s_%s.csv' % (tar, fgname, fgname, dbname), index=False)
    plot.plmyfig(df, dbname, fgname, tar, count=20)
    df = df[df['P_value'] <= pvalue]
    HTML.df2html(df, fgname, dbname, tar)