Exemple #1
0
 def get_gene_list(self, samples_stat):
     for sample in samples_stat:
         gene = samples_stat[sample]['gene']
         if len(gene) == 0:
             continue
         else:
             gp = GProfiler(user_agent='ExampleTool', return_dataframe=True)
             df = gp.profile(organism='hsapiens', query=gene)
             go = df[df['native'].str.contains('GO')]
             go.to_csv('{module}/{sample}/GO_FuncTerm.csv'.format(
                 module=self.module, sample=sample),
                       header=True,
                       index=False,
                       sep=',')
             self.plot_go(go, sample, 'GO')
             kegg = df[df['native'].str.contains('KEGG')]
             kegg.to_csv('{module}/{sample}/KEGG_FuncTerm.csv'.format(
                 module=self.module, sample=sample),
                         header=True,
                         index=False,
                         sep=',')
             self.plot_go(kegg, sample, 'KEGG')
             df = gp.convert(organism='hsapiens',
                             query=gene,
                             target_namespace='ENTREZGENE_ACC')
             df.to_csv('{module}/{sample}/Entrez_Gene_converted.csv'.format(
                 module=self.module, sample=sample),
                       header=True,
                       index=False,
                       sep=',')
             with open(
                     '{module}/{sample}/gene_list.txt'.format(
                         module=self.module, sample=sample), 'wt') as f:
                 f.write('\n'.join(gene))
Exemple #2
0
def run_gprofiler(inputfile, theargs):
    """
    todo
    :param inputfile:
    :return:
    """
    genes = read_inputfile(inputfile)
    gp = GProfiler(return_dataframe=True)
    genes = genes.strip(',').strip('\n').split(',')
    df_result = gp.profile(query=genes,
                           organism=theargs.organism,
                           user_threshold=theargs.maxpval)
    if df_result.shape[0] == 0:
        sys.stderr.write('No terms found\n')
        return 0

    df_result['Jaccard'] = 1.0 / (1.0 / df_result['precision'] +
                                  1.0 / df_result['recall'] - 1)
    df_result.sort_values(['Jaccard', 'p_value'],
                          ascending=[False, True],
                          inplace=True)
    df_result.reset_index(drop=True, inplace=True)
    top_hit = df_result['name'][0]
    sys.stdout.write(top_hit)
    return 0
def run_gprofiler(inputfile, theargs):
    """
    todo
    :param inputfile:
    :return:
    """
    genes = read_inputfile(inputfile)
    gp = GProfiler(return_dataframe=True)
    genes = genes.strip(',').strip('\n').split(',')
    df_result = gp.profile(query=genes,
                           organism=theargs.organism,
                           user_threshold=theargs.maxpval,
                           no_evidences=False)
    if df_result.shape[0] == 0:
        sys.stderr.write('No terms found\n')
        return 0
    df_result['Jaccard'] = 1.0 / (1.0 / df_result['precision'] +
                                  1.0 / df_result['recall'] - 1)
    df_result.sort_values(['Jaccard', 'p_value'],
                          ascending=[False, True],
                          inplace=True)
    df_result.reset_index(drop=True, inplace=True)

    theres = {
        'name': df_result['name'][0],
        'source': df_result['source'][0],
        'p_value': df_result['p_value'][0],
        'description': df_result['description'][0],
        'intersections': df_result['intersections'][0]
    }
    json.dump(theres, sys.stdout)
    sys.stdout.flush()
    return 0
def run_gProfiler(comp, org):
	gp = GProfiler( return_dataframe=True) #return pandas dataframe or plain python structures 
	#gp = GProfiler(user_agent = 'lisa' )
	list_id = []
	for name in list(set(comp["composite"])):
		i_d = name.split('|')[1]
		list_id.append(i_d)
	res = gp.profile(organism=org,domain_scope = "annotated", sources = ["GO", "KEGG", "REACTOME"], #exemple org : hsapiens
		query=list_id)
	return res
Exemple #5
0
def pathway_enrich_genes(genes, databases):
    
    gp = GProfiler(return_dataframe=True, user_agent='g:GOSt')

    cluster_enrichment = gp.profile(organism='mmusculus', sources=databases, user_threshold=0.05,
                                   significance_threshold_method='fdr', 
                                   query=genes, #"contains the list of enriched genes"
                                   no_evidences=False)
    cluster_enrichment_results = cluster_enrichment.set_index('native').sort_values('p_value').iloc[:,[1,2,5,10,13]]

    pd.set_option("display.max_colwidth", 800)
    return cluster_enrichment_results.iloc[:10,:] 
Exemple #6
0
def pathway_enrich(genes, databases):
   
    gp = GProfiler(return_dataframe=True, user_agent='g:GOSt')

    cluster_enrichment = gp.profile(organism='mmusculus', sources=databases, user_threshold=0.05,
                               significance_threshold_method='fdr', 
                               domain_scope ='annotated',
                               #background= 10000, 
                               query= genes) #"contains the list of enriched genes"

    cluster_enrichment_results = cluster_enrichment.set_index('native').sort_values('p_value').iloc[:,[2,5,7,10,1]]
    pd.set_option("display.max_colwidth", 800)
    return cluster_enrichment_results.iloc[:10,:]
def Functional_profiling(gene_list,
                         organism='hsapiens',
                         sources=[
                             "GO:MF", "GO:CC", "GO:BP", "KEGG", "REAC", "WP",
                             "TF", "MIRNA", "HPA", "CORUM", "HP"
                         ],
                         user_threshold=0.05):
    gp = GProfiler(return_dataframe=True)

    gp_result_df = gp.profile(query=gene_list,
                              organism=organism,
                              user_threshold=user_threshold,
                              no_iea=True,
                              sources=sources)

    return gp_result_df
Exemple #8
0
    def execute(self):
        d = pd.read_excel(self.args["<markers_spreadsheet>"], sheet_name=None)
        output_spreadsheet = self.args['<output_spreadsheet>']
        organism = self.args["--organism"]
        enrichment_threshold = float(self.args["--enrichment_threshold"])
        max_genes = int(self.args['--max_genes'])
        from gprofiler import GProfiler
        gp = GProfiler(return_dataframe=True)
        query = {}
        for key in d.keys():
            features = d[key]['feature'].values.tolist()
            query[key] = features[0:max_genes]

        result = gp.profile(organism=organism,
                            query=query,
                            user_threshold=enrichment_threshold)
        result.to_excel(output_spreadsheet, index=False)
Exemple #9
0
def pathway_enrich_plot(genes, databases, title, background_genes, name_output, save: bool = False):
    """A function to plot the signature enrichment as a bargraph.  
    # Inputs:
    #    genes              - list of genes to be probed
    #    databases          - which databases to query, more information can be found here: https://biit.cs.ut.ee/gprofiler/page/apis
    #    title              - title for figure
    #    background_genes   - all the 
    #    save            - if I want to save the the figure
    # 
    """
    #Interpretation of differentially expressed genes in cluster 0 cells - g:profiler
    
    gp = GProfiler(return_dataframe=True, user_agent='g:GOSt')

    cluster_enrichment = gp.profile(organism='mmusculus', sources=databases, user_threshold=0.05,
                                   significance_threshold_method='fdr', 
                                   background= background_genes, 
                                   query=genes) #"contains the list of enriched genes"

    cluster_enrichment_results = cluster_enrichment.set_index('native').sort_values('p_value').iloc[:,[2,5,7,10,1]]

    # made new column with negative log p-value
    cluster_enrichment_results['-log10_p_value'] = cluster_enrichment_results['p_value'].map(lambda x: -math.log(x,10))
    
    if 'REAC:0000000' in cluster_enrichment_results.index.tolist():
        cluster_enrichment_results = cluster_enrichment_results.drop(labels='REAC:0000000', axis=0)

    plt.rcdefaults()
    fig, ax = plt.subplots()

    cluster_name = cluster_enrichment_results['name'].head(10)
    y_pos = np.arange(len(cluster_name))
    enrichment_value = cluster_enrichment_results['-log10_p_value'].head(10)

    ax.barh(y_pos, enrichment_value, align='center', color='black')
    ax.set_yticks(y_pos)
    ax.set_yticklabels(cluster_name)
    ax.invert_yaxis()  # labels read top-to-bottom
    ax.set_xlabel('-log10 p value')
    ax.set_title(title)

    if save:
        plt.savefig(name_output, format='pdf', bbox_inches = "tight")
    
    return plt.show()
Exemple #10
0
def enrichment_analysis(markers: Dict[str, Dict[str, pd.DataFrame]],
                        max_genes: int = 100,
                        organism: str = 'hsapiens',
                        enrichment_threshold: float = 0.05) -> pd.DataFrame:
    """Perform enrichment analysis using gprofiler (https://biit.cs.ut.ee/gprofiler/gost).

    Parameters
    ----------
    markers: ``Dict[str, Dict[str, pd.DataFrame]``
        Output from markers.

    max_genes: ``int``, optional, default: 100
        Maximum number of genes to use in enrichment query
    organism: ``str``, optional, default: ``hsapiens``
        Organism. See https://biit.cs.ut.ee/gprofiler/page/organism-list for full list.
    enrichment_threshold: ``float``, optional, default: ``0.05``
        Include enrichment results with corrected p-value less than this threshold

    Returns
    -------
    ``pd.DataFrame``

    """
    start = time.perf_counter()
    from gprofiler import GProfiler
    gp = GProfiler(return_dataframe=True)
    query = {}
    for cluster in markers.keys():
        up_list = markers[cluster]['up'].index.values.tolist()
        if len(up_list) > 0:
            query[cluster + '-up'] = up_list[0:max_genes]
        down_list = markers[cluster]['down'].index.values.tolist()
        if len(down_list) > 0:
            query[cluster + '-down'] = down_list[0:max_genes]
    result = gp.profile(organism=organism,
                        query=query,
                        user_threshold=enrichment_threshold)
    end = time.perf_counter()
    logger.info(
        "Enrichment analysis is finished. Time spent = {:.2f}s.".format(end -
                                                                        start))
    return result
Exemple #11
0
def enrich_and_simplify(sets,
                        intersections=True,
                        sources=('GO:BP', ),
                        organism='hsapiens',
                        reduce_limit=0,
                        **revigo_kwds):
    from gprofiler import GProfiler

    if not isinstance(sets, dict):
        sets = list(sets)

    gprofiler = GProfiler(user_agent="scanpy", return_dataframe=True)
    gprofiler_kwargs = {'no_evidences': not intersections, 'sources': sources}

    df = gprofiler.profile(sets, organism=organism, **gprofiler_kwargs)
    revs = {}

    if reduce_limit is not None:
        dfs = []
        for q in df['query'].unique():
            df_sub = df[df['query'] == q].copy()
            go = df_sub.native.tolist()
            pvals = df_sub.p_value.tolist()

            if len(go) > reduce_limit:
                r = revigo(go, pvals, **revigo_kwds)
                revs[q] = r

                r = r.rename(columns={
                    'term_ID': 'native'
                }).drop(columns='description').assign(query=q)
                dfs.append(df_sub.merge(r))
            else:
                dfs.append(df.assign(eliminated=0))

        df = pd.concat(dfs, axis=0).reset_index(drop=True)

    return df, revs
Exemple #12
0
def main(args):
    gp = GProfiler(
        user_agent='gprofiler_custom_gmt',  #optional user agent
        return_dataframe=
        True,  #return pandas dataframe or plain python structures    
    )
    genes = [line.strip() for line in open(args.filename)]

    if args.gmt is not None:
        with open(args.gmt) as f:
            response = requests.post(
                'https://biit.cs.ut.ee/gprofiler/api/gost/custom/',
                json={
                    'gmt': f.read(),
                    'name': args.gmt
                })
        token = get_token_form_response(response)
    elif args.token is not None:
        token = args.token
    else:
        raise ValuError("Please supply either a token or a gmt file")
    res = gp.profile(genes, organism=token)
    res.to_csv(args.output)
Exemple #13
0
community = g.community_infomap()
print("Number of Communities:", len(community))

df = pd.DataFrame({'gene': g.vs['name'], 'community': community.membership})
# Order by size of communities
valuec = df['community'].value_counts()
biggest = valuec.unique()[0]  # Corta el valor con mas cuentas
values = valuec[(valuec >= biggest) |
                (valuec >= 10)]  # Values of communities filtered
order = values.index.tolist()
df = df[df['community'].isin(order)]
df = df.set_index('community')
df = df.loc[order].reset_index()
# print("community id - Number of Genes")
# print(valuec)

#! Enrichment by GO
gp = GProfiler(return_dataframe=True)
enrich_communities = pd.DataFrame()
print("Community id - Number of nodes")
for name, group in df.groupby('community', sort=False):
    print(name, group.shape[0])
    s = gp.profile(organism='hsapiens', query=group.gene.tolist())
    s['community'] = name
    enrich_communities = enrich_communities.append(s)
# print(enrich_components)

#! OUTPUT
enrich_communities.to_csv(oname1, sep="\t", index=False)
df.to_csv(oname2, sep="\t", index=False)
import gprofiler
from gprofiler import GProfiler
GProfiler?
gp = GProfiler(return_dataframe=True)
gp.profile(organism='mmusculus', query=genes)
genes
genes = """ENSMUSG00000076488
ENSMUSG00000065231
ENSMUSG00000079120
ENSMUSG00000047222
ENSMUSG00000097494
ENSMUSG00000064419
ENSMUSG00000095668
ENSMUSG00000059606""".split()
gp.profile(organism='mmusculus', query=genes)
import requests
def mygprofiler(namelist, organism='mmusculus'):
    """Run gProfiler using POST api with a json query body
    
    Returns a pandas DataFrame with the result"""
    if type(namelist) is not list:
        namelist = list(namelist)
    r = requests.post(
        url='https://biit.cs.ut.ee/gprofiler/api/gost/profile/',
        json={
            'organism':organism,
            'query': namelist,
        }
        )
    df = pd.DataFrame(r.json()['result'])
    return df
Exemple #15
0
def enrich(
    container: Iterable[str],
    *,
    org: str = "hsapiens",
    gprofiler_kwargs: Mapping[str, Any] = {},
) -> pd.DataFrame:
    """\
    Get enrichment for DE results.

    This is a thin convenience wrapper around the very useful gprofiler_.

    This method dispatches on the first argument, leading to the following two
    signatures::

        enrich(container, ...)
        enrich(adata: AnnData, group, key: str, ...)

    Where::

        enrich(adata, group, key, ...) = enrich(adata.uns[key]["names"][group], ...)

    .. _gprofiler: https://pypi.org/project/gprofiler-official/#description

    Parameters
    ----------
    container
        Contains genes you'd like to search.
    adata
        AnnData object whose group will be looked for.
    group
        The group whose genes should be used for enrichment.
    key
        Key in `uns` to find group under.
    {doc_org}
    gprofiler_kwargs
        Keyword arguments to pass to `GProfiler.profile`, see gprofiler_.

    Returns
    -------
    Dataframe of enrichment results.

    Examples
    --------
    Using `sc.queries.enrich` on a list of genes:

    >>> import scanpy as sc
    >>> sc.queries.enrich(['Klf4', 'Pax5', 'Sox2', 'Nanog'], org="hsapiens")

    Using `sc.queries.enrich` on an :class:`anndata.AnnData` object:

    >>> pbmcs = sc.datasets.pbmc68k_reduced()
    >>> sc.tl.rank_genes_groups(pbmcs, "bulk_labels")
    >>> sc.queries.enrich(pbmcs, "CD34+")
    """
    try:
        from gprofiler import GProfiler
    except ImportError:
        raise ImportError(
            "This method requires the `gprofiler-official` module to be installed."
        )
    gprofiler = GProfiler(user_agent="scanpy", return_dataframe=True)
    gprofiler_kwargs = copy(gprofiler_kwargs)
    for k in ["organism"]:
        if gprofiler_kwargs.get(k) is not None:
            raise ValueError(
                f"Argument `{k}` should be passed directly through `enrich`, "
                "not through `gprofiler_kwargs`")
    return gprofiler.profile(list(container), organism=org, **gprofiler_kwargs)
Exemple #16
0
downreg = downreg['Gene'].astype(str).tolist()
downreg[:] = map(str.strip, downreg)

hyper = methyl_genes[methyl_genes['Methylation'] > 0]
hyper = hyper['Gene'].tolist()

hypo = methyl_genes[methyl_genes['Methylation'] < 0]
hypo = hypo['Gene'].tolist()

genelists = {'downreg': downreg, 'upreg': upreg, 'hyper': hyper, 'hypo': hypo}

for i in genelists:
    print("Calculating", i, "enrichment...")
    enrichment = gp.profile(genelists[i],
                            organism='mmusculus',
                            significance_threshold_method='fdr',
                            measure_underrepresentation=False,
                            sources=['GO:BP'])
    enrichment.sort_values('p_value').to_csv(str(i) + "_enrichment.csv")
    locals()[str(i) + "_enrichment"] = enrichment
    print("Done")

intersection_1 = pd.merge(downreg_enrichment,
                          hyper_enrichment,
                          how='inner',
                          on=['native'])
intersection_2 = pd.merge(upreg_enrichment,
                          hypo_enrichment,
                          how='inner',
                          on=['native'])



utr3_100 = utr3[utr3['count'] > 100]
cds_100 = cds[cds['count'] > 100]
df = pd.read_table('/Users/nate/Projects/EBV_interactome/bl/discovery_cohort_spearman.tsv', index_col=0) 
mrnas = set(utr3_100[utr3_100['corr'] < -.27]['mrna']) | set(cds_100[cds_100['corr'] < -.27]['mrna'])
mm = df[set(mrnas) & set(df.columns)]



from gprofiler import GProfiler 
gp = GProfiler(user_agent='ExampleTool', return_dataframe=True )   

a2 = gp.profile(query=list(mm['CSDE1'].sort_values().index[-500:]),sources=['GO:BP','GO:MP','REAC','KEGG','GO:CC']) 




#spearman
cor = []
for mi, mr in zip(utr3['mir'], utr3['mrna']):
    cor.append(spearmanr(bl.loc[mr], mir.loc[mi])[0])
utr3['corr'] = cor

cor = []
for mi, mr in zip(cds['mir'], cds['mrna']):
    cor.append(spearmanr(bl.loc[mr], mir.loc[mi])[0])
cds['corr'] = cor
# Let me now find the GO anaotation graphs  for the proteins that have SaintExpress score>0.5 and BFDR<0.01
'''
This code asks you if you have one or more than one conditions. In case, you have one, it gives you only one conditional 
horizontal bar graphs. If not, it would compare the bar graphs.  

To have 2 conditions, you need to run this code 2 times with different outputs! 
'''

if ask_user == "YES":
    if number_of_conditions == 1:
        # Only one condition! getting GO annotation profiles of proteins that have >0.5 saint score and <0.01 BFDR score

        gp = GProfiler(return_dataframe=True)

        profiler = gp.profile(organism='hsapiens', query=gene_names)

        BP_profiler = profiler[profiler["source"] == "GO:BP"]
        CC_profiler = profiler[profiler["source"] == "GO:CC"]
        MF_profiler = profiler[profiler["source"] == "GO:MF"]

        BP_profiled = BP_profiler.sort_values(by=["p_value"])
        CC_profiled = CC_profiler.sort_values(by=["p_value"])
        MF_profiled = MF_profiler.sort_values(by=["p_value"])

        location_BP = BP_profiled["name"].to_list()[0:10]
        p_BP = BP_profiled["p_value"].to_list()[0:10]
        logged_p_BP = []
        for i in p_BP:
            x = -log(i, 10)
            logged_p_BP.append(x)
def run_gProfiler_gOST( gene_symbols_list:List[str], 
                        organism:str='hsapiens') -> pd.DataFrame:
    gp = GProfiler(return_dataframe=True)
    df = gp.profile(organism=organism,
            query=gene_symbols_list)
    return df