Beispiel #1
0
def run_gprofiler(inputfile, theargs):
    """
    todo
    :param inputfile:
    :return:
    """
    genes = read_inputfile(inputfile)
    gp = GProfiler(return_dataframe=True)
    genes = genes.strip(',').strip('\n').split(',')
    df_result = gp.profile(query=genes,
                           organism=theargs.organism,
                           user_threshold=theargs.maxpval)
    if df_result.shape[0] == 0:
        sys.stderr.write('No terms found\n')
        return 0

    df_result['Jaccard'] = 1.0 / (1.0 / df_result['precision'] +
                                  1.0 / df_result['recall'] - 1)
    df_result.sort_values(['Jaccard', 'p_value'],
                          ascending=[False, True],
                          inplace=True)
    df_result.reset_index(drop=True, inplace=True)
    top_hit = df_result['name'][0]
    sys.stdout.write(top_hit)
    return 0
def run_gprofiler(inputfile, theargs):
    """
    todo
    :param inputfile:
    :return:
    """
    genes = read_inputfile(inputfile)
    gp = GProfiler(return_dataframe=True)
    genes = genes.strip(',').strip('\n').split(',')
    df_result = gp.profile(query=genes,
                           organism=theargs.organism,
                           user_threshold=theargs.maxpval,
                           no_evidences=False)
    if df_result.shape[0] == 0:
        sys.stderr.write('No terms found\n')
        return 0
    df_result['Jaccard'] = 1.0 / (1.0 / df_result['precision'] +
                                  1.0 / df_result['recall'] - 1)
    df_result.sort_values(['Jaccard', 'p_value'],
                          ascending=[False, True],
                          inplace=True)
    df_result.reset_index(drop=True, inplace=True)

    theres = {
        'name': df_result['name'][0],
        'source': df_result['source'][0],
        'p_value': df_result['p_value'][0],
        'description': df_result['description'][0],
        'intersections': df_result['intersections'][0]
    }
    json.dump(theres, sys.stdout)
    sys.stdout.flush()
    return 0
Beispiel #3
0
def add_gene_name_gprofiler(data_df, col, organism):
    print(type(data_df[col].tolist()))
    gp = GProfiler(return_dataframe=True)

    # details of what returns the following function : caleydo.org/tools/
    res = gp.convert(organism=organism,
                     query=data_df[col].tolist(),
                     target_namespace='UNIPROTSWISSPROT')

    # now add the relevant results to dataframe
    res_f = res[['incoming', 'name', 'namespaces']]

    res_f.rename(columns={"incoming":col,
                       "name": "gene_name",
                       "namespaces": "gene_name_bank"}, inplace=True)

    res_f = res_f.replace({'UNIPROTSWISSPROT,UNIPROT_GN_ACC':'Swiss-Prot',
                           'UNIPROTSPTREMBL,UNIPROT_GN_ACC':'TrEMBL'})

    df = data_df.merge(res_f, how='left', on=col)

    # TODO check if concordant with description
    #df['OK'] = np.where(df['gene_name_PD'] == df['converted_gprofiler'], True, False)
    print(df)

    return df
Beispiel #4
0
 def get_gene_list(self, samples_stat):
     for sample in samples_stat:
         gene = samples_stat[sample]['gene']
         if len(gene) == 0:
             continue
         else:
             gp = GProfiler(user_agent='ExampleTool', return_dataframe=True)
             df = gp.profile(organism='hsapiens', query=gene)
             go = df[df['native'].str.contains('GO')]
             go.to_csv('{module}/{sample}/GO_FuncTerm.csv'.format(
                 module=self.module, sample=sample),
                       header=True,
                       index=False,
                       sep=',')
             self.plot_go(go, sample, 'GO')
             kegg = df[df['native'].str.contains('KEGG')]
             kegg.to_csv('{module}/{sample}/KEGG_FuncTerm.csv'.format(
                 module=self.module, sample=sample),
                         header=True,
                         index=False,
                         sep=',')
             self.plot_go(kegg, sample, 'KEGG')
             df = gp.convert(organism='hsapiens',
                             query=gene,
                             target_namespace='ENTREZGENE_ACC')
             df.to_csv('{module}/{sample}/Entrez_Gene_converted.csv'.format(
                 module=self.module, sample=sample),
                       header=True,
                       index=False,
                       sep=',')
             with open(
                     '{module}/{sample}/gene_list.txt'.format(
                         module=self.module, sample=sample), 'wt') as f:
                 f.write('\n'.join(gene))
def get_gene_names(geneList):
    gp = GProfiler(return_dataframe=True)
    df = gp.convert(organism='athaliana',
                    query=geneList)[['incoming', 'name', 'description']]
    df['description'] = df.apply(
        lambda x: x['description'].split('[')[0].split(';')[0], axis=1)
    return df
Beispiel #6
0
def add_gene_name_gprofiler(data_df: pd.DataFrame, col: str,
                            organism: str) -> pd.DataFrame:
    gp = GProfiler(return_dataframe=True)
    protein_list = data_df[col].tolist()

    # details of what returns the following function : https://pypi.org/project/gprofiler-official/
    # TODO : documentation
    res = gp.convert(organism=organism,
                     query=protein_list,
                     target_namespace='UNIPROTSWISSPROT')

    # now add the relevant results to dataframe
    res_f = res[['incoming', 'name', 'namespaces']]

    res_f.rename(columns={
        "incoming": col,
        "name": "gene_name",
        "namespaces": "gene_name_bank"
    },
                 inplace=True)

    res_f = res_f.replace({
        'UNIPROTSWISSPROT,UNIPROT_GN_ACC': 'Swiss-Prot',
        'UNIPROTSPTREMBL,UNIPROT_GN_ACC': 'TrEMBL'
    })

    df = data_df.merge(res_f, how='left', on=col)

    # gProfiler returns one line for each alias of the gene (as in alias section in Uniprot): keep only the first one
    df = df[~df['Accession'].duplicated(keep='first')]

    return df
Beispiel #7
0
def gene_name_annotation_short(genes):
    gp = GProfiler(return_dataframe=True)
    gene_annot = gp.convert(organism='mmusculus',
             query= genes,
             target_namespace='ENTREZGENE_ACC')

    gene_annot['short_description'] = gene_annot['description'].map(lambda x: re.sub('\[.+\]', '', x)) # delete extra text between []
    gene_annot = gene_annot.drop(['description','name', 'converted','n_incoming','n_converted', 'namespaces', 'query'], axis=1)
    return gene_annot
Beispiel #8
0
def gene_name_annotation_long(genes):
    gp = GProfiler(return_dataframe=True)
    gene_annot = gp.convert(organism='mmusculus',
             query= genes,
             target_namespace='ENTREZGENE_ACC')

    gene_annot['short_description'] = gene_annot['description'].map(lambda x: re.sub('\[.+\]', '', x)) # delete extra text between []
    gene_annot = gene_annot.drop(['description','name', 'converted','n_incoming','n_converted', 'namespaces', 'query'], axis=1)
    with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # print all lines
        return display(gene_annot)
def run_gProfiler(comp, org):
	gp = GProfiler( return_dataframe=True) #return pandas dataframe or plain python structures 
	#gp = GProfiler(user_agent = 'lisa' )
	list_id = []
	for name in list(set(comp["composite"])):
		i_d = name.split('|')[1]
		list_id.append(i_d)
	res = gp.profile(organism=org,domain_scope = "annotated", sources = ["GO", "KEGG", "REACTOME"], #exemple org : hsapiens
		query=list_id)
	return res
Beispiel #10
0
def pathway_enrich_genes(genes, databases):
    
    gp = GProfiler(return_dataframe=True, user_agent='g:GOSt')

    cluster_enrichment = gp.profile(organism='mmusculus', sources=databases, user_threshold=0.05,
                                   significance_threshold_method='fdr', 
                                   query=genes, #"contains the list of enriched genes"
                                   no_evidences=False)
    cluster_enrichment_results = cluster_enrichment.set_index('native').sort_values('p_value').iloc[:,[1,2,5,10,13]]

    pd.set_option("display.max_colwidth", 800)
    return cluster_enrichment_results.iloc[:10,:] 
Beispiel #11
0
 def command(self, gene_list, n_top):
     from gprofiler import GProfiler
     import numpy as np
     gp = GProfiler("")
     r0 = gp.gprofile(gene_list,
                      correction_method=GProfiler.THR_FDR,
                      ordered=True)
     r0 = np.array(r0)
     r0 = r0[r0[:, 9] == 'MF']
     name_out = r0[0:n_top, -3]
     p_out = r0[0:n_top, 2]
     return np.array([x for x in zip(name_out, p_out)])
Beispiel #12
0
def pathway_enrich(genes, databases):
   
    gp = GProfiler(return_dataframe=True, user_agent='g:GOSt')

    cluster_enrichment = gp.profile(organism='mmusculus', sources=databases, user_threshold=0.05,
                               significance_threshold_method='fdr', 
                               domain_scope ='annotated',
                               #background= 10000, 
                               query= genes) #"contains the list of enriched genes"

    cluster_enrichment_results = cluster_enrichment.set_index('native').sort_values('p_value').iloc[:,[2,5,7,10,1]]
    pd.set_option("display.max_colwidth", 800)
    return cluster_enrichment_results.iloc[:10,:]
Beispiel #13
0
    def command(self, gene_list, n_top):
        from gprofiler import GProfiler
        import numpy as np
        gp = GProfiler("")
        r0 = gp.gprofile(gene_list,correction_method=GProfiler.THR_FDR,ordered=True)
        r0 = np.array(r0)
        r0 = r0[r0[:,9]=='MF']
        name_out = r0[0:n_top,-3]
        p_out = r0[0:n_top,2]
        data = np.array([x for x in zip(name_out, p_out)])
        dataframe = pd.DataFrame(data=data, columns=["Name", "p-values"])

        return dataframe
Beispiel #14
0
def Functional_profiling(gene_list,
                         organism='hsapiens',
                         sources=[
                             "GO:MF", "GO:CC", "GO:BP", "KEGG", "REAC", "WP",
                             "TF", "MIRNA", "HPA", "CORUM", "HP"
                         ],
                         user_threshold=0.05):
    gp = GProfiler(return_dataframe=True)

    gp_result_df = gp.profile(query=gene_list,
                              organism=organism,
                              user_threshold=user_threshold,
                              no_iea=True,
                              sources=sources)

    return gp_result_df
Beispiel #15
0
def profile_genes_with_active_sites(enriched_genes,
                                    background=None) -> DataFrame:

    if len(enriched_genes) == 0:
        return DataFrame()

    gp = GProfiler('ActiveDriverDB', want_header=True)

    response = gp.gprofile(enriched_genes, custom_bg=background)

    if not response:
        return DataFrame()

    header, *rows = response

    return DataFrame(rows, columns=header)
Beispiel #16
0
    def execute(self):
        d = pd.read_excel(self.args["<markers_spreadsheet>"], sheet_name=None)
        output_spreadsheet = self.args['<output_spreadsheet>']
        organism = self.args["--organism"]
        enrichment_threshold = float(self.args["--enrichment_threshold"])
        max_genes = int(self.args['--max_genes'])
        from gprofiler import GProfiler
        gp = GProfiler(return_dataframe=True)
        query = {}
        for key in d.keys():
            features = d[key]['feature'].values.tolist()
            query[key] = features[0:max_genes]

        result = gp.profile(organism=organism,
                            query=query,
                            user_threshold=enrichment_threshold)
        result.to_excel(output_spreadsheet, index=False)
Beispiel #17
0
def pathway_enrich_plot(genes, databases, title, background_genes, name_output, save: bool = False):
    """A function to plot the signature enrichment as a bargraph.  
    # Inputs:
    #    genes              - list of genes to be probed
    #    databases          - which databases to query, more information can be found here: https://biit.cs.ut.ee/gprofiler/page/apis
    #    title              - title for figure
    #    background_genes   - all the 
    #    save            - if I want to save the the figure
    # 
    """
    #Interpretation of differentially expressed genes in cluster 0 cells - g:profiler
    
    gp = GProfiler(return_dataframe=True, user_agent='g:GOSt')

    cluster_enrichment = gp.profile(organism='mmusculus', sources=databases, user_threshold=0.05,
                                   significance_threshold_method='fdr', 
                                   background= background_genes, 
                                   query=genes) #"contains the list of enriched genes"

    cluster_enrichment_results = cluster_enrichment.set_index('native').sort_values('p_value').iloc[:,[2,5,7,10,1]]

    # made new column with negative log p-value
    cluster_enrichment_results['-log10_p_value'] = cluster_enrichment_results['p_value'].map(lambda x: -math.log(x,10))
    
    if 'REAC:0000000' in cluster_enrichment_results.index.tolist():
        cluster_enrichment_results = cluster_enrichment_results.drop(labels='REAC:0000000', axis=0)

    plt.rcdefaults()
    fig, ax = plt.subplots()

    cluster_name = cluster_enrichment_results['name'].head(10)
    y_pos = np.arange(len(cluster_name))
    enrichment_value = cluster_enrichment_results['-log10_p_value'].head(10)

    ax.barh(y_pos, enrichment_value, align='center', color='black')
    ax.set_yticks(y_pos)
    ax.set_yticklabels(cluster_name)
    ax.invert_yaxis()  # labels read top-to-bottom
    ax.set_xlabel('-log10 p value')
    ax.set_title(title)

    if save:
        plt.savefig(name_output, format='pdf', bbox_inches = "tight")
    
    return plt.show()
Beispiel #18
0
def find_orth_gene(gene, organism, target):
    """Find orthogonal gene via Gprofiler

    returns a list of gene(s)"""
    import pandas as pd
    from gprofiler import GProfiler
    gp = GProfiler()

    if organism == target:  # do not search if original gene is known
        genes = list(set([
            gene,
        ]))
    else:
        results = pd.DataFrame(
            gp.orth(query=gene, organism=organism, target=target))
        results.dropna(subset=["name"], axis=0)
        genes = [gene for gene in results.name if gene != "N/A"]

    return genes
Beispiel #19
0
def enrichment_analysis(markers: Dict[str, Dict[str, pd.DataFrame]],
                        max_genes: int = 100,
                        organism: str = 'hsapiens',
                        enrichment_threshold: float = 0.05) -> pd.DataFrame:
    """Perform enrichment analysis using gprofiler (https://biit.cs.ut.ee/gprofiler/gost).

    Parameters
    ----------
    markers: ``Dict[str, Dict[str, pd.DataFrame]``
        Output from markers.

    max_genes: ``int``, optional, default: 100
        Maximum number of genes to use in enrichment query
    organism: ``str``, optional, default: ``hsapiens``
        Organism. See https://biit.cs.ut.ee/gprofiler/page/organism-list for full list.
    enrichment_threshold: ``float``, optional, default: ``0.05``
        Include enrichment results with corrected p-value less than this threshold

    Returns
    -------
    ``pd.DataFrame``

    """
    start = time.perf_counter()
    from gprofiler import GProfiler
    gp = GProfiler(return_dataframe=True)
    query = {}
    for cluster in markers.keys():
        up_list = markers[cluster]['up'].index.values.tolist()
        if len(up_list) > 0:
            query[cluster + '-up'] = up_list[0:max_genes]
        down_list = markers[cluster]['down'].index.values.tolist()
        if len(down_list) > 0:
            query[cluster + '-down'] = down_list[0:max_genes]
    result = gp.profile(organism=organism,
                        query=query,
                        user_threshold=enrichment_threshold)
    end = time.perf_counter()
    logger.info(
        "Enrichment analysis is finished. Time spent = {:.2f}s.".format(end -
                                                                        start))
    return result
Beispiel #20
0
def enrich_and_simplify(sets,
                        intersections=True,
                        sources=('GO:BP', ),
                        organism='hsapiens',
                        reduce_limit=0,
                        **revigo_kwds):
    from gprofiler import GProfiler

    if not isinstance(sets, dict):
        sets = list(sets)

    gprofiler = GProfiler(user_agent="scanpy", return_dataframe=True)
    gprofiler_kwargs = {'no_evidences': not intersections, 'sources': sources}

    df = gprofiler.profile(sets, organism=organism, **gprofiler_kwargs)
    revs = {}

    if reduce_limit is not None:
        dfs = []
        for q in df['query'].unique():
            df_sub = df[df['query'] == q].copy()
            go = df_sub.native.tolist()
            pvals = df_sub.p_value.tolist()

            if len(go) > reduce_limit:
                r = revigo(go, pvals, **revigo_kwds)
                revs[q] = r

                r = r.rename(columns={
                    'term_ID': 'native'
                }).drop(columns='description').assign(query=q)
                dfs.append(df_sub.merge(r))
            else:
                dfs.append(df.assign(eliminated=0))

        df = pd.concat(dfs, axis=0).reset_index(drop=True)

    return df, revs
Beispiel #21
0
def main(args):
    gp = GProfiler(
        user_agent='gprofiler_custom_gmt',  #optional user agent
        return_dataframe=
        True,  #return pandas dataframe or plain python structures    
    )
    genes = [line.strip() for line in open(args.filename)]

    if args.gmt is not None:
        with open(args.gmt) as f:
            response = requests.post(
                'https://biit.cs.ut.ee/gprofiler/api/gost/custom/',
                json={
                    'gmt': f.read(),
                    'name': args.gmt
                })
        token = get_token_form_response(response)
    elif args.token is not None:
        token = args.token
    else:
        raise ValuError("Please supply either a token or a gmt file")
    res = gp.profile(genes, organism=token)
    res.to_csv(args.output)
Beispiel #22
0
def make_tcga_gtex_id_mapping_file(tcga_gtex_id_df, tcga_gtex_id_addr):

    # print(tcga_gtex_id_df)
    ensembl_id = tcga_gtex_id_df['sample'].str.split(".", n=1, expand=True)
    tcga_gtex_id_df['ensembl_gene'] = ensembl_id[0]

    # print(tcga_gtex_id_df)
    gp = GProfiler(return_dataframe=True)
    ensembl_2_symbol = gp.convert(
        organism='hsapiens',
        query=tcga_gtex_id_df['ensembl_gene'].tolist(),
        target_namespace='ENSG')
    # print(ensembl_2_symbol[['incoming','name']])

    tcga_gtex_id_df['gene_symbol'] = tcga_gtex_id_df[['ensembl_gene']].merge(
        ensembl_2_symbol,
        how='left',
        right_on='incoming',
        left_on='ensembl_gene').name

    # print(tcga_gtex_id_df)

    tcga_gtex_id_df.to_csv(tcga_gtex_id_addr, sep='\t', index=False)
Beispiel #23
0
def gsea_connected_components(G, outdir):
    """
  Perform Gene Set Enrichment Analysis on the connected components in G using GProfiler

  Returns
  -------
  rv : list of (set, str)
    tuples of gene set that was queried for enrichment and the enrichment output file
  """
    rv = []
    gp = GProfiler("FluPath/0.1")
    if nx.is_directed(G):
        G = G.to_undirected()
    comps = list(nx.connected_components(G))
    comp_no = 0
    for comp in comps:
        # TODO how are http errors handled?
        enrich_out_fp = os.path.join(outdir, "enrich_{}.tsv".format(comp_no))
        if not os.path.exists(enrich_out_fp):
            enrich = gp.gprofile(comp, src_filter=['GO:BP'])
            write_enrich(enrich, enrich_out_fp)
        rv.append((comp, enrich_out_fp))
        comp_no += 1
    return rv
Beispiel #24
0
def add_ensembl_gene_into_string_info(string_info_addr):
    string_info_df = pd.read_csv(string_info_addr, sep='\t')
    protein_tax_ensembl = string_info_df['protein_external_id'].str.split(
        ".", n=1, expand=True)
    string_info_df['protein_ensembl.protein'] = protein_tax_ensembl[1]

    protein_ensembl = protein_tax_ensembl[1].tolist()
    print(protein_ensembl)
    # string_info_df = string_info_df.iloc[:-1]

    # mg = mygene.MyGeneInfo()
    # ensembl_protein_to_gene_df = mg.querymany(protein_ensembl, scopes='ensembl.protein', fields='ensembl.gene',
    #                                           species=10090,returnall=False, as_dataframe=True)
    #
    # # ensembl_protein_to_gene_df = mg.getgenes(protein_ensembl, fields='ensembl.gene',
    # #                                           species=10090,as_dataframe=True)
    #
    # print(ensembl_protein_to_gene_df)
    #
    #
    # ensembl_protein_to_gene_df.to_csv("/Users/woochanghwang/PycharmProjects/CIMR/Data/STRING/ensembl.id.tsv",sep='\t')

    from gprofiler import GProfiler
    gp = GProfiler(return_dataframe=True)

    ensembl_protein_to_gene_df = gp.orth(organism='mmusculus',
                                         target='ENSG',
                                         query=protein_ensembl)

    ensembl_protein_to_gene_df = ensembl_protein_to_gene_df.reset_index()
    ensembl_protein_to_gene_df = ensembl_protein_to_gene_df.set_index(
        'incoming')
    # ensembl_protein_to_gene_df = ensembl_protein_to_gene_df.drop('index')
    # ensembl_protein_to_gene_df.to_csv("/Users/woochanghwang/PycharmProjects/CIMR/Data/STRING/string_ensembl_protein_to_gene_gp.tsv",sep='\t',index=False)
    # string_info_df = string_info_df.drop_duplicates()
    # ensembl_protein_to_gene_df = ensembl_protein_to_gene_df.drop_duplicates()
    string_info_df = string_info_df.set_index('protein_ensembl.protein')

    print(string_info_df.head())
    print(ensembl_protein_to_gene_df.head())
    string_info_ensembl_df = pd.concat(
        [string_info_df, ensembl_protein_to_gene_df], axis=1, sort=False)

    # string_info_ensembl_df = pd.merge(string_info_df, ensembl_protein_to_gene_df)
    string_info_ensembl_df.to_csv(
        "/Users/woochanghwang/PycharmProjects/CIMR/Data/STRING/10090.protein.info.v11.0.ensembl.txt",
        sep='\t')
for f in files:
    shutil.move(os.path.join(source, f), destination)

# Let me now find the GO anaotation graphs  for the proteins that have SaintExpress score>0.5 and BFDR<0.01
'''
This code asks you if you have one or more than one conditions. In case, you have one, it gives you only one conditional 
horizontal bar graphs. If not, it would compare the bar graphs.  

To have 2 conditions, you need to run this code 2 times with different outputs! 
'''

if ask_user == "YES":
    if number_of_conditions == 1:
        # Only one condition! getting GO annotation profiles of proteins that have >0.5 saint score and <0.01 BFDR score

        gp = GProfiler(return_dataframe=True)

        profiler = gp.profile(organism='hsapiens', query=gene_names)

        BP_profiler = profiler[profiler["source"] == "GO:BP"]
        CC_profiler = profiler[profiler["source"] == "GO:CC"]
        MF_profiler = profiler[profiler["source"] == "GO:MF"]

        BP_profiled = BP_profiler.sort_values(by=["p_value"])
        CC_profiled = CC_profiler.sort_values(by=["p_value"])
        MF_profiled = MF_profiler.sort_values(by=["p_value"])

        location_BP = BP_profiled["name"].to_list()[0:10]
        p_BP = BP_profiled["p_value"].to_list()[0:10]
        logged_p_BP = []
        for i in p_BP:
Beispiel #26
0
community = g.community_infomap()
print("Number of Communities:", len(community))

df = pd.DataFrame({'gene': g.vs['name'], 'community': community.membership})
# Order by size of communities
valuec = df['community'].value_counts()
biggest = valuec.unique()[0]  # Corta el valor con mas cuentas
values = valuec[(valuec >= biggest) |
                (valuec >= 10)]  # Values of communities filtered
order = values.index.tolist()
df = df[df['community'].isin(order)]
df = df.set_index('community')
df = df.loc[order].reset_index()
# print("community id - Number of Genes")
# print(valuec)

#! Enrichment by GO
gp = GProfiler(return_dataframe=True)
enrich_communities = pd.DataFrame()
print("Community id - Number of nodes")
for name, group in df.groupby('community', sort=False):
    print(name, group.shape[0])
    s = gp.profile(organism='hsapiens', query=group.gene.tolist())
    s['community'] = name
    enrich_communities = enrich_communities.append(s)
# print(enrich_components)

#! OUTPUT
enrich_communities.to_csv(oname1, sep="\t", index=False)
df.to_csv(oname2, sep="\t", index=False)
Beispiel #27
0
class GOEnrichmentTester():
    
    def __init__(self):
        self.gp = GProfiler("COSSY++/1.5")
    
    def getGoTerms(self, genelist):
        result = []
        res = self.gp.gprofile(query=genelist)
        
        for i in range(len(res)):
            pvalue = res[i][2]
            goid = res[i][8]
            gocat = res[i][9]
            goterm = res[i][11]
            
            '''
            if (gocat =="MF" or gocat == "CC" or gocat == "BP"):
                result.append({"pvalue":pvalue, "id":goid, "category":gocat, "term":goterm})
            '''
            result.append({"pvalue":pvalue, "id":goid, "category":gocat, "term":goterm})
        
        return result
    
    def readTSV(self, fname):
        records = []
        with open(fname) as reader:
            headers = []
            
            for line in reader:
                values = [x.replace("\"","") for x in line.split("\t")]
                if line.startswith("Gene Symbol"):
                    headers = values
                    continue
                
                rec = {headers[i] : values[i] for i in range(len(headers))}
                
                records.append(rec)
                
        return records
    
    def loadCOSMIC(self, fname):
        self.result = {"somatic":{}, "germline":{}}
        self.diseaseList = []
        
        records = self.readTSV(fname=fname)
        
        for rec in records:
            geneSymbol = rec["Gene Symbol"]
            somaticTumors = [x.strip() for x in rec["Tumour Types(Somatic)"].strip().split(",")]
            germlineTumors = [x.strip() for x in rec["Tumour Types(Germline)"].strip().split(",")]
            
            for tumorType in somaticTumors:
                if tumorType == "":
                    continue;
                
                if tumorType not in self.result["somatic"]:
                    self.result["somatic"][tumorType] = []
                self.result["somatic"][tumorType].append(geneSymbol)
                
                if tumorType not in self.diseaseList:
                    self.diseaseList.append(tumorType)
                
                
            for tumorType in germlineTumors:
                if tumorType == "":
                    continue;
                
                if tumorType not in self.result["germline"]:
                    self.result["germline"][tumorType] = []
                self.result["germline"][tumorType].append(geneSymbol)
                
                if tumorType not in self.diseaseList:
                    self.diseaseList.append(tumorType)
        
        self.makeGOList()

    def getGenes(self, disease):
        
        if disease in self.result["somatic"]:
            somaticGenes = self.result["somatic"][disease]
        else:
            somaticGenes = []
        
        if disease in self.result["germline"]:
            germlineGenes = self.result["germline"][disease]
        else:
            germlineGenes = []
        
        return somaticGenes + germlineGenes
    
    def makeGOList(self):
        self.GOList = {}
        
        for tumorType in self.diseaseList:
            print "."
            genes = self.getGenes(tumorType)
            goTerms = self.getGoTerms(genes)
            
            goTerms = sorted(goTerms, cmp=self.pvaluecomp)
            
            self.GOList[tumorType] = goTerms
    
    def writeCOSMICGO(self, fname):
        with open(fname, "w") as w:
            json.dump(self.GOList, w, indent=4)
    
    def corr(self, genes, disease):
        inputGO = sorted(self.getGoTerms(genes), cmp=self.pvaluecomp)
        inputGO_terms = [x["term"] for x in inputGO]
        
        answerGO = sorted([x for x in self.GOList[disease] if x["term"] in inputGO_terms], cmp=self.pvaluecomp)
        answerGO_terms = [x["term"] for x in answerGO]
        
        assert(len(inputGO_terms) != len(answerGO_terms))
        
        inputGO_ranks_pair = [(x,inputGO_terms.index(x)) for x in inputGO_terms]
        answerGO_ranks_pair = [(x,answerGO_terms.index(x)) for x in answerGO_terms]
        
        inputGO_ranks = [x[1] for x in sorted(inputGO_ranks_pair, key=itemgetter(0))]
        answerGO_ranks = [x[1] for x in sorted(answerGO_ranks_pair, key=itemgetter(0))]
        
        np.correlate(inputGO_ranks, answerGO_ranks, "same")
        
    def pvaluecomp(self, a,b):
        x = a['pvalue']
        y = b['pvalue']
        if x > y:
            return 1
        elif x < y:
            return -1
        else:
            return 0
def lookup_enrichment(gene_set):
    clean_gene_set = [x for x in gene_set if x is not None]
    gp = GProfiler("GTEx/wj")
    enrichment_results = gp.gprofile(clean_gene_set)
    return enrichment_results
import gprofiler
from gprofiler import GProfiler
GProfiler?
gp = GProfiler(return_dataframe=True)
gp.profile(organism='mmusculus', query=genes)
genes
genes = """ENSMUSG00000076488
ENSMUSG00000065231
ENSMUSG00000079120
ENSMUSG00000047222
ENSMUSG00000097494
ENSMUSG00000064419
ENSMUSG00000095668
ENSMUSG00000059606""".split()
gp.profile(organism='mmusculus', query=genes)
import requests
def mygprofiler(namelist, organism='mmusculus'):
    """Run gProfiler using POST api with a json query body
    
    Returns a pandas DataFrame with the result"""
    if type(namelist) is not list:
        namelist = list(namelist)
    r = requests.post(
        url='https://biit.cs.ut.ee/gprofiler/api/gost/profile/',
        json={
            'organism':organism,
            'query': namelist,
        }
        )
    df = pd.DataFrame(r.json()['result'])
    return df
Beispiel #30
0
 def __init__(self):
     self.gp = GProfiler("COSSY++/1.5")
Beispiel #31
0
def enrich(
    container: Iterable[str],
    *,
    org: str = "hsapiens",
    gprofiler_kwargs: Mapping[str, Any] = {},
) -> pd.DataFrame:
    """\
    Get enrichment for DE results.

    This is a thin convenience wrapper around the very useful gprofiler_.

    This method dispatches on the first argument, leading to the following two
    signatures::

        enrich(container, ...)
        enrich(adata: AnnData, group, key: str, ...)

    Where::

        enrich(adata, group, key, ...) = enrich(adata.uns[key]["names"][group], ...)

    .. _gprofiler: https://pypi.org/project/gprofiler-official/#description

    Parameters
    ----------
    container
        Contains genes you'd like to search.
    adata
        AnnData object whose group will be looked for.
    group
        The group whose genes should be used for enrichment.
    key
        Key in `uns` to find group under.
    {doc_org}
    gprofiler_kwargs
        Keyword arguments to pass to `GProfiler.profile`, see gprofiler_.

    Returns
    -------
    Dataframe of enrichment results.

    Examples
    --------
    Using `sc.queries.enrich` on a list of genes:

    >>> import scanpy as sc
    >>> sc.queries.enrich(['Klf4', 'Pax5', 'Sox2', 'Nanog'], org="hsapiens")

    Using `sc.queries.enrich` on an :class:`anndata.AnnData` object:

    >>> pbmcs = sc.datasets.pbmc68k_reduced()
    >>> sc.tl.rank_genes_groups(pbmcs, "bulk_labels")
    >>> sc.queries.enrich(pbmcs, "CD34+")
    """
    try:
        from gprofiler import GProfiler
    except ImportError:
        raise ImportError(
            "This method requires the `gprofiler-official` module to be installed."
        )
    gprofiler = GProfiler(user_agent="scanpy", return_dataframe=True)
    gprofiler_kwargs = copy(gprofiler_kwargs)
    for k in ["organism"]:
        if gprofiler_kwargs.get(k) is not None:
            raise ValueError(
                f"Argument `{k}` should be passed directly through `enrich`, "
                "not through `gprofiler_kwargs`")
    return gprofiler.profile(list(container), organism=org, **gprofiler_kwargs)
Beispiel #32
0
# -*- coding: utf-8 -*-
from gprofiler import GProfiler
gp = GProfiler(return_dataframe=True)
import numpy as np
import scipy as sp
import pandas as pd
import os
os.chdir("/home/conor/Documents/Git_Repositories/MSc_Project")

DEG_list = pd.read_csv('data/DEG_list.csv')
methyl_genes = pd.read_csv('data/Methylation_genes.csv')

upreg = DEG_list[(DEG_list['adj.P.Val'] <= 0.1) & (DEG_list['logFC'] >= 0.1)]
upreg = upreg['Gene'].astype(str).tolist()
upreg[:] = map(str.strip, upreg)

downreg = DEG_list[(DEG_list['adj.P.Val'] <= 0.1)
                   & (DEG_list['logFC'] <= -0.1)]
downreg = downreg['Gene'].astype(str).tolist()
downreg[:] = map(str.strip, downreg)

hyper = methyl_genes[methyl_genes['Methylation'] > 0]
hyper = hyper['Gene'].tolist()

hypo = methyl_genes[methyl_genes['Methylation'] < 0]
hypo = hypo['Gene'].tolist()

genelists = {'downreg': downreg, 'upreg': upreg, 'hyper': hyper, 'hypo': hypo}

for i in genelists:
    print("Calculating", i, "enrichment...")