Python GProfiler Beispiele, gprofiler.GProfiler Python Beispiele

Beispiel #1

0

Datei anzeigen

def run_gprofiler(inputfile, theargs):
    """
    todo
    :param inputfile:
    :return:
    """
    genes = read_inputfile(inputfile)
    gp = GProfiler(return_dataframe=True)
    genes = genes.strip(',').strip('\n').split(',')
    df_result = gp.profile(query=genes,
                           organism=theargs.organism,
                           user_threshold=theargs.maxpval)
    if df_result.shape[0] == 0:
        sys.stderr.write('No terms found\n')
        return 0

    df_result['Jaccard'] = 1.0 / (1.0 / df_result['precision'] +
                                  1.0 / df_result['recall'] - 1)
    df_result.sort_values(['Jaccard', 'p_value'],
                          ascending=[False, True],
                          inplace=True)
    df_result.reset_index(drop=True, inplace=True)
    top_hit = df_result['name'][0]
    sys.stdout.write(top_hit)
    return 0

Beispiel #2

0

Datei anzeigen

Datei: run.py Projekt: Ceofy/communitydetection-rest-server

def run_gprofiler(inputfile, theargs):
    """
    todo
    :param inputfile:
    :return:
    """
    genes = read_inputfile(inputfile)
    gp = GProfiler(return_dataframe=True)
    genes = genes.strip(',').strip('\n').split(',')
    df_result = gp.profile(query=genes,
                           organism=theargs.organism,
                           user_threshold=theargs.maxpval,
                           no_evidences=False)
    if df_result.shape[0] == 0:
        sys.stderr.write('No terms found\n')
        return 0
    df_result['Jaccard'] = 1.0 / (1.0 / df_result['precision'] +
                                  1.0 / df_result['recall'] - 1)
    df_result.sort_values(['Jaccard', 'p_value'],
                          ascending=[False, True],
                          inplace=True)
    df_result.reset_index(drop=True, inplace=True)

    theres = {
        'name': df_result['name'][0],
        'source': df_result['source'][0],
        'p_value': df_result['p_value'][0],
        'description': df_result['description'][0],
        'intersections': df_result['intersections'][0]
    }
    json.dump(theres, sys.stdout)
    sys.stdout.flush()
    return 0

Beispiel #3

0

Datei anzeigen

def add_gene_name_gprofiler(data_df, col, organism):
    print(type(data_df[col].tolist()))
    gp = GProfiler(return_dataframe=True)

    # details of what returns the following function : caleydo.org/tools/
    res = gp.convert(organism=organism,
                     query=data_df[col].tolist(),
                     target_namespace='UNIPROTSWISSPROT')

    # now add the relevant results to dataframe
    res_f = res[['incoming', 'name', 'namespaces']]

    res_f.rename(columns={"incoming":col,
                       "name": "gene_name",
                       "namespaces": "gene_name_bank"}, inplace=True)

    res_f = res_f.replace({'UNIPROTSWISSPROT,UNIPROT_GN_ACC':'Swiss-Prot',
                           'UNIPROTSPTREMBL,UNIPROT_GN_ACC':'TrEMBL'})

    df = data_df.merge(res_f, how='left', on=col)

    # TODO check if concordant with description
    #df['OK'] = np.where(df['gene_name_PD'] == df['converted_gprofiler'], True, False)
    print(df)

    return df

Beispiel #4

0

Datei anzeigen

 def get_gene_list(self, samples_stat):
     for sample in samples_stat:
         gene = samples_stat[sample]['gene']
         if len(gene) == 0:
             continue
         else:
             gp = GProfiler(user_agent='ExampleTool', return_dataframe=True)
             df = gp.profile(organism='hsapiens', query=gene)
             go = df[df['native'].str.contains('GO')]
             go.to_csv('{module}/{sample}/GO_FuncTerm.csv'.format(
                 module=self.module, sample=sample),
                       header=True,
                       index=False,
                       sep=',')
             self.plot_go(go, sample, 'GO')
             kegg = df[df['native'].str.contains('KEGG')]
             kegg.to_csv('{module}/{sample}/KEGG_FuncTerm.csv'.format(
                 module=self.module, sample=sample),
                         header=True,
                         index=False,
                         sep=',')
             self.plot_go(kegg, sample, 'KEGG')
             df = gp.convert(organism='hsapiens',
                             query=gene,
                             target_namespace='ENTREZGENE_ACC')
             df.to_csv('{module}/{sample}/Entrez_Gene_converted.csv'.format(
                 module=self.module, sample=sample),
                       header=True,
                       index=False,
                       sep=',')
             with open(
                     '{module}/{sample}/gene_list.txt'.format(
                         module=self.module, sample=sample), 'wt') as f:
                 f.write('\n'.join(gene))

Beispiel #5

0

Datei anzeigen

Datei: feature_selection.py Projekt: SirSharpest/RNA-Seq-Analysis

def get_gene_names(geneList):
    gp = GProfiler(return_dataframe=True)
    df = gp.convert(organism='athaliana',
                    query=geneList)[['incoming', 'name', 'description']]
    df['description'] = df.apply(
        lambda x: x['description'].split('[')[0].split(';')[0], axis=1)
    return df

Beispiel #6

0

Datei anzeigen

def add_gene_name_gprofiler(data_df: pd.DataFrame, col: str,
                            organism: str) -> pd.DataFrame:
    gp = GProfiler(return_dataframe=True)
    protein_list = data_df[col].tolist()

    # details of what returns the following function : https://pypi.org/project/gprofiler-official/
    # TODO : documentation
    res = gp.convert(organism=organism,
                     query=protein_list,
                     target_namespace='UNIPROTSWISSPROT')

    # now add the relevant results to dataframe
    res_f = res[['incoming', 'name', 'namespaces']]

    res_f.rename(columns={
        "incoming": col,
        "name": "gene_name",
        "namespaces": "gene_name_bank"
    },
                 inplace=True)

    res_f = res_f.replace({
        'UNIPROTSWISSPROT,UNIPROT_GN_ACC': 'Swiss-Prot',
        'UNIPROTSPTREMBL,UNIPROT_GN_ACC': 'TrEMBL'
    })

    df = data_df.merge(res_f, how='left', on=col)

    # gProfiler returns one line for each alias of the gene (as in alias section in Uniprot): keep only the first one
    df = df[~df['Accession'].duplicated(keep='first')]

    return df

Beispiel #7

0

Datei anzeigen

def gene_name_annotation_short(genes):
    gp = GProfiler(return_dataframe=True)
    gene_annot = gp.convert(organism='mmusculus',
             query= genes,
             target_namespace='ENTREZGENE_ACC')

    gene_annot['short_description'] = gene_annot['description'].map(lambda x: re.sub('\[.+\]', '', x)) # delete extra text between []
    gene_annot = gene_annot.drop(['description','name', 'converted','n_incoming','n_converted', 'namespaces', 'query'], axis=1)
    return gene_annot

Beispiel #8

0

Datei anzeigen

def gene_name_annotation_long(genes):
    gp = GProfiler(return_dataframe=True)
    gene_annot = gp.convert(organism='mmusculus',
             query= genes,
             target_namespace='ENTREZGENE_ACC')

    gene_annot['short_description'] = gene_annot['description'].map(lambda x: re.sub('\[.+\]', '', x)) # delete extra text between []
    gene_annot = gene_annot.drop(['description','name', 'converted','n_incoming','n_converted', 'namespaces', 'query'], axis=1)
    with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # print all lines
        return display(gene_annot)

Beispiel #9

0

Datei anzeigen

Datei: gProfiler.py Projekt: LisaChabrier/FusionsEvents

def run_gProfiler(comp, org):
	gp = GProfiler( return_dataframe=True) #return pandas dataframe or plain python structures 
	#gp = GProfiler(user_agent = 'lisa' )
	list_id = []
	for name in list(set(comp["composite"])):
		i_d = name.split('|')[1]
		list_id.append(i_d)
	res = gp.profile(organism=org,domain_scope = "annotated", sources = ["GO", "KEGG", "REACTOME"], #exemple org : hsapiens
		query=list_id)
	return res

Beispiel #10

0

Datei anzeigen

def pathway_enrich_genes(genes, databases):
    
    gp = GProfiler(return_dataframe=True, user_agent='g:GOSt')

    cluster_enrichment = gp.profile(organism='mmusculus', sources=databases, user_threshold=0.05,
                                   significance_threshold_method='fdr', 
                                   query=genes, #"contains the list of enriched genes"
                                   no_evidences=False)
    cluster_enrichment_results = cluster_enrichment.set_index('native').sort_values('p_value').iloc[:,[1,2,5,10,13]]

    pd.set_option("display.max_colwidth", 800)
    return cluster_enrichment_results.iloc[:10,:]

Beispiel #11

0

Datei anzeigen

 def command(self, gene_list, n_top):
     from gprofiler import GProfiler
     import numpy as np
     gp = GProfiler("")
     r0 = gp.gprofile(gene_list,
                      correction_method=GProfiler.THR_FDR,
                      ordered=True)
     r0 = np.array(r0)
     r0 = r0[r0[:, 9] == 'MF']
     name_out = r0[0:n_top, -3]
     p_out = r0[0:n_top, 2]
     return np.array([x for x in zip(name_out, p_out)])

Beispiel #12

0

Datei anzeigen

def pathway_enrich(genes, databases):
   
    gp = GProfiler(return_dataframe=True, user_agent='g:GOSt')

    cluster_enrichment = gp.profile(organism='mmusculus', sources=databases, user_threshold=0.05,
                               significance_threshold_method='fdr', 
                               domain_scope ='annotated',
                               #background= 10000, 
                               query= genes) #"contains the list of enriched genes"

    cluster_enrichment_results = cluster_enrichment.set_index('native').sort_values('p_value').iloc[:,[2,5,7,10,1]]
    pd.set_option("display.max_colwidth", 800)
    return cluster_enrichment_results.iloc[:10,:]

Beispiel #13

0

Datei anzeigen

    def command(self, gene_list, n_top):
        from gprofiler import GProfiler
        import numpy as np
        gp = GProfiler("")
        r0 = gp.gprofile(gene_list,correction_method=GProfiler.THR_FDR,ordered=True)
        r0 = np.array(r0)
        r0 = r0[r0[:,9]=='MF']
        name_out = r0[0:n_top,-3]
        p_out = r0[0:n_top,2]
        data = np.array([x for x in zip(name_out, p_out)])
        dataframe = pd.DataFrame(data=data, columns=["Name", "p-values"])

        return dataframe

Beispiel #14

0

Datei anzeigen

Datei: gprofiler_utilities.py Projekt: wchwang/Toolbox

def Functional_profiling(gene_list,
                         organism='hsapiens',
                         sources=[
                             "GO:MF", "GO:CC", "GO:BP", "KEGG", "REAC", "WP",
                             "TF", "MIRNA", "HPA", "CORUM", "HP"
                         ],
                         user_threshold=0.05):
    gp = GProfiler(return_dataframe=True)

    gp_result_df = gp.profile(query=gene_list,
                              organism=organism,
                              user_threshold=user_threshold,
                              no_iea=True,
                              sources=sources)

    return gp_result_df

Beispiel #15

0

Datei anzeigen

def profile_genes_with_active_sites(enriched_genes,
                                    background=None) -> DataFrame:

    if len(enriched_genes) == 0:
        return DataFrame()

    gp = GProfiler('ActiveDriverDB', want_header=True)

    response = gp.gprofile(enriched_genes, custom_bg=background)

    if not response:
        return DataFrame()

    header, *rows = response

    return DataFrame(rows, columns=header)

Beispiel #16

0

Datei anzeigen

Datei: Enrichment.py Projekt: slowkow/pegasus

    def execute(self):
        d = pd.read_excel(self.args["<markers_spreadsheet>"], sheet_name=None)
        output_spreadsheet = self.args['<output_spreadsheet>']
        organism = self.args["--organism"]
        enrichment_threshold = float(self.args["--enrichment_threshold"])
        max_genes = int(self.args['--max_genes'])
        from gprofiler import GProfiler
        gp = GProfiler(return_dataframe=True)
        query = {}
        for key in d.keys():
            features = d[key]['feature'].values.tolist()
            query[key] = features[0:max_genes]

        result = gp.profile(organism=organism,
                            query=query,
                            user_threshold=enrichment_threshold)
        result.to_excel(output_spreadsheet, index=False)

Beispiel #17

0

Datei anzeigen

def pathway_enrich_plot(genes, databases, title, background_genes, name_output, save: bool = False):
    """A function to plot the signature enrichment as a bargraph.  
    # Inputs:
    #    genes              - list of genes to be probed
    #    databases          - which databases to query, more information can be found here: https://biit.cs.ut.ee/gprofiler/page/apis
    #    title              - title for figure
    #    background_genes   - all the 
    #    save            - if I want to save the the figure
    # 
    """
    #Interpretation of differentially expressed genes in cluster 0 cells - g:profiler
    
    gp = GProfiler(return_dataframe=True, user_agent='g:GOSt')

    cluster_enrichment = gp.profile(organism='mmusculus', sources=databases, user_threshold=0.05,
                                   significance_threshold_method='fdr', 
                                   background= background_genes, 
                                   query=genes) #"contains the list of enriched genes"

    cluster_enrichment_results = cluster_enrichment.set_index('native').sort_values('p_value').iloc[:,[2,5,7,10,1]]

    # made new column with negative log p-value
    cluster_enrichment_results['-log10_p_value'] = cluster_enrichment_results['p_value'].map(lambda x: -math.log(x,10))
    
    if 'REAC:0000000' in cluster_enrichment_results.index.tolist():
        cluster_enrichment_results = cluster_enrichment_results.drop(labels='REAC:0000000', axis=0)

    plt.rcdefaults()
    fig, ax = plt.subplots()

    cluster_name = cluster_enrichment_results['name'].head(10)
    y_pos = np.arange(len(cluster_name))
    enrichment_value = cluster_enrichment_results['-log10_p_value'].head(10)

    ax.barh(y_pos, enrichment_value, align='center', color='black')
    ax.set_yticks(y_pos)
    ax.set_yticklabels(cluster_name)
    ax.invert_yaxis()  # labels read top-to-bottom
    ax.set_xlabel('-log10 p value')
    ax.set_title(title)

    if save:
        plt.savefig(name_output, format='pdf', bbox_inches = "tight")
    
    return plt.show()

Beispiel #18

0

Datei anzeigen

Datei: database.py Projekt: JonETJakobsson/scConnect

def find_orth_gene(gene, organism, target):
    """Find orthogonal gene via Gprofiler

    returns a list of gene(s)"""
    import pandas as pd
    from gprofiler import GProfiler
    gp = GProfiler()

    if organism == target:  # do not search if original gene is known
        genes = list(set([
            gene,
        ]))
    else:
        results = pd.DataFrame(
            gp.orth(query=gene, organism=organism, target=target))
        results.dropna(subset=["name"], axis=0)
        genes = [gene for gene in results.name if gene != "N/A"]

    return genes

Beispiel #19

0

Datei anzeigen

Datei: enrichment.py Projekt: slowkow/pegasus

def enrichment_analysis(markers: Dict[str, Dict[str, pd.DataFrame]],
                        max_genes: int = 100,
                        organism: str = 'hsapiens',
                        enrichment_threshold: float = 0.05) -> pd.DataFrame:
    """Perform enrichment analysis using gprofiler (https://biit.cs.ut.ee/gprofiler/gost).

    Parameters
    ----------
    markers: ``Dict[str, Dict[str, pd.DataFrame]``
        Output from markers.

    max_genes: ``int``, optional, default: 100
        Maximum number of genes to use in enrichment query
    organism: ``str``, optional, default: ``hsapiens``
        Organism. See https://biit.cs.ut.ee/gprofiler/page/organism-list for full list.
    enrichment_threshold: ``float``, optional, default: ``0.05``
        Include enrichment results with corrected p-value less than this threshold

    Returns
    -------
    ``pd.DataFrame``

    """
    start = time.perf_counter()
    from gprofiler import GProfiler
    gp = GProfiler(return_dataframe=True)
    query = {}
    for cluster in markers.keys():
        up_list = markers[cluster]['up'].index.values.tolist()
        if len(up_list) > 0:
            query[cluster + '-up'] = up_list[0:max_genes]
        down_list = markers[cluster]['down'].index.values.tolist()
        if len(down_list) > 0:
            query[cluster + '-down'] = down_list[0:max_genes]
    result = gp.profile(organism=organism,
                        query=query,
                        user_threshold=enrichment_threshold)
    end = time.perf_counter()
    logger.info(
        "Enrichment analysis is finished. Time spent = {:.2f}s.".format(end -
                                                                        start))
    return result

Beispiel #20

0

Datei anzeigen

Datei: revigo.py Projekt: gokceneraslan/sctoolkit

def enrich_and_simplify(sets,
                        intersections=True,
                        sources=('GO:BP', ),
                        organism='hsapiens',
                        reduce_limit=0,
                        **revigo_kwds):
    from gprofiler import GProfiler

    if not isinstance(sets, dict):
        sets = list(sets)

    gprofiler = GProfiler(user_agent="scanpy", return_dataframe=True)
    gprofiler_kwargs = {'no_evidences': not intersections, 'sources': sources}

    df = gprofiler.profile(sets, organism=organism, **gprofiler_kwargs)
    revs = {}

    if reduce_limit is not None:
        dfs = []
        for q in df['query'].unique():
            df_sub = df[df['query'] == q].copy()
            go = df_sub.native.tolist()
            pvals = df_sub.p_value.tolist()

            if len(go) > reduce_limit:
                r = revigo(go, pvals, **revigo_kwds)
                revs[q] = r

                r = r.rename(columns={
                    'term_ID': 'native'
                }).drop(columns='description').assign(query=q)
                dfs.append(df_sub.merge(r))
            else:
                dfs.append(df.assign(eliminated=0))

        df = pd.concat(dfs, axis=0).reset_index(drop=True)

    return df, revs

Beispiel #21

0

Datei anzeigen

def main(args):
    gp = GProfiler(
        user_agent='gprofiler_custom_gmt',  #optional user agent
        return_dataframe=
        True,  #return pandas dataframe or plain python structures    
    )
    genes = [line.strip() for line in open(args.filename)]

    if args.gmt is not None:
        with open(args.gmt) as f:
            response = requests.post(
                'https://biit.cs.ut.ee/gprofiler/api/gost/custom/',
                json={
                    'gmt': f.read(),
                    'name': args.gmt
                })
        token = get_token_form_response(response)
    elif args.token is not None:
        token = args.token
    else:
        raise ValuError("Please supply either a token or a gmt file")
    res = gp.profile(genes, organism=token)
    res.to_csv(args.output)

Beispiel #22

0

Datei anzeigen

def make_tcga_gtex_id_mapping_file(tcga_gtex_id_df, tcga_gtex_id_addr):

    # print(tcga_gtex_id_df)
    ensembl_id = tcga_gtex_id_df['sample'].str.split(".", n=1, expand=True)
    tcga_gtex_id_df['ensembl_gene'] = ensembl_id[0]

    # print(tcga_gtex_id_df)
    gp = GProfiler(return_dataframe=True)
    ensembl_2_symbol = gp.convert(
        organism='hsapiens',
        query=tcga_gtex_id_df['ensembl_gene'].tolist(),
        target_namespace='ENSG')
    # print(ensembl_2_symbol[['incoming','name']])

    tcga_gtex_id_df['gene_symbol'] = tcga_gtex_id_df[['ensembl_gene']].merge(
        ensembl_2_symbol,
        how='left',
        right_on='incoming',
        left_on='ensembl_gene').name

    # print(tcga_gtex_id_df)

    tcga_gtex_id_df.to_csv(tcga_gtex_id_addr, sep='\t', index=False)

Beispiel #23

0

Datei anzeigen

Datei: gsea.py Projekt: gitter-lab/influenza-pb2

def gsea_connected_components(G, outdir):
    """
  Perform Gene Set Enrichment Analysis on the connected components in G using GProfiler

  Returns
  -------
  rv : list of (set, str)
    tuples of gene set that was queried for enrichment and the enrichment output file
  """
    rv = []
    gp = GProfiler("FluPath/0.1")
    if nx.is_directed(G):
        G = G.to_undirected()
    comps = list(nx.connected_components(G))
    comp_no = 0
    for comp in comps:
        # TODO how are http errors handled?
        enrich_out_fp = os.path.join(outdir, "enrich_{}.tsv".format(comp_no))
        if not os.path.exists(enrich_out_fp):
            enrich = gp.gprofile(comp, src_filter=['GO:BP'])
            write_enrich(enrich, enrich_out_fp)
        rv.append((comp, enrich_out_fp))
        comp_no += 1
    return rv

Beispiel #24

0

Datei anzeigen

Datei: string_handler.py Projekt: wchwang/Toolbox

def add_ensembl_gene_into_string_info(string_info_addr):
    string_info_df = pd.read_csv(string_info_addr, sep='\t')
    protein_tax_ensembl = string_info_df['protein_external_id'].str.split(
        ".", n=1, expand=True)
    string_info_df['protein_ensembl.protein'] = protein_tax_ensembl[1]

    protein_ensembl = protein_tax_ensembl[1].tolist()
    print(protein_ensembl)
    # string_info_df = string_info_df.iloc[:-1]

    # mg = mygene.MyGeneInfo()
    # ensembl_protein_to_gene_df = mg.querymany(protein_ensembl, scopes='ensembl.protein', fields='ensembl.gene',
    #                                           species=10090,returnall=False, as_dataframe=True)
    #
    # # ensembl_protein_to_gene_df = mg.getgenes(protein_ensembl, fields='ensembl.gene',
    # #                                           species=10090,as_dataframe=True)
    #
    # print(ensembl_protein_to_gene_df)
    #
    #
    # ensembl_protein_to_gene_df.to_csv("/Users/woochanghwang/PycharmProjects/CIMR/Data/STRING/ensembl.id.tsv",sep='\t')

    from gprofiler import GProfiler
    gp = GProfiler(return_dataframe=True)

    ensembl_protein_to_gene_df = gp.orth(organism='mmusculus',
                                         target='ENSG',
                                         query=protein_ensembl)

    ensembl_protein_to_gene_df = ensembl_protein_to_gene_df.reset_index()
    ensembl_protein_to_gene_df = ensembl_protein_to_gene_df.set_index(
        'incoming')
    # ensembl_protein_to_gene_df = ensembl_protein_to_gene_df.drop('index')
    # ensembl_protein_to_gene_df.to_csv("/Users/woochanghwang/PycharmProjects/CIMR/Data/STRING/string_ensembl_protein_to_gene_gp.tsv",sep='\t',index=False)
    # string_info_df = string_info_df.drop_duplicates()
    # ensembl_protein_to_gene_df = ensembl_protein_to_gene_df.drop_duplicates()
    string_info_df = string_info_df.set_index('protein_ensembl.protein')

    print(string_info_df.head())
    print(ensembl_protein_to_gene_df.head())
    string_info_ensembl_df = pd.concat(
        [string_info_df, ensembl_protein_to_gene_df], axis=1, sort=False)

    # string_info_ensembl_df = pd.merge(string_info_df, ensembl_protein_to_gene_df)
    string_info_ensembl_df.to_csv(
        "/Users/woochanghwang/PycharmProjects/CIMR/Data/STRING/10090.protein.info.v11.0.ensembl.txt",
        sep='\t')

Beispiel #25

0

Datei anzeigen

Datei: interactome_map_parameters.py Projekt: aliyurtsevenn/Cytoscape_Input_Interaction_map

for f in files:
    shutil.move(os.path.join(source, f), destination)

# Let me now find the GO anaotation graphs  for the proteins that have SaintExpress score>0.5 and BFDR<0.01
'''
This code asks you if you have one or more than one conditions. In case, you have one, it gives you only one conditional 
horizontal bar graphs. If not, it would compare the bar graphs.  

To have 2 conditions, you need to run this code 2 times with different outputs! 
'''

if ask_user == "YES":
    if number_of_conditions == 1:
        # Only one condition! getting GO annotation profiles of proteins that have >0.5 saint score and <0.01 BFDR score

        gp = GProfiler(return_dataframe=True)

        profiler = gp.profile(organism='hsapiens', query=gene_names)

        BP_profiler = profiler[profiler["source"] == "GO:BP"]
        CC_profiler = profiler[profiler["source"] == "GO:CC"]
        MF_profiler = profiler[profiler["source"] == "GO:MF"]

        BP_profiled = BP_profiler.sort_values(by=["p_value"])
        CC_profiled = CC_profiler.sort_values(by=["p_value"])
        MF_profiled = MF_profiler.sort_values(by=["p_value"])

        location_BP = BP_profiled["name"].to_list()[0:10]
        p_BP = BP_profiled["p_value"].to_list()[0:10]
        logged_p_BP = []
        for i in p_BP:

Beispiel #26

0

Datei anzeigen

Datei: community-GO.py Projekt: josemaz/kirc-stages

community = g.community_infomap()
print("Number of Communities:", len(community))

df = pd.DataFrame({'gene': g.vs['name'], 'community': community.membership})
# Order by size of communities
valuec = df['community'].value_counts()
biggest = valuec.unique()[0]  # Corta el valor con mas cuentas
values = valuec[(valuec >= biggest) |
                (valuec >= 10)]  # Values of communities filtered
order = values.index.tolist()
df = df[df['community'].isin(order)]
df = df.set_index('community')
df = df.loc[order].reset_index()
# print("community id - Number of Genes")
# print(valuec)

#! Enrichment by GO
gp = GProfiler(return_dataframe=True)
enrich_communities = pd.DataFrame()
print("Community id - Number of nodes")
for name, group in df.groupby('community', sort=False):
    print(name, group.shape[0])
    s = gp.profile(organism='hsapiens', query=group.gene.tolist())
    s['community'] = name
    enrich_communities = enrich_communities.append(s)
# print(enrich_components)

#! OUTPUT
enrich_communities.to_csv(oname1, sep="\t", index=False)
df.to_csv(oname2, sep="\t", index=False)

Beispiel #27

0

Datei anzeigen

Datei: goEnrichment.py Projekt: sungjoonpark93/cossy_pp

class GOEnrichmentTester():
    
    def __init__(self):
        self.gp = GProfiler("COSSY++/1.5")
    
    def getGoTerms(self, genelist):
        result = []
        res = self.gp.gprofile(query=genelist)
        
        for i in range(len(res)):
            pvalue = res[i][2]
            goid = res[i][8]
            gocat = res[i][9]
            goterm = res[i][11]
            
            '''
            if (gocat =="MF" or gocat == "CC" or gocat == "BP"):
                result.append({"pvalue":pvalue, "id":goid, "category":gocat, "term":goterm})
            '''
            result.append({"pvalue":pvalue, "id":goid, "category":gocat, "term":goterm})
        
        return result
    
    def readTSV(self, fname):
        records = []
        with open(fname) as reader:
            headers = []
            
            for line in reader:
                values = [x.replace("\"","") for x in line.split("\t")]
                if line.startswith("Gene Symbol"):
                    headers = values
                    continue
                
                rec = {headers[i] : values[i] for i in range(len(headers))}
                
                records.append(rec)
                
        return records
    
    def loadCOSMIC(self, fname):
        self.result = {"somatic":{}, "germline":{}}
        self.diseaseList = []
        
        records = self.readTSV(fname=fname)
        
        for rec in records:
            geneSymbol = rec["Gene Symbol"]
            somaticTumors = [x.strip() for x in rec["Tumour Types(Somatic)"].strip().split(",")]
            germlineTumors = [x.strip() for x in rec["Tumour Types(Germline)"].strip().split(",")]
            
            for tumorType in somaticTumors:
                if tumorType == "":
                    continue;
                
                if tumorType not in self.result["somatic"]:
                    self.result["somatic"][tumorType] = []
                self.result["somatic"][tumorType].append(geneSymbol)
                
                if tumorType not in self.diseaseList:
                    self.diseaseList.append(tumorType)
                
                
            for tumorType in germlineTumors:
                if tumorType == "":
                    continue;
                
                if tumorType not in self.result["germline"]:
                    self.result["germline"][tumorType] = []
                self.result["germline"][tumorType].append(geneSymbol)
                
                if tumorType not in self.diseaseList:
                    self.diseaseList.append(tumorType)
        
        self.makeGOList()

    def getGenes(self, disease):
        
        if disease in self.result["somatic"]:
            somaticGenes = self.result["somatic"][disease]
        else:
            somaticGenes = []
        
        if disease in self.result["germline"]:
            germlineGenes = self.result["germline"][disease]
        else:
            germlineGenes = []
        
        return somaticGenes + germlineGenes
    
    def makeGOList(self):
        self.GOList = {}
        
        for tumorType in self.diseaseList:
            print "."
            genes = self.getGenes(tumorType)
            goTerms = self.getGoTerms(genes)
            
            goTerms = sorted(goTerms, cmp=self.pvaluecomp)
            
            self.GOList[tumorType] = goTerms
    
    def writeCOSMICGO(self, fname):
        with open(fname, "w") as w:
            json.dump(self.GOList, w, indent=4)
    
    def corr(self, genes, disease):
        inputGO = sorted(self.getGoTerms(genes), cmp=self.pvaluecomp)
        inputGO_terms = [x["term"] for x in inputGO]
        
        answerGO = sorted([x for x in self.GOList[disease] if x["term"] in inputGO_terms], cmp=self.pvaluecomp)
        answerGO_terms = [x["term"] for x in answerGO]
        
        assert(len(inputGO_terms) != len(answerGO_terms))
        
        inputGO_ranks_pair = [(x,inputGO_terms.index(x)) for x in inputGO_terms]
        answerGO_ranks_pair = [(x,answerGO_terms.index(x)) for x in answerGO_terms]
        
        inputGO_ranks = [x[1] for x in sorted(inputGO_ranks_pair, key=itemgetter(0))]
        answerGO_ranks = [x[1] for x in sorted(answerGO_ranks_pair, key=itemgetter(0))]
        
        np.correlate(inputGO_ranks, answerGO_ranks, "same")
        
    def pvaluecomp(self, a,b):
        x = a['pvalue']
        y = b['pvalue']
        if x > y:
            return 1
        elif x < y:
            return -1
        else:
            return 0

Beispiel #28

0

Datei anzeigen

Datei: GeneOntologyAnalysis.py Projekt: willgdjones/GTEx

def lookup_enrichment(gene_set):
    clean_gene_set = [x for x in gene_set if x is not None]
    gp = GProfiler("GTEx/wj")
    enrichment_results = gp.gprofile(clean_gene_set)
    return enrichment_results

Beispiel #29

0

Datei anzeigen

Datei: pynotes_gprofiler.py Projekt: UBMI-IFC/easyGprofiler

import gprofiler
from gprofiler import GProfiler
GProfiler?
gp = GProfiler(return_dataframe=True)
gp.profile(organism='mmusculus', query=genes)
genes
genes = """ENSMUSG00000076488
ENSMUSG00000065231
ENSMUSG00000079120
ENSMUSG00000047222
ENSMUSG00000097494
ENSMUSG00000064419
ENSMUSG00000095668
ENSMUSG00000059606""".split()
gp.profile(organism='mmusculus', query=genes)
import requests
def mygprofiler(namelist, organism='mmusculus'):
    """Run gProfiler using POST api with a json query body
    
    Returns a pandas DataFrame with the result"""
    if type(namelist) is not list:
        namelist = list(namelist)
    r = requests.post(
        url='https://biit.cs.ut.ee/gprofiler/api/gost/profile/',
        json={
            'organism':organism,
            'query': namelist,
        }
        )
    df = pd.DataFrame(r.json()['result'])
    return df

Beispiel #30

0

Datei anzeigen

Datei: goEnrichment.py Projekt: sungjoonpark93/cossy_pp

 def __init__(self):
     self.gp = GProfiler("COSSY++/1.5")

Beispiel #31

0

Datei anzeigen

def enrich(
    container: Iterable[str],
    *,
    org: str = "hsapiens",
    gprofiler_kwargs: Mapping[str, Any] = {},
) -> pd.DataFrame:
    """\
    Get enrichment for DE results.

    This is a thin convenience wrapper around the very useful gprofiler_.

    This method dispatches on the first argument, leading to the following two
    signatures::

        enrich(container, ...)
        enrich(adata: AnnData, group, key: str, ...)

    Where::

        enrich(adata, group, key, ...) = enrich(adata.uns[key]["names"][group], ...)

    .. _gprofiler: https://pypi.org/project/gprofiler-official/#description

    Parameters
    ----------
    container
        Contains genes you'd like to search.
    adata
        AnnData object whose group will be looked for.
    group
        The group whose genes should be used for enrichment.
    key
        Key in `uns` to find group under.
    {doc_org}
    gprofiler_kwargs
        Keyword arguments to pass to `GProfiler.profile`, see gprofiler_.

    Returns
    -------
    Dataframe of enrichment results.

    Examples
    --------
    Using `sc.queries.enrich` on a list of genes:

    >>> import scanpy as sc
    >>> sc.queries.enrich(['Klf4', 'Pax5', 'Sox2', 'Nanog'], org="hsapiens")

    Using `sc.queries.enrich` on an :class:`anndata.AnnData` object:

    >>> pbmcs = sc.datasets.pbmc68k_reduced()
    >>> sc.tl.rank_genes_groups(pbmcs, "bulk_labels")
    >>> sc.queries.enrich(pbmcs, "CD34+")
    """
    try:
        from gprofiler import GProfiler
    except ImportError:
        raise ImportError(
            "This method requires the `gprofiler-official` module to be installed."
        )
    gprofiler = GProfiler(user_agent="scanpy", return_dataframe=True)
    gprofiler_kwargs = copy(gprofiler_kwargs)
    for k in ["organism"]:
        if gprofiler_kwargs.get(k) is not None:
            raise ValueError(
                f"Argument `{k}` should be passed directly through `enrich`, "
                "not through `gprofiler_kwargs`")
    return gprofiler.profile(list(container), organism=org, **gprofiler_kwargs)

Beispiel #32

0

Datei anzeigen

# -*- coding: utf-8 -*-
from gprofiler import GProfiler
gp = GProfiler(return_dataframe=True)
import numpy as np
import scipy as sp
import pandas as pd
import os
os.chdir("/home/conor/Documents/Git_Repositories/MSc_Project")

DEG_list = pd.read_csv('data/DEG_list.csv')
methyl_genes = pd.read_csv('data/Methylation_genes.csv')

upreg = DEG_list[(DEG_list['adj.P.Val'] <= 0.1) & (DEG_list['logFC'] >= 0.1)]
upreg = upreg['Gene'].astype(str).tolist()
upreg[:] = map(str.strip, upreg)

downreg = DEG_list[(DEG_list['adj.P.Val'] <= 0.1)
                   & (DEG_list['logFC'] <= -0.1)]
downreg = downreg['Gene'].astype(str).tolist()
downreg[:] = map(str.strip, downreg)

hyper = methyl_genes[methyl_genes['Methylation'] > 0]
hyper = hyper['Gene'].tolist()

hypo = methyl_genes[methyl_genes['Methylation'] < 0]
hypo = hypo['Gene'].tolist()

genelists = {'downreg': downreg, 'upreg': upreg, 'hyper': hyper, 'hypo': hypo}

for i in genelists:
    print("Calculating", i, "enrichment...")