Exemple #1
0
    def enrichment_analysis(self, library, output):
        '''
        Saves the results of enrichment analysis
        
        Attributes:
        -----------
        library - Enrichr library to be used. Recommendations: 
            - 'GO_Molecular_Function_2018'
            - 'GO_Biological_Process_2018'
            - 'GO_Cellular_Component_2018'
            for more options check available libraries by typing gseapy.get_library_name()
            
        output - directory name where results should be saved    
        '''
        libs = gseapy.get_library_name()
        assert library in libs, "the library is not available, check gseapy.get_library_name() for available options"
        assert (self.convert == True) or (
            self.origID == "symbol"
        ), "EnrichR accepts only gene names as an input, thus please set 'convert' to True and indicate the original gene ID"

        genes1_name = [self.mapping[x] for x in self.genes1]
        genes2_name = [self.mapping[x] for x in self.genes2]
        all_genes_names = genes1_name + genes2_name
        gseapy.enrichr(gene_list=all_genes_names,
                       description='pathway',
                       gene_sets=library,
                       cutoff=0.05,
                       outdir=output)
Exemple #2
0
def gsea(genes,
         description='',
         out='./',
         sleeptime=1,
         sleep=False,
         gsets=[
             'GO_Biological_Process_2018', 'KEGG_2019_Human',
             'WikiPathways_2019_Human'
         ]):
    """
    genes (list of str): gene symbols
    description (str): name for enrichment report
    sleeptime (int): length of wait time between each query 
        (overloading server causes connection to be cut)
    """
    if sleep:
        for gset in gsets:
            time.sleep(sleeptime)
            gseapy.enrichr(gene_list=genes,
                           description=description,
                           gene_sets=gset,
                           outdir=out)
    else:
        gseapy.enrichr(gene_list=genes,
                       description=description,
                       gene_sets=gsets,
                       outdir=out)
    return
def enrichr_validation(gene_list,
                       gene_rank=None,
                       outdir="validation_results",
                       gene_sets='KEGG_2016'):
    """
    Perform the enrichr tool (http://amp.pharm.mssm.edu/Enrichr/)
    Enrichment of a gene list
    
    Args:
        -gene_list (list): Gene list to analyze
        -gene_rank (list): Ranking of the genes (according to a scoring function)
        -outdir (str): Location to save the files
        -gene_sets (str): Gene set to use for the enrichment
    
    """
    if gene_rank == None:
        enr = gp.enrichr(gene_list=gene_list,
                         description='pathway',
                         gene_sets='KEGG_2016',
                         outdir='test',
                         cutoff=0.05,
                         format='png')
    else:
        assert type(gene_rank) == list, "please provide gene_rank as a list"
        rnk = pd.DataFrame(np.array([gene_list, gene_rank]).T,
                           columns=['gene', 'score'])
        enr = gp.enrichr(gene_list=rnk,
                         description='pathway',
                         gene_sets='KEGG_2016',
                         outdir=outdir,
                         cutoff=0.05,
                         format='png')
    result = enr.res2d[enr.res2d["Adjusted P-value"] < pvalue]

    return result
Exemple #4
0
def res(x_one, y_one, top):
    array_one = []

    enr_x_one = None
    try:
        enr_x_one = gp.enrichr(gene_list=x_one,
                               gene_sets=lib,
                               organism='Human',
                               cutoff=0.05).results[[
                                   'Term', 'P-value'
                               ]].head(top).values.tolist()
    except Exception:
        pass

    if enr_x_one is not None and len(enr_x_one) > 0:
        enr_y_one = None

        try:
            enr_y_one = gp.enrichr(gene_list=y_one,
                                   gene_sets=lib,
                                   organism='Human',
                                   cutoff=1.0).results[['Term', 'P-value']]
        except Exception:
            pass

        if enr_y_one is not None:
            for term in enr_x_one:
                pair = enr_y_one.loc[enr_y_one['Term'] == term[0]]
                if pair is not None and pair.shape[0] > 0:
                    pair = pair.iloc[0].values.tolist()
                    array_one.append((term[1], pair[1]))

    return array_one
Exemple #5
0
def enrichr(gene_list,
            description,
            out_dir,
            scan=None,
            max_terms=10,
            figsize=(12, 6),
            run_main=False):
    '''
    Performs GO Molecular Function, GO Biological Process and KEGG enrichment on a gene list.
    Uses enrichr.

    Inputs
    ------
    gene_list: list of genes to perform enrichment on
    description: string description for title
    out_dir: output director
    scan: dictionary with additional enrichr dbs to scan (http://amp.pharm.mssm.edu/Enrichr/#stats)
    max_terms: limit return plot to this max
    load: load results
    figsize: change fig size

    Returns
    -------

    None

    '''

    out_dir = make_folder(out_dir)

    testscan = {
        'KEGG': 'KEGG_2016',
        'GO_biological_process': 'GO_Biological_Process_2017b',
        'ChIP-X_Consensus_TFs': 'ENCODE_and_ChEA_Consensus_TFs_from_ChIP-X',
        'ChEA': 'ChEA_2016',
        'OMIM_Disease': 'OMIM_Disease'
    }

    if isinstance(scan, dict):
        testscan = {**testscan, **scan}

    for nick, name in testscan.items():
        gseapy.enrichr(gene_list=gene_list,
                       figsize=figsize,
                       top_term=max_terms,
                       description=f'{description}_{nick}',
                       gene_sets=name,
                       outdir=out_dir,
                       format='png')

        out_result(f'{out_dir}{nick}.{name}.enrichr.reports.png',
                   f'Enrichr: {nick} for {description}',
                   run_main=run_main)

    out_list = pd.DataFrame({'Gene Name': gene_list},
                            index=range(len(gene_list)))
    out_list.to_excel(f'{out_dir}{description}_genes.xlsx', index=None)
def adj_enrich_score(ranks, gene_set_file, gene_sets, cutoff=.5):
    clusters = ranks.columns.tolist()

    # Just to get pathway names
    nes_df = pd.DataFrame(index=gene_sets)

    for cluster in clusters:
        print("computing for cluster %s" % cluster)
        genes = ranks.index[(ranks[cluster] > cutoff).tolist()]
        #print(genes)
        print(len(genes))

        enr = gp.enrichr(
            gene_list=genes.tolist(),
            # or gene_list=glist
            description='test_name',
            gene_sets=gene_set_file,
            outdir='../test/enrichr_kegg',
            cutoff=1  # test dataset, use lower value of range(0,1)
        )
        try:
            enr.results.index = enr.results["Term"]
            #print(enr.results.head())
            #print(nes_df.head())
            nes_df[cluster] = enr.results["Adjusted P-value"]
        except KeyError:
            print(enr.results.columns)
            #print(nes_df.head())

        #print("number of positive nes scores %d" % (nes_df[cluster] > 0).sum())

    return nes_df
def save_enrichment(x):
     lib = gp.get_library_name('Human')

     with open('gensets.txt', 'w') as f:
          for item in range(len(lib)):
               f.write("%s %s\n" % (item, lib[item]))
     # lib = lib[49: 54]
     lib = lib[53]

     files = [(1, x+"/gcn-hom-hom.csv"), (2, x+"/gcn-hom-onto.csv"),
              (3, x+"/gcn-onto-onto.csv"), (4, x+"/gae-hom-hom.csv"),
              (5, x+"/gae-hom-onto.csv"), (6, x+"/gae-onto-onto.csv")]

     df = pd.DataFrame()
     writer = pd.ExcelWriter('enrich-cluster/full-results.xlsx')
     for key, file in files:
          print(file)
          cluster_data = read_file_2(file)
          for i in cluster_data:
               try:
                    enr = gp.enrichr(gene_list=list(cluster_data[i][2]), gene_sets=lib, organism='Human', cutoff=0.05).results
               except:
                    pass
               enr['model'] = key
               enr['cluster'] = i
               df = df.append(enr)

     df = df[(df['P-value'] < 0.05)]
     df.to_excel(writer, sheet_name="sheet1")
     writer.save()
Exemple #8
0
def enrichr_go_bp(symbols=None, cutoff=0.05):
    dummy_directory = 'biopipe-enrichr-dummy'
    try:
        enrichr_result = gseapy.enrichr(
            gene_list=symbols,
            gene_sets='GO_Biological_Process_2017b',
            outdir=dummy_directory,
            no_plot=True)
    except Exception:
        print(
            'An error occurred during running enrichr. Please try again later.'
        )
        sys.exit(1)
    finally:
        shutil.rmtree(dummy_directory)

    result_dataframe = enrichr_result.res2d
    p_value_filtered_result = result_dataframe[
        result_dataframe['Adjusted P-value'] < cutoff]
    transformed_p_values = [
        -np.log10(p)
        for p in p_value_filtered_result['Adjusted P-value'].values
    ]
    terms = p_value_filtered_result['Term'].values

    data = list(zip(terms, transformed_p_values))[:10]
    terminal_bar_chart(data,
                       title='%s [-log10(p)]' % enrichr_result.gene_sets,
                       sort=True)
Exemple #9
0
    def __init__(self, data:H5COUNTS, path="data/interim/",
                 threshold=0.05,
                 gene_sets=['GO_Biological_Process_2018', 'GO_Cellular_Component_2018', 'GO_Molecular_Function_2018'],
                 tumor_ids=[1, 2, 3, 4, 5, 6, 7, 8]):
        self.gsea_table = pd.DataFrame()
        self.data = data

        for tumor_id in tumor_ids:
            de_genes_tumor_df = pd.read_csv(path+"MK_genes_TUMOR{}.csv".format(tumor_id))
            de_genes_by_cluster = de_genes_tumor_df.groupby("cluster")["gene"].apply(lambda x: "|".join(x.unique()))
            tumor_name = data.id2tumor[tumor_id]

            print("Running GSEA for tumor", tumor_name)
            for cluster in de_genes_by_cluster.index:
                DE_gene_list = de_genes_by_cluster[cluster].split("|")
                tumor_cluster = tumor_name + "_" + str(cluster)
                enr = gp.enrichr(gene_list=DE_gene_list,
                                 gene_sets=gene_sets,
                                 no_plot=True,
                                 cutoff=0.05  # test dataset, use lower value from range(0,1)
                                 )
                if threshold:
                    enr.results = enr.results[enr.results["Adjusted P-value"] < threshold]
                enr_results = enr.results.set_index("Term")

                for geneset in enr_results.index:
                    self.gsea_table.loc[geneset, tumor_cluster] = enr_results.loc[geneset, "Adjusted P-value"]

        self.gsea_table = self.gsea_table.T
        self.gsea_table.index = pd.MultiIndex.from_tuples(self.gsea_table.index.str.split("_", expand=True),
                                                             names=["tumor", "cluster"])
Exemple #10
0
    def _enrichr(self, category, background=None, verbose=True):

        if background is None:
            background = self.background

        if isinstance(category, list):
            gene_list = category
        else:
            assert category in ['up', 'down', 'all']
            gene_list = list(self.rnadiff.gene_lists[category])

        if self.mapper is not None:
            logger.info("Input gene list of {} ids".format(len(gene_list)))
            #gene_list = [x.replace("gene:", "") for x in gene_list]
            identifiers = self.mapper.loc[gene_list]['name'].drop_duplicates(
            ).values
            logger.info("Mapped gene list of {} ids".format(len(identifiers)))
            gene_list = list(identifiers)

        enr = gseapy.enrichr(gene_list=gene_list,
                             gene_sets=self.gene_sets,
                             verbose=verbose,
                             background=background,
                             outdir="test",
                             no_plot=True)

        return enr
def save_enrichment_set():
     lib = gp.get_library_name('Human')
     lib = lib[53]

     files = [("gcn-hom-hom", "enrich/gcn-hom-hom.csv"),
              ("gcn-hom-onto", "enrich/gcn-hom-onto.csv"),
              ("gcn-onto-onto", "enrich/gcn-onto-onto.csv"),
              ("gae-hom-hom", "enrich/gae-hom-hom.csv"),
              ("gae-hom-onto", "enrich/gae-hom-onto.csv"),
              ("gae-onto-onto", "enrich/gae-onto-onto.csv")]

     enrich_set = {}
     for key, file in files:
          print(file)
          cluster_data = read_file(file)
          for i in cluster_data:
              print(len(cluster_data[i][2]))
              try:
                  enr = gp.enrichr(gene_list=list(cluster_data[i][2])[:1000], gene_sets=lib, organism='Human', cutoff=0.05).results
                  name = key + "-" + str(i)
                  term = enr['Term'].to_list()
                  enrich_set[name] = term
                  # print(i)
                  print(enr)
              except:
                   pass


     write_file("enrich-cluster/full_result_dic.csv", enrich_set)
Exemple #12
0
def vec_enrich(vec, gene_ids, quantile, gene_sets):
    ind = np.quantile(vec, quantile) > vec
    print("... {} features selected...".format(sum(ind)))
    genes = gene_ids[ind]

    # remove ens id version
    genes = [re.sub("\\..*$", "", g) for g in genes]

    print("Mapping to gene names...")
    # map ens ids to gene symbols
    bm = Biomart()
    bm_result = bm.query(
        dataset="hsapiens_gene_ensembl",
        attributes=[
            "ensembl_gene_id",
            "external_gene_name",
            "entrezgene_id",
            "go_id",
        ],
        filters={"ensembl_gene_id": genes},
    )
    gene_symbols = list(bm_result["external_gene_name"].unique())

    print("Calculating enrichment...")
    enr = gp.enrichr(
        gene_list=gene_symbols,
        gene_sets=gene_sets,
        organism="Human",
        cutoff=0.05,
    )
    return enr
Exemple #13
0
def enrichr_test(direction="pos", significance=True):

    genes = np.array(rnk["gene"])
    pvals = np.array(rnk["pval"])
    fcs = np.array(rnk["fc"])

    if direction == "pos":
        if significance:
            hits = set(genes[np.logical_and(fcs > 0, pvals < alpha)])
        else:
            hits = set(genes[:top])
    else:
        if significance:
            hits = set(genes[np.logical_and(fcs < 0, pvals < alpha)])
        else:
            hits = set(genes[-top:])

    if significance:
        sur = "sign05"
    else:
        sur = "top100"

    outpath = OUTPATH + "/enrichr_%s_%s/" % (direction, sur)
    if not os.path.exists(outpath): os.mkdir(outpath)

    enr = gseapy.enrichr(
        gene_list=list(hits),
        description="%s_%s_%s" % (filename, direction, sur),
        gene_sets=gene_sets,
        outdir=outpath,
        cutoff=0.5  # only used for plotting.
    )
Exemple #14
0
def get_ontology_df(
        topic,
        cutoff=0.05,
        threshhold=5e-1,
        gene_sets=[
            'GO_Molecular_Function_2018', 'GO_Biological_Process_2018',
            'GO_Cellular_Component_2018', 'Human_Phenotype_Ontology',
            'GTEx_Tissue_Sample_Gene_Expression_Profiles_up',
            'GTEx_Tissue_Sample_Gene_Expression_Profiles_down',
            'Tissue_Protein_Expression_from_Human_Proteome_Map',
            'KEGG_2019_Human', 'NCI-60_Cancer_Cell_Lines'
        ],
        background=None):
    """

    :param topic: list of genes
    :param background: enrichment test background
    :param cutoff: Enrichments cutoff
    :param threshhold: threshold on Adjusted P-value
    :return:
    """
    sets = ','.join(gene_sets)
    if background is None:
        background = 'hsapiens_gene_ensembl'
    topic = [g for g in topic if str(g) != 'nan']
    gene_ontology = gs.enrichr(list(topic),
                               gene_sets=sets,
                               cutoff=cutoff,
                               background=background).results
    return gene_ontology[gene_ontology['Adjusted P-value'] < threshhold][[
        'Term', 'Adjusted P-value', 'Gene_set'
    ]]
Exemple #15
0
def perform_GO_enrichment_analysis(inputGenes, geneDescription, threshold):
    targetGeneSet = "GO_Biological_Process_2015"  #"KEGG_2016","Reactome_2013","GO_Molecular_Function_2015","WikiPathways_2013"
    if os.path.exists('./gene_set_enrichment_analysis') == False:
        os.makedirs('./gene_set_enrichment_analysis')
    enr = gp.enrichr(gene_list=inputGenes,
                     description=geneDescription,
                     gene_sets=targetGeneSet,
                     outdir='./gene_set_enrichment_analysis',
                     cutoff=threshold)
    enr.res2d.head()
Exemple #16
0
def corr_rep_gene(adata, rep, prefix='', dims=[0,1], offset=1, layer=None, thresh=0.5, out='./', \
                  gsets=['GO_Biological_Process_2018', 'KEGG_2019_Human', 'WikiPathways_2019_Human']):
    """
    Use gseapy.get_library_name() to see more gene set option
    """
    for i in dims:
        df = corr_comp_gene(adata, rep, i, offset=offset, layer=layer)
        df.to_csv('{}{}_{}{}_corr_gene.csv'.format(out, rep, prefix, i + 1))
        df.sort_values('R', ascending=False).to_csv(
            '{}{}_{}{}_corr_gene_sorted.csv'.format(out, rep, prefix, i + 1))
        df_pos = df.loc[df['R'] > thresh, :]
        df_neg = df.loc[df['R'] < -thresh, :]
        for sign, df in zip(['pos', 'neg'], [df_pos, df_neg]):
            if len(df) > 0:
                gseapy.enrichr(gene_list=list(df.index),
                               gene_sets=gsets,
                               outdir='{}{}{}_{}'.format(
                                   out, prefix, i + 1, sign))
    return
def calcu_gsea(gene_list, gmt, bg):
    gene_list = list(gene_list)
    enr2 = gp.enrichr(
        gene_list=gene_list,
        # or gene_list=glist
        description='test_name',
        gene_sets=gmt.term_set,
        background=bg,  # or the number of genes, e.g 20000
        outdir=None,
        cutoff=0.5,  # only used for testing.
        verbose=True)
    return enr2.results
Exemple #18
0
def gsea(homepath):
    '''
		Parameters
		----------
			`homepath` (str): 
			  Path where you want to save all the generated files 
			  and folders. 

		Return:
		-------
			None

		Outputs:
		--------
			Generate a directory names enrichr 
			within home directory and two plot 
			of gene enrichement analysis using 
			the selected genes from panclassif 
	'''
    warnings.filterwarnings("ignore")
    # Directory
    directory = "enrichr"
    # Parent Directory path
    parent_dir = homepath
    # Path
    path = os.path.join(parent_dir, directory)
    if not os.path.exists(path):
        os.mkdir(path)
    gene = pd.read_csv(homepath + "/std_npy/unique_genes_with_frequency.csv",
                       header=None)
    gl = []
    for g in range(len(gene)):
        gl.append(gene[0][g])

    enr = gs.enrichr(gene_list=gl,
                     description='Disease',
                     gene_sets='DisGeNET',
                     outdir=homepath + '/enrichr')
    # simple plotting function
    from gseapy.plot import barplot, dotplot

    # to save your figure, make sure that ``ofname`` is not None
    barplot(enr.res2d,
            title='DisGeNET',
            cutoff=0.2,
            ofname=homepath + '/enrichr/DisGeNET_barplot.png')
    dotplot(enr.res2d,
            title='DisGeNET',
            cmap='viridis_r',
            cutoff=0.2,
            ofname=homepath + '/enrichr/DisGeNET_dotplot.png')
Exemple #19
0
def enrichment_GO(gene_list,
                  go_mode='Bio',
                  organism='Human',
                  description='test_name',
                  outdir='enrichment_go',
                  cutoff=0.5):
    '''
	Gene enrichment analysis of GO

	Parameters
	----------
	gene_list:list
		The gene set to be enrichment analyzed
	go_mode:str
		The module of GO include:'Bio','Cell','Mole'
	organism:str
		Select from (human, mouse, yeast, fly, fish, worm)
	description:str
		The title of enrichment
	outdir:str
		The savedir of enrichment
	cutoff:float
		Show enriched terms which Adjusted P-value < cutoff.

	Returns
	----------
	result:pandas.DataFrame
		stores your last query
	'''
    if (go_mode == 'Bio'):
        geneset = 'GO_Biological_Process_2018'
    if (go_mode == 'Cell'):
        geneset = 'GO_Cellular_Component_2018'
    if (go_mode == 'Mole'):
        geneset = 'GO_Molecular_Function_2018'
    enr = gp.enrichr(
        gene_list=gene_list,
        gene_sets=geneset,
        organism=
        organism,  # don't forget to set organism to the one you desired! e.g. Yeast
        description=description,
        outdir=outdir,
        # no_plot=True,
        cutoff=cutoff  # test dataset, use lower value from range(0,1)
    )
    subp = dotplot(enr.res2d, title=description, cmap='seismic')
    print(subp)
    return enr.res2d
def main():
    usage = ""  # TODO
    parser = OptionParser(usage=usage)
    parser.add_option("-o", "--out_file", help="OUTPUT file")
    (options, args) = parser.parse_args()

    de_genes_f = args[0]
    #expr_fs = args[2].split(',')
    out_f = options.out_file

    # Get all the DE genes
    de_genes_df = pd.read_csv(de_genes_f, sep='\t', index_col=0)
    de_genes = de_genes_df.index

    # Perform gene set enrichment
    db_to_gene_sets = {}
    for db_name, db in GENE_SETS.items():
        enr = gp.enrichr(
            gene_list=[x.strip() for x in de_genes],
            #gene_list=[x.strip() for x in de_genes]
            gene_sets=[db],
            background=19463,
            no_plot=True,
            cutoff=0.05  # test dataset, use lower value from range(0,1)
        )
        enr.results = enr.results[
            enr.results["Adjusted P-value"] < GSEA_THRESH]
        sig_terms = {
            str(row[0]): float(row[1])
            for row_i, row in enr.results[['Term', 'Adjusted P-value'
                                           ]].iterrows()
        }
        db_to_gene_sets[db_name] = sig_terms

    # Create final dataframe
    da = []
    for db, gene_set_to_pval in db_to_gene_sets.items():
        for gene_set, pval in gene_set_to_pval.items():
            da.append((db, gene_set, pval))
    df = pd.DataFrame(data=da,
                      columns=['collection', 'gene_set', 'adjusted_p_value'])
    df = df.sort_values(by='adjusted_p_value', axis=0)
    print('{} total enriched gene sets.'.format(len(df)))

    # Write output
    print('Writing to {}.'.format(out_f))
    df.to_csv(out_f, index=False, sep='\t')
    print('done')
Exemple #21
0
 def enrichr(self, gene_dict, gene_set, key, reg):
     """Perform enrichr analysis on a gene dictionary of sample group : enriched gene list, a GO term library
     of gene set, and a reg varible that is either 'upreg' or 'downreg'"""
     # check if gene list is empty
     if not gene_dict[key]:
         pass
     else:
         # run enrichr - if there are no genes enriched with the cutoff level, it will not generate an output
         enr = gp.enrichr(gene_list=gene_dict[key],
                          gene_sets=[gene_set],
                          organism='Human',
                          description=key + "_" + reg,
                          outdir=self.input_dir + 'enrichr/' + key + "/",
                          cutoff=0.1)
         enr_df = enr.results.copy()
         return enr_df
def run_enrichr(gene_list, gene_sets):
    '''
    gene_list: List
        List containing genes names used for the analysis
    
    gene_sets: List
        List of enrichr gene libaries to use
    
    enr: Enrichr object
        Analysis output, use "enr.results" to print table of results
    '''
    enr = gp.enrichr(gene_list=gene_list,
                     description='pathway',
                     gene_sets=gene_sets,
                     organism='Human',
                     cutoff=0.5)

    return enr
Exemple #23
0
def enrichment_KEGG(gene_list,
                    gene_sets=['KEGG_2019_Human'],
                    organism='Human',
                    description='test_name',
                    outdir='enrichment_kegg',
                    cutoff=0.5):
    '''
	Gene enrichment analysis of KEGG

	Parameters
	----------
	gene_list:list
		The gene set to be enrichment analyzed
	gene_sets:list
		The gene_set of enrichr library
		Input Enrichr Libraries (https://maayanlab.cloud/Enrichr/#stats)
	organism:str
		Select from (human, mouse, yeast, fly, fish, worm)
	description:str
		The title of enrichment
	outdir:str
		The savedir of enrichment
	cutoff:float
		Show enriched terms which Adjusted P-value < cutoff.

	Returns
	----------
	res:pandas.DataFrame
		stores your last query
	'''

    enr = gp.enrichr(
        gene_list=gene_list,
        gene_sets=gene_sets,
        organism=
        organism,  # don't forget to set organism to the one you desired! e.g. Yeast
        description=description,
        outdir=outdir,
        # no_plot=True,
        cutoff=cutoff  # test dataset, use lower value from range(0,1)
    )
    subp = dotplot(enr.res2d, title=description, cmap='seismic')
    print(subp)
    return enr.res2d
Exemple #24
0
def save_top4_csvs():
    for i in files:
        file_name = "enrich/" + i + ".csv"
        file = read_file(file_name)

        temp = {}
        for j in file:
            gene_list = list(file[j][2])

            enr_x_one = None
            try:
                enr_x_one = gp.enrichr(
                    gene_list=gene_list,
                    gene_sets=lib,
                    organism='Human',
                    cutoff=0.05).results.head(10)['Term'].tolist()
                temp[j] = set(enr_x_one)
            except:
                pass
        write_file("top_4_bio_process/" + i + ".csv", temp)
Exemple #25
0
def enrich_tissue(tissue_name, last_num):
    communities = pickle.load(
        open("results/louvain_modules_" + tissue_name + ".pkl", "rb"))
    corr_mat = pd.read_pickle("data/corr_" + tissue_name + ".pkl")

    community_id = 1

    for community in np.unique(communities[0]):
        common = np.array(corr_mat.columns)[communities[0] == community]

        if len(common) <= 3:
            continue
        # print(community_id)
        print("For community", community_id,
              "(len: " + str(len(common)) + ")...")

        if community_id < last_num:
            community_id += 1
            continue

        enr = gp.enrichr(gene_list=list(common.astype('<U3')),
                         organism='human',
                         description=tissue_name + "_" + str(community_id),
                         gene_sets='Reactome_2016',
                         cutoff=0.05,
                         outdir='results/EnrichClass')
        if enr.results.shape[0] > 0:
            enr.results = enr.results[enr.results['Adjusted P-value'] < 0.05]
            if enr.results.shape[0] > 0:
                enr.results.to_csv("results/enrichment/" + tissue_name + "_" +
                                   str(community_id) + "_" + str(len(common)) +
                                   ".csv")
                print("Enriched!")

        community_id += 1
        sleep(
            50
        )  # just to go easy on the Enrich API... (constantly getting errors after a while)
Exemple #26
0
def _get_top_enrichr_term(gene_sets, libraries=[
    # 'GO_Biological_Process_2018',
    # 'GO_Cellular_Component_2018',
    # 'GO_Molecular_Function_2018',
                                                'KEGG_2019_Human', ],
                          cutoff=0.01, top_k=1):
    results = []

    for gene_set in gene_sets:
        try:
            enr = gp.enrichr(gene_list=gene_set,
                             gene_sets=libraries,
                             cutoff=cutoff,
                             no_plot=True, verbose=False,
                             )
            if enr.results.shape[0] > 0:
                results.append(enr.results.sort_values(by="Adjusted P-value").head_node_type(top_k))
        except Exception:
            pass
    results = [row for row in results if row is not None]
    if len(results) > 0:
        return pd.concat(results)
    else:
        return None
Exemple #27
0
import gseapy as gp

import pandas as pd

import RNA_expression_processing as rn

import matplotlib.pyplot as plt

path = 'tracks/MARGE/relativeRP/'

dt = pd.read_csv(path + 'DN_RegNetwork_TLX3.csv')

gl = list(dt['gene_name'])

gs = 'RNA-Seq_Disease_Gene_and_Drug_Signatures_from_GEO'

rs = gp.enrichr(gene_list=gl, gene_sets=gs)

rn.barplot(rs.res2d, ttl=gs)

plt.show()
tss = set(tlx_tss_3kb_gn)

enh = set(tlx_chrhmm_gn)

#~ from matplotlib_venn import venn2
#~ venn2([tss,enh], set_labels = ('Tlx3 in Enhancer', 'Tlx3 in Promoter'))

tss_and_enh = tss & enh
tss_or_enh = tss | enh
tss_notin_enh = tss - enh
enh_notin_tss = enh - tss
diff_enh_tss = enh ^ tss

# === Run GSEApy Enrichr
gp.enrichr(gene_list=list(tss_and_enh),
           description='tss_and_enh',
           gene_sets='GO_Biological_Process_2017b',
           outdir='gene_lists/tlx_enh_tss3k/' + 'tss_and_enh_GO_BP_2017b')

gp.enrichr(gene_list=list(tss_or_enh),
           description='tss_or_enh',
           gene_sets='GO_Biological_Process_2017b',
           outdir='gene_lists/tlx_enh_tss3k/' + 'tss_or_enh_GO_BP_2017b')

gp.enrichr(gene_list=list(tss_notin_enh),
           description='tss_notin_enh',
           gene_sets='GO_Biological_Process_2017b',
           outdir='gene_lists/tlx_enh_tss3k/' + 'tss_notin_enh_GO_BP_2017b')

gp.enrichr(gene_list=list(enh_notin_tss),
           description='enh_notin_tss',
           gene_sets='GO_Biological_Process_2017b',
Exemple #29
0
for x in pd.DataFrame.from_records(r).columns:
    print("group :"+x, end = "\r")
    # rank gene by importance for clusters
    glist = pd.DataFrame.from_records(r)[x].tolist()
    bm = Biomart()
    if not os.path.exists("test"):
        os.makedirs("test")
    results = bm.query(dataset='hsapiens_gene_ensembl',
                    attributes=['external_gene_name', 'go_id'],
                    filters={'hgnc_symbol': glist},
                    # save output file
                    filename="test/query_"+x+".results.txt")

    enr = gp.enrichr(gene_list=glist,
                    description='test_name',
                    gene_sets=['KEGG_2016'],
                    outdir="test/enrichr_kegg_group"+x,
                    cutoff=0.5 # test dataset, use lower value from range(0,1)
                    )


    # to save your figure, make sure that ``ofname`` is not None
    #barplot(enr.res2d,title='',ofname="test/enrichr_kegg_group"+x+"/bar_plot.pdf")
    dotplot(enr.res2d, title='',ofname="test/enrichr_kegg_group"+x+"/dot_plot.pdf")


    #pd.DataFrame.from_records(adata.uns['rank_genes_groups']['scores'])[x]
    rnk= pd.concat([pd.DataFrame.from_records(r)[x],pd.DataFrame.from_records(adata.uns['rank_genes_groups']['scores'])[x]],axis=1)
    rnk.columns=[0,1]

    pre_res = gp.prerank(rnk=rnk, gene_sets='KEGG_2016',
                        processes=4,
Exemple #30
0
def main():
    usage = ""  # TODO
    parser = OptionParser(usage=usage)
    parser.add_option("-t", "--de_table_f", help="DE table file")
    parser.add_option("-c", "--de_table_col", help="DE table column to filter")
    parser.add_option("-o", "--out_file", help="Output file")
    parser.add_option("-k", "--out_kept", help="Output kept genes file")
    parser.add_option("-v", "--out_venn_f", help="Venn diagram file")
    (options, args) = parser.parse_args()

    de_genes_f = args[0]
    ec_f = args[1]
    out_f = options.out_file
    out_kept_f = options.out_kept
    out_venn_f = options.out_venn_f

    # Get all the DE genes
    with open(de_genes_f, 'r') as f:
        de_genes = [l.strip() for l in f]

    print('{} total DE genes'.format(len(de_genes)))

    # Map each gene to its fold change and filter
    # by fold change
    if FILTER_BY_FOLD:
        ec_df = pd.read_csv(ec_f, sep='\t', index_col=0)
        gene_to_fc = {gene: ec_df.loc[gene]['FC'] for gene in ec_df.index}
        filtered_genes = [
            gene for gene in gene_to_fc
            if gene_to_fc[gene] > FC_THRESH or gene_to_fc[gene] < (1 /
                                                                   FC_THRESH)
        ]
        print('{}/{} remain after filtering by fold-change.'.format(
            len(filtered_genes), len(gene_to_fc)))
        if len(filtered_genes) == 0:
            with open(out_f, 'w') as f:
                f.write('collection\tgene_set\tadjusted_p_value')
            return

    # Filter batch effect if specified in options
    if options.de_table_f:
        de_table_f = options.de_table_f
        de_table_col = options.de_table_col
        de_table_df = pd.read_csv(de_table_f, sep='\t', index_col=0)
        batch_effect_genes = set(de_table_df.loc[de_table_df[de_table_col] ==
                                                 1][de_table_col].index)

        # Output the removed genes
        present_batch_effect_genes = batch_effect_genes & set(filtered_genes)
        print('Removed {} genes: {}'.format(len(present_batch_effect_genes),
                                            present_batch_effect_genes))

        filtered_genes = sorted(set(filtered_genes) - batch_effect_genes)
        with open(out_kept_f, 'w') as f:
            f.write('\n'.join(sorted(filtered_genes)))

        print('{}/{} remain after filtering by batch-effect:'.format(
            len(filtered_genes), len(gene_to_fc)))
        print(filtered_genes)

        if len(filtered_genes) == 0:
            with open(out_f, 'w') as f:
                f.write('collection\tgene_set\tadjusted_p_value')
            return

    # Draw Venn diagram
    if 'Up' in de_genes_f:
        title = r'DE genes $\bf{higher}$ in COVID-19 ICU patients'
    else:
        title = r'DE genes $\bf{lower}$ in COVID-19 ICU patients'
    fig, ax = plt.subplots(1, 1, figsize=(4.5, 3.5))
    venn2(subsets=(
        len(filtered_genes),
        len(batch_effect_genes),
        len(present_batch_effect_genes),
    ),
          set_labels=('COVID-19 ICU\nvs. sepsis ARDS',
                      'non-ICU\nvs. sepsis non-ARDS'),
          set_colors=('b', 'y'),
          alpha=0.5,
          ax=ax)
    plt.gca().set_title(title, fontsize=14)
    plt.tight_layout()
    plt.savefig(options.out_venn_f, format='pdf')

    # Perform gene set enrichment
    db_to_gene_sets = {}
    for db_name, db in GENE_SETS.items():
        enr = gp.enrichr(
            gene_list=[x.strip() for x in filtered_genes],
            gene_sets=[db],
            background=19463,
            no_plot=True,
            cutoff=0.05  # test dataset, use lower value from range(0,1)
        )
        enr.results = enr.results[
            enr.results["Adjusted P-value"] < GSEA_THRESH]
        sig_terms = {
            str(row[0]): float(row[1])
            for row_i, row in enr.results[['Term', 'Adjusted P-value'
                                           ]].iterrows()
        }
        db_to_gene_sets[db_name] = sig_terms

    # Create final dataframe
    da = []
    for db, gene_set_to_pval in db_to_gene_sets.items():
        for gene_set, pval in gene_set_to_pval.items():
            da.append((db, gene_set, pval))
    df = pd.DataFrame(data=da,
                      columns=['collection', 'gene_set', 'adjusted_p_value'])
    df = df.sort_values(by='adjusted_p_value', axis=0)
    print('{} total enriched gene sets.'.format(len(df)))

    # Write output
    print('Writing to {}.'.format(out_f))
    df.to_csv(out_f, index=False, sep='\t')
    print('done')