def gsea(homepath): ''' Parameters ---------- `homepath` (str): Path where you want to save all the generated files and folders. Return: ------- None Outputs: -------- Generate a directory names enrichr within home directory and two plot of gene enrichement analysis using the selected genes from panclassif ''' warnings.filterwarnings("ignore") # Directory directory = "enrichr" # Parent Directory path parent_dir = homepath # Path path = os.path.join(parent_dir, directory) if not os.path.exists(path): os.mkdir(path) gene = pd.read_csv(homepath + "/std_npy/unique_genes_with_frequency.csv", header=None) gl = [] for g in range(len(gene)): gl.append(gene[0][g]) enr = gs.enrichr(gene_list=gl, description='Disease', gene_sets='DisGeNET', outdir=homepath + '/enrichr') # simple plotting function from gseapy.plot import barplot, dotplot # to save your figure, make sure that ``ofname`` is not None barplot(enr.res2d, title='DisGeNET', cutoff=0.2, ofname=homepath + '/enrichr/DisGeNET_barplot.png') dotplot(enr.res2d, title='DisGeNET', cmap='viridis_r', cutoff=0.2, ofname=homepath + '/enrichr/DisGeNET_dotplot.png')
def run(self): """run enrichr for one sample gene list but multi-libraries""" # read input file genes_list = self.parse_genelists() gss = self.parse_genesets() # if gmt self._logger.info( "Connecting to Enrichr Server to get latest library names") if len(gss) < 1: self._logger.error("Hint: Current organism = %s, is this correct?\n"%self.organism +\ "Hint: use get_library_name() to view full list of supported names.") raise LookupError( "Not validated Enrichr library! Please provide correct organism and library name!" ) self.results = pd.DataFrame() for g in gss: if isinstance(g, dict): ## local mode res = self.enrich(g) shortID, self._gs = str(id(g)), "CUSTOM%s" % id(g) if res is None: self._logger.info( "No hits return, for gene set: Custom%s" % shortID) continue else: ## online mode self._gs = str(g) self._logger.debug("Start Enrichr using library: %s" % (self._gs)) self._logger.info('Analysis name: %s, Enrichr Library: %s' % (self.descriptions, self._gs)) shortID, res = self.get_results(genes_list) # Remember gene set library used res.insert(0, "Gene_set", self._gs) # Append to master dataframe self.results = self.results.append(res, ignore_index=True) self.res2d = res if self._outdir is None: continue self._logger.info('Save file of enrichment results: Job Id:' + str(shortID)) outfile = "%s/%s.%s.%s.reports.txt" % (self.outdir, self._gs, self.organism, self.module) self.res2d.to_csv(outfile, index=False, encoding='utf-8', sep="\t") # plotting if not self.__no_plot: msg = barplot(df=res, cutoff=self.cutoff, figsize=self.figsize, top_term=self.__top_term, color='salmon', title=self._gs, ofname=outfile.replace("txt", self.format)) if msg is not None: self._logger.warning(msg) self._logger.info('Done.\n') # clean up tmpdir if self._outdir is None: self._tmpdir.cleanup() return
def run(self): """run enrichr for one sample gene list but multi-libraries""" # read input file genes_list = self.parse_genelists() gss = unique(self.parse_genesets()) self._logger.info( "Connecting to Enrichr Server to get latest library names") # gss = self.gene_sets.split(",") enrichr_library = get_library_name() gss = [g for g in gss if g in enrichr_library] self._logger.info("Libraries are used: %s" % ("',".join(gss))) if len(gss) < 1: sys.stderr.write("Not validated Enrichr library name provided\n") sys.stdout.write( "Hint: use get_library_name() to view full list of supported names" ) sys.exit(1) self.results = pd.DataFrame() for g in gss: self._gs = str(g) self._logger.debug("Start Enrichr using library: %s" % (self._gs)) self._logger.info('Analysis name: %s, Enrichr Library: %s' % (self.descriptions, self._gs)) shortID, res = self.get_results(genes_list) # Remember gene set library used res.insert(0, "Gene_set", self._gs) # Append to master dataframe self.results = self.results.append(res, ignore_index=True) self.res2d = res if self._outdir is None: continue self._logger.info('Save file of enrichment results: Job Id:' + str(shortID)) outfile = "%s/%s.%s.%s.reports.txt" % ( self.outdir, self._gs, self.descriptions, self.module) self.res2d.to_csv(outfile, index=False, encoding='utf-8', sep="\t") # plotting if not self.__no_plot: msg = barplot(df=res, cutoff=self.cutoff, figsize=self.figsize, top_term=self.__top_term, color='salmon', title=self._gs, ofname=outfile.replace("txt", self.format)) if msg is not None: self._logger.warning(msg) self._logger.info('Done.\n') # clean up tmpdir if self._outdir is None: self._tmpdir.cleanup() return
import sleep import matplotlib.pyplot as plt from gseapy.parser import Biomart from gseapy.plot import barplot, dotplot gene_list = pd.read_csv("/Users/sunxueyan/Downloads/GSEApy-master/tests/data/gene_list.txt",header=None, sep="\t") gene_list1 = pd.read_csv("/Users/sunxueyan/Downloads/non_geneID.csv") gene_list1.head() glist = gene_list1.squeeze().str.strip().tolist() names = gp.get_library_name() # default: Human s = requests.session() s.keep_alive = False enr = gp.enrichr(gene_list="/Users/sunxueyan/Downloads/GSEApy-master/tests/data/gene_list.txt", # or gene_list=glist description='', gene_sets=['KEGG_2019_Human'], outdir='test/enrichr_kegg', cutoff=0.5 # test dataset, use lower value from range(0,1) ) barplot(enr.res2d,title='KEGG_2019_Human',) dotplot(enr.res2d, title='KEGG_2019_Human',)
def run_single(self): """run enrichr for one sample""" # read input file genes_str=self.parse_input() # name of analysis or list description = str(self.descriptions) gene_set = str(self._gs) self._logger.info("Connecting to Enrichr Server to get latest library names") if gene_set in DEFAULT_LIBRARY: enrichr_library = DEFAULT_LIBRARY else: enrichr_library = get_library_name() if gene_set not in enrichr_library: sys.stderr.write("%s is not a Enrichr library name\n"%gene_set) sys.stdout.write("Hint: use get_library_name() to view full list of supported names") sys.exit(1) self._logger.info('Analysis name: %s, Enrichr Library: %s'%(description, gene_set)) # enrichr url ENRICHR_URL = 'http://amp.pharm.mssm.edu/Enrichr/addList' # payload payload = { 'list': (None, genes_str), 'description': (None, description) } # response response = requests.post(ENRICHR_URL, files=payload) if not response.ok: raise Exception('Error analyzing gene list') job_id = json.loads(response.text) self._logger.debug('Job ID:'+ str(job_id)) ENRICHR_URL_A = 'http://amp.pharm.mssm.edu/Enrichr/view?userListId=%s' user_list_id = job_id['userListId'] response_gene_list = requests.get(ENRICHR_URL_A % str(user_list_id), timeout=None) # wait for 1s sleep(1) if not response_gene_list.ok: raise Exception('Error getting gene list') self._logger.info('Submitted gene list:' + str(job_id)) # Get enrichment results ENRICHR_URL = 'http://amp.pharm.mssm.edu/Enrichr/enrich' query_string = '?userListId=%s&backgroundType=%s' # get id data user_list_id = job_id['userListId'] response = requests.get(ENRICHR_URL + query_string % (str(user_list_id), gene_set)) if not response.ok: raise Exception('Error fetching enrichment results') self._logger.debug('Get enrichment results: Job Id:'+ str(job_id)) # Download file of enrichment results ENRICHR_URL = 'http://amp.pharm.mssm.edu/Enrichr/export' query_string = '?userListId=%s&filename=%s&backgroundType=%s' user_list_id = str(job_id['userListId']) filename = "%s.%s.%s.reports"%(gene_set, description, self.module) url = ENRICHR_URL + query_string % (user_list_id, filename, gene_set) # set max retries num =5 s = retry(num=5) response = s.get(url, stream=True, timeout=None) self._logger.info('Downloading file of enrichment results: Job Id:'+ str(job_id)) outfile="%s/%s.%s.%s.reports.txt"%(self.outdir, gene_set, description, self.module) with open(outfile, 'wb') as f: for chunk in response.iter_content(chunk_size=1024): if chunk: f.write(chunk) self._logger.debug('Results written to: ' + outfile) # save results df = read_table(outfile) self.res2d = df if self._outdir is None: return # plotting if not self.__no_plot: fig = barplot(df=df, cutoff=self.cutoff, figsize=self.figsize, top_term=self.__top_term, color='salmon', title='') if fig is None: self._logger.warning("Warning: No enrich terms using library %s when cuttoff = %s"%(gene_set, self.cutoff)) else: fig.savefig(outfile.replace("txt", self.format), bbox_inches='tight', dpi=300) self._logger.info('Done.\n') return
def run(self): """run enrichr""" mkdirs(self.outdir) #read input file genes_str = self.parse_input() # name of analysis or list description = str(self.descriptions) #library validaty confirmationi gene_set = str(self.gene_sets) #logging start logger = self._log_init( module=self.module, log_level=logging.INFO if self.verbose else logging.WARNING) logger.info("Connecting to Enrichr Server to get latest library names") if gene_set in DEFAULT_LIBRARY: enrichr_library = DEFAULT_LIBRARY else: enrichr_library = self.get_libraries() if gene_set not in enrichr_library: sys.stderr.write("%s is not a enrichr library name\n" % gene_set) sys.stdout.write( "Hint: use get_library_name() to veiw full list of supported names" ) sys.exit(1) logger.info('Analysis name: %s, Enrichr Library: %s' % (description, gene_set)) ## enrichr url ENRICHR_URL = 'http://amp.pharm.mssm.edu/Enrichr/addList' # payload payload = { 'list': (None, genes_str), 'description': (None, description) } # response response = requests.post(ENRICHR_URL, files=payload) if not response.ok: raise Exception('Error analyzing gene list') sleep(1) job_id = json.loads(response.text) logger.debug('Job ID:' + str(job_id)) ENRICHR_URL_A = 'http://amp.pharm.mssm.edu/Enrichr/view?userListId=%s' user_list_id = job_id['userListId'] response_gene_list = requests.get(ENRICHR_URL_A % str(user_list_id), timeout=None) if not response_gene_list.ok: raise Exception('Error getting gene list') logger.info('Submitted gene list:' + str(job_id)) # Get enrichment results ENRICHR_URL = 'http://amp.pharm.mssm.edu/Enrichr/enrich' query_string = '?userListId=%s&backgroundType=%s' ## get id data user_list_id = job_id['userListId'] response = requests.get(ENRICHR_URL + query_string % (str(user_list_id), gene_set)) if not response.ok: raise Exception('Error fetching enrichment results') logger.debug('Get enrichment results: Job Id:' + str(job_id)) ## Download file of enrichment results ENRICHR_URL = 'http://amp.pharm.mssm.edu/Enrichr/export' query_string = '?userListId=%s&filename=%s&backgroundType=%s' user_list_id = str(job_id['userListId']) filename = "%s.%s.%s.reports" % (gene_set, description, self.module) url = ENRICHR_URL + query_string % (user_list_id, filename, gene_set) # set max retries num =5 s = requests.Session() retries = Retry(total=5, backoff_factor=0.1, status_forcelist=[500, 502, 503, 504]) s.mount('http://', HTTPAdapter(max_retries=retries)) response = s.get(url, stream=True, timeout=None) logger.info('Downloading file of enrichment results: Job Id:' + str(job_id)) outfile = "%s/%s.%s.%s.reports.txt" % (self.outdir, gene_set, description, self.module) with open(outfile, 'wb') as f: for chunk in response.iter_content(chunk_size=1024): if chunk: f.write(chunk) logger.debug('Results written to: ' + outfile) #save results df = read_table(outfile) self.res2d = df #plotting if not self.__no_plot: fig = barplot( df=df, cutoff=self.cutoff, figsize=self.figsize, top_term=self.__top_term, ) if fig is None: logger.warning( "Warning: No enrich terms using library %s when cuttoff = %s" % (gene_set, self.cutoff)) else: fig.savefig(outfile.replace("txt", self.format), bbox_inches='tight', dpi=300) logger.info('Done.\n') return