def run_ssgsea(filepath: str, output_dir: str): """Perform ssGSEA.""" # Read data gene_exp = pd.read_csv(filepath, sep='\t') kegg_gene_set, _, _, _ = check_gmt_files() # Filter data set such that only those genes which are in the gene sets are in the expression data filtered_expression_data = filter_gene_exp_data(gene_exp, merge_gene_set) logger.info(f'Running {filepath}') single_sample_gsea = gseapy.ssgsea( data=filtered_expression_data, gene_sets=kegg_gene_set, outdir=output_dir, # do not write output to disk sample_norm_method='rank', # choose 'custom' for your own rank list permutation_num= 0, # skip permutation procedure, because you don't need it no_plot=True, # skip plotting to speed up format='png', ) logger.info('Done with ssGSEA') single_sample_gsea.res2d.to_csv(os.path.join(output_dir, 'results.tsv'), sep='\t')
def do_ssgsea( filtered_expression_data: pd.DataFrame, gene_set: str, output_dir: str = None, processes: int = 96, max_size: int = 3000, min_size: int = 15, ) -> SingleSampleGSEA: """Run single sample GSEA (ssGSEA) on filtered gene expression data set. :param filtered_expression_data: filtered gene expression values for samples :param gene_set: .gmt file containing gene sets :param output_dir: output directory :param processes: Number of processes :param max_size: Maximum allowed number of genes from gene set also the data set :param min_size: Minimum allowed number of genes from gene set also the data set :return: ssGSEA results in respective directory """ single_sample_gsea = ssgsea( data=filtered_expression_data, gene_sets=gene_set, outdir=output_dir, # do not write output to disk max_size=max_size, min_size=min_size, sample_norm_method='rank', # choose 'custom' for your own rank list permutation_num=0, # skip permutation procedure, because you don't need it no_plot=True, # skip plotting to speed up processes=processes, format='png', ) return single_sample_gsea
def run_ssgsea( filtered_expression_data: pd.DataFrame, gene_set: str, output_dir: str = SSGSEA, processes: int = 1, max_size: int = 3000, min_size: int = 15, ) -> SingleSampleGSEA: """Run single sample GSEA (ssGSEA) on filtered gene expression data set. :param filtered_expression_data: filtered gene expression values for samples :param gene_set: .gmt file containing gene sets :param output_dir: output directory :return: ssGSEA results in respective directory """ single_sample_gsea = gseapy.ssgsea( data=filtered_expression_data, gene_sets=gene_set, outdir=output_dir, # do not write output to disk max_size=max_size, min_size=min_size, sample_norm_method='rank', # choose 'custom' for your own rank list permutation_num= 0, # skip permutation procedure, because you don't need it no_plot=True, # skip plotting to speed up processes=processes, format='png', ) logger.info('Done with ssGSEA') return single_sample_gsea
def pathway_enrichment(self, factor, views=None, genesets=None, nprocesses=4, permutation_num=0): if genesets is None: genesets = [ "c6.all.v7.1.symbols.gmt", "c5.all.v7.1.symbols.gmt", "h.all.v7.1.symbols.gmt", "c2.all.v7.1.symbols.gmt", ] if views is None: views = ["methylation", "transcriptomics", "proteomics"] df = pd.concat( [ gseapy.ssgsea( self.weights[v][factor], processes=nprocesses, permutation_num=permutation_num, gene_sets=Enrichment.read_gmt(f"{DPATH}/pathways/{g}"), no_plot=True, ).res2d.assign(geneset=g).assign(view=v).reset_index() for v in views for g in genesets ], ignore_index=True, ) df = df.rename(columns={"sample1": "nes"}).sort_values("nes") return df
def run_ssGSEA(self, ordered_df, db, method='rank', no_plot=True, processes=4): ssgsea_analysis = gseapy.ssgsea(ordered_df, gene_sets=db, outdir='ss_'+self.gsea_save, sample_norm_method=method, no_plot=no_plot, processes=processes) return ssgsea_analysis
def main(): # Import data df, tar, grp, lengths, _ = data_reader.read_main(raw=True) # Log transform keeping all columns as they will be used in the gsea. df = df_utils.transform_sequence_to_microarray(df.T, all=True) # Handling for microarray set 0 as this does not require log transformation """ df1, _, _ = data_reader.read_number(1) df2, _, _ = data_reader.read_number(2) df_len = len(df) df = df_utils.merge_frames([df,df1,df2], drop=False) df = df.head(df_len) """ sample = df.T ss = gseapy.ssgsea(data=sample , gene_sets='Out/gmt_hepmark.gmt' , no_plot=True , outdir='Out/gsea_hepmark' , min_size=10) # "When you run the gene set enrichment analysis, the GSEA software automatically normalizes # the enrichment scores for variation in gene set size, as described in GSEA Statistics. # Nevertheless, the normalization is not very accurate for extremely small or extremely # large gene sets. For example, for gene sets with fewer than 10 genes, just 2 or 3 genes # can generate significant results. Therefore, by default, GSEA ignores gene sets that # contain fewer than 25 genes or more than 500 genes; defaults that are appropriate for # datasets with 10,000 to 20,000 features. To change these default values, use the Max Size # and Min Size parameters on the Run GSEA Page; however, keep in mind the possibility of # inflated scorings for very small gene sets and inaccurate normalization for large ones." # Setup df file rows = [] for s in ss.resultsOnSamples: row = [s] for val in ss.resultsOnSamples[s]: row.append(val) rows.append(row) columns = ['index'] columns.extend([x for x in ss.resultsOnSamples[s].axes[0]]) # NB : Do not use the res2d as this is the normalized score # Create es file df_out = pd.DataFrame(rows, columns = columns) df_out.set_index('index', inplace=True) path = r'%s' % getcwd().replace('\\','/') + "/Out/enrichment_scores/" df_out.to_csv(path+"es_test.csv")
("prot_culture_reps", "PC1"), ("prot_culture_reps", "PC2"), ("prot_broad_culture", "PC1"), ("prot_culture_reps_emt", "PC1"), ("prot_culture_reps_emt", "PC2"), ("prot_broad_culture_emt", "PC1"), ("prot_broad_culture_emt", "PC2"), ("prot_broad_culture_emt", "PC4"), ] enr_pcs = pd.concat( [ gseapy.ssgsea( dsets_dred[dtype]["loadings"].loc[dtype_pc], processes=4, gene_sets=Enrichment.read_gmt(f"{DPATH}/pathways/{g}"), no_plot=True, ).res2d.assign(geneset=g).assign(dtype=dtype).assign( dtype_pc=dtype_pc).reset_index() for dtype, dtype_pc in enr_pcs for g in genesets ], ignore_index=True, ) enr_pcs = enr_pcs.rename(columns={"sample1": "nes"}).sort_values("nes") enr_pcs.to_csv(f"{RPATH}/DimRed_pcs_enr.csv.gz", compression="gzip", index=False) # Plot enr_pcs_plt = [ ("prot", "PC1", "prot_broad", "PC1", 0.5),
), time.ctime() # gene sets for ssGSEA gene_sets = {} pw_list = [] for pw in pathwayDic[testing_pathway.lower()]: for uniprot in pathwayDic[testing_pathway.lower()][pw]: if uniprot in uniprot2gene: gene = uniprot2gene[uniprot] if gene in geneList: if not pw in gene_sets: gene_sets[pw] = [] gene_sets[pw].append(gene) pw_list = gene_sets.keys() fo = open('%s/%s.gmt' % (fo_directory, testing_pathway.lower()), 'w') for pw in pw_list: print >> fo, pw + '\t' + '\t'.join(gene_sets[pw]) fo.close() # ssGSEA ss = gp.ssgsea(data='%s/expression.txt' % (fo_directory), outdir='%s/%s_ssgsea_result' % (fo_directory, testing_pathway.lower()), gene_sets='%s/%s.gmt' % (fo_directory, testing_pathway.lower()), sample_norm_method='rank', permutation_num=0, no_plot=True, scale=True, min_size=2)
def ssgsea(self, name: str, df: pd.DataFrame, gene_sets: dict) -> pd.DataFrame: """ Perform single sample GSEA. Args: name: str, name of analysis, used to create directory and store results. df: pd.DataFrame, the data gene_sets: dict, a dict of gene sets for which to perform the analysis. Structure is (gene set name) -> Path(...) Returns: DataFrame with all the samples NES and a column indicating the gene set name. """ dfs = [] for name_gs, val_gs in gene_sets.items(): outdir_gs = Enricher.ENRICHMENT_DIR_PATH / name if not outdir_gs.is_dir(): outdir_gs.mkdir() if isinstance(val_gs, Path): geneset_dict = read_gmt(val_gs) else: geneset_dict = val_gs.copy() # Perform the enrichment in batches of size step_size genesets = list(geneset_dict.keys()) step_size = 40 dfs_geneset = [] for i in range(0, len(genesets) + step_size, step_size): # Define genesets_batch if i == 0: continue if i < len(genesets): genesets_batch = genesets[i - step_size:i] else: genesets_batch = genesets[i - step_size:len(genesets)] genesets_batch_dict = { geneset: geneset_dict[geneset] for geneset in genesets_batch } print(i - step_size, i, len(genesets_batch_dict)) ss = gp.ssgsea(data=df, gene_sets=genesets_batch_dict, outdir=str(outdir_gs / name_gs), sample_norm_method=self.sample_norm_method, min_size=self.min_size, max_size=self.max_size, permutation_num=self.permutation_num, scale=self.scale, processes=self.processes, no_plot=self.no_plot, verbose=self.verbose) df_res_batch = pd.DataFrame.from_dict( ss.resultsOnSamples, orient='index').transpose() dfs_geneset.append(df_res_batch) # Aggregate batches df_res = pd.concat(dfs_geneset, axis=0, sort=False) df_res['gene_set'] = name_gs df_res.to_csv(outdir_gs / (name_gs + ".csv"), header=True, index=True) dfs.append(df_res) # Aggregate the different categories of gene sets (e.g. KEGG, GO, etc.) df_agg = pd.concat(dfs, axis=0, sort=False) df_agg.to_csv(outdir_gs / "enrichment_results.csv", header=True, index=True) return df_agg