Example #1
0
def run_ssgsea(filepath: str, output_dir: str):
    """Perform ssGSEA."""
    # Read data
    gene_exp = pd.read_csv(filepath, sep='\t')

    kegg_gene_set, _, _, _ = check_gmt_files()

    # Filter data set such that only those genes which are in the gene sets are in the expression data
    filtered_expression_data = filter_gene_exp_data(gene_exp, merge_gene_set)

    logger.info(f'Running {filepath}')

    single_sample_gsea = gseapy.ssgsea(
        data=filtered_expression_data,
        gene_sets=kegg_gene_set,
        outdir=output_dir,  # do not write output to disk
        sample_norm_method='rank',  # choose 'custom' for your own rank list
        permutation_num=
        0,  # skip permutation procedure, because you don't need it
        no_plot=True,  # skip plotting to speed up
        format='png',
    )
    logger.info('Done with ssGSEA')

    single_sample_gsea.res2d.to_csv(os.path.join(output_dir, 'results.tsv'),
                                    sep='\t')
Example #2
0
def do_ssgsea(
        filtered_expression_data: pd.DataFrame,
        gene_set: str,
        output_dir: str = None,
        processes: int = 96,
        max_size: int = 3000,
        min_size: int = 15,
) -> SingleSampleGSEA:
    """Run single sample GSEA (ssGSEA) on filtered gene expression data set.

    :param filtered_expression_data: filtered gene expression values for samples
    :param gene_set: .gmt file containing gene sets
    :param output_dir: output directory
    :param processes: Number of processes
    :param max_size: Maximum allowed number of genes from gene set also the data set
    :param min_size: Minimum allowed number of genes from gene set also the data set
    :return: ssGSEA results in respective directory
    """
    single_sample_gsea = ssgsea(
        data=filtered_expression_data,
        gene_sets=gene_set,
        outdir=output_dir,  # do not write output to disk
        max_size=max_size,
        min_size=min_size,
        sample_norm_method='rank',  # choose 'custom' for your own rank list
        permutation_num=0,  # skip permutation procedure, because you don't need it
        no_plot=True,  # skip plotting to speed up
        processes=processes,
        format='png',
    )

    return single_sample_gsea
Example #3
0
def run_ssgsea(
    filtered_expression_data: pd.DataFrame,
    gene_set: str,
    output_dir: str = SSGSEA,
    processes: int = 1,
    max_size: int = 3000,
    min_size: int = 15,
) -> SingleSampleGSEA:
    """Run single sample GSEA (ssGSEA) on filtered gene expression data set.

    :param filtered_expression_data: filtered gene expression values for samples
    :param gene_set: .gmt file containing gene sets
    :param output_dir: output directory
    :return: ssGSEA results in respective directory
    """
    single_sample_gsea = gseapy.ssgsea(
        data=filtered_expression_data,
        gene_sets=gene_set,
        outdir=output_dir,  # do not write output to disk
        max_size=max_size,
        min_size=min_size,
        sample_norm_method='rank',  # choose 'custom' for your own rank list
        permutation_num=
        0,  # skip permutation procedure, because you don't need it
        no_plot=True,  # skip plotting to speed up
        processes=processes,
        format='png',
    )
    logger.info('Done with ssGSEA')
    return single_sample_gsea
Example #4
0
    def pathway_enrichment(self,
                           factor,
                           views=None,
                           genesets=None,
                           nprocesses=4,
                           permutation_num=0):
        if genesets is None:
            genesets = [
                "c6.all.v7.1.symbols.gmt",
                "c5.all.v7.1.symbols.gmt",
                "h.all.v7.1.symbols.gmt",
                "c2.all.v7.1.symbols.gmt",
            ]

        if views is None:
            views = ["methylation", "transcriptomics", "proteomics"]

        df = pd.concat(
            [
                gseapy.ssgsea(
                    self.weights[v][factor],
                    processes=nprocesses,
                    permutation_num=permutation_num,
                    gene_sets=Enrichment.read_gmt(f"{DPATH}/pathways/{g}"),
                    no_plot=True,
                ).res2d.assign(geneset=g).assign(view=v).reset_index()
                for v in views for g in genesets
            ],
            ignore_index=True,
        )
        df = df.rename(columns={"sample1": "nes"}).sort_values("nes")
        return df
Example #5
0
 def run_ssGSEA(self, ordered_df, db,
                method='rank', no_plot=True,
                processes=4):
     ssgsea_analysis = gseapy.ssgsea(ordered_df,
                                     gene_sets=db,
                                     outdir='ss_'+self.gsea_save,
                                     sample_norm_method=method,
                                     no_plot=no_plot,
                                     processes=processes)
     return ssgsea_analysis
def main():
    # Import data
    df, tar, grp, lengths, _ = data_reader.read_main(raw=True)

    # Log transform keeping all columns as they will be used in the gsea.
    df = df_utils.transform_sequence_to_microarray(df.T, all=True)

    # Handling for microarray set 0 as this does not require log transformation
    """
    df1, _, _ = data_reader.read_number(1)
    df2, _, _ = data_reader.read_number(2)
    df_len = len(df)
    df = df_utils.merge_frames([df,df1,df2], drop=False)
    df = df.head(df_len)
    """


    sample = df.T

    ss = gseapy.ssgsea(data=sample
                    , gene_sets='Out/gmt_hepmark.gmt'
                    , no_plot=True
                    , outdir='Out/gsea_hepmark'
                    , min_size=10)
    # "When you run the gene set enrichment analysis, the GSEA software automatically normalizes
    # the enrichment scores for variation in gene set size, as described in GSEA Statistics.
    # Nevertheless, the normalization is not very accurate for extremely small or extremely
    # large gene sets. For example, for gene sets with fewer than 10 genes, just 2 or 3 genes
    # can generate significant results. Therefore, by default, GSEA ignores gene sets that
    # contain fewer than 25 genes or more than 500 genes; defaults that are appropriate for
    # datasets with 10,000 to 20,000 features. To change these default values, use the Max Size
    # and Min Size parameters on the Run GSEA Page; however, keep in mind the possibility of
    # inflated scorings for very small gene sets and inaccurate normalization for large ones."

    # Setup df file
    rows = []
    for s in ss.resultsOnSamples:
        row = [s]
        for val in ss.resultsOnSamples[s]:
            row.append(val)
        rows.append(row)
    columns = ['index']
    columns.extend([x for x in ss.resultsOnSamples[s].axes[0]])
    # NB : Do not use the res2d as this is the normalized score

    # Create es file
    df_out = pd.DataFrame(rows, columns = columns)
    df_out.set_index('index', inplace=True)
    path = r'%s' % getcwd().replace('\\','/') + "/Out/enrichment_scores/"
    df_out.to_csv(path+"es_test.csv")
Example #7
0
        ("prot_culture_reps", "PC1"),
        ("prot_culture_reps", "PC2"),
        ("prot_broad_culture", "PC1"),
        ("prot_culture_reps_emt", "PC1"),
        ("prot_culture_reps_emt", "PC2"),
        ("prot_broad_culture_emt", "PC1"),
        ("prot_broad_culture_emt", "PC2"),
        ("prot_broad_culture_emt", "PC4"),
    ]

    enr_pcs = pd.concat(
        [
            gseapy.ssgsea(
                dsets_dred[dtype]["loadings"].loc[dtype_pc],
                processes=4,
                gene_sets=Enrichment.read_gmt(f"{DPATH}/pathways/{g}"),
                no_plot=True,
            ).res2d.assign(geneset=g).assign(dtype=dtype).assign(
                dtype_pc=dtype_pc).reset_index() for dtype, dtype_pc in enr_pcs
            for g in genesets
        ],
        ignore_index=True,
    )
    enr_pcs = enr_pcs.rename(columns={"sample1": "nes"}).sort_values("nes")
    enr_pcs.to_csv(f"{RPATH}/DimRed_pcs_enr.csv.gz",
                   compression="gzip",
                   index=False)

    # Plot
    enr_pcs_plt = [
        ("prot", "PC1", "prot_broad", "PC1", 0.5),
Example #8
0
        ), time.ctime()
        # gene sets for ssGSEA
        gene_sets = {}
        pw_list = []
        for pw in pathwayDic[testing_pathway.lower()]:
            for uniprot in pathwayDic[testing_pathway.lower()][pw]:
                if uniprot in uniprot2gene:
                    gene = uniprot2gene[uniprot]
                    if gene in geneList:
                        if not pw in gene_sets:
                            gene_sets[pw] = []
                        gene_sets[pw].append(gene)
        pw_list = gene_sets.keys()

        fo = open('%s/%s.gmt' % (fo_directory, testing_pathway.lower()), 'w')
        for pw in pw_list:
            print >> fo, pw + '\t' + '\t'.join(gene_sets[pw])
        fo.close()

        # ssGSEA
        ss = gp.ssgsea(data='%s/expression.txt' % (fo_directory),
                       outdir='%s/%s_ssgsea_result' %
                       (fo_directory, testing_pathway.lower()),
                       gene_sets='%s/%s.gmt' %
                       (fo_directory, testing_pathway.lower()),
                       sample_norm_method='rank',
                       permutation_num=0,
                       no_plot=True,
                       scale=True,
                       min_size=2)
    def ssgsea(self, name: str, df: pd.DataFrame,
               gene_sets: dict) -> pd.DataFrame:
        """
        Perform single sample GSEA.
        
        Args:
            name: str, name of analysis, used to create directory and store results.
            df: pd.DataFrame, the data
            gene_sets: dict, a dict of gene sets for which to perform the analysis.
                       Structure is (gene set name) -> Path(...)
        Returns: 
            DataFrame with all the samples NES and a column indicating the gene set name.
        """

        dfs = []
        for name_gs, val_gs in gene_sets.items():
            outdir_gs = Enricher.ENRICHMENT_DIR_PATH / name

            if not outdir_gs.is_dir():
                outdir_gs.mkdir()

            if isinstance(val_gs, Path):
                geneset_dict = read_gmt(val_gs)
            else:
                geneset_dict = val_gs.copy()

            # Perform the enrichment in batches of size step_size
            genesets = list(geneset_dict.keys())
            step_size = 40
            dfs_geneset = []
            for i in range(0, len(genesets) + step_size, step_size):

                # Define genesets_batch
                if i == 0:
                    continue
                if i < len(genesets):
                    genesets_batch = genesets[i - step_size:i]
                else:
                    genesets_batch = genesets[i - step_size:len(genesets)]

                genesets_batch_dict = {
                    geneset: geneset_dict[geneset]
                    for geneset in genesets_batch
                }

                print(i - step_size, i, len(genesets_batch_dict))

                ss = gp.ssgsea(data=df,
                               gene_sets=genesets_batch_dict,
                               outdir=str(outdir_gs / name_gs),
                               sample_norm_method=self.sample_norm_method,
                               min_size=self.min_size,
                               max_size=self.max_size,
                               permutation_num=self.permutation_num,
                               scale=self.scale,
                               processes=self.processes,
                               no_plot=self.no_plot,
                               verbose=self.verbose)

                df_res_batch = pd.DataFrame.from_dict(
                    ss.resultsOnSamples, orient='index').transpose()
                dfs_geneset.append(df_res_batch)

            # Aggregate batches
            df_res = pd.concat(dfs_geneset, axis=0, sort=False)
            df_res['gene_set'] = name_gs

            df_res.to_csv(outdir_gs / (name_gs + ".csv"),
                          header=True,
                          index=True)
            dfs.append(df_res)

        # Aggregate the different categories of gene sets (e.g. KEGG, GO, etc.)
        df_agg = pd.concat(dfs, axis=0, sort=False)
        df_agg.to_csv(outdir_gs / "enrichment_results.csv",
                      header=True,
                      index=True)
        return df_agg