Beispiel #1
0
def checkOrg(org):
    if not os.path.exists('./%s_pathway.txt' % org):
        a = KEGG()
        b = a.list('pathway', org)
        with open('%s_pathway.txt' % org, 'a', encoding='latin-1') as f:
            f.write(b)
        print('Finish writing org file')
    else:
        print('File already existed')
Beispiel #2
0
    def extract_all(self):
        from bioservices import KEGG
        kegg = KEGG()
        pathway_list = filter(None, kegg.list("pathway/hsa").split("\n"))

        pathway_dict = {}
        for p in pathway_list:
            id = p.split("\t")[0]
            name = p.split("\t")[1]

            pathway_dict[id] = name

        return pathway_dict
Beispiel #3
0
class KeggPathwayEnrichment():
    """DRAFT IN PROGRESS


    Current input is the output of the rnadiff analysis
    ::

        pe = PathwayEnrichment("rnadiff", "eco")
        pe.barplot(pe.enrichment['down'])


        # Save all deregulated pathways found by the enrichment:
        up = pe.save_significant_pathways("up")
        down = pe.save_significant_pathways("down")
        up.to_csv("kegg_pathway_up_regulated.csv")
        down.to_csv("kegg_pathway_down_regulated.csv")

    # transparent for ecoli. Set the organism to eco
    # For mus musculus, you will need convert the Gene Input IDs into
    # kegg identifiers so as to compare them.
    from bioservices import BioMart
    b = BioMart()
    datasets = b.get_datasets("ENSEMBL_MART_ENSEMBL")
    [x for x in datasets if 'mus' in x]
    -> one of interest is obviously mmusculus_gene_ensembl
    attributes = b.attributes(dataset='mmusculus_gene_ensembl')
    filters = b.filters(dataset='mmusculus_gene_ensembl')

    b.new_query()
    b.add_dataset_to_xml('mmusculus_gene_ensembl')
    b.add_attribute_to_xml('ensembl_gene_id')
    b.add_attribute_to_xml('go_id')
    b.add_attribute_to_xml('entrezgene_id')
    b.add_attribute_to_xml('mgi_id')
    b.add_attribute_to_xml('external_gene_name')
    xml = b.get_xml()
    res = b.query(xml)
    import pandas as pd
    df.columns=['ensembl','go', 'entrez', 'mgi', 'name'] 
    df = df.set_index('ensembl')
    # name should be the name used by kegg

    """
    def __init__(self,
                 folder,
                 organism,
                 alpha=0.05,
                 log2_fc=0,
                 progress=True,
                 mapper=None,
                 background=None):

        print("DRAFT in progress")
        from bioservices import KEGG
        self.kegg = KEGG(cache=True)
        self.kegg.organism = organism

        self.rnadiff = RNADiffResults(folder, alpha=alpha, log2_fc=log2_fc)
        # some clean up
        if "ID" in self.rnadiff.df.columns:
            self.rnadiff.df['ID'] = [
                x.replace("gene:", "") for x in self.rnadiff.df['ID']
            ]
        self.rnadiff.df.index = [
            x.replace("gene:", "") for x in self.rnadiff.df.index
        ]
        for key, values in self.rnadiff.gene_lists.items():
            self.rnadiff.gene_lists[key] = [
                x.replace("gene:", "") for x in values
            ]

        self.rnadiff.df.index = [
            x.replace("gene:", "") for x in self.rnadiff.df.index
        ]

        choices = list(self.rnadiff.gene_lists.keys())

        if background:
            self.background = background
        else:
            self.background = len(
                self.kegg.list(self.kegg.organism).split("\n"))
        logger.info("Set number of genes to {}".format(self.background))

        self._load_pathways(progress=progress)

        self.mapper = mapper

        try:
            self.compute_enrichment()
        except Exception:
            logger.critical("An error occured while computing enrichments")
            pass

    def _load_pathways(self, progress=True):
        # This is just loading all pathways once for all
        logger.info(
            "loading all pathways from KEGG. may take time the first time")
        self.pathways = {}
        from easydev import Progress
        pb = Progress(len(self.kegg.pathwayIds))
        for i, ID in enumerate(self.kegg.pathwayIds):
            self.pathways[ID.replace("path:",
                                     "")] = self.kegg.parse(self.kegg.get(ID))
            if progress:
                pb.animate(i + 1)

        # Some cleanup
        for ID in self.pathways.keys():
            name = self.pathways[ID]['NAME'][0]
            self.pathways[ID]['NAME'] = name.split(" - ", 1)[0]

        # save gene sets
        self.gene_sets = {}
        for ID in self.pathways.keys():
            res = self.pathways[ID]
            if "GENE" in res.keys():
                results = []
                # some pathways reports genes as a dictionary id:'gene name; description' ('.eg. eco')
                # others reports genes as a dictionary id:'description'
                for geneID, description in res['GENE'].items():
                    if ";" in description:
                        name = description.split(';')[0]
                    else:
                        name = geneID
                    results.append(name)

                self.gene_sets[ID] = results
            else:
                print("SKIPPED (no genes) {}: {}".format(ID, res['NAME']))

        # save all pathways info
        self.df_pathways = pd.DataFrame(self.pathways).T
        del self.df_pathways["ENTRY"]
        del self.df_pathways["REFERENCE"]
        go = [
            x['GO'] if isinstance(x, dict) and 'GO' in x.keys() else None
            for x in self.df_pathways.DBLINKS
        ]
        self.df_pathways['GO'] = go
        del self.df_pathways["DBLINKS"]

    def plot_genesets_hist(self, bins=20):
        N = len(self.gene_sets.keys())
        pylab.clf()
        pylab.hist([len(v) for k, v in self.gene_sets.items()],
                   bins=bins,
                   lw=1,
                   ec="k")
        pylab.title("{} gene sets".format(N))
        pylab.xlabel("Gene set sizes")
        pylab.grid(True)
        a, b = pylab.xlim()
        pylab.xlim([0, b])

    def compute_enrichment(self, background=None):
        if background is None:
            background = self.background
        self.enrichment = {}
        self.enrichment['up'] = self._enrichr("up", background=background)
        self.enrichment['down'] = self._enrichr("down", background=background)
        self.enrichment['all'] = self._enrichr("all", background=background)

    def _enrichr(self, category, background=None, verbose=True):

        if background is None:
            background = self.background

        if isinstance(category, list):
            gene_list = category
        else:
            assert category in ['up', 'down', 'all']
            gene_list = list(self.rnadiff.gene_lists[category])

        if self.mapper is not None:
            logger.info("Input gene list of {} ids".format(len(gene_list)))
            #gene_list = [x.replace("gene:", "") for x in gene_list]
            identifiers = self.mapper.loc[gene_list]['name'].drop_duplicates(
            ).values
            logger.info("Mapped gene list of {} ids".format(len(identifiers)))
            gene_list = list(identifiers)

        enr = gseapy.enrichr(gene_list=gene_list,
                             gene_sets=self.gene_sets,
                             verbose=verbose,
                             background=background,
                             outdir="test",
                             no_plot=True)

        return enr

    def _get_final_df(self, df, cutoff=0.05, nmax=10):
        # takes the df and populate the name and size of the found pathways
        # we also sort by adjusted p-value
        # we keep adj p-value <=0.05
        df = df.copy()
        df['name'] = [self.pathways[x]['NAME'] for x in df.Term]
        df['size'] = [len(x.split(";")) for x in df.Genes]
        df = df.sort_values("Adjusted P-value")
        df.reset_index(drop=True, inplace=True)
        df = df[df["Adjusted P-value"] <= cutoff]

        if len(df) < nmax:
            nmax = len(df)
        df = df.iloc[0:nmax]
        df = df.sort_values("Adjusted P-value", ascending=False)
        return df

    def barplot(self, enrich, cutoff=0.05, nmax=10):
        df = self._get_final_df(enrich.results, cutoff=cutoff, nmax=nmax)

        pylab.clf()
        pylab.barh(range(len(df)), -pylab.log10(df['Adjusted P-value']))
        pylab.yticks(range(len(df)), df.name)
        pylab.axvline(1.3, lw=2, ls="--", color="r")
        pylab.grid(True)
        pylab.xlabel("Adjusted p-value (log10)")
        pylab.ylabel("Gene sets")
        a, b = pylab.xlim()
        pylab.xlim([0, b])
        pylab.tight_layout()
        return df

    def scatterplot(self, enrich, cutoff=0.05, nmax=10, gene_set_size=[]):
        df = self._get_final_df(enrich.results, cutoff=cutoff, nmax=nmax)

        pylab.clf()
        pylab.scatter(-pylab.log10(df['Adjusted P-value']),
                      range(len(df)),
                      s=10 * df['size'],
                      c=df['size'])

        pylab.xlabel("Odd ratio")
        pylab.ylabel("Gene sets")
        pylab.yticks(range(len(df)), df.name)
        a, b = pylab.xlim()
        pylab.xlim([0, b])
        pylab.grid(True)
        ax = pylab.gca()

        M = max(df['size'])
        if M > 100:
            l1, l2, l3 = "10", "100", str(M)
        else:
            l1, l2, l3 = str(round(M / 3)), str(round(M * 2 / 3)), str(M)

        handles = [
            pylab.Line2D([0], [0], marker="o", markersize=5, label=l1, ls=""),
            pylab.Line2D([0], [0], marker="o", markersize=10, label=l2, ls=""),
            pylab.Line2D([0], [0], marker="o", markersize=15, label=l3, ls="")
        ]
        ax.legend(handles=handles, loc="upper left", title="gene-set size")

        pylab.axvline(1.3, lw=2, ls="--", color="r")
        pylab.tight_layout()
        ax = pylab.colorbar(pylab.gci())
        return df

    def _get_summary_pathway(self, pathway_ID):
        genes = self.df_pathways.loc[pathway_ID]['GENE']
        df_down = self.rnadiff.df.query(
            "padj<=0.05 and log2FoldChange<0").copy()
        df_up = self.rnadiff.df.query("padj<=0.05 and log2FoldChange>0").copy()

        #f_down = self.rnadiff.dr_gene_lists[self.comparison]

        logger.info("Total down-regulated: {}".format(len(df_down)))
        logger.info("Total up-regulated: {}".format(len(df_up)))

        mapper = {}
        for k, v in genes.items():
            mapper[v.split(";")[0]] = k
        self.genes = genes
        self.df_down = df_down
        self.df_up = df_up
        summary_names = []
        summary_keggids = []
        summary_types = []
        summary_pvalues = []
        summary_fcs = []

        if self.mapper is not None:
            if 'Name' not in df_down.columns:
                df_down['Name'] = df_down['ID']
                Names = []
                for index in df_down.index:
                    Names.append(self.mapper.loc[index]['name'][0])
                df_down['Name'] = Names
            if 'Name' not in df_up.columns:
                df_up['Name'] = df_up['ID']
                Names = []
                for index in df_up.index:
                    Names.append(self.mapper.loc[index]['name'][0])
                df_up['Name'] = Names

        for name, kegg_id in mapper.items():
            summary_names.append(name)
            summary_keggids.append(kegg_id)

            if name.lower() in [x.lower() for x in df_down.Name]:
                pvalue = -pylab.log10(
                    df_down.query("Name==@name").pvalue.values[0])
                fc = df_down.query("Name==@name").log2FoldChange.values[0]
                summary_fcs.append(fc)
                summary_pvalues.append(pvalue)
                summary_types.append("-")
            elif name.lower() in [x.lower() for x in df_up.Name]:
                pvalue = -pylab.log10(
                    df_up.query("Name==@name").pvalue.values[0])
                summary_pvalues.append(pvalue)
                fc = df_up.query("Name==@name").log2FoldChange.values[0]
                summary_fcs.append(fc)
                summary_types.append("+")
            else:
                summary_pvalues.append(None)
                summary_fcs.append(None)
                summary_types.append("=")

        summary = pd.DataFrame({
            "type": summary_types,
            "name": summary_names,
            "pvalue": summary_pvalues,
            "fc": summary_fcs,
            "keggid": summary_keggids
        })
        summary['description'] = [
            self.pathways[pathway_ID]['GENE'][x] for x in summary.keggid
        ]
        return summary

    def _get_colors(self, summary):
        colors = {}
        for index, row in summary.iterrows():
            pvalue = row['pvalue']
            type_ = row['type']
            kegg_id = row['keggid']
            if type_ == "-":
                if pvalue > 0 and pvalue < 5:
                    colors[kegg_id] = "#FF8C00,black"
                elif pvalue < 10:
                    colors[kegg_id] = "#FF0000,black"
                else:
                    colors[kegg_id] = "#B22222%2Cblack"
            elif type_ == "+":
                if pvalue > 0 and pvalue < 5:
                    colors[kegg_id] = "#9ACD32,black"
                elif pvalue < 10:
                    colors[kegg_id] = "#008000,black"
                else:
                    colors[kegg_id] = "#006400,#000000"
            else:
                colors[kegg_id] = "grey,black"
        return colors

    def save_pathway(self, pathway_ID, scale=None, show=False, filename=None):

        summary = self._get_summary_pathway(pathway_ID)
        colors = self._get_colors(summary)

        logger.info("pathway {} total genes: {}".format(
            pathway_ID, len(summary)))
        count_up = len(summary.query("type == '+'"))
        count_down = len(summary.query("type == '-'"))
        logger.info("this pathway down-regulared genes: {}".format(count_down))
        logger.info("this pathway up-regulated genes: {}".format(count_up))

        url = "https://www.kegg.jp/kegg-bin/show_pathway"
        #dcolor = "white"  --> does not work with the post requests unlike get
        # requests
        params = {
            "map":
            pathway_ID,
            "multi_query":
            "\r\n".join(["{} {}".format(k, v) for k, v in colors.items()])
        }

        self.params = params
        import requests
        html_page = requests.post(url, data=params)

        self.tmp = html_page
        html_page = html_page.content.decode()

        links_to_png = [
            x for x in html_page.split() if "png" in x and x.startswith("src")
        ]
        link_to_png = links_to_png[0].replace("src=", "").replace('"', '')
        r = requests.get("https://www.kegg.jp/{}".format(link_to_png))

        if filename is None:
            filename = "{}.png".format(pathway_ID)

        with open(filename, "wb") as fout:
            fout.write(r.content)

        return summary

    def save_all_pathways(self):  #pragma: no cover
        # This does not do any enrichment. Just save all pathways once for all
        # with useful information
        for ID in self.kegg.pathwayIds:
            self.save_pathway(ID)

    def save_significant_pathways(self,
                                  mode,
                                  cutoff=0.05,
                                  nmax=20,
                                  background=None):  #pragma: no cover
        """mode should be up, down or all"""

        if background is None:
            background = self.background

        # select the relevant pathways
        df = self._enrichr(mode, background).results
        df = self._get_final_df(df, cutoff=cutoff, nmax=nmax)
        logger.warning("Found {} pathways to save".format(len(df)))
        if len(df) == nmax:
            logger.warning("Restricted pathways to {}".format(nmax))

        logger.info("saving {} deregulated pathways".format(len(df)))

        summaries = {}
        # save them
        for ID in df['Term']:
            summary = self.save_pathway(ID,
                                        filename="{}_{}.png".format(ID, mode))
            summaries[ID] = summary
        return summaries

    def find_pathways_by_gene(self, gene_name, match="exact"):
        """Returns pathways that contain the gene name

        ke.find_pathways_by_gene("ysgA")
        """

        #First let us find the kegg ID
        genes = self.kegg.list(self.kegg.organism).strip().split("\n")

        keggid = [x.split("\t")[0].strip() for x in genes]
        gene_names = [x.split("\t")[1].split(";")[0].strip() for x in genes]

        self.keggid = keggid
        self.gene_names = gene_names
        candidates = []
        for x, y in zip(keggid, gene_names):

            if match == "exact":
                if gene_name == y:
                    candidates = x.split(":")[1]
                    break
            else:
                if gene_name in y:
                    candidates.append(x)
        if match != "exact":
            candidates = [x.split(":")[1] for x in candidates]
            logger.info("Found {} candidate(s): {}".format(
                len(candidates), candidates))
        else:
            logger.info("Found {} in {}".format(gene_name, candidates))

        paths = []
        for key in self.pathways.keys():
            if "GENE" in self.pathways[key]:
                if match == "exact":
                    if candidates in self.pathways[key]['GENE'].keys():
                        paths.append(key)
                else:
                    for candidate in candidates:
                        if candidate in self.pathways[key]['GENE'].keys():
                            paths.append(key)
        return list(set(paths))