Beispiel #1
0
def getPathway(org, compare):
    temp = open('%s_pathway.txt' % org, "r").read()
    temp = re.findall(r'\d{5}', temp)
    text = read_doc('%s.txt' % compare)
    kegg_color = {}
    map = {}
    map_list = []
    s = KEGG()
    final_com = {}
    for i in range(0, len(text) - 1, 2):
        newid = text[i]  #'cpd:'+ text[i]
        kegg_color[newid] = text[i + 1] + ',' + text[i + 1]
    for id in kegg_color:
        a = s.get(id)
        dic = s.parse(a)
        try:
            if 'PATHWAY' in dic:
                map[id] = list(dic['PATHWAY'].keys())
                map_list.extend(map[id])
        except TypeError:
            print('Error:' + a)
    final_map = dict(Counter(map_list))
    final_map = [x for x in final_map.items() if x[1] > 1]
    final_map = [x for x in final_map if x[0][3:] in temp]
    for pathway in final_map:
        newpath = pathway[0][3:]
        final_com[newpath] = []
        for compound in map:
            if pathway[0] in map[compound]:
                final_com[newpath].append(compound)
    return kegg_color, final_com
Beispiel #2
0
def get_pathway(pathway):
    s = KEGG()
    data = s.get(pathway)
    if type(data) == int:
        return data
    dict_data = s.parse(data)
    path_info = (dict_data['NAME'], dict_data['GENE'])
    return path_info
def teste4():
    s = KEGG()
    s.organism = "hsa" #H**o sapiens (human)
    modules=s.moduleIds #pathway modules
    dic=s.parse(s.get(modules[0]))
    compounds=dic["COMPOUND"]#dictionary with the names of the compounds {'C00074': 'Phosphoenolpyruvate',.....
    pathway=dic["PATHWAY"] # {'map00010': 'Glycolysis / Gluconeogenesis',......
    module_name=dic["NAME"] #['Glycolysis (Embden-Meyerhof pathway), glucose => pyruvate']
    return pathway
def Get_Drug_IDs(Brite_ID):
    k = KEGG(verbose="False")
    k_id = k.get(Brite_ID)
    e = easyXML(k_id, 'utf=8')
    results = e.soup.findChildren("a")
    all_drug_ids = re.findall(r"(D\d{5})", str(results))
    array = np.array(all_drug_ids)
    unique_drug_ids = np.unique(array)
    return unique_drug_ids
def teste5():
    s = KEGG()
    s.organism = "hsa" #H**o sapiens (human)
    modules=s.moduleIds #pathway modules
    dic=s.parse(s.get("M00627"))
    module_name=dic["NAME"][0]
    reactions=dic["REACTION"]
    if "Pentose phosphate cycle" in module_name:
        print(module_name)
    else:
        print("haha")
Beispiel #6
0
    def build_csv(self, filename=None, Nmax=None):
        """rebuild the entire dataframe (1hour) and stores as attribute

        :param Nmax: for testing
        """
        logger.info("Retrieving the kegg organisms and their definitions")
        from bioservices import KEGG
        import pandas as pd
        k = KEGG()
        results = []
        definition = []
        for i, item in enumerate(k.organismIds):
            results.append(k.parse(k.get(f"gn:{item}"))['NAME'])
            definition.append(k.parse(k.get(f"gn:{item}"))['DEFINITION'])
            print(i, Nmax)
            if Nmax and i + 1 >= Nmax:
                break

        results = [x[0] for x in results]
        IDs = [x.split(",")[0] for x in results]
        taxon = [x.split(",")[-1] for x in results]
        names = [
            x.split(",")[1].strip() if len(x.split(",")) == 3 else None
            for x in results
        ]

        df = pd.DataFrame({
            'ID': IDs,
            'taxon': taxon,
            'name': names,
            'def': definition
        })
        df = df.fillna("")
        df.columns = ['ID', 'taxon', 'shortname', 'definition']
        df['definition'] = [x.lower() for x in df.definition]
        df['shortname'] = [x.lower() for x in df.shortname]

        self.df = df
        if filename:
            df.to_csv(filename)
def target_paths(target_dict):
    # Create KEGG Object
    k = KEGG(verbose=False)

    # Create empty dictionary to output information
    gene_path = {}

    # start iterator
    i = 0

    # create list of targets
    target_names = list(target_dict.keys())

    # Loop through genes
    for HSA in target_dict.values():

        # Only use data where available
        if len(HSA) > 1:

            # get gene KEGG page
            page = k.get(HSA.lower())

            # isolate pathway information
            d = k.parse(page)

            # write pathway information to output dictionary
            if "PATHWAY" in d.keys():

                # create variable for pathways
                paths = d["PATHWAY"]

                # add pathway ids as list to gene name key
                gene_path[target_names[i]] = list(paths.keys())

                # increase iterator
                i += 1

            # add null value for no pathways
            else:
                gene_path[target_names[i]] = " "

                # increase iterator
                i += 1

        # Skip null values
        else:
            gene_path[target_names[i]] = " "

            # increase iterator
            i += 1

    return gene_path
def teste2():
    s = KEGG()
    s.organism = "hsa"
    modules=s.moduleIds
    print(modules[3])
    dic=s.parse(s.get(modules[3]))
    reactions=dic["REACTION"]
    dic_reac={}
    for reac in reactions:
        teste=reactions[reac]
        string=teste.split(" ")
        dic_reac[reac]=string
    return dic_reac #it gives a dictionary with reactionsID as keys and a list of compounds 
def teste6():
    s = KEGG()
    s.organism = "hsa"
    modules=["M00001", "M00002", "M00013", "M00034"]
    dic_reac={}
    for mod in modules:
        dic=s.parse(s.get(mod))
        reactions=dic["REACTION"]
        for reac in reactions:
            teste=reactions[reac]
            string=teste.split(" ")
            dic_reac[reac]=string
    return dic_reac 
Beispiel #10
0
def kegg_get(*args):
    if not hasattr(kegg_get,"cache"):
        if os.path.isfile("kegg_get.cache"):
            kegg_get.cache = pickle.load(open("kegg_get.cache","rb"))
        else:
            kegg_get.cache = {}

    if args not in kegg_get.cache or kegg_get.cache[args] is None:
        k = KEGG()
        result = k.get(*args)
        kegg_get.cache[args] = result
        with open("kegg_get.cache~","wb") as f:
            pickle.dump(kegg_get.cache, f)
        os.rename("kegg_get.cache~", "kegg_get.cache")
        return result
    else:
        return kegg_get.cache[args]
def drug_dict(disease):

    # Create KEGG Object
    k = KEGG(verbose=False)

    # Create object for disease file
    dis = k.get(disease)

    # create dictionary of k.get() output with k.parse()
    # this is an extension of the KEGG class
    d = k.parse(dis)

    # Pull out Therapeutic drug information
    treatment_drugs = d["DRUG"]

    # Return dictionary of drugs
    return treatment_drugs
Beispiel #12
0
class KeggInfo:
    def __init__(self):
        self.k = KEGG()
        self.org = "lac"
        self.genelist = []
        self.genedict = {}
        self.a = AnnotationTable()
        self.targetdict = self.a.analyze_sequences(
        )  # listed as sequence: list of genes
        for gene in self.a.genes:
            self.genelist.append(gene)
        #for gene in self.genelist:
        #self.get_info(gene)

    def get_info(self, gene):
        id = self.org + ":" + gene
        res = self.k.get(id)
        d = self.k.parse(res)
        ortho = "unknown"
        motif = "unknown"
        pfam = "unknown"
        definition = str(d['DEFINITION'])
        definition = definition[9:]
        if d.has_key('ORTHOLOGY'):
            ortho = str(d['ORTHOLOGY'])
        if d.has_key('MOTIF'):
            motif = d['MOTIF']
            if motif.has_key('Pfam'):
                pfam = str(motif['Pfam'])
            else:
                pfam = "unknown"
        # print gene + ";" + definition + ";" + pfam
        self.genedict[gene] = definition
        print gene + " info obtained"

    def make_file(self):
        f = open("/Users/brianmendoza/Desktop/CRISPRs/lac_multi_data.txt", 'w')
        for sequence in self.targetdict:
            sequenceLine = sequence + ";" + str(len(self.targetdict[sequence]))
            for gene in self.targetdict[sequence]:
                sequenceLine += ";" + gene[0:-2]  # self.genedict[gene]
            f.write(sequenceLine + "\n")
        f.close()
def drug_targets(drug_dic):
    # Create KEGG Object
    k = KEGG(verbose=False)

    # create empty list for drug IDs
    id_list = []

    # Create empty dictionary do add gene information to
    target_gene_dic = {}

    # create dictionary to link gene(key) and theraputic drug(value)
    gene_drug = {}

    # locate each drug id and add to list
    for value in drug_dic.values():
        id = re.findall(r"(D\d{5})", str(value))
        id_list.append(id[0])

    # Loop through drug IDs to gather information
    for drug_ID in id_list:

        # create object for drug information
        page = k.get(drug_ID)

        # create dictionary of drug information to isolate target information
        d = k.parse(page)

        # check for presence of target information
        if "TARGET" in d.keys():

            # isolate target information
            targ = d["TARGET"]

            # Remove pathways
            no_paths_pre = targ.split("  PATHWAY")

            # count spaces to identify presence of info
            spaces = targ.count(" ")

            # create list of genes
            gene_list = no_paths_pre[0].split("\n            ")

            # follow this if pathway section is present
            if spaces > 0:

                # loop through gene list
                for x in gene_list:
                    # separate gene names and HSA ID's
                    gene_split = x.split(" [")

                    # remove extras from gene name
                    y_split = gene_split[0].split(" ")

                    # add gene information to output dictionary
                    target_gene_dic[y_split[0]] = gene_split[1].strip("]")

                    # add gene and drug to output dictionary
                    gene_drug.setdefault(y_split[0], []).append(drug_ID)

            # if Gene doesn't have HSA# enter no value
            # also add gene to drug output dictionary
            else:
                target_gene_dic[no_paths_pre[0]] = ""

                for x in gene_list:
                    # separate gene names and HSA ID's
                    gene_split = x.split(" [")

                    # remove extras from gene name
                    y_split = gene_split[0].split(" ")

                    # add gene and drug to output dictionary
                    gene_drug.setdefault(y_split[0], []).append(drug_ID)

        else:
            pass

    return target_gene_dic, gene_drug
    # M: total number of objects,
    # n: total number of type I objects
    # N: total number of type I objects drawn without replacement
    kegg_pathways_hyper = {p: (hypergeom.sf(
        len(subnetwork_proteins.intersection(kegg_pathways_proteins[p])),
        len(network_proteins),
        len(network_proteins.intersection(kegg_pathways_proteins[p])),
        len(subnetwork_proteins)
    ), len(subnetwork_proteins.intersection(kegg_pathways_proteins[p]))) for p in kegg_pathways_proteins if len(subnetwork_proteins.intersection(kegg_pathways_proteins[p])) > 0}
    print '[INFO] KEGG pathways enrichment done'

    kegg_pathways_hyper = DataFrame(kegg_pathways_hyper, index=['pvalue', 'intersection']).T
    kegg_pathways_hyper['adj.pvalue'] = multipletests(kegg_pathways_hyper['pvalue'], method='fdr_bh')[1]
    kegg_pathways_hyper = kegg_pathways_hyper.sort('adj.pvalue', ascending=False)
    kegg_pathways_hyper = kegg_pathways_hyper[kegg_pathways_hyper['adj.pvalue'] < 0.05]
    kegg_pathways_hyper['name'] = [re.findall('NAME\s*(.*) - Mus musculus\n?', kegg.get(p))[0] for p in kegg_pathways_hyper.index]
    kegg_pathways_hyper = kegg_pathways_hyper[kegg_pathways_hyper['adj.pvalue'] != 0.0]

    # Plot PPI network of the Kegg pathways
    for set_id, set_name in kegg_pathways_hyper['name'].tail(10).to_dict().items():
        subnetwork_i.vs['label'] = [n.split('_')[0] if n in kegg_pathways_proteins[set_id] else '' for n in subnetwork_i.vs['name']]
        subnetwork_i.vs['shape'] = ['circle' for n in subnetwork_i.vs['name']]
        subnetwork_i.vs['color'] = [palette[hypothesis][1] if n in kegg_pathways_proteins[set_id] else palette[hypothesis][0] for n in subnetwork_i.vs['name']]
        subnetwork_i.vs['size'] = [17 if n in kegg_pathways_proteins[set_id] else 7 for n in subnetwork_i.vs['name']]

        igraph.plot(
            subnetwork_i,
            layout=layout,
            vertex_label_size=5,
            vertex_label_color='white',
            bbox=(0, 0, 360, 360),
Beispiel #15
0
def test_KEGGParser():
    s = KEGG()
    d = s.parse(s.get("cpd:C00001"))
    d = s.parse(s.get("ds:H00001"))
    d = s.parse(s.get("dr:D00001"))
    d = s.parse(s.get("ev:E00001"))
    d = s.parse(s.get("ec:1.1.1.1"))
    d = s.parse(s.get("hsa:1525"))
    d = s.parse(s.get("genome:T00001"))
    d = s.parse(s.get("gl:G00001"))
    d = s.parse(s.get("md:hsa_M00554"))
    d = s.parse(s.get("ko:K00001"))
    d = s.parse(s.get("path:hsa04914"))
    d = s.parse(s.get("rc:RC00001"))
    d = s.parse(s.get("rn:R00001"))
    d = s.parse(s.get("rp:RP00001"))

    d = s.parse(s.get('C15682'))
    assert d['SEQUENCE'][0]['TYPE'] == 'PK'
    assert d['SEQUENCE'][0][
        'GENE'] == "0-2 mycAI [UP:Q83WF0]; 3 mycAII [UP:Q83WE9]; 4-5 mycAIII [UP:Q83WE8]; 6 mycAIV [UP:Q83WE7]; 7 mycAV [UP:Q83WE6]"
    assert d['SEQUENCE'][0]['ORGANISM'] == "Micromonospora griseorubida"

    # issue #79

    d = s.parse(s.get("C00395"))
    assert d["SEQUENCE"][0][
        "GENE"] == '[1] 0-2 pcbAB [UP:P19787] [2] 0-2 pcbAB [UP:P27742]'
    assert d["SEQUENCE"][0][
        "ORGANISM"] == '[1] Penicillium chrysogenum [2] Emericella nidulans (Aspergillus nidulans [GN:ani] )'
    assert d['SEQUENCE'][0]['SEQUENCE'] == '0 Aad  1 Cys  2 Val'
    assert d['SEQUENCE'][0]['TYPE'] == "NRP"

    # issue #85

    bad_ids = [
        'D00136', 'D05177', 'D10223', 'H00434', 'H01656', 'H01663', 'T40092',
        'T40093', 'T40094', 'T40098', 'T40100', 'T40103', 'T40107', 'T40123',
        'T40129', 'T40136', 'T40139', 'T40145', 'T40147', 'T40148', 'T40149',
        'T40161', 'T40162', 'T40180', 'T40182', 'T40183', 'T40195', 'T40196',
        'T40197', 'T40208', 'T40209', 'T40210', 'T40213', 'T40215', 'T40219',
        'T40224', 'T40226', 'T40233', 'T40236', 'T40238', 'T40242', 'T40248',
        'T40249', 'T40255', 'T40256', 'T40258', 'T40284', 'T40285', 'T40286',
        'T40287', 'T40288', 'T40289', 'T40295', 'T40303', 'T40307', 'T40314',
        'T40318', 'T40326', 'K20201'
    ]

    for entry_id in bad_ids:
        entry = s.get(entry_id)
        assert not s.parse(entry) is entry
Beispiel #16
0
class KeggPathwayEnrichment():
    """DRAFT IN PROGRESS


    Current input is the output of the rnadiff analysis
    ::

        pe = PathwayEnrichment("rnadiff", "eco")
        pe.barplot(pe.enrichment['down'])


        # Save all deregulated pathways found by the enrichment:
        up = pe.save_significant_pathways("up")
        down = pe.save_significant_pathways("down")
        up.to_csv("kegg_pathway_up_regulated.csv")
        down.to_csv("kegg_pathway_down_regulated.csv")

    # transparent for ecoli. Set the organism to eco
    # For mus musculus, you will need convert the Gene Input IDs into
    # kegg identifiers so as to compare them.
    from bioservices import BioMart
    b = BioMart()
    datasets = b.get_datasets("ENSEMBL_MART_ENSEMBL")
    [x for x in datasets if 'mus' in x]
    -> one of interest is obviously mmusculus_gene_ensembl
    attributes = b.attributes(dataset='mmusculus_gene_ensembl')
    filters = b.filters(dataset='mmusculus_gene_ensembl')

    b.new_query()
    b.add_dataset_to_xml('mmusculus_gene_ensembl')
    b.add_attribute_to_xml('ensembl_gene_id')
    b.add_attribute_to_xml('go_id')
    b.add_attribute_to_xml('entrezgene_id')
    b.add_attribute_to_xml('mgi_id')
    b.add_attribute_to_xml('external_gene_name')
    xml = b.get_xml()
    res = b.query(xml)
    import pandas as pd
    df.columns=['ensembl','go', 'entrez', 'mgi', 'name'] 
    df = df.set_index('ensembl')
    # name should be the name used by kegg

    """
    def __init__(self,
                 folder,
                 organism,
                 alpha=0.05,
                 log2_fc=0,
                 progress=True,
                 mapper=None,
                 background=None):

        print("DRAFT in progress")
        from bioservices import KEGG
        self.kegg = KEGG(cache=True)
        self.kegg.organism = organism

        self.rnadiff = RNADiffResults(folder, alpha=alpha, log2_fc=log2_fc)
        # some clean up
        if "ID" in self.rnadiff.df.columns:
            self.rnadiff.df['ID'] = [
                x.replace("gene:", "") for x in self.rnadiff.df['ID']
            ]
        self.rnadiff.df.index = [
            x.replace("gene:", "") for x in self.rnadiff.df.index
        ]
        for key, values in self.rnadiff.gene_lists.items():
            self.rnadiff.gene_lists[key] = [
                x.replace("gene:", "") for x in values
            ]

        self.rnadiff.df.index = [
            x.replace("gene:", "") for x in self.rnadiff.df.index
        ]

        choices = list(self.rnadiff.gene_lists.keys())

        if background:
            self.background = background
        else:
            self.background = len(
                self.kegg.list(self.kegg.organism).split("\n"))
        logger.info("Set number of genes to {}".format(self.background))

        self._load_pathways(progress=progress)

        self.mapper = mapper

        try:
            self.compute_enrichment()
        except Exception:
            logger.critical("An error occured while computing enrichments")
            pass

    def _load_pathways(self, progress=True):
        # This is just loading all pathways once for all
        logger.info(
            "loading all pathways from KEGG. may take time the first time")
        self.pathways = {}
        from easydev import Progress
        pb = Progress(len(self.kegg.pathwayIds))
        for i, ID in enumerate(self.kegg.pathwayIds):
            self.pathways[ID.replace("path:",
                                     "")] = self.kegg.parse(self.kegg.get(ID))
            if progress:
                pb.animate(i + 1)

        # Some cleanup
        for ID in self.pathways.keys():
            name = self.pathways[ID]['NAME'][0]
            self.pathways[ID]['NAME'] = name.split(" - ", 1)[0]

        # save gene sets
        self.gene_sets = {}
        for ID in self.pathways.keys():
            res = self.pathways[ID]
            if "GENE" in res.keys():
                results = []
                # some pathways reports genes as a dictionary id:'gene name; description' ('.eg. eco')
                # others reports genes as a dictionary id:'description'
                for geneID, description in res['GENE'].items():
                    if ";" in description:
                        name = description.split(';')[0]
                    else:
                        name = geneID
                    results.append(name)

                self.gene_sets[ID] = results
            else:
                print("SKIPPED (no genes) {}: {}".format(ID, res['NAME']))

        # save all pathways info
        self.df_pathways = pd.DataFrame(self.pathways).T
        del self.df_pathways["ENTRY"]
        del self.df_pathways["REFERENCE"]
        go = [
            x['GO'] if isinstance(x, dict) and 'GO' in x.keys() else None
            for x in self.df_pathways.DBLINKS
        ]
        self.df_pathways['GO'] = go
        del self.df_pathways["DBLINKS"]

    def plot_genesets_hist(self, bins=20):
        N = len(self.gene_sets.keys())
        pylab.clf()
        pylab.hist([len(v) for k, v in self.gene_sets.items()],
                   bins=bins,
                   lw=1,
                   ec="k")
        pylab.title("{} gene sets".format(N))
        pylab.xlabel("Gene set sizes")
        pylab.grid(True)
        a, b = pylab.xlim()
        pylab.xlim([0, b])

    def compute_enrichment(self, background=None):
        if background is None:
            background = self.background
        self.enrichment = {}
        self.enrichment['up'] = self._enrichr("up", background=background)
        self.enrichment['down'] = self._enrichr("down", background=background)
        self.enrichment['all'] = self._enrichr("all", background=background)

    def _enrichr(self, category, background=None, verbose=True):

        if background is None:
            background = self.background

        if isinstance(category, list):
            gene_list = category
        else:
            assert category in ['up', 'down', 'all']
            gene_list = list(self.rnadiff.gene_lists[category])

        if self.mapper is not None:
            logger.info("Input gene list of {} ids".format(len(gene_list)))
            #gene_list = [x.replace("gene:", "") for x in gene_list]
            identifiers = self.mapper.loc[gene_list]['name'].drop_duplicates(
            ).values
            logger.info("Mapped gene list of {} ids".format(len(identifiers)))
            gene_list = list(identifiers)

        enr = gseapy.enrichr(gene_list=gene_list,
                             gene_sets=self.gene_sets,
                             verbose=verbose,
                             background=background,
                             outdir="test",
                             no_plot=True)

        return enr

    def _get_final_df(self, df, cutoff=0.05, nmax=10):
        # takes the df and populate the name and size of the found pathways
        # we also sort by adjusted p-value
        # we keep adj p-value <=0.05
        df = df.copy()
        df['name'] = [self.pathways[x]['NAME'] for x in df.Term]
        df['size'] = [len(x.split(";")) for x in df.Genes]
        df = df.sort_values("Adjusted P-value")
        df.reset_index(drop=True, inplace=True)
        df = df[df["Adjusted P-value"] <= cutoff]

        if len(df) < nmax:
            nmax = len(df)
        df = df.iloc[0:nmax]
        df = df.sort_values("Adjusted P-value", ascending=False)
        return df

    def barplot(self, enrich, cutoff=0.05, nmax=10):
        df = self._get_final_df(enrich.results, cutoff=cutoff, nmax=nmax)

        pylab.clf()
        pylab.barh(range(len(df)), -pylab.log10(df['Adjusted P-value']))
        pylab.yticks(range(len(df)), df.name)
        pylab.axvline(1.3, lw=2, ls="--", color="r")
        pylab.grid(True)
        pylab.xlabel("Adjusted p-value (log10)")
        pylab.ylabel("Gene sets")
        a, b = pylab.xlim()
        pylab.xlim([0, b])
        pylab.tight_layout()
        return df

    def scatterplot(self, enrich, cutoff=0.05, nmax=10, gene_set_size=[]):
        df = self._get_final_df(enrich.results, cutoff=cutoff, nmax=nmax)

        pylab.clf()
        pylab.scatter(-pylab.log10(df['Adjusted P-value']),
                      range(len(df)),
                      s=10 * df['size'],
                      c=df['size'])

        pylab.xlabel("Odd ratio")
        pylab.ylabel("Gene sets")
        pylab.yticks(range(len(df)), df.name)
        a, b = pylab.xlim()
        pylab.xlim([0, b])
        pylab.grid(True)
        ax = pylab.gca()

        M = max(df['size'])
        if M > 100:
            l1, l2, l3 = "10", "100", str(M)
        else:
            l1, l2, l3 = str(round(M / 3)), str(round(M * 2 / 3)), str(M)

        handles = [
            pylab.Line2D([0], [0], marker="o", markersize=5, label=l1, ls=""),
            pylab.Line2D([0], [0], marker="o", markersize=10, label=l2, ls=""),
            pylab.Line2D([0], [0], marker="o", markersize=15, label=l3, ls="")
        ]
        ax.legend(handles=handles, loc="upper left", title="gene-set size")

        pylab.axvline(1.3, lw=2, ls="--", color="r")
        pylab.tight_layout()
        ax = pylab.colorbar(pylab.gci())
        return df

    def _get_summary_pathway(self, pathway_ID):
        genes = self.df_pathways.loc[pathway_ID]['GENE']
        df_down = self.rnadiff.df.query(
            "padj<=0.05 and log2FoldChange<0").copy()
        df_up = self.rnadiff.df.query("padj<=0.05 and log2FoldChange>0").copy()

        #f_down = self.rnadiff.dr_gene_lists[self.comparison]

        logger.info("Total down-regulated: {}".format(len(df_down)))
        logger.info("Total up-regulated: {}".format(len(df_up)))

        mapper = {}
        for k, v in genes.items():
            mapper[v.split(";")[0]] = k
        self.genes = genes
        self.df_down = df_down
        self.df_up = df_up
        summary_names = []
        summary_keggids = []
        summary_types = []
        summary_pvalues = []
        summary_fcs = []

        if self.mapper is not None:
            if 'Name' not in df_down.columns:
                df_down['Name'] = df_down['ID']
                Names = []
                for index in df_down.index:
                    Names.append(self.mapper.loc[index]['name'][0])
                df_down['Name'] = Names
            if 'Name' not in df_up.columns:
                df_up['Name'] = df_up['ID']
                Names = []
                for index in df_up.index:
                    Names.append(self.mapper.loc[index]['name'][0])
                df_up['Name'] = Names

        for name, kegg_id in mapper.items():
            summary_names.append(name)
            summary_keggids.append(kegg_id)

            if name.lower() in [x.lower() for x in df_down.Name]:
                pvalue = -pylab.log10(
                    df_down.query("Name==@name").pvalue.values[0])
                fc = df_down.query("Name==@name").log2FoldChange.values[0]
                summary_fcs.append(fc)
                summary_pvalues.append(pvalue)
                summary_types.append("-")
            elif name.lower() in [x.lower() for x in df_up.Name]:
                pvalue = -pylab.log10(
                    df_up.query("Name==@name").pvalue.values[0])
                summary_pvalues.append(pvalue)
                fc = df_up.query("Name==@name").log2FoldChange.values[0]
                summary_fcs.append(fc)
                summary_types.append("+")
            else:
                summary_pvalues.append(None)
                summary_fcs.append(None)
                summary_types.append("=")

        summary = pd.DataFrame({
            "type": summary_types,
            "name": summary_names,
            "pvalue": summary_pvalues,
            "fc": summary_fcs,
            "keggid": summary_keggids
        })
        summary['description'] = [
            self.pathways[pathway_ID]['GENE'][x] for x in summary.keggid
        ]
        return summary

    def _get_colors(self, summary):
        colors = {}
        for index, row in summary.iterrows():
            pvalue = row['pvalue']
            type_ = row['type']
            kegg_id = row['keggid']
            if type_ == "-":
                if pvalue > 0 and pvalue < 5:
                    colors[kegg_id] = "#FF8C00,black"
                elif pvalue < 10:
                    colors[kegg_id] = "#FF0000,black"
                else:
                    colors[kegg_id] = "#B22222%2Cblack"
            elif type_ == "+":
                if pvalue > 0 and pvalue < 5:
                    colors[kegg_id] = "#9ACD32,black"
                elif pvalue < 10:
                    colors[kegg_id] = "#008000,black"
                else:
                    colors[kegg_id] = "#006400,#000000"
            else:
                colors[kegg_id] = "grey,black"
        return colors

    def save_pathway(self, pathway_ID, scale=None, show=False, filename=None):

        summary = self._get_summary_pathway(pathway_ID)
        colors = self._get_colors(summary)

        logger.info("pathway {} total genes: {}".format(
            pathway_ID, len(summary)))
        count_up = len(summary.query("type == '+'"))
        count_down = len(summary.query("type == '-'"))
        logger.info("this pathway down-regulared genes: {}".format(count_down))
        logger.info("this pathway up-regulated genes: {}".format(count_up))

        url = "https://www.kegg.jp/kegg-bin/show_pathway"
        #dcolor = "white"  --> does not work with the post requests unlike get
        # requests
        params = {
            "map":
            pathway_ID,
            "multi_query":
            "\r\n".join(["{} {}".format(k, v) for k, v in colors.items()])
        }

        self.params = params
        import requests
        html_page = requests.post(url, data=params)

        self.tmp = html_page
        html_page = html_page.content.decode()

        links_to_png = [
            x for x in html_page.split() if "png" in x and x.startswith("src")
        ]
        link_to_png = links_to_png[0].replace("src=", "").replace('"', '')
        r = requests.get("https://www.kegg.jp/{}".format(link_to_png))

        if filename is None:
            filename = "{}.png".format(pathway_ID)

        with open(filename, "wb") as fout:
            fout.write(r.content)

        return summary

    def save_all_pathways(self):  #pragma: no cover
        # This does not do any enrichment. Just save all pathways once for all
        # with useful information
        for ID in self.kegg.pathwayIds:
            self.save_pathway(ID)

    def save_significant_pathways(self,
                                  mode,
                                  cutoff=0.05,
                                  nmax=20,
                                  background=None):  #pragma: no cover
        """mode should be up, down or all"""

        if background is None:
            background = self.background

        # select the relevant pathways
        df = self._enrichr(mode, background).results
        df = self._get_final_df(df, cutoff=cutoff, nmax=nmax)
        logger.warning("Found {} pathways to save".format(len(df)))
        if len(df) == nmax:
            logger.warning("Restricted pathways to {}".format(nmax))

        logger.info("saving {} deregulated pathways".format(len(df)))

        summaries = {}
        # save them
        for ID in df['Term']:
            summary = self.save_pathway(ID,
                                        filename="{}_{}.png".format(ID, mode))
            summaries[ID] = summary
        return summaries

    def find_pathways_by_gene(self, gene_name, match="exact"):
        """Returns pathways that contain the gene name

        ke.find_pathways_by_gene("ysgA")
        """

        #First let us find the kegg ID
        genes = self.kegg.list(self.kegg.organism).strip().split("\n")

        keggid = [x.split("\t")[0].strip() for x in genes]
        gene_names = [x.split("\t")[1].split(";")[0].strip() for x in genes]

        self.keggid = keggid
        self.gene_names = gene_names
        candidates = []
        for x, y in zip(keggid, gene_names):

            if match == "exact":
                if gene_name == y:
                    candidates = x.split(":")[1]
                    break
            else:
                if gene_name in y:
                    candidates.append(x)
        if match != "exact":
            candidates = [x.split(":")[1] for x in candidates]
            logger.info("Found {} candidate(s): {}".format(
                len(candidates), candidates))
        else:
            logger.info("Found {} in {}".format(gene_name, candidates))

        paths = []
        for key in self.pathways.keys():
            if "GENE" in self.pathways[key]:
                if match == "exact":
                    if candidates in self.pathways[key]['GENE'].keys():
                        paths.append(key)
                else:
                    for candidate in candidates:
                        if candidate in self.pathways[key]['GENE'].keys():
                            paths.append(key)
        return list(set(paths))
Beispiel #17
0
    sita = ab_dict[gene][-1]
    sita_q = 'ncbi-proteinid:{g}'.format(g=sita[:-2])
    if sita_q in convDb:
        ab_dict[gene].append(convDb[sita_q])
        annotated.append(gene)
    else:
        no_joy_for_sita.append(sita)
counter = 0

# get kegg pathways

for gene in annotated:
    counter += 1
    if (len(ab_dict[gene])) == 4:
        call, conf, sita, kSita = ab_dict[gene]
        keggObj = s.get(kSita)
        keggParse = s.parse(keggObj)
        ko = []
        if 'ORTHOLOGY' in keggParse.keys():
            ko = list(keggParse['ORTHOLOGY'].keys())
        else:
            ko = ['None']
        assert len(ko) == 1, '{ko:{k}\ngene:{g}'.format(ko=ko, g=gene)
        ab_dict[gene].append(ko[0])
    if (counter % 1000) == 0:
        print('Finshed:{c}'.format(c=counter))

f = open('/home/ndh0004/Documents/keggPthCor/gene_dictv2.pckl', 'wb')
pickle.dump(ab_dict, f)
f.close()
"""
Created on Thu May 01 20:57:44 2018

@author: Tina Lai
"""
#TARGETS Quiz Instrutions
#Based on the KEGG  id ("br:br08329") generate a list of target genes and their pathways.

from bioservices import KEGG
from bioservices import easyXML
import re
import numpy as np
import os

k = KEGG(verbose="False")
k_id = k.get("br:br08329")  #This pulls up the KEGG brite id file

#To parse through the file, I will create a easy xml file
e = easyXML(k_id, 'utf=8')
# The drug IDs are tagged with "<a" so I will use e soup find children to parse out lines tagged with "<a"
results = e.soup.findChildren("a")

#This is using the regular expression findall to find the drug IDS, however it pulls all drug ids mentioned in lines tagged with "a", even redundant drug ID tags.
all_drug_ids = re.findall(r"(D\d{5})", str(results))

#I will use numpy to parse out unique drug ids from the list.

#First I need to convert the list into an array
array = np.array(all_drug_ids)
#then using numpy's unique function, find unique genes.
unique_drug_ids = np.unique(array)
class MetabolicNetwork(MyGraph):
    def __init__(self, modules, organism="hsa"):    
        MyGraph.__init__(self,{})
        self.gr=MyGraph()
        self.modules=modules
        self.s = KEGG()
        self.s.organism = organism # H**o sapiens as default
    

    def __kegg_dic(self):
        if type(self.modules)!=list:
            self.modules=self.s.moduleIds
        dic_reac={}
        for mod in self.modules:
            try:
                dic=self.s.parse(self.s.get(mod))
                reactions=dic['REACTION']
                for reac in reactions:
                    teste=reactions[reac]
                    string=teste.split(" ")
                    dic_reac[reac]=string
            except KeyError:
                pass        
        return dic_reac #it gives a dictionary with reactionsID as keys and a list of compoundsID 
                        # 'R01015': ['C00111', '->', 'C00118']
                        # 'R01070': ['C05378', '->', 'C00111', '+', 'C00118']         
    
    def c_c_graph(self):### comp-comp
        dic_reac=self.__kegg_dic()
        gr=self.gr
        for reac in dic_reac:
            comp=dic_reac[reac]
            c=0
            if comp[c+1]=="+": 
                try:
                    comp[c+5]=="+"
                    s2="+".join([str(comp[c+4]), str(comp[c+6])])
                    s3="+".join([str(comp[c]), str(comp[c+2])])
                    gr.addEdge(s3,s2)
                except IndexError:
                    s="+".join([str(comp[c]), str(comp[c+2])])
                    gr.addEdge(s,comp[c+4])     
            elif comp[c+1]=="->":
                try:
                    comp[c+3]=="+"
                    s="+".join([str(comp[c+2]), str(comp[c+4])])
                    gr.addEdge(comp[c],s)
                except IndexError:
                    gr.addEdge(comp[c],comp[c+2])              
        return gr.printGraph()
        
        
    def r_r_graph(self):### reac-reac
        dic_reac=self.__kegg_dic()
        gr=self.gr
        for k, v in dic_reac.items():
            for r, m in dic_reac.items():
                if v[len(v)-2] == "->":
                    if v[len(v)-1]==m[0]:
                        gr.addEdge(k, r)
                else:
                    s="+".join([str(v[len(v)-3]), str(v[len(v)-1])])
                    try:
                        s2="+".join([str(m[0]), str(m[2])])
                        if s == s2:
                            gr.addEdge(k, r)
                    except IndexError:
                        pass
        return gr.printGraph()      
        

    def r_c_graph(self):### reac-comp
        dic_reac=self.__kegg_dic()
        gr=self.gr
        for k, v in dic_reac.items():
            for r, m in dic_reac.items():
                if v[len(v)-2] == "->":
                    if v[len(v)-1]==m[0]:
                        sv="".join(v)
                        sm="".join(m)
                        gr.addEdge(k, sv)
                        gr.addEdge(sv, r)
                        gr.addEdge(r, sm)
                else:
                    s="+".join([str(v[len(v)-3]), str(v[len(v)-1])])
                    try:
                        s2="+".join([str(m[0]), str(m[2])])
                        if s == s2:
                            sv="".join(v)
                            sm="".join(m)
                            gr.addEdge(k, sv)
                            gr.addEdge(sv, r)
                            gr.addEdge(r, sm)
                    except IndexError:
                        pass
        return gr.printGraph()           
        

    def modules_name(self):
        if type(self.modules)!=list:
            self.modules=self.s.moduleIds
        for i in self.modules:
            dic=self.s.parse(self.s.get(i))
            name=dic["NAME"][0]#['Glycolysis (Embden-Meyerhof pathway), glucose => pyruvate']
            s="-".join([i,name])
            print("\n".join([s]))


    def compounds_name(self):        
        if type(self.modules)!=list:
            self.modules=self.s.moduleIds
        for i in self.modules:
            print(i)
            dic=self.s.parse(self.s.get(i))
            comps=dic["COMPOUND"]#dictionary with the names of the compounds {'C00074': 'Phosphoenolpyruvate',.....
            for key in comps.keys():
                s="-".join([key,comps[key]])
                print("\n".join([s]))        
        
    
    def pathway_name(self):
        if type(self.modules)!=list:
            self.modules=self.s.moduleIds
        for i in self.modules:
            dic=self.s.parse(self.s.get(i))
            pathway=dic["PATHWAY"]#{'map00010': 'Glycolysis / Gluconeogenesis',......
            for key in pathway.keys():
                s="-".join([key, pathway[key]])
                print(s)
        
    
    def nodes_degree(self):
        gr=self.gr
        return gr.allDegrees()
        
    
    def clustering(self):
        gr=self.gr
        return gr.allClusteringCoefs()
        
        
    def connections(self, n1, n2):
        gr=self.gr
        return gr.distance(n1, n2)
Beispiel #20
0
def test_KEGGParser():
    s = KEGG()
    d = s.parse(s.get("cpd:C00001"))
    d = s.parse(s.get("ds:H00001"))
    d = s.parse(s.get("dr:D00001"))
    d = s.parse(s.get("ev:E00001"))
    d = s.parse(s.get("ec:1.1.1.1"))
    d = s.parse(s.get("hsa:1525"))
    d = s.parse(s.get("genome:T00001"))
    d = s.parse(s.get("gl:G00001"))
    d = s.parse(s.get("md:hsa_M00554"))
    d = s.parse(s.get("ko:K00001"))
    d = s.parse(s.get("path:hsa04914"))
    d = s.parse(s.get("rc:RC00001"))
    d = s.parse(s.get("rn:R00001"))
    d = s.parse(s.get("rp:RP00001"))


    d = s.parse(s.get('C15682'))
    assert d['SEQUENCE'][0]['TYPE'] == 'PK'
    assert d['SEQUENCE'][0]['GENE'] =="0-2 mycAI [UP:Q83WF0]; 3 mycAII [UP:Q83WE9]; 4-5 mycAIII[UP:Q83WE8]; 6 mycAIV [UP:Q83WE7]; 7 mycAV [UP:Q83WE6]"
    assert d['SEQUENCE'][0]['ORGANISM'] == "Micromonospora griseorubida"
Beispiel #21
0
def test_KEGGParser():
    s = KEGG()
    d = s.parse(s.get("cpd:C00001"))
    d = s.parse(s.get("ds:H00001"))
    d = s.parse(s.get("dr:D00001"))
    d = s.parse(s.get("ev:E00001"))
    d = s.parse(s.get("ec:1.1.1.1"))
    d = s.parse(s.get("hsa:1525"))
    d = s.parse(s.get("genome:T00001"))
    d = s.parse(s.get("gl:G00001"))
    d = s.parse(s.get("md:hsa_M00554"))
    d = s.parse(s.get("ko:K00001"))
    d = s.parse(s.get("path:hsa04914"))
    d = s.parse(s.get("rc:RC00001"))
    d = s.parse(s.get("rn:R00001"))
    d = s.parse(s.get("rp:RP00001"))