Beispiel #1
0
    def test_extract_protein_interactions_kgml(self, kgml_file,
                                               expected_no_rel):
        # Arrange
        sut = KeggProteinInteractionsExtractor()
        with open(
                os.path.join(os.path.dirname(os.path.realpath(__file__)),
                             kgml_file), 'r') as myfile:
            kgml_string = myfile.read()

        # Mock Kegg ops
        mock_kegg = KEGG()
        sut.kegg = mock_kegg

        # No matter what the input is, return the  ko numbers that map to hsa numbers
        mock_kegg.link = MagicMock(return_value="ko:K00922	hsa:5293\n" +
                                   "ko:K00922	hsa:5291\n" +
                                   "ko:K02649	hsa:5295")

        # No matter what the input is, return the  hsa numbers that map to uniprot numbers
        mock_kegg.conv = MagicMock(return_value={"hsa:5293": "up:B0LPE5"})

        # Mock Uni Prot
        mock_uniprot = UniProt()
        sut.uniprot = mock_uniprot
        mock_uniprot.mapping = MagicMock(
            return_value={"B0LPE5": ["gene1", "gene2"]})

        # Act
        actual = sut.extract_protein_interactions_kgml(kgml_string)

        # Assert
        self.assertEqual(expected_no_rel, len(actual))
Beispiel #2
0
def downloadPathway(kegg_color, final_com, downloaded, pair):
    i = 1
    s = KEGG()
    for path in final_com:
        if path not in downloaded:
            print('loading...%d' % i)
            i += 1
            keggid = dict([(key, kegg_color[key]) for key in final_com[path]])
            image_url = s.show_pathway("path:%s%s" % (org, path),
                                       dcolor="white",
                                       keggid=keggid)
            req = urllib.request.urlopen(image_url).read()
            req = req.decode()
            listurl = re.findall(r'tmp.*png', req)
            if path == '01100' or path == '01110':
                listurl = re.findall(r'tmp.*%s.{5}' % org, listurl[0])
                urll = listurl[0] + '.png'
                url = 'http://www.kegg.jp/' + urll
            else:
                url = 'http://www.kegg.jp/' + listurl[0]
            f = open('./%s/%s/%s%s.png' % (client, pair, org, path),
                     "wb")  # 打开文件
            req = urllib.request.urlopen(url)
            buf = req.read()  # 读出文件
            f.write(buf)  # 写入文件
            f.close()
Beispiel #3
0
def getPathway(org, compare):
    temp = open('%s_pathway.txt' % org, "r").read()
    temp = re.findall(r'\d{5}', temp)
    text = read_doc('%s.txt' % compare)
    kegg_color = {}
    map = {}
    map_list = []
    s = KEGG()
    final_com = {}
    for i in range(0, len(text) - 1, 2):
        newid = text[i]  #'cpd:'+ text[i]
        kegg_color[newid] = text[i + 1] + ',' + text[i + 1]
    for id in kegg_color:
        a = s.get(id)
        dic = s.parse(a)
        try:
            if 'PATHWAY' in dic:
                map[id] = list(dic['PATHWAY'].keys())
                map_list.extend(map[id])
        except TypeError:
            print('Error:' + a)
    final_map = dict(Counter(map_list))
    final_map = [x for x in final_map.items() if x[1] > 1]
    final_map = [x for x in final_map if x[0][3:] in temp]
    for pathway in final_map:
        newpath = pathway[0][3:]
        final_com[newpath] = []
        for compound in map:
            if pathway[0] in map[compound]:
                final_com[newpath].append(compound)
    return kegg_color, final_com
Beispiel #4
0
def get_pdb_id_by_name_gene(gene_name):
    k = KEGG()
    gene_ids = []

    # по названию гена получаем id
    gen = kegg_find("hsa", gene_name)
    if gen in [400, 404]:
        return [],[]
    for line in gen.split("\n"):
        if len(line)>0:
            gene_ids.append(line.split("\t")[0])

    # по каждому полученному id гена получаем PDB_ID
    pdb_ids = []
    if len(gene_ids)>100:
        return [],[]
    for gene_id in gene_ids:

        e = kegg_get(gene_id)
        if e in [400, 404]:
            continue
        d = k.parse(e)

        if "STRUCTURE" in d:
            pdb_ids += d["STRUCTURE"]["PDB"].split()
    return gene_ids, pdb_ids
Beispiel #5
0
def kegg_to_uniprot(fr='hsa', cache=False):
    """Downloads a mapping from a `KEGG` database to `UniProt`, including
    both `TrEMBL` and `SwissProt`.

    Parameters:
    ----------
    fr : str, optional, default: 'hsa'
        KEGG database identifier to convert. Defaults to 'hsa'.

    cache : bool, optional, default: False
        If True, results are cached by `bioservices`. This can save
        time but you will eventually miss out on new database releases if
        your cache is old.

    Returns
    -------
    `dict`
        Mapping from `KEGG` identifiers to a list of `UniProt` accessions.
    """

    kegg = KEGG(cache=cache)
    mapping = kegg.conv(fr, 'uniprot')

    parsed_mapping = {}
    for upid, org in mapping.items():
        upid = upid.split(':')[1]  # remove the 'up:' prefix
        if org in parsed_mapping:
            parsed_mapping[org] += [upid]
        else:
            parsed_mapping[org] = [upid]
    return parsed_mapping
Beispiel #6
0
def get_pathway(pathway):
    s = KEGG()
    data = s.get(pathway)
    if type(data) == int:
        return data
    dict_data = s.parse(data)
    path_info = (dict_data['NAME'], dict_data['GENE'])
    return path_info
def Get_Drug_IDs(Brite_ID):
    k = KEGG(verbose="False")
    k_id = k.get(Brite_ID)
    e = easyXML(k_id, 'utf=8')
    results = e.soup.findChildren("a")
    all_drug_ids = re.findall(r"(D\d{5})", str(results))
    array = np.array(all_drug_ids)
    unique_drug_ids = np.unique(array)
    return unique_drug_ids
def teste4():
    s = KEGG()
    s.organism = "hsa" #H**o sapiens (human)
    modules=s.moduleIds #pathway modules
    dic=s.parse(s.get(modules[0]))
    compounds=dic["COMPOUND"]#dictionary with the names of the compounds {'C00074': 'Phosphoenolpyruvate',.....
    pathway=dic["PATHWAY"] # {'map00010': 'Glycolysis / Gluconeogenesis',......
    module_name=dic["NAME"] #['Glycolysis (Embden-Meyerhof pathway), glucose => pyruvate']
    return pathway
Beispiel #9
0
def checkOrg(org):
    if not os.path.exists('./%s_pathway.txt' % org):
        a = KEGG()
        b = a.list('pathway', org)
        with open('%s_pathway.txt' % org, 'a', encoding='latin-1') as f:
            f.write(b)
        print('Finish writing org file')
    else:
        print('File already existed')
Beispiel #10
0
 def __init__(self):
     self.k = KEGG()
     self.org = "lac"
     self.genelist = []
     self.genedict = {}
     self.a = AnnotationTable()
     self.targetdict = self.a.analyze_sequences(
     )  # listed as sequence: list of genes
     for gene in self.a.genes:
         self.genelist.append(gene)
Beispiel #11
0
    def __init__(self, gene_lists, taxon, dataframe,
                 kegg_organism=None,
                 enrichment_params={
                        "padj": 0.05,
                        "log2_fc": 3,
                        "max_entries": 3000,
                        "kegg_background": None,
                        "mapper": None,
                        "preload_directory": None,
                        'plot_compute_levels': False,
                        'plot_logx': True
                        },
                go_only=False,
                kegg_only=False,
                command=""
                ):
        """.. rubric:: constructor

        """
        super().__init__()
        self.title = "Enrichment"

        self.command = command
        #self.rnadiff_folder = rnadiff_folder
        self.gene_lists = gene_lists
        self.enrichment_params = enrichment_params
        self.data = dataframe
        self.taxon = taxon
        if taxon == 10090:
            self.organism = "mmu"
        elif taxon == 9606:
            self.organism = "hsa"
        else:
            if kegg_organism is None:
                logger.error("You must specify the kegg organism name if not human or mouse: eg., eco for ecoli")
                # figure out the organism from taxon 
                raise NotImplementedError
            else:
                from bioservices import KEGG
                k = KEGG()
                k.organism = kegg_organism # validates the organism name
                self.organism = kegg_organism

        if self.enrichment_params['preload_directory']:
            pathname = self.enrichment_params['preload_directory']
            if os.path.exists(pathname) is False:
                logger.error(f"{pathname} does not exist")
                sys.exit(1)

        #from sequana.rnadiff import RNADiffResults
        #self.rnadiff = RNADiffResults(self.rnadiff_folder)
        self.rnadiff = {}

        self.create_report_content(go_only=go_only, kegg_only=kegg_only)
        self.create_html("enrichment.html")
def teste5():
    s = KEGG()
    s.organism = "hsa" #H**o sapiens (human)
    modules=s.moduleIds #pathway modules
    dic=s.parse(s.get("M00627"))
    module_name=dic["NAME"][0]
    reactions=dic["REACTION"]
    if "Pentose phosphate cycle" in module_name:
        print(module_name)
    else:
        print("haha")
def target_paths(target_dict):
    # Create KEGG Object
    k = KEGG(verbose=False)

    # Create empty dictionary to output information
    gene_path = {}

    # start iterator
    i = 0

    # create list of targets
    target_names = list(target_dict.keys())

    # Loop through genes
    for HSA in target_dict.values():

        # Only use data where available
        if len(HSA) > 1:

            # get gene KEGG page
            page = k.get(HSA.lower())

            # isolate pathway information
            d = k.parse(page)

            # write pathway information to output dictionary
            if "PATHWAY" in d.keys():

                # create variable for pathways
                paths = d["PATHWAY"]

                # add pathway ids as list to gene name key
                gene_path[target_names[i]] = list(paths.keys())

                # increase iterator
                i += 1

            # add null value for no pathways
            else:
                gene_path[target_names[i]] = " "

                # increase iterator
                i += 1

        # Skip null values
        else:
            gene_path[target_names[i]] = " "

            # increase iterator
            i += 1

    return gene_path
def teste6():
    s = KEGG()
    s.organism = "hsa"
    modules=["M00001", "M00002", "M00013", "M00034"]
    dic_reac={}
    for mod in modules:
        dic=s.parse(s.get(mod))
        reactions=dic["REACTION"]
        for reac in reactions:
            teste=reactions[reac]
            string=teste.split(" ")
            dic_reac[reac]=string
    return dic_reac 
Beispiel #15
0
    def extract_all(self):
        from bioservices import KEGG
        kegg = KEGG()
        pathway_list = filter(None, kegg.list("pathway/hsa").split("\n"))

        pathway_dict = {}
        for p in pathway_list:
            id = p.split("\t")[0]
            name = p.split("\t")[1]

            pathway_dict[id] = name

        return pathway_dict
def teste2():
    s = KEGG()
    s.organism = "hsa"
    modules=s.moduleIds
    print(modules[3])
    dic=s.parse(s.get(modules[3]))
    reactions=dic["REACTION"]
    dic_reac={}
    for reac in reactions:
        teste=reactions[reac]
        string=teste.split(" ")
        dic_reac[reac]=string
    return dic_reac #it gives a dictionary with reactionsID as keys and a list of compounds 
Beispiel #17
0
    def __init__(self,
                 folder,
                 organism,
                 alpha=0.05,
                 log2_fc=0,
                 progress=True,
                 mapper=None,
                 background=None):

        print("DRAFT in progress")
        from bioservices import KEGG
        self.kegg = KEGG(cache=True)
        self.kegg.organism = organism

        self.rnadiff = RNADiffResults(folder, alpha=alpha, log2_fc=log2_fc)
        # some clean up
        if "ID" in self.rnadiff.df.columns:
            self.rnadiff.df['ID'] = [
                x.replace("gene:", "") for x in self.rnadiff.df['ID']
            ]
        self.rnadiff.df.index = [
            x.replace("gene:", "") for x in self.rnadiff.df.index
        ]
        for key, values in self.rnadiff.gene_lists.items():
            self.rnadiff.gene_lists[key] = [
                x.replace("gene:", "") for x in values
            ]

        self.rnadiff.df.index = [
            x.replace("gene:", "") for x in self.rnadiff.df.index
        ]

        choices = list(self.rnadiff.gene_lists.keys())

        if background:
            self.background = background
        else:
            self.background = len(
                self.kegg.list(self.kegg.organism).split("\n"))
        logger.info("Set number of genes to {}".format(self.background))

        self._load_pathways(progress=progress)

        self.mapper = mapper

        try:
            self.compute_enrichment()
        except Exception:
            logger.critical("An error occured while computing enrichments")
            pass
def drug_dict(disease):

    # Create KEGG Object
    k = KEGG(verbose=False)

    # Create object for disease file
    dis = k.get(disease)

    # create dictionary of k.get() output with k.parse()
    # this is an extension of the KEGG class
    d = k.parse(dis)

    # Pull out Therapeutic drug information
    treatment_drugs = d["DRUG"]

    # Return dictionary of drugs
    return treatment_drugs
Beispiel #19
0
def kegg_find(*args):
    if not hasattr(kegg_find,"cache"):
        if os.path.isfile("kegg_find.cache"):
            kegg_find.cache = pickle.load(open("kegg_find.cache","rb"))
        else:
            kegg_find.cache = {}

    if args not in kegg_find.cache or kegg_find.cache[args] is None:
        k = KEGG()
        result = k.find(*args)
        kegg_find.cache[args] = result
        with open("kegg_find.cache~","wb") as f:
            pickle.dump(kegg_find.cache, f)
        os.rename("kegg_find.cache~", "kegg_find.cache")
        return result
    else:
        return kegg_find.cache[args]
def tcell_read_metabolomics_data():
    """This function is quite convoluted as it downloads an excelfile from a publication and extracts a dataframe, idexed by chebi. The function also caches intermediate files"""
    tcell_metabol_xls = cache.UrlFileCache(os.path.join(cache.get_cache_path(),  metabolite_expression_name + ".xlsx"), metabolomics_data_url)
    metabolomics_df = pd.read_excel(tcell_metabol_xls.get_file_name(), sheet_name = "normalized by sample mean", index_col=0, usecols="A,C:HN", skiprows = [0])
    #metabolomics_df = pd.read_excel(tcell_metabol_xls.get_file_name(), sheet_name = "normalized by sample mean", index_col=0, usecols="A,C:HN", skiprows = [0])
    for col in metabolomics_df.columns:
        # Average all technical replicates (Named by trailing ".1")
        if len(col.split('.'))>1 and col.split('.')[1] == "1":
            remcol = col.split('.')[0]
            metabolomics_df[remcol] = scipy.stats.gmean(metabolomics_df[[remcol,col]],axis=1)
            metabolomics_df.drop(col, axis=1, inplace=True)
    metabolomics_df.index.name = "KEGG_ID"
    metabolomics_df = metabolomics_df.apply(np.exp2)    # The excel data is in log2 space, return it to normal
    k = KEGG(verbose=False)
    map_kegg_chebi = k.conv("chebi", "compound")
    metabolomics_df = metabolomics_df.groupby("KEGG_ID", group_keys=False).apply(lambda x: one_row_per_compound_convert(x, map_kegg_chebi)).reset_index(drop=True)
    metabolomics_df.set_index("MetaboliteID", inplace=True)
    return metabolomics_df
Beispiel #21
0
class KeggInfo:
    def __init__(self):
        self.k = KEGG()
        self.org = "lac"
        self.genelist = []
        self.genedict = {}
        self.a = AnnotationTable()
        self.targetdict = self.a.analyze_sequences(
        )  # listed as sequence: list of genes
        for gene in self.a.genes:
            self.genelist.append(gene)
        #for gene in self.genelist:
        #self.get_info(gene)

    def get_info(self, gene):
        id = self.org + ":" + gene
        res = self.k.get(id)
        d = self.k.parse(res)
        ortho = "unknown"
        motif = "unknown"
        pfam = "unknown"
        definition = str(d['DEFINITION'])
        definition = definition[9:]
        if d.has_key('ORTHOLOGY'):
            ortho = str(d['ORTHOLOGY'])
        if d.has_key('MOTIF'):
            motif = d['MOTIF']
            if motif.has_key('Pfam'):
                pfam = str(motif['Pfam'])
            else:
                pfam = "unknown"
        # print gene + ";" + definition + ";" + pfam
        self.genedict[gene] = definition
        print gene + " info obtained"

    def make_file(self):
        f = open("/Users/brianmendoza/Desktop/CRISPRs/lac_multi_data.txt", 'w')
        for sequence in self.targetdict:
            sequenceLine = sequence + ";" + str(len(self.targetdict[sequence]))
            for gene in self.targetdict[sequence]:
                sequenceLine += ";" + gene[0:-2]  # self.genedict[gene]
            f.write(sequenceLine + "\n")
        f.close()
Beispiel #22
0
    def __init__(self, info_path):
        super(NewGenome, self).__init__()
        uic.loadUi('NewGenome.ui', self)
        self.setWindowTitle('New Genome')
        self.k = KEGG()
        self.info_path = info_path
        #---Button Modifications---#

        self.setWindowIcon(Qt.QIcon("cas9image.png"))
        self.whatsthisButton.clicked.connect(self.whatsthisclicked)
        self.KeggSearchButton.clicked.connect(self.updatekegglist)
        self.resetButton.clicked.connect(self.reset)
        self.submitButton.clicked.connect(self.submit)
        self.browseForFile.clicked.connect(self.selectFasta)
        self.NCBI_File_Search.clicked.connect(self.prep_ncbi_search)
        self.JobsQueueBox.setReadOnly(True)
        self.output_browser.setText("Waiting for program initiation...")
        self.CompletedJobs.setText(" ")
        self.contButton.clicked.connect(self.continue_to_main)

        self.comboBoxEndo.currentIndexChanged.connect(self.endo_settings)

        self.runButton.clicked.connect(self.run_jobs)
        self.clearButton.clicked.connect(self.clear_job_queue)

        self.viewStatButton.setEnabled(False)

        self.JobsQueue = []  # holds Job classes.
        self.Endos = dict()
        self.file = ""

        self.process = QtCore.QProcess()
        self.process.setProcessChannelMode(QtCore.QProcess.MergedChannels)
        self.process.finished.connect(self.upon_process_finishing)
        self.seqTrans = SeqTranslate()

        self.first = False
        #show functionalities on window
        self.fillEndo()
        #self.show()

        self.num_chromo_next = False
Beispiel #23
0
def download_pathway_ids(organism, cache=False):
    """
    Query KEGG for a recent list of pathways for an organism.

    Parameters
    ----------
    organism: str
        A KEGG organism code. For example 'hsa'.

    cache : bool, optional, default: False
        If True, results are cached by `bioservices`. This can save
        time but you will eventually miss out on new database releases if
        your cache is old.

    Returns
    -------
    `list`
        List of str pathway identifiers.
    """
    kegg = KEGG(cache=cache)
    kegg.organism = organism
    pathways = kegg.pathwayIds
    return pathways
Beispiel #24
0
def search_kegg(accessions):
    start_time = datetime.datetime.now()

    with yaspin(text="Retrieving KEGG annotations...", color="cyan") as sp:
        raw_data = ""
        for accession in accessions.dropna():
            path = KEGG()
            res = accession.split(":")
            try:
                for k, val in path.get_pathway_by_gene(res[1], res[0]).items():
                    _id = re.search("\d+", k).group(0)
                    raw_data = f"{raw_data}map{_id}\t\"{val}\"\n"
            except AttributeError:
                pass
        try:
            kegg = pandas.read_csv(pandas.compat.StringIO(raw_data),
                                   sep="\t",
                                   header=None)
            kegg.columns = ["accession", "description"]

            # Add column of counts.
            kegg["count"] = kegg.groupby("accession")["accession"].transform(
                "count")
            kegg = (kegg.drop_duplicates(subset="accession").sort_values(
                by="count", ascending=False).reset_index(drop=True))
            mssg = f"* Found {sum(kegg['count'])} KEGG pathways from which {len(kegg)} were unique."
        except pandas.errors.EmptyDataError:
            kegg = pandas.DataFrame()
            mssg = f"* Found 0 KEGG Pathways."

        time_diff = (datetime.datetime.now() - start_time).total_seconds()

        sp.text = f"Retrieving KEGG annotations => Task done in {time_diff} seconds."
        sp.ok("✔")
        print(mssg)
        return kegg
Beispiel #25
0
    def build_csv(self, filename=None, Nmax=None):
        """rebuild the entire dataframe (1hour) and stores as attribute

        :param Nmax: for testing
        """
        logger.info("Retrieving the kegg organisms and their definitions")
        from bioservices import KEGG
        import pandas as pd
        k = KEGG()
        results = []
        definition = []
        for i, item in enumerate(k.organismIds):
            results.append(k.parse(k.get(f"gn:{item}"))['NAME'])
            definition.append(k.parse(k.get(f"gn:{item}"))['DEFINITION'])
            print(i, Nmax)
            if Nmax and i + 1 >= Nmax:
                break

        results = [x[0] for x in results]
        IDs = [x.split(",")[0] for x in results]
        taxon = [x.split(",")[-1] for x in results]
        names = [
            x.split(",")[1].strip() if len(x.split(",")) == 3 else None
            for x in results
        ]

        df = pd.DataFrame({
            'ID': IDs,
            'taxon': taxon,
            'name': names,
            'def': definition
        })
        df = df.fillna("")
        df.columns = ['ID', 'taxon', 'shortname', 'definition']
        df['definition'] = [x.lower() for x in df.definition]
        df['shortname'] = [x.lower() for x in df.shortname]

        self.df = df
        if filename:
            df.to_csv(filename)
Beispiel #26
0
from bioservices import ChEMBL, QuickGO, Reactome, KEGG
from py2neo import Graph

from model.core import *
from ncbi import fetch_publication_list
from quickgo import fetch_quick_go_data
from uniprot import *

graph = Graph(host=os.environ.get("DB", "localhost"),
              bolt=True,
              password=os.environ.get("NEO4J_PASSWORD", ""))

chembl = ChEMBL(verbose=False)
quick_go = QuickGO(verbose=False)
reactome = Reactome(verbose=False)
kegg = KEGG(verbose=False)

# watch("neo4j.bolt")

gene_dict = dict()
transcript_dict = dict()
pseudogene_dict = dict()
cds_dict = dict()
exon_dict = dict()
rrna_dict = dict()
trna_dict = dict()
ncrna_dict = dict()
location_dict = dict()
go_term_set = set()

target_protein_ids_csv = "data/drugbank/all_target_polypeptide_ids.csv"
Beispiel #27
0
def search_organism(organism):
    k = KEGG()
    return k.lookfor_organism(organism)
Beispiel #28
0
def test_KEGGParser():
    s = KEGG()
    d = s.parse(s.get("cpd:C00001"))
    d = s.parse(s.get("ds:H00001"))
    d = s.parse(s.get("dr:D00001"))
    d = s.parse(s.get("ev:E00001"))
    d = s.parse(s.get("ec:1.1.1.1"))
    d = s.parse(s.get("hsa:1525"))
    d = s.parse(s.get("genome:T00001"))
    d = s.parse(s.get("gl:G00001"))
    d = s.parse(s.get("md:hsa_M00554"))
    d = s.parse(s.get("ko:K00001"))
    d = s.parse(s.get("path:hsa04914"))
    d = s.parse(s.get("rc:RC00001"))
    d = s.parse(s.get("rn:R00001"))
    d = s.parse(s.get("rp:RP00001"))


    d = s.parse(s.get('C15682'))
    assert d['SEQUENCE'][0]['TYPE'] == 'PK'
    assert d['SEQUENCE'][0]['GENE'] =="0-2 mycAI [UP:Q83WF0]; 3 mycAII [UP:Q83WE9]; 4-5 mycAIII[UP:Q83WE8]; 6 mycAIV [UP:Q83WE7]; 7 mycAV [UP:Q83WE6]"
    assert d['SEQUENCE'][0]['ORGANISM'] == "Micromonospora griseorubida"
Beispiel #29
0
def test_KEGGParser():
    s = KEGG()
    d = s.parse(s.get("cpd:C00001"))
    d = s.parse(s.get("ds:H00001"))
    d = s.parse(s.get("dr:D00001"))
    d = s.parse(s.get("ev:E00001"))
    d = s.parse(s.get("ec:1.1.1.1"))
    d = s.parse(s.get("hsa:1525"))
    d = s.parse(s.get("genome:T00001"))
    d = s.parse(s.get("gl:G00001"))
    d = s.parse(s.get("md:hsa_M00554"))
    d = s.parse(s.get("ko:K00001"))
    d = s.parse(s.get("path:hsa04914"))
    d = s.parse(s.get("rc:RC00001"))
    d = s.parse(s.get("rn:R00001"))
    d = s.parse(s.get("rp:RP00001"))
Beispiel #30
0
def kegg():
    k = KEGG()
    k.organismIds
    k.organism = "hsa"
    return k
Beispiel #31
0
"""
import logging
import collections

import matplotlib.pyplot as plt
from matplotlib import gridspec
import numpy as np
from scipy.stats import linregress
from scipy.sparse import issparse
from bioservices import KEGG

#Setting logging preferences
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

k = KEGG()
k.settings.TIMEOUT = 1000  #Changing timeout

COMPLETE_BACT_MEDIUM = [  #letort c et al, 2001 : https://mediadb.systemsbiology.net/defined_media/media/322/
    "C00568",  #4-Aminobenzoate
    "C00147",  #Adenine
    "C00041",  #Alanine
    "C01342",
    "C00158",  #Ammonium citrate
    "C00062",  #Arginine
    "C00072",  #Ascorbate
    "C00152",  #Asparagine
    "C00049",  #Aspartate
    "C00120",  #Biotin
    "C08130",
    "C00076",
Beispiel #32
0
    at any time and I'll get back to you with instructions on how to use
    it.
'''

import os
import click
import json
import requests
import time
import xmltodict
import bioservices
from bioservices import KEGG, ChEBI
from zeep import Client
from tqdm import tqdm

k = KEGG(verbose=False)
map_kegg_chebi = k.conv("chebi", "compound")
c = ChEBI(verbose=False)

chebi_client = Client(
    "https://www.ebi.ac.uk/webservices/chebi/2.0/webservice?wsdl")
chemspider_client = Client("https://www.chemspider.com/InChI.asmx?WSDL")

# For compounds that cant be found at all.
not_founds = []

# Need to create a global dictonary for these annotations, as I don't
# want to take the piss with the web services these wonderful people
# provide to us free of charge.

global CONVERTED_COMPOUNDS
Beispiel #33
0
import io
import logging
import os.path as op
from collections import defaultdict

from bioservices import KEGG
from slugify import Slugify

import ssbio.utils
from ssbio.protein.sequence.seqprop import SeqProp

log = logging.getLogger(__name__)
custom_slugify = Slugify(safe_chars='-_')
bs_kegg = KEGG()


class KEGGProp(SeqProp):
    def __init__(self, seq, id, fasta_path=None, txt_path=None, gff_path=None):
        SeqProp.__init__(self,
                         seq=seq,
                         id=id,
                         sequence_path=fasta_path,
                         metadata_path=txt_path,
                         feature_path=gff_path)
        self.kegg = id

    @SeqProp.metadata_path.setter
    def metadata_path(self, m_path):
        """Provide pointers to the paths of the metadata file

        Args:
def drug_targets(drug_dic):
    # Create KEGG Object
    k = KEGG(verbose=False)

    # create empty list for drug IDs
    id_list = []

    # Create empty dictionary do add gene information to
    target_gene_dic = {}

    # create dictionary to link gene(key) and theraputic drug(value)
    gene_drug = {}

    # locate each drug id and add to list
    for value in drug_dic.values():
        id = re.findall(r"(D\d{5})", str(value))
        id_list.append(id[0])

    # Loop through drug IDs to gather information
    for drug_ID in id_list:

        # create object for drug information
        page = k.get(drug_ID)

        # create dictionary of drug information to isolate target information
        d = k.parse(page)

        # check for presence of target information
        if "TARGET" in d.keys():

            # isolate target information
            targ = d["TARGET"]

            # Remove pathways
            no_paths_pre = targ.split("  PATHWAY")

            # count spaces to identify presence of info
            spaces = targ.count(" ")

            # create list of genes
            gene_list = no_paths_pre[0].split("\n            ")

            # follow this if pathway section is present
            if spaces > 0:

                # loop through gene list
                for x in gene_list:
                    # separate gene names and HSA ID's
                    gene_split = x.split(" [")

                    # remove extras from gene name
                    y_split = gene_split[0].split(" ")

                    # add gene information to output dictionary
                    target_gene_dic[y_split[0]] = gene_split[1].strip("]")

                    # add gene and drug to output dictionary
                    gene_drug.setdefault(y_split[0], []).append(drug_ID)

            # if Gene doesn't have HSA# enter no value
            # also add gene to drug output dictionary
            else:
                target_gene_dic[no_paths_pre[0]] = ""

                for x in gene_list:
                    # separate gene names and HSA ID's
                    gene_split = x.split(" [")

                    # remove extras from gene name
                    y_split = gene_split[0].split(" ")

                    # add gene and drug to output dictionary
                    gene_drug.setdefault(y_split[0], []).append(drug_ID)

        else:
            pass

    return target_gene_dic, gene_drug
 def __init__(self, modules, organism="hsa"):    
     MyGraph.__init__(self,{})
     self.gr=MyGraph()
     self.modules=modules
     self.s = KEGG()
     self.s.organism = organism # H**o sapiens as default
from pandas import DataFrame, read_csv

sns.set(style='ticks', palette='pastel', color_codes=True)

# ---- Import network
network = read_csv('%s/files/string_mouse_network_filtered_800.txt' % wd, sep='\t')
network_proteins = set(network['protein1']).intersection(network['protein2'])

# ---- Set-up UniProt
uniprot = UniProt(cache=True)

# ---- Set-up QuickGO bioservice
quickgo = QuickGO(cache=True)

# ---- Set-up KEGG bioservice
kegg, kegg_parser = KEGG(cache=True), KEGGParser()

kegg.organism = 'mmu'
print '[INFO] KEGG service configured'

kegg_pathways = {p: kegg.parse_kgml_pathway(p) for p in kegg.pathwayIds}
print '[INFO] KEGG pathways extracted: ', len(kegg_pathways)

# Convert KEGG pathways Gene Name to UniProt
k2u = kegg.conv('uniprot', 'mmu')

kegg_pathways_proteins = {p: {k2u[x].split(':')[1] for i in kegg_pathways[p]['entries'] if i['type'] == 'gene' for x in i['name'].split(' ') if x in k2u} for p in kegg_pathways}

kegg_uniprot_acc_map = {x for p in kegg_pathways_proteins for x in kegg_pathways_proteins[p]}
kegg_uniprot_acc_map = {p: uniprot.get_fasta(str(p)).split(' ')[0].split('|')[2] for p in kegg_uniprot_acc_map}
Beispiel #37
0
class Kegg:

    k = KEGG()
    location = "http://www.genome.jp/dbget-bin/www_bget?"

    def gene_locator(self, gene_id):
        res = self.k.get(gene_id)
        d = self.k.parse(res)

        newstr = d['POSITION']
        cstop = newstr.find(':')
        if cstop == -1:
            chromosome = 1
        else:
            chrom = newstr[0:cstop]
            chromosome = self.translate_chromosome(chrom)
        sense = True
        if newstr.find('complement') != -1:  # it is on the opposite strand of DNA
            sense = False
            cstop = newstr.find('(')
        if newstr.find('join') != -1:
            cstop = newstr.find('(')
            srt = cstop + 1
            bothpos = newstr[srt:len(newstr)-1].split(",")
            totpos = []
            for i in range(0, len(bothpos)):
                spc = bothpos[i].find('..')
                spos = bothpos[i][0:spc]
                epos = bothpos[i][spc+2:len(bothpos[i])]
                totpos.append((spos, epos))
            startloc = totpos[0][0]
            endloc = totpos[len(bothpos)-1][1]
        else:
            srt = cstop + 1
            spc = newstr.find('..')
            startloc = newstr[srt:spc]
            if not sense:
                endloc = newstr[spc+2:len(newstr)-1]
            else:
                endloc = newstr[spc+2:len(newstr)]
        totloc = (chromosome, int(startloc), int(endloc))
        return totloc

    def translate_chromosome(self, chr):
        numbers = ('1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16')
        letters = ('A','B','C','D','E','F','G','H')
        roman = ('I','II','III','IV','V','VI','VII','VIII','IX','X','XI','XII','XIII', 'XIV',
                 'XV', 'XVI')
        types =[numbers,letters,roman]
        for list in types:
            if chr in list:
                ind = list.index(chr)
                return numbers[ind]
        return 1

    def revcom(self, sequence):
        revseq = ""
        change = {'A':'T',
                  'T':'A',
                  'G':'C',
                  'C':'G'}
        for nt in sequence:
            rnt = change[nt]
            revseq = rnt + revseq
        return revseq

    def added_nts(self, seqstart, seqend, vector, orgcode, chromosome):
        url = self.location + "FROM=" + seqstart + "&TO=" + seqend + "&VECTOR="\
              + vector + "&ORG=" + orgcode + "&CHR=" + chromosome
        source_code = requests.get(url)
        plain_text = source_code.text
        soup = BeautifulSoup(plain_text, "html.parser")
        exons = []
        x = soup.find('pre')
        for region in soup.findAll('font'):
            seq = str(region)
            st = seq.find('>') + 1
            z = seq.find('/font') - 2
            exons.append(seq[st:z])
        # get the sequence:
        sx = str(x)
        start = sx.find("\n", 10) + 1
        end = sx.find("/pre") - 2
        unfontseq = sx[start:end]
        trueseq = ""
        ingene = True
        i = 0
        for nt in unfontseq:
            if nt == "<":
                ingene = False
            if nt == ">":
                ingene = True
            elif ingene:
                trueseq += nt
            i += 1
        exon_position_tuples = []
        for exon in exons:
            spos = trueseq.find(exon)
            epos = spos + len(exon)
            pos = (spos, epos)
            exon_position_tuples.append(pos)
        return exon_position_tuples
Beispiel #38
0
def search_pathway(gene, organism):
    k = KEGG()
    return k.get_pathway_by_gene(gene, organism)
Beispiel #39
0
def kegg():
    k = KEGG()
    k.organismIds
    k.organism = "hsa"
    return k
class MetabolicNetwork(MyGraph):
    def __init__(self, modules, organism="hsa"):    
        MyGraph.__init__(self,{})
        self.gr=MyGraph()
        self.modules=modules
        self.s = KEGG()
        self.s.organism = organism # H**o sapiens as default
    

    def __kegg_dic(self):
        if type(self.modules)!=list:
            self.modules=self.s.moduleIds
        dic_reac={}
        for mod in self.modules:
            try:
                dic=self.s.parse(self.s.get(mod))
                reactions=dic['REACTION']
                for reac in reactions:
                    teste=reactions[reac]
                    string=teste.split(" ")
                    dic_reac[reac]=string
            except KeyError:
                pass        
        return dic_reac #it gives a dictionary with reactionsID as keys and a list of compoundsID 
                        # 'R01015': ['C00111', '->', 'C00118']
                        # 'R01070': ['C05378', '->', 'C00111', '+', 'C00118']         
    
    def c_c_graph(self):### comp-comp
        dic_reac=self.__kegg_dic()
        gr=self.gr
        for reac in dic_reac:
            comp=dic_reac[reac]
            c=0
            if comp[c+1]=="+": 
                try:
                    comp[c+5]=="+"
                    s2="+".join([str(comp[c+4]), str(comp[c+6])])
                    s3="+".join([str(comp[c]), str(comp[c+2])])
                    gr.addEdge(s3,s2)
                except IndexError:
                    s="+".join([str(comp[c]), str(comp[c+2])])
                    gr.addEdge(s,comp[c+4])     
            elif comp[c+1]=="->":
                try:
                    comp[c+3]=="+"
                    s="+".join([str(comp[c+2]), str(comp[c+4])])
                    gr.addEdge(comp[c],s)
                except IndexError:
                    gr.addEdge(comp[c],comp[c+2])              
        return gr.printGraph()
        
        
    def r_r_graph(self):### reac-reac
        dic_reac=self.__kegg_dic()
        gr=self.gr
        for k, v in dic_reac.items():
            for r, m in dic_reac.items():
                if v[len(v)-2] == "->":
                    if v[len(v)-1]==m[0]:
                        gr.addEdge(k, r)
                else:
                    s="+".join([str(v[len(v)-3]), str(v[len(v)-1])])
                    try:
                        s2="+".join([str(m[0]), str(m[2])])
                        if s == s2:
                            gr.addEdge(k, r)
                    except IndexError:
                        pass
        return gr.printGraph()      
        

    def r_c_graph(self):### reac-comp
        dic_reac=self.__kegg_dic()
        gr=self.gr
        for k, v in dic_reac.items():
            for r, m in dic_reac.items():
                if v[len(v)-2] == "->":
                    if v[len(v)-1]==m[0]:
                        sv="".join(v)
                        sm="".join(m)
                        gr.addEdge(k, sv)
                        gr.addEdge(sv, r)
                        gr.addEdge(r, sm)
                else:
                    s="+".join([str(v[len(v)-3]), str(v[len(v)-1])])
                    try:
                        s2="+".join([str(m[0]), str(m[2])])
                        if s == s2:
                            sv="".join(v)
                            sm="".join(m)
                            gr.addEdge(k, sv)
                            gr.addEdge(sv, r)
                            gr.addEdge(r, sm)
                    except IndexError:
                        pass
        return gr.printGraph()           
        

    def modules_name(self):
        if type(self.modules)!=list:
            self.modules=self.s.moduleIds
        for i in self.modules:
            dic=self.s.parse(self.s.get(i))
            name=dic["NAME"][0]#['Glycolysis (Embden-Meyerhof pathway), glucose => pyruvate']
            s="-".join([i,name])
            print("\n".join([s]))


    def compounds_name(self):        
        if type(self.modules)!=list:
            self.modules=self.s.moduleIds
        for i in self.modules:
            print(i)
            dic=self.s.parse(self.s.get(i))
            comps=dic["COMPOUND"]#dictionary with the names of the compounds {'C00074': 'Phosphoenolpyruvate',.....
            for key in comps.keys():
                s="-".join([key,comps[key]])
                print("\n".join([s]))        
        
    
    def pathway_name(self):
        if type(self.modules)!=list:
            self.modules=self.s.moduleIds
        for i in self.modules:
            dic=self.s.parse(self.s.get(i))
            pathway=dic["PATHWAY"]#{'map00010': 'Glycolysis / Gluconeogenesis',......
            for key in pathway.keys():
                s="-".join([key, pathway[key]])
                print(s)
        
    
    def nodes_degree(self):
        gr=self.gr
        return gr.allDegrees()
        
    
    def clustering(self):
        gr=self.gr
        return gr.allClusteringCoefs()
        
        
    def connections(self, n1, n2):
        gr=self.gr
        return gr.distance(n1, n2)