Esempio n. 1
0
    def test_parser_roundtrip(self):
        """Download a KEGG pathway, write local KGML and check roundtrip."""
        with kegg_get("ko00680", "kgml") as remote_handle:
            pathway = KGML_parser.read(remote_handle)

        with io.StringIO(pathway.get_KGML()) as local_handle:
            roundtrip = KGML_parser.read(local_handle)

        self.assertEqual(pathway.name, roundtrip.name)
        self.assertEqual(len(pathway.relations), len(roundtrip.relations))
Esempio n. 2
0
def main():
    query = sys.argv[1].replace(" ", "+")
    result = KEGG_REST.kegg_find('PATHWAY', query)
    result_txt = result.read().split('\n')
    if len(result_txt) == 1:
        print("Search found no results")
        return

    choice = 0
    if len(result_txt) > 2:
        print("More than 1 result:")
        for index, r in enumerate(result_txt):
            output = r.split("\t")
            if len(output) == 2:
                print(str(index) + "\t" + output[1])
        choice = int(input("Which one? "))

    identifier = result_txt[choice].split("\t")[0].strip()
    identifier = identifier.replace("map", "hsa")

    pathway_kgml = KEGG_REST.kegg_get(identifier, "kgml")
    pathway = KEGG_KGML_PARSER.read(pathway_kgml)

    for i in pathway.genes:
        print(i.name)
Esempio n. 3
0
    def plot_pathway(self,
                     enriched_genes,
                     pathway_id='hsa05322',
                     figurename=None):

        # config figure name
        if not figurename:
            figurename = '%s.pdf' % pathway_id
        assert (figurename.endswith('.pdf'))

        # fetch pathway
        pathway = KGML_parser.read(kegg_get(pathway_id, "kgml"))

        # change color for pathway elements
        for entry in pathway.entries.values():
            possible_gene_names = entry.graphics[0].name
            matched_name = gene_is_enriched(enriched_genes,
                                            possible_gene_names)
            if matched_name:
                entry.graphics[
                    0].bgcolor = self.enriched_box_color  #set box color
                entry.graphics[
                    0].fgcolor = self.enriched_text_color  # set text color
                entry.graphics[0].name = matched_name
            else:
                entry.graphics[0].bgcolor = self.non_enriched_box_color
                entry.graphics[0].fgcolor = self.non_enriched_text_color
                entry.graphics[0].name = entry.graphics[0].name.split(',')[0]

        canvas = KGMLCanvas(pathway,
                            import_imagemap=True,
                            fontsize=self.fontsize)
        canvas.draw(figurename)
        print('Drawn: ', figurename)
        return pathway
Esempio n. 4
0
 def test_parse_remote_pathway(self):
     """Download a KEGG pathway from the KEGG server and write KGML."""
     # Download the KEGG ko03070 pathway as a filehandle
     h = kegg_get("ko03070", "kgml")
     pathway = KGML_parser.read(h)
     self.assertEqual(pathway.name, "path:ko03070")
     h.close()
Esempio n. 5
0
 def test_parse_remote_pathway(self):
     """Download a KEGG pathway from the KEGG server and write KGML."""
     # Download the KEGG ko03070 pathway as a filehandle
     h = kegg_get("ko03070", "kgml")
     pathway = KGML_parser.read(h)
     self.assertEqual(pathway.name, "path:ko03070")
     h.close()
Esempio n. 6
0
def get_pathway_kgml(pathway_id='hsa04151'):
    print('Reading pathway:', pathway_id)
    pw_url = f'{HOST}/get/{pathway_id}/kgml'

    # get pathway data if not exists
    path = DATA_ROOT / f'{pathway_id}.kgml'
    if not path.exists():
        print('Could not find hsa file getting from kegg:', pw_url)
        r = requests.get(pw_url)
        if not r.status_code == 200:
            print('Failed to get pathway:', path, r.text)
            sys.exit(-1)
        with open(path, 'w') as f:
            f.write(r.text)
            print('Saved KGML file at:', path)

    # parse data
    with open(path, 'r') as f:
        pathway = KGML_parser.read(f)
    print('\n', pathway, sep='')
    print('  entry:', len(pathway.entries))
    print('  reaction:', len(pathway.reactions))
    print('  relation:', len(pathway.relations))
    entries, relations = prune_kgml(pathway)
    print('Finished reading:', pathway_id)
    print(' entry:', len(entries.keys()), 'relation:', len(relations), 'new_relation:',
          len([1 for r in relations if hasattr(r, '_pamogk')]))
    return entries, relations
def draw_kegg_map(map_id):
    """ Render a local PDF of a KEGG map with the passed map ID
    """
    # Get the background image first
    pathway = KGML_parser.read(kegg_get(map_id, "kgml"))
    canvas = KGMLCanvas(pathway, import_imagemap=True)
    img_filename = "%s.pdf" % map_id
    canvas.draw(img_filename)
Esempio n. 8
0
 def load_pathway(self, pathway_ID, organism_ID = ""):
     '''
     Downloads pathway kgml from KEGG and readis it
     :param pathway_ID: (str) - suffix Pathway ID, int part
     :param organism_ID: (str) - preffix Pathway ID, str part
     :return: (object) - pathway kgml parsed
     '''
     if not organism_ID:
         print(pathway_ID)
         pathway = kegg_get(str("ko"+pathway_ID), "kgml")
         return KGML_parser.read(pathway)
     else:
         try:
             pathway = kegg_get(str(organism_ID + pathway_ID), "kgml")
             return KGML_parser.read(pathway)
         except:
             print("Invalid IDs")
def colorCompounds(pathname, cpdlist, size=20):
    pathway = KGML_parser.read(kegg_get(pathname, "kgml"))
    for element in pathway.compounds:
        for graphic in element.graphics:
            if graphic.name in cpdlist:
                graphic.bgcolor = '#ff0000'
                graphic.width = size
                graphic.height = size
    canvas = KGMLCanvas(pathway, import_imagemap=True)
    canvas.draw("%s.pdf" % pathname)
Esempio n. 10
0
def get_pathway_ko_association_table():
    import os
    import MySQLdb
    from chlamdb.biosqldb import manipulate_biosqldb
    import urllib
    from Bio.KEGG.KGML import KGML_parser
    import re

    sqlpsw = os.environ['SQLPSW']

    conn = MySQLdb.connect(
        host="localhost",  # your host, usually localhost
        user="******",  # your username
        passwd=sqlpsw,  # your password
        db="enzyme")  # name of the data base
    cursor = conn.cursor()

    sql = 'create table enzyme.pathway2ortholog_associations (pathway_id INT, node_id INT, ko_id varchar(200), ' \
          ' index pathway_id(pathway_id), index node_id(node_id), index ko_id(ko_id));'
    cursor.execute(sql, )
    conn.commit()

    sql2 = 'select pathway_name,pathway_id from enzyme.kegg_pathway'
    cursor.execute(sql2, )

    pathway2pathway_id = manipulate_biosqldb.to_dict(cursor.fetchall())

    for pathway in pathway2pathway_id:
        print(pathway)

        url_template = 'http://rest.kegg.jp/get/%s/kgml' % re.sub(
            'map', 'ko', pathway)
        print(url_template)
        try:
            f = urllib.request.urlopen(url_template)
        except:
            continue
        from Bio.Graphics import KGML_vis

        pathway_KGML = KGML_parser.read(f.read().decode("UTF-8"))

        # Loop over the orthologs in the pathway, and change the
        # background colour
        orthologs = [e for e in pathway_KGML.orthologs]
        for o in orthologs:
            ko_temp_list = list(set([i.rstrip() for i in o.name.split('ko:')]))
            ko_temp_list = filter(None, ko_temp_list)
            for ko in ko_temp_list:
                sql = 'insert into enzyme.pathway2ortholog_associations values(%s, %s, "%s")' % (
                    pathway2pathway_id[pathway], o.id, ko)
                cursor.execute(sql, )
        conn.commit()
Esempio n. 11
0
def get_network(org, opt='ec'):
    # Creating a Parser Object
    graph = k.KeggParser()
    # Store pathways that doesn't have EC numbers
    error = []
    # Getting organism
    list1 = r.kegg_list('pathway', org).read()
    list1 = list1.split('\n')
    list1.remove('')
    print('Retrieving data from KEGG PATHWAY database. ' + str(time.ctime()))

    # For each path getting enzymes and reactions
    for path in list1:
        try:
            path = path.split('\t')
            # print (path[0])
            graph.genes = parse.read(r.kegg_get(path[0], 'kgml'))
            graph.genes_default = parse.read(
                r.kegg_get("path:" + opt + path[0][-5:], 'kgml'))
            graph.path = path
        except Exception:
            error.append(path[0])
            continue
        # print ("getting relations")
        graph.get_relations()
        # print ("getting reaction")
        graph.get_reactions()
    # print ('Unretrieved data',error)
    graph.ref = opt
    genes = 0
    for i in graph.ec_org_target.items():
        genes += len(i[1].split())

    # print (graph.ec_org_target.keys())
    # Building Graph
    graph.building_graph(2)
    return (graph)
Esempio n. 12
0
def pathway_genes(pathway: str) -> set:
    """Returns genes for a given pathway in KEGG"""
    kgml = _get(pathway, form='kgml').text

    # Wrap text in a file handle for KGML parser
    f = io.StringIO(kgml)
    k = KGML_parser.read(f)

    genes = set()
    for gene in k.genes:
        for x in gene.name.split():
            g = kegg_label_gene_names(x)
            genes = genes.union(g)

    return genes
Esempio n. 13
0
 def xml_to_txt(self):
     """
     função que converte o ficheiro xml gerado com a função self.getKGML() para a conformação das reações dada nas aulas
     """
     pathway = KGML_parser.read(open(self.name + '.xml', 'r'))
     reactions = pathway._reactions
     f = open(self.name + '.txt', 'w+')
     for r_id in reactions.keys():
         reaction_names = reactions[r_id]._names
         substrates_ids_t = pathway._reactions[r_id]._substrates
         products_ids_t = pathway._reactions[r_id]._products
         while reaction_names != []:  #para o mesmo id por vezes existe mais do que um nome de reação, por isso, este while serve para percorrer todos os nomes existentes
             substrates_ids = list(substrates_ids_t)
             products_ids = list(products_ids_t)
             name = reaction_names.pop()
             self.dictionary(
                 name
             )  #para adicionar o nome da reação ao dicionário criado
             line = name + ': '
             while substrates_ids:  #mesmo fundamento do while anterior mas para os substratos
                 sub = substrates_ids.pop()
                 self.dictionary(pathway.entries[sub]._names[0])
                 if len(substrates_ids) > 0:
                     line += str(pathway.entries[sub]._names[0]) + ' + '
                 else:
                     line += str(pathway.entries[sub]._names[0]) + ' '
             if pathway._reactions[
                     r_id].type == "reversible":  #adicionar os simbolos definidos para reversivel / irreversivel
                 line += '<=> '
             else:
                 line += '=> '
             while products_ids:  #mesmo fundamento dos whiles anteriores mas para os produtos
                 prod = products_ids.pop()
                 self.dictionary(pathway.entries[prod]._names[0])
                 if len(products_ids) > 0:
                     line += str(pathway.entries[prod]._names[0]) + ' + '
                 else:
                     line += str(pathway.entries[prod]._names[0]) + '\n'
             f.write(line)
     f.close()
Esempio n. 14
0
 def getPic(self, pidpath, pathwaydir):
     """
     输入文件pid.txt,输出文件夹pathways,作图
     """
     fs = gridfs.GridFS(self.mongodb)
     f = open(pidpath)
     if not os.path.exists(pathwaydir):
         os.makedirs(pathwaydir)
     for i in f:
         if i:
             i = i.strip('\n').split('\t')
             pid = i[0]
             koid = i[1].split(';')
             l = []
             kgml_path = os.path.join(os.getcwd(), "pathway.kgml")
             png_path = os.path.join(os.getcwd(), "pathway.png")
             if os.path.exists(kgml_path) and os.path.exists(png_path):
                 os.remove(kgml_path)
                 os.remove(png_path)
             with open("pathway.kgml",
                       "w+") as k, open("pathway.png", "w+") as p:
                 result = self.png_coll.find_one({"pathway_id": pid})
                 if result:
                     kgml_id = result['pathway_ko_kgml']
                     png_id = result['pathway_ko_png']
                     k.write(fs.get(kgml_id).read())
                     p.write(fs.get(png_id).read())
             p_kgml = KGML_parser.read(open("pathway.kgml"))
             p_kgml.image = png_path
             for ko in koid:
                 for degree in p_kgml.entries.values():
                     if re.search(ko, degree.name):
                         l.append(degree.id)
                 for n in l:
                     for graphic in p_kgml.entries[n].graphics:
                         graphic.fgcolor = '#CC0000'
                 canvas = KGMLCanvas(p_kgml, import_imagemap=True)
                 canvas.draw(pathwaydir + '/' + pid + '.pdf')
     print "getPic finished!!!"
Esempio n. 15
0
def parseKGML():
    nodestring = '"nodes":[\n'
    edgestring = '"edges":[\n'
    assignId = 0

    for files in KGMLs:
        path = "kgml_files/" + files
        Maps = KGML_parser.read(open(path, 'r'))
        name = Maps.name
        description = Maps.title
        #retrieve list of compounds in each pathway
        KEGGcompounds = []
        for i in range(len(Maps.compounds)):
            temp_compound_name = str(Maps.compounds[i].name).split()[0]
            if "cpd:" in temp_compound_name:
                KEGGcompounds.append(temp_compound_name)
        strKEGGcompounds = str(KEGGcompounds).replace("'", '"')
        #retrieve list of genes products in each pathway
        KEGGgeneproduct = []
        for i in range(len(Maps.genes)):
            temp_gene_name = str(Maps.genes[i].name).split()[0]
            if "mmu:" in temp_gene_name:
                KEGGgeneproduct.append(temp_gene_name)
        strKEGGgeneproduct = str(KEGGgeneproduct).replace("'", '"')

        nodestring += '{"data":{ "id":"' + name + '", "name":"' + description + '","compounds":' + strKEGGcompounds + ',"gene products":' + strKEGGgeneproduct + '}},'

        lenMapmap = len(Maps.maps)
        for i in range(lenMapmap):
            entry = str(Maps.maps[i].name)
            if entry != name and "mmu" in entry:
                #nodestring+='{"data":{ "id":"' + entry + '", "name":"'+entry+'"}},'
                edgestring += '{"data":{ "id":"' + str(
                    assignId
                ) + '", "source":"' + name + '","target":"' + entry + '"}},'
                assignId = assignId + 1
    nodestring += "],"
    edgestring += "]"
    return nodestring, edgestring
Esempio n. 16
0
File: KEGG.py Progetto: CSB-IG/pxat
def readKGML(kgml):
    return KGML_parser.parse(kgml).next()
Esempio n. 17
0
def kgml_file_to_digraph(kgml_file):
    """Parse a KEGG KGML file and convert to a NetworkX directed graph"""
    fh = open(kgml_file, 'r')
    pw = KGML_parser.read(fh)
    digraph = pw2graph(pw)
    return (digraph)
Esempio n. 18
0
def main():
    # Expects name of pathway as argument
    # Get the KGML from KEGG
    query = sys.argv[1].replace(" ", "+")
    result = KEGG_REST.kegg_find('PATHWAY', query)
    result_txt = result.read().split('\n')
    if len(result_txt) == 1:
        print("Search found no results")
        return

    choice = 0
    if len(result_txt) > 2:
        print("More than 1 result:")
        for index, r in enumerate(result_txt):
            output = r.split("\t")
            if len(output) == 2:
                print(str(index) + "\t" + output[1])
        choice = int(input("Which one? "))

    identifier = result_txt[choice].split("\t")[0].strip()
    identifier = identifier.replace("map", ORGANISM)

    pathway_kgml = KEGG_REST.kegg_get(identifier, "kgml")
    pathway = KEGG_KGML_PARSER.read(pathway_kgml)
    config = configparser.ConfigParser()
    config.read("server_config")
    if not "KGML2NEO4J" in config:
        print("Server config not found!")
        return

    username = config["KGML2NEO4J"]['username']
    password = config["KGML2NEO4J"]['password']
    server_uri = config["KGML2NEO4J"]['uri']

    db = database(server_uri, username, password)

    db.run_query("MATCH (n) DETACH DELETE n")

    query = "CREATE "
    query_list = [
        db.make_gene_query(pathway.genes),
        db.make_compound_query(pathway.compounds),
        db.make_reaction_query(pathway.reaction_entries),
        db.make_map_query(pathway.maps),
        db.make_relations_query(pathway.relations)
    ]

    for q in query_list:
        if len(q) > 0:
            query += q + ","
    query = query[:-1]

    db.run_query(query)

    # Merge matching nodes
    merge_query = """MATCH (n1),(n2)
                    WHERE ANY (x IN n1.name WHERE x IN n2.name) and id(n1) < id(n2)
                    WITH [n1,n2] as ns
                    CALL apoc.refactor.mergeNodes(ns) YIELD node
                    RETURN node"""

    db.run_query(merge_query)
Esempio n. 19
0
    # TODO: integrate relations involving compounds?
    if False and entry.type == "compound":
        return entry.id


f_members = open(os.path.join(outdir, 'members.txt'), 'w')
for filename in os.listdir(kgmldir):
    nodes = {}
    pathway_nodes = set()
    if not filename.endswith(".xml") or not filename.startswith("hsa"):
        continue
    kgid = filename[:-4]
    kgml_file = os.path.join(kgmldir, filename)
    f = open(kgml_file)
    print(filename)
    parsed = KGML_parser.read(f)
    f.close()

    pathway_name = parsed.title

    # Find and map components
    components = set()
    # Enter the parsed pathway info and extract components
    for k, entry in parsed.entries.items():
        if entry.type != "gene":
            continue

        rnames = entry.name.split(" ")
        names = [
            converter.handler.to_uniprot(converter.KEGG_IDX, u) for u in rnames
        ]
# Maps from the extracted node id (e.g. EGFR) to the Node Object
nodes = set()
# Map to prevent multiple creation of edges between two nodes uses concatenated node ids as keys (e.g. EGFR_C00010)
# edgeDict = dict()

# batch = WriteBatch(graph)

for filename in files:
    url = args.data_dir + filename
    print ('Loading file ' + url)

    currentNodes = set()
    currentEdges = dict()

    with open(url) as f:
        pathway = KGML_parser.read(f, 'r')
        # Maps from the reaction id (e.g. rn:R05134) to the Node Objects that are part of this reaction
        reactionToNode = dict()
        # Maps from the internal pathway id (e.g. 23) to the compound id (e.g. C00010)
        compoundDict = dict()

        for gene in pathway.genes:
            for geneName in gene.name.split(' '):
                gene_id = geneName[4:]
                if gene_id not in nodes:
                    gName = gene_id
                    if gene_id in geneNames:
                        gName = geneNames[gene_id]
                    importer.add_node(['_Network_Node', 'Gene'], gene_id,
                                      {'name': gName, 'idType': 'ENTREZ',
                                       'url': 'http://www.kegg.jp/dbget-bin/www_bget?hsa:' + gene_id})
Esempio n. 21
0
                kegg[keggID[0]].extend(
                    [x for x in keggID[1:] if x not in kegg[keggID[0]]])
                org = proteinMapping[currentGene]
                for ec in keggID[1:]:
                    if org not in ecToGeneOrg:
                        ecToGeneOrg[org] = defaultdict(list)
                    if currentGene not in ecToGeneOrg[org][ec]:
                        ecToGeneOrg[org][ec].append(currentGene)

# process all found kegg pathways
for k in kegg:
    print("Processing: {}".format(k))
    stats[k] = defaultdict(int)
    processedIDs = set()
    # load current pathway
    pathway = KGML_parser.read(kegg_get("ko{}".format(k), "kgml"))

    # get information on EC numbers in kegg pathway
    for ec in kegg[k]:
        print(" EC: {}".format(ec))
        if True:
            foundOrtho = False
            # query KEGG
            for ecInfo in kegg_get("ec:{}".format(ec)):
                ecInfoLabel = ecInfo[:12]
                if "ORTHOLOGY" in ecInfoLabel:
                    foundOrtho = True
                    KOToEC[ecInfo[12:18]].append(ec)
#                    KOToGene[ecInfo[12:18]].extend(ecToGene[ec])
                else:
                    foundOrtho = foundOrtho and len(ecInfoLabel.strip()) == 0
Esempio n. 22
0
def parser_xml(pathway_nodes_df, save=True):
    print('Executing function parser_xml:')
    pathwaylist = list(pathway_nodes_df[0])  # 获取 pathway 列表
    enzymeDict = {}  # 初始化结果字典
    path2path = list()  # 初始化
    procress, allnum = 0, len(pathwaylist)  # 显示进度

    for pathwayname in pathwaylist:
        if procress % (allnum // 5) == 0:
            print(
                f"--- Parsing xml {procress}/{allnum} pathwayname: {pathwayname}"
            )
        procress += 1
        # 在线提取 pathwayname 下的 xml 文件
        pathway2xml = KGML_parser.read(kegg_get(pathwayname, "kgml"))
        # 将 xml 内出现的所有 map 类型保存,并认为这些 map 与 pathwayname 有互作关系
        path2path.extend([[pathwayname, maps.name] for maps in pathway2xml.maps
                          if maps.name in pathwaylist])
        # 设置空的 dataframe 存储 pathwayname 下 entry 与 entry 关系
        relation2entry = pd.DataFrame(columns=('id1', 'id2'))
        genelist = pathway2xml.genes
        for i, gene in enumerate(pathway2xml.relations):
            if gene.entry1 in genelist and gene.entry2 in genelist:
                relation2entry.loc[i, :] = [
                    pathwayname + '_' + str(gene.entry1.id),
                    pathwayname + '_' + str(gene.entry2.id)
                ]

        entry_nodes = list(
            set(relation2entry['id1'].tolist() +
                relation2entry['id2'].tolist()))
        id2gene = [(pathwayname + '_' + str(gene.id), gene.name)
                   for gene in genelist]
        enzymeDict[pathwayname] = {
            'entry_nodes': entry_nodes,
            'entry_entry_edges': relation2entry,
            'entry2gene': id2gene
        }
    print('--- Finish parsing xml')
    print('--- Processing data...')
    # 去重
    path2path = pd.DataFrame(path2path, columns=('path1', 'path2'))
    rows = [
        i for i in path2path.index
        if path2path.iat[i, 0] == path2path.iat[i, 1]
    ]
    path2path2 = path2path.drop(rows, axis=0)  # 利用drop方法将含 path1=path2 的行删除
    path2path2 = path2path2.drop_duplicates(['path1', 'path2'],
                                            keep='first')  # 删除重复行
    a = path2path2.apply(lambda x: str(sorted(x.tolist())), axis=1)
    pos = pd.DataFrame(a).duplicated()
    path2path_drop = path2path2.loc[-pos, :].reset_index()
    # 保存
    if save:
        print('--- Saving data pathway2enzyme.pickle.txt')
        with open('data/pathway2enzyme.pickle.txt', 'wb') as file:
            pickle.dump(enzymeDict, file)
        print('--- Saving data pathway2pathway.csv')
        path2path_drop[['path1', 'path2']].to_csv('data/pathway2pathway.csv',
                                                  header=0,
                                                  index=0)
    return {'pathway2enzyme': enzymeDict, 'pathway2pathway': path2path_drop}
def gatherDetails(makeNclusters,trimPath,forRelatedness,folderName,CO_fromMATLAB,KO_Norm2Mean,Insitu_TPM_DIA,Insitu_TPM_DIN,Insitu_TPM_Oth):
    colLabel = ['nCpds','nGenes'] #starting with this is easiest - makes one list, no need to flatten

    for item in range(makeNclusters):
        colLabel.append('Km' + str(item) + '_cpd')
        colLabel.append('Km' + str(item) + '_gene')

    gatherCounts = pd.DataFrame(0, index = trimPath, columns = colLabel)

    #setup the strings to match first
    rnString = re.compile('(?:[rn:R])(\d+)$') #will return R00190
    cpdString = re.compile('(?:[cpd:C])(\d+)$') #will return C00190

    size = 20 #turns out I can increase the size of the compounds in the plots

    for kn in range(makeNclusters):
        fullSet = set(forRelatedness.KEGG)
        oneK = forRelatedness[forRelatedness.kmeans == kn] #get gene & transcript information for one Kmeans group
        getKm = 'Km' + str(kn)

        #check if the directories exist, one for pathway files
        directoryPDF = folderName + str(kn) + '/pathway_files'
        if not os.path.exists(directoryPDF):
            os.makedirs(directoryPDF)
        else:
            raise ValueError('Krista - be careful, this folder already exists')

        #check if the directories exist, one for reaction files
        directoryPNG = folderName + str(kn) + '/reaction_files'
        if not os.path.exists(directoryPNG):
            os.makedirs(directoryPNG) 
        else:
            raise ValueError('Krista - be careful, this folder already exists')
                       
        #check if the directories exist, one for species 
        directorySpecies = folderName + str(kn) + '/species_files'
        if not os.path.exists(directorySpecies):
            os.makedirs(directorySpecies) 
        else:
            raise ValueError('Krista - be careful, this folder already exists')
                    
        for item in trimPath: #searching within one pathway at a time
            plotPathway = [] #gather up yes/no and will only plot if have linked genes/mtabs    
            genes = getKfrom_ko(item)
            compounds = getCfrom_ko(item)
            gatherCounts.loc[item,'nCpds'] = len(compounds)
            gatherCounts.loc[item,'nGenes'] = len(genes)     
            #have to track genes and compounds differently for the biopython plotting later on 
            setG = set(genes)
            setC = set(compounds)
            setB = set(oneK.KEGG)
            intGenes = setG.intersection(setB)
            intCompounds = setC.intersection(setB)
            gatherCounts.loc[item,(getKm + '_gene')] = len(intGenes)
            gatherCounts.loc[item,(getKm + '_cpd')] = len(intCompounds)
            for gen in intGenes: #go through each gene...one at a time
                rnList = kegg_link('reaction',gen).read() #get the list of reactions for that gene
                #can have cases where there is a gene and no reaction (K02906 for example). This returns rnList = '\n'
                #since this is not actually empty...need a few way to filter those out
                test = '\n'
                if test != rnList:
                    for line in rnList.rstrip().split('\n'):
                        countCpd = []
                        countGene = []
                        m = rnString.search(line) #get the reaction number
                        cpdList = kegg_link('cpd',m.group(0)).read() #now go get the compounds for that reaction
                        #can have no compounds in a reaction (only glycans, begin with G, nothing I have matched)
                        if len(cpdList) > 1: #will be true if cpdList includes compounds
                            for line2 in cpdList.rstrip().split('\n'):
                                m2 = cpdString.search(line2).group(0)
                                #now that I have a compound, check if it is in intCompounds
                                if m2 in intCompounds:
                                    countCpd.append(m2) 
                                    countGene.append(gen)
                                    plotPathway.append('yes')
                        ##Now, plot the PNG files (one for each reaction within a pathway)
                        if len(countCpd) > 0:
                            dayList = ['S1','S2','S3','S4','S5']
                            kData = pd.DataFrame(columns = dayList)
                            for k in set(countGene):
                                kData = kData.append(oneK.ix[k,dayList])
                            cData = pd.DataFrame(columns = dayList)
                            for co in set(countCpd):
                                #convert CO to RI, can have multiple options
                                j = findRInumber(oneK,co)
                                cData = cData.append(oneK.loc[j,dayList])
                            fig,ax = plt.subplots(1)
                            cData.T.plot(color = 'k',ax=ax)
                            kData.T.plot(color = 'r',ax=ax)
                            handles, labels = ax.get_legend_handles_labels()
                            #convert the RI numbers to COnumbers for the figure
                            for ia, a in enumerate(labels):
                                #add compound/gene name to the legend
                                if a[0]== 'R':
                                    tLabel = convertRItoCO(CO_fromMATLAB,a)
                                    fn = kegg_list(tLabel).read()                          
                                    labels[ia] = fn
                                elif a[0] == 'K':
                                    fn = kegg_list(a).read()
                                    labels[ia] = fn
                            ax.legend(handles, labels, bbox_to_anchor = ([-1, 0.5]))
                            fig.suptitle('pathway ' + item + ', Kmeans grp ' + str(kn))
                            pngName = 'pathway' + item + '_' + m.group(0) + '.png'
                            fig.savefig(directoryPNG + '/' + pngName, bbox_inches = 'tight')
                            pngName = None #empty it in case that is where I am having issues
                            plt.close()
            if len(plotPathway)>0:
                ## plot the pathway map for this pathway, get details from KEGG for plotting
                useColors = pal.colorbrewer.qualitative.Set1_4.hex_colors
                useColors.insert(0,'#f7f7f7') ## insert white at beginning
                # order of colors: white, red, blue,green,purple
                sd = 0 #not in dataset
                sk = 1 #in K means group and pathway
                sa = 2 #in pathway, in any K means (for genes, bc overlap in numbers)
                sn = 3 #in pathway, not in K means group (compounds only)               
                su = 4 #unconnected gene or compound
                line1 = useColors[sd] + ', not in dataset' + '\n'
                line2 = useColors[sk] + ', in K means group and pathway' + '\n'
                line3 = useColors[sa] + ', #in pathway, in any K means (for genes, bc overlap in numbers)' +'\n'
                line4 = useColors[sn] +  ', #in pathway, not in K means group (compounds only)' + '\n'               
                line5 = useColors[su] + ', #unconnected gene or compound' + '\n'
                file = open("readme_colorsInPathways.txt", "w")
                file.write(line1 + line2 + line3 + line4 + line5)
                file.close()
                
                pathway = KGML_parser.read(kegg_get(item, "kgml"))
                for element in pathway.orthologs:
                    #print element.name
                    for graphic in element.graphics:
                        tg = element.name[3:9] #skip over the 'ko:'
                        if (tg in intGenes):
                            #in the pathway AND in the set for this particular K means group
                            graphic.bgcolor = useColors[sk] #
                            
                            #if this is something in the pathway, plot up the species for the K number
                            if tg in Insitu_TPM_DIA.index.tolist():
                                Dk=Insitu_TPM_DIA.loc[tg]
                            else: 
                                Dk = 0/Insitu_TPM_DIA.iloc[0] #make an empty frame
                            if tg in Insitu_TPM_DIN.index.tolist():
                                Nk=Insitu_TPM_DIN.loc[tg]
                            else:
                                Nk = 0/Insitu_TPM_DIN.iloc[0]
                            if tg in Insitu_TPM_Oth.index.tolist():
                                Ok=Insitu_TPM_Oth.loc[tg]
                            else:
                                Ok = 0/Insitu_TPM_Oth.iloc[0]
                            fig,ax=plt.subplots(1)
                            ax.stackplot(range(5), Dk, Nk, Ok, colors=pal.colorbrewer.qualitative.Set3_6_r.hex_colors, lw=0)
                            ax.set_xticks(range(5))
                            ax.set_xticklabels([1,2,3,4,5])
                            ax.set_ylabel('In situ TPM')
                            plt.title(tg + ', lt orange=diatoms, blue=dinos, dk orange=other')
                            fig.savefig(directorySpecies + '/' + tg + '_species.png',bbox_inches='tight')
                            plt.close()
                        elif (tg in fullSet) and (tg in genes) and (tg not in intGenes):
                            #in the pathway AND in the set of genes from RI, allow any Kmeans group for genes
                            graphic.bgcolor = useColors[sa] #
                        elif (tg not in fullSet) and (tg in genes) and (tg not in KO_Norm2Mean.index.tolist()):
                            #in the pathway, but *not* in anything from the RI samples
                            graphic.bgcolor = useColors[sd] #
                        elif (tg not in fullSet) and (tg in genes) and (tg in KO_Norm2Mean.index.tolist()): 
                            #an unconnected gene in the RI data
                            graphic.bgcolor = useColors[su] #
                # Change the colours of compounds (mostly same as genes
                for element in pathway.compounds:
                    for graphic in element.graphics:
                        tc = element.name[4:10] #skip over the 'cpd:'
                        if (tc in intCompounds):
                            #in the pathway AND in the set for this particular K means group
                            graphic.bgcolor = useColors[sk] #
                            graphic.width = size
                            graphic.height = size
                        elif (tc in fullSet) and (tc in compounds) and (tc not in intCompounds):
                            #in the pathway AND in the set of compounds from RI, but *not* in this Kmeans group
                            graphic.bgcolor = useColors[sn] #
                            graphic.width = size
                            graphic.height = size
                        elif (tc not in fullSet) and (tc in compounds) and (tc not in CO_fromMATLAB.cNumber.values):
                            #in the pathway, but *not* in anything from the RI samples
                            graphic.bgcolor = useColors[sd] #  
                        elif (tc not in fullSet) and (tc in compounds) and (tc in CO_fromMATLAB.cNumber.values): #seems like a hack
                            #unconnected compound in the RI data
                            graphic.bgcolor = useColors[su] #
                            graphic.width = size
                            graphic.height = size
                canvas = KGMLCanvas(pathway, import_imagemap=True)
                pdfName = 'mapWithColors_' + str(item) + '.pdf'
                canvas.draw(directoryPDF + '/' + pdfName)
                pdfName = None #empty it in case that is where I am having issues
    #stick the pathway information into gatherCounts before I export...
    #want to export gatherCounts, with the added pathway name as a new column
    gatherCounts['pathwayInfo'] = ''
    gatherCounts['pathwayGroup_A'] = ''
    gatherCounts['pathwayGroup_B'] = ''
    gatherCounts['pathwayGroup_C'] = ''
    #go read in the file from KEGG
    D = glob.glob('br08901.keg') #from http://www.genome.jp/kegg-bin/get_htext?br08901.keg; 3/15/2016
    allBRITE=[]
    for idx,nof in enumerate(D):
        allBRITE = ReadBRITEfile(nof) 

    #put the pathway name and group into the data frame before exporting it
    for item in gatherCounts.index:
        #if this error appears: IndexError: index 0 is out of bounds for axis 0 with size 0
        #KEGG has updated a pathway, but not the BRITE file (see below for work around)
        pathstr = kegg_list(item).read()
        #this next line splits the string at the '\t', then keeps the piece at index = 1, and strips off the '\n'
        gatherCounts.loc[item,('pathwayInfo')] = pathstr.split('\t')[1].rstrip()
        t = allBRITE.loc[allBRITE['map']==item[2:]]  
        #put in a check to see if t.empty ...will be empty if KEGG updated pathway and not BRITE file
        if t.empty is False: 
            gatherCounts.set_value(item,'pathwayGroup_A',t['A'].values[0])
            gatherCounts.set_value(item,'pathwayGroup_B',t['B'].values[0])
            gatherCounts.set_value(item,'pathwayGroup_C',t['C'].values[0])
    
    return gatherCounts
def gatherDetails(enterPathway,folderName,useCO,CO_values):
    #check if the directories exist, one for pathway files
    if not os.path.exists(folderName):
        os.makedirs(folderName)
    #else:
        #raise ValueError('Be careful, this folder already exists')
                   
    #only one pathway at a time
    setKeep = 1
    try:
        kegg_get(enterPathway).read()
    except:
        #use the ko map if there is nothing species specific...this can also fail...
        usePathway = 'ko' + enterPathway[3:8]
        setKeep = 0
        try:
            kegg_get(usePathway).read()
        except:
            pass
        
    if setKeep:
        usePathway = enterPathway

    #get the compounds and genes for this pathway     
    genes = getKfrom_ko(usePathway)
    compounds = getCfrom_ko(usePathway)
    
    #figure out which ones I have data for...
    setG = set(genes)
    setC = set(compounds)
    setT = set(useCO)
    intCompounds = setC.intersection(setT)
        
    ## plot the pathway map for this pathway, get details from KEGG for plotting (%must be at least 4 colors)
    useColors = pal.colorbrewer.diverging.PuOr_4.hex_colors
    #useColors = pal.colorbrewer.diverging.RdYlBu_11.hex_colors
    
    #set the color of the mtab based on its value, only scale the values from this particular pathway
    useCOsubset = CO_values.loc[intCompounds]
    cmin = useCOsubset.min() #find min and max...ignore NaN and inf for the moment
    cmax = useCOsubset.replace([np.inf],np.nan).dropna(how = 'all').max()
    
    size = 20 #increase the size of the compounds in the plots
        
    #can have all zeros...
    if sum(useCOsubset.dropna())==0:
        pass
        #print('No measured metabolites in pathway ' + usePathway)
    elif len(useCOsubset.value_counts())==1:
        #only two color options: yes/no
        dummy = useCOsubset.copy(deep = True)
        dummy.replace([np.inf],np.nan,inplace = True)
        for idx,item in enumerate(useCOsubset):
            if np.isnan(item):
                useCOsubset.iloc[idx] = int(0)
            else:
                useCOsubset.iloc[idx] = int(1) 
        
        #go get the pathway information and customize the plot
        pathway = KGML_parser.read(kegg_get(usePathway, "kgml")) #no choice in gene color: green

        # Change the colors of compounds
        for element in pathway.compounds:
            for graphic in element.graphics:
                tc = element.name[4:10] #skip over the 'cpd:'
                if (tc in intCompounds):
                    #in the pathway, set the color
                    tempColor = useCOsubset.loc[tc]
                    graphic.bgcolor = useColors[int(tempColor)] 
                    graphic.width = size
                    graphic.height = size

        canvas = KGMLCanvas(pathway, import_imagemap=True)
        pdfName = 'mapWithColors_' + str(usePathway) + '.pdf'
        canvas.draw(folderName + '/' + pdfName)
        pdfName = None #empty it in case that is where I am having issues         
        
    else:
        dummy = useCOsubset.copy(deep = True)
        dummy.replace([np.inf],np.nan,inplace = True)
        for idx,item in enumerate(useCOsubset):
            if np.isnan(item):
                useCOsubset.iloc[idx] = 0
            elif np.isinf(item):
                useCOsubset.iloc[idx] = 10*cmax #make inf 10x the biggest value

        #now, find cmax again...use that downstream
        cmax = useCOsubset.replace([np.inf],np.nan).dropna(how = 'all').max()

        #use histogram to make the bins (complete hack)
        a,bin_edges = np.histogram(useCOsubset,bins = len(useColors)-3,range = (cmin,cmax))
        #now...put zero at beginning and inf at end
        #BUT - can actually have cases with values for all metabolites (novel concept)
        try:
            nz = useCOsubset.value_counts()[0] #count the number of zeros
            a = np.insert(a,0,nz)
            bin_edges = np.insert(bin_edges,0,0)
        except:
            pass
            
        try:
            nm = useCOsubset.value_counts()[cmax]
            a = np.append(a,nm)
            bin_edges = np.append(bin_edges,cmax)
        except:
            pass

        #then find the index for each number...this will be the index into useColors
        useIdx = np.digitize(useCOsubset,bin_edges)
        color_df = pd.DataFrame({'mtab': useCOsubset,'idx':useIdx})

        #go get the pathway information and customize the plot
        pathway = KGML_parser.read(kegg_get(usePathway, "kgml")) #no choice in gene color: green

        # Change the colors of compounds
        for element in pathway.compounds:
            for graphic in element.graphics:
                tc = element.name[4:10] #skip over the 'cpd:'
                if (tc in intCompounds):
                    #in the pathway, set the color
                    tempColor = color_df.loc[tc,'idx']
                    graphic.bgcolor = useColors[int(tempColor)-1] 
                    graphic.width = size
                    graphic.height = size

        canvas = KGMLCanvas(pathway, import_imagemap=True)
        pdfName = 'mapWithColors_' + str(usePathway) + '.pdf'
        #Tracer()()
        canvas.draw(folderName + '/' + pdfName)
        pdfName = None #empty it in case that is where I am having issues
Esempio n. 25
0
def map2highlighted_map(map_id,
                        ko_list,
                        ko2freq,
                        biodb,
                        outpath='test.pdf',
                        taxon_id=False,
                        n_species=60):
    import re
    from chlamdb.biosqldb import shell_command
    from Bio.Graphics.KGML_vis import KGMLCanvas
    from Bio.Graphics import KGML_vis
    import urllib.request
    from Bio.KEGG.KGML.KGML_pathway import Pathway, Reaction, Relation
    import Bio.KEGG.KGML.KGML_pathway
    from Bio.KEGG.KGML import KGML_parser
    from Bio.Graphics.ColorSpiral import ColorSpiral
    import matplotlib.cm as cm
    from matplotlib.colors import rgb2hex
    import matplotlib as mpl

    values = [float(i) for i in ko2freq.values()]

    norm = mpl.colors.Normalize(vmin=0, vmax=n_species)
    cmap = cm.OrRd
    cmap2 = cm.Greens
    m = cm.ScalarMappable(norm=norm, cmap=cmap)
    m2 = cm.ScalarMappable(norm=norm, cmap=cmap2)

    url_template = 'http://rest.kegg.jp/get/%s/kgml' % re.sub(
        'map', 'ko', map_id)
    print(url_template)
    f = urllib.request.urlopen(url_template)
    from Bio.Graphics import KGML_vis

    pathway = KGML_parser.read(f.read().decode('UTF-8'))

    kgml_map = KGMLCanvas(pathway, show_maps=True)

    # Let's use some arbitrary colours for the orthologs
    cs = ColorSpiral(a=2, b=0.2, v_init=0.85, v_final=0.5, jitter=0.03)
    # Loop over the orthologs in the pathway, and change the
    # background colour
    orthologs = [e for e in pathway.orthologs]
    for o in orthologs:
        match = False
        if 'K00163' in o.name:
            print('##################################')
        ko_temp_list = set([i.rstrip() for i in o.name.split('ko:')])
        if len(ko_temp_list.intersection(set(ko2freq.keys()))) > 0:

            ko_keep = []
            for ko in ko_temp_list:
                if ko in ko2freq:
                    ko_keep.append(ko)
                if ko in ko_list:
                    match = True
            o.name = 'ko:' + ' ko:'.join(ko_keep)
            total = sum([
                int(ko2freq[i])
                for i in ko_temp_list.intersection(set(ko2freq.keys()))
            ])

            for g in o.graphics:
                if match:
                    g.bgcolor = rgb2hex(m2.to_rgba(float(total)))
                else:
                    #print 'no match!!!!'
                    #print ko_temp_list
                    #print ko2freq.keys()
                    #print 'TOTAL:', total
                    g.bgcolor = rgb2hex(m.to_rgba(float(total)))
            o.name = "%s (%s)" % (o.name.split('ko:')[0], total)
        #else:
        #    for g in o.graphics:
        #        g.bgcolor = '#FFFFFF'

    # Default settings are for the KGML elements only

    # We need to use the image map, and turn off the KGML elements, to see
    # only the .png base map. We could have set these values on canvas
    # instantiation
    kgml_map.import_imagemap = True
    kgml_map.show_maps = True
    kgml_map.show_orthologs = True
    kgml_map.draw_relations = False
    kgml_map.show_compounds = False
    kgml_map.show_genes = False
    kgml_map.show_compounds = False
    kgml_map.show_genes = False
    kgml_map.draw(outpath)
    '''
    print 'DIRLISAT:', dir(pathway)
    maps = [m for m in pathway.maps]
    for map in maps:
        for g in map.graphics:
            print g.name
    '''

    #print re.sub('pdf', 'svg', outpath)
    shell_command.shell_command(
        'inkscape %s --export-plain-svg=%s' %
        (outpath, re.sub('pdf', 'svg', outpath)))  # 'pdf2svg %s %s all'
    t = edit_svg_map("%s" % re.sub('pdf', 'svg', outpath),
                     ko2freq.keys(),
                     biodb,
                     map_id,
                     taxon_id=taxon_id)
    #print "%s" % re.sub('pdf', 'svg', outpath)
    t.write("%s" % re.sub('pdf', 'svg', outpath))
Esempio n. 26
0
def kegg_get_pathway(identifier):
    identifier_sanitised = identifier.replace("path:", "")
    identifier_sanitised = identifier_sanitised.replace("map", "hsa")
    pathway_kgml = KEGG_REST.kegg_get(identifier_sanitised, "kgml")
    return KEGG_KGML_PARSER.read(pathway_kgml)
   file=open(os.path.join(KEGG_data_folder,identifier+'.kgml'),'w')
   file.write(KGML_handle.read())
   file.close()

#%%
parse_pathways=1
if parse_pathways:
    
    pathways=[]
    lengths=[]
    for filename in os.listdir(KEGG_data_folder):
        if not filename.endswith('kgml'): continue
        #pathways.append({})    
        KGML_handle=open(os.path.join(KEGG_data_folder,filename))#open('/home/grosstor/Desktop/steady_ready_projects/response_logic_synthetic_benchmarks/KEGG/hsa00515.xml')
        #KGML_handle=open(identifier+'.kgml')
        pathway=KGML_parser.read(KGML_handle)
        net=nx.DiGraph()
        for relation in pathway.relations:
            #print(relation.entry1.id,'<->',relation.entry2.id)
            net.add_edge(relation.entry1.id,relation.entry2.id)
        lengths.append(len(net))
        if (len(net)>5) & (len(net)<100):
    #        print(len(net))
            #generate some extra information from net#
            
            gold_Jac=nx.adj_matrix(net).toarray().T
            cycle_list=[set(cycle) for cycle in nx.simple_cycles(net) ] #maybe needed later to correlate results with number of nodes that are in loops
            #find distinct aggregated cycles
            aggregated_cycles=[]
            while cycle_list:        
                aggr_cycle=cycle_list.pop()
Esempio n. 28
0

max_statements = 200
num_statements = 0
total_num_statements = 0

tx = graph.cypher.begin()

for filename in files:
  url = datapath + filename
  print ("Loading file " + url)

  currentNodes = set()

  with open(url) as f:
    pathway = KGML_parser.read(f, 'r')
    # Maps from the reaction id (e.g. rn:R05134) to the Node Objects that are part of this reaction
    reactionToNode = dict()
    # Maps from the internal pathway id (e.g. 23) to the compound id (e.g. C00010)
    compoundDict = dict()

    for gene in pathway.genes:
      for geneName in gene.name.split(" "):
        gene_id = geneName[4:]
        if gene_id not in nodes:
          add_node(tx, ["_Network_Node", "Gene"], gene_id, {"name": gene_id, "idType": "ENTREZ"})
          num_statements += 1
          total_num_statements += 1
          if commit(tx, num_statements, max_statements, total_num_statements):
            num_statements = 0
            tx = graph.cypher.begin()
Esempio n. 29
0
#open("ec_5.4.2.2.txt",'w').write(request.read())
#records = Enzyme.parse(open("ec_5.4.2.2.txt"))
#record = list(records)[0]
#print(record.classname)
#print(record.entry)
organisms = REST.kegg_list("organism").read()
organismlist = []
for line in organisms.rstrip().split("\n"):
    #print(line)
    code = line.split("\t")[1]
    organismlist.append(code)

#print(organismlist)

#parser = KGML_parser.KGMLparser()
#open("human_map.xml",'w').write(REST.kegg_get("hsa05130",option="kgml").read())
human_map = KGML_parser.read(REST.kegg_get("hsa01100",option="kgml"))
cpds = human_map.compounds
for cpd in cpds:
    print(cpd.name)
    graphics = cpd.graphics
    for graphic in graphics:
        print(graphic.x)

rxns = human_map.reaction_entries
for rxn in rxns:
    print(rxn.name)
    graphics = rxn.graphics
    for graphic in graphics:
        print(graphic.x)
Esempio n. 30
0
 def test_parse_remote_pathway(self):
     """Download a KEGG pathway from the KEGG server and parse KGML."""
     with kegg_get("ko03070", "kgml") as handle:
         pathway = KGML_parser.read(handle)
     self.assertEqual(pathway.name, "path:ko03070")
Esempio n. 31
0
File: KEGG.py Progetto: CSB-IG/pxat
def readKGML(kgml):
    return KGML_parser.parse(kgml).next()