def ShowNodeClasses(doc, verbose=0): #nodes=xpath.find('/graphml/graph/nnode',doc) nodes = doc.findall('/graphml/graph/nnode') node_classes = {} for node in nodes: node_id = None node_class = None if node.attributes: for attName, attValue in node.attributes.items(): if attName == 'id': node_id = attValue node_class_uri = re.sub(r'/[^/]+$', '', node_id) for cnode in node.childNodes: if cnode.nodeName == 'data': if cnode.attributes: for attName, attValue in cnode.attributes.items(): if attName == 'key' and attValue == 'class': node_class = xml_utils.DOM_NodeText(cnode) node_classes[node_class] = node_class_uri print >> sys.stderr, 'Node classes (%d):' % len(node_classes) for node_class in sorted(node_classes.keys()): print >> sys.stderr, '%18s:\t[%s]' % (node_class, node_classes[node_class]) return node_classes
def DTPathGraphml2TIDs(doc, fout, verbose): #tgt_nodes=xpath.find('/graphml/graph/nnode/data[@key="class" and text()="gene"]/parent::node()',doc) tgt_nodes = doc.findall( '/graphml/graph/nnode/data[@key="class" and text()="gene"]/parent::node()' ) n_tid = 0 for node in tgt_nodes: tid = None atts = node.attributes for cnode in node.childNodes: if cnode.attributes: for attName, attValue in cnode.attributes.items(): if attName == 'key' and attValue == 'label': txt = xml_utils.DOM_NodeText(cnode) try: tid = re.sub(r'\(.*\)', '', txt) except Exception, e: print >> sys.stderr, 'Error (Exception): %s' % e if tid == None: continue fout.write('%s\n' % tid) n_tid += 1
def ShowEdgeClasses(doc, verbose=0): #edges=xpath.find('/graphml/graph/edge',doc) edges = doc.findall('/graphml/graph/edge') edge_classes = {} for edge in edges: source_id = None source_class = None target_id = None target_class = None if edge.attributes: for attName, attValue in edge.attributes.items(): if attName == 'source': source_id = attValue source_class_uri = re.sub(r'/[^/]+$', '', source_id) if attName == 'target': target_id = attValue target_class_uri = re.sub(r'/[^/]+$', '', target_id) for cnode in edge.childNodes: if cnode.nodeName == 'data': if cnode.attributes: for attName, attValue in cnode.attributes.items(): if attName == 'key' and attValue == 'uri': edge_class = xml_utils.DOM_NodeText(cnode) if not edge_classes.has_key(edge_class): edge_classes[edge_class] = set() edge_classes[edge_class].add( tuple( sorted( [source_class_uri, target_class_uri]))) print >> sys.stderr, 'Edge classes (%d):' % len(edge_classes) for edge_class in sorted(edge_classes.keys()): edge_class_name = re.sub(r'^.*/', '', edge_class) print >> sys.stderr, '%18s:\t[%s]' % (edge_class_name, edge_class) for source, target in edge_classes[edge_class]: source_name = re.sub(r'^.*/', '', source) target_name = re.sub(r'^.*/', '', target) print >> sys.stderr, '\t\t%s : %s' % (source_name, target_name) return edge_classes
def AnnotateDTPathGraphmlCompounds(doc, fin_csv, verbose=0): '''Annotate graphml with additional data from CSV file.''' n_in = 0 n_out = 0 errtxt = None if not doc: return None ### Read CSV file containing target data: csvReader = csv.DictReader(fin_csv, fieldnames=None, restkey=None, restval=None, dialect='excel', delimiter=',', quotechar='"') try: csvrow = csvReader.next() ## must do this to get fieldnames n_in += 1 except: errtxt = ('ERROR: bad ifile: %s' % fin_csv.name) return doc print >> sys.stderr, 'DEBUG: fieldnames=', csvReader.fieldnames cid_tag = 'cid' synonyms_tag = 'synonyms' for tag in (cid_tag, synonyms_tag): if tag not in csvReader.fieldnames: print >> sys.stderr, 'ERROR: %s not in fieldnames' % tag return doc cid2synonyms = {} n_synonyms_total = 0 while True: try: csvrow = csvReader.next() n_in += 1 cid = int(csvrow[cid_tag]) synonyms = csvrow[synonyms_tag].split(';') cid2synonyms[cid] = synonyms n_synonyms_total += len(synonyms) except: break print >> sys.stderr, 'DEBUG: cpd ids read: %d ; total synonyms: %d' % ( len(cid2synonyms), n_synonyms_total) #Find cpd nodes in Graphml: #cpd_nodes=xpath.find('/graphml/graph/nnode/data[@key="class" and text()="pubchem_compound"]/parent::node()',doc) cpd_nodes = doc.findall( '/graphml/graph/nnode/data[@key="class" and text()="pubchem_compound"]/parent::node()' ) n_cid = 0 for node in cpd_nodes: cid = None atts = node.attributes for cnode in node.childNodes: if cnode.attributes: for attName, attValue in cnode.attributes.items(): if attName == 'key' and attValue == 'label': txt = xml_utils.DOM_NodeText(cnode) try: cid = int(re.sub(r'\(.*\)', '', txt)) except Exception, e: print >> sys.stderr, 'Error (Exception): %s' % e if cid == None: continue n_cid += 1 if cid2synonyms.has_key(cid): namenode = doc.createElement('data') namenode.setAttribute('key', 'synonyms') namenode.appendChild( doc.createTextNode('; '.join(cid2synonyms[cid]))) node.appendChild(namenode) #Also revise cpd label: #cpd_label_node=xpath.find('data[@key="label"]',node)[0] cpd_label_node = node.find('data[@key="label"]') ReplaceNodeText(cpd_label_node, cid2synonyms[cid][0]) xml_str = node.toprettyxml() xml_str = re.sub(r'\s*[\n\r]+', r'\n', xml_str) xml_str = re.sub(r'<nnode', '<node', xml_str) xml_str = re.sub(r'/nnode>', '/node>', xml_str) #print >>sys.stderr, 'DEBUG: %s'%xml_str else: print >> sys.stderr, 'DEBUG: cid %d not in csv' % cid
def AnnotateDTPathGraphmlTargets(doc, fin_csv, verbose=0): '''Annotate graphml with additional data from CSV file.''' n_in = 0 n_out = 0 errtxt = None if not doc: return None ### Read CSV file containing target data: csvReader = csv.DictReader(fin_csv, fieldnames=None, restkey=None, restval=None, dialect='excel', delimiter=',', quotechar='"') try: csvrow = csvReader.next() ## must do this to get fieldnames n_in += 1 except: errtxt = ('ERROR: bad ifile: %s' % fin_csv.name) return doc print >> sys.stderr, 'DEBUG: fieldnames=', csvReader.fieldnames tgt_id_tag = 'protein_accession' tgt_gene_tag = 'gene_symbol' tgt_name_tag = 'pref_name' for tag in (tgt_id_tag, tgt_name_tag, tgt_gene_tag): if tag not in csvReader.fieldnames: print >> sys.stderr, 'ERROR: %s not in fieldnames' % tag return doc tgt_id2name = {} tgt_gene2id = {} while True: try: csvrow = csvReader.next() n_in += 1 tgt_id = csvrow[tgt_id_tag] tgt_name = csvrow[tgt_name_tag] tgt_gene = csvrow[tgt_gene_tag] tgt_gene2id[tgt_gene] = tgt_id tgt_id2name[tgt_id] = tgt_name except: break print >> sys.stderr, 'DEBUG: protein ids/names read: %d' % len(tgt_id2name) #for tgt_id in tgt_id2name.keys(): # print >>sys.stderr, 'DEBUG: protein id: %s'%tgt_id #gene_nodes = xpath.find('/graphml/graph/nnode/data[@key="class" and text()="gene"]/parent::node()', doc) gene_nodes = doc.findall( '/graphml/graph/nnode/data[@key="class" and text()="gene"]/parent::node()' ) print >> sys.stderr, 'DEBUG: gene count: %d' % len(gene_nodes) for node in gene_nodes: tgt_id = None atts = node.attributes for cnode in node.childNodes: if cnode.attributes: for attName, attValue in cnode.attributes.items(): if attName == 'key' and attValue == 'label': tgt_gene = xml_utils.DOM_NodeText(cnode) if tgt_gene2id.has_key(tgt_gene): tgt_id = tgt_gene2id[tgt_gene] print >> sys.stderr, 'DEBUG: gene = "%s" ; tgt = "%s"' % ( tgt_gene, tgt_id) if tgt_id and tgt_id2name.has_key(tgt_id): namenode = doc.createElement('data') namenode.setAttribute('key', 'name') namenode.appendChild(doc.createTextNode(tgt_id2name[tgt_id])) node.appendChild(namenode) xml_str = node.toprettyxml() xml_str = re.sub(r'\s*[\n\r]+', r'\n', xml_str) #xml_str=re.sub(r'<nnode (.*?)</nnode>',r'<node \1</node>',xml_str,re.DOTALL) xml_str = re.sub(r'<nnode', '<node', xml_str) xml_str = re.sub(r'/nnode>', '/node>', xml_str) #print >>sys.stderr, 'DEBUG: %s'%xml_str return doc