Exemple #1
0
def ShowNodeClasses(doc, verbose=0):
    #nodes=xpath.find('/graphml/graph/nnode',doc)
    nodes = doc.findall('/graphml/graph/nnode')
    node_classes = {}
    for node in nodes:
        node_id = None
        node_class = None
        if node.attributes:
            for attName, attValue in node.attributes.items():
                if attName == 'id':
                    node_id = attValue
                    node_class_uri = re.sub(r'/[^/]+$', '', node_id)

        for cnode in node.childNodes:
            if cnode.nodeName == 'data':
                if cnode.attributes:
                    for attName, attValue in cnode.attributes.items():
                        if attName == 'key' and attValue == 'class':
                            node_class = xml_utils.DOM_NodeText(cnode)
                            node_classes[node_class] = node_class_uri

    print >> sys.stderr, 'Node classes (%d):' % len(node_classes)
    for node_class in sorted(node_classes.keys()):
        print >> sys.stderr, '%18s:\t[%s]' % (node_class,
                                              node_classes[node_class])
    return node_classes
Exemple #2
0
def DTPathGraphml2TIDs(doc, fout, verbose):
    #tgt_nodes=xpath.find('/graphml/graph/nnode/data[@key="class" and text()="gene"]/parent::node()',doc)
    tgt_nodes = doc.findall(
        '/graphml/graph/nnode/data[@key="class" and text()="gene"]/parent::node()'
    )
    n_tid = 0
    for node in tgt_nodes:
        tid = None
        atts = node.attributes
        for cnode in node.childNodes:
            if cnode.attributes:
                for attName, attValue in cnode.attributes.items():
                    if attName == 'key' and attValue == 'label':
                        txt = xml_utils.DOM_NodeText(cnode)
                        try:
                            tid = re.sub(r'\(.*\)', '', txt)
                        except Exception, e:
                            print >> sys.stderr, 'Error (Exception): %s' % e
        if tid == None: continue
        fout.write('%s\n' % tid)
        n_tid += 1
Exemple #3
0
def ShowEdgeClasses(doc, verbose=0):
    #edges=xpath.find('/graphml/graph/edge',doc)
    edges = doc.findall('/graphml/graph/edge')
    edge_classes = {}
    for edge in edges:
        source_id = None
        source_class = None
        target_id = None
        target_class = None
        if edge.attributes:
            for attName, attValue in edge.attributes.items():
                if attName == 'source':
                    source_id = attValue
                    source_class_uri = re.sub(r'/[^/]+$', '', source_id)
                if attName == 'target':
                    target_id = attValue
                    target_class_uri = re.sub(r'/[^/]+$', '', target_id)

        for cnode in edge.childNodes:
            if cnode.nodeName == 'data':
                if cnode.attributes:
                    for attName, attValue in cnode.attributes.items():
                        if attName == 'key' and attValue == 'uri':
                            edge_class = xml_utils.DOM_NodeText(cnode)
                            if not edge_classes.has_key(edge_class):
                                edge_classes[edge_class] = set()
                            edge_classes[edge_class].add(
                                tuple(
                                    sorted(
                                        [source_class_uri, target_class_uri])))

    print >> sys.stderr, 'Edge classes (%d):' % len(edge_classes)
    for edge_class in sorted(edge_classes.keys()):
        edge_class_name = re.sub(r'^.*/', '', edge_class)
        print >> sys.stderr, '%18s:\t[%s]' % (edge_class_name, edge_class)
        for source, target in edge_classes[edge_class]:
            source_name = re.sub(r'^.*/', '', source)
            target_name = re.sub(r'^.*/', '', target)
            print >> sys.stderr, '\t\t%s : %s' % (source_name, target_name)
    return edge_classes
Exemple #4
0
def AnnotateDTPathGraphmlCompounds(doc, fin_csv, verbose=0):
    '''Annotate graphml with additional data from CSV file.'''

    n_in = 0
    n_out = 0
    errtxt = None

    if not doc: return None

    ### Read CSV file containing target data:
    csvReader = csv.DictReader(fin_csv,
                               fieldnames=None,
                               restkey=None,
                               restval=None,
                               dialect='excel',
                               delimiter=',',
                               quotechar='"')
    try:
        csvrow = csvReader.next()  ## must do this to get fieldnames
        n_in += 1
    except:
        errtxt = ('ERROR: bad ifile: %s' % fin_csv.name)
        return doc
    print >> sys.stderr, 'DEBUG: fieldnames=', csvReader.fieldnames

    cid_tag = 'cid'
    synonyms_tag = 'synonyms'
    for tag in (cid_tag, synonyms_tag):
        if tag not in csvReader.fieldnames:
            print >> sys.stderr, 'ERROR: %s not in fieldnames' % tag
            return doc
    cid2synonyms = {}
    n_synonyms_total = 0
    while True:
        try:
            csvrow = csvReader.next()
            n_in += 1
            cid = int(csvrow[cid_tag])
            synonyms = csvrow[synonyms_tag].split(';')
            cid2synonyms[cid] = synonyms
            n_synonyms_total += len(synonyms)
        except:
            break
    print >> sys.stderr, 'DEBUG: cpd ids read: %d ; total synonyms: %d' % (
        len(cid2synonyms), n_synonyms_total)

    #Find cpd nodes in Graphml:
    #cpd_nodes=xpath.find('/graphml/graph/nnode/data[@key="class" and text()="pubchem_compound"]/parent::node()',doc)
    cpd_nodes = doc.findall(
        '/graphml/graph/nnode/data[@key="class" and text()="pubchem_compound"]/parent::node()'
    )
    n_cid = 0
    for node in cpd_nodes:
        cid = None
        atts = node.attributes
        for cnode in node.childNodes:
            if cnode.attributes:
                for attName, attValue in cnode.attributes.items():
                    if attName == 'key' and attValue == 'label':
                        txt = xml_utils.DOM_NodeText(cnode)
                        try:
                            cid = int(re.sub(r'\(.*\)', '', txt))
                        except Exception, e:
                            print >> sys.stderr, 'Error (Exception): %s' % e
        if cid == None: continue
        n_cid += 1

        if cid2synonyms.has_key(cid):
            namenode = doc.createElement('data')
            namenode.setAttribute('key', 'synonyms')
            namenode.appendChild(
                doc.createTextNode('; '.join(cid2synonyms[cid])))
            node.appendChild(namenode)

            #Also revise cpd label:
            #cpd_label_node=xpath.find('data[@key="label"]',node)[0]
            cpd_label_node = node.find('data[@key="label"]')
            ReplaceNodeText(cpd_label_node, cid2synonyms[cid][0])

            xml_str = node.toprettyxml()
            xml_str = re.sub(r'\s*[\n\r]+', r'\n', xml_str)
            xml_str = re.sub(r'<nnode', '<node', xml_str)
            xml_str = re.sub(r'/nnode>', '/node>', xml_str)
            #print >>sys.stderr, 'DEBUG: %s'%xml_str

        else:
            print >> sys.stderr, 'DEBUG: cid %d not in csv' % cid
Exemple #5
0
def AnnotateDTPathGraphmlTargets(doc, fin_csv, verbose=0):
    '''Annotate graphml with additional data from CSV file.'''

    n_in = 0
    n_out = 0
    errtxt = None

    if not doc:
        return None

    ### Read CSV file containing target data:
    csvReader = csv.DictReader(fin_csv,
                               fieldnames=None,
                               restkey=None,
                               restval=None,
                               dialect='excel',
                               delimiter=',',
                               quotechar='"')
    try:
        csvrow = csvReader.next()  ## must do this to get fieldnames
        n_in += 1
    except:
        errtxt = ('ERROR: bad ifile: %s' % fin_csv.name)
        return doc
    print >> sys.stderr, 'DEBUG: fieldnames=', csvReader.fieldnames
    tgt_id_tag = 'protein_accession'
    tgt_gene_tag = 'gene_symbol'
    tgt_name_tag = 'pref_name'
    for tag in (tgt_id_tag, tgt_name_tag, tgt_gene_tag):
        if tag not in csvReader.fieldnames:
            print >> sys.stderr, 'ERROR: %s not in fieldnames' % tag
            return doc

    tgt_id2name = {}
    tgt_gene2id = {}
    while True:
        try:
            csvrow = csvReader.next()
            n_in += 1
            tgt_id = csvrow[tgt_id_tag]
            tgt_name = csvrow[tgt_name_tag]
            tgt_gene = csvrow[tgt_gene_tag]
            tgt_gene2id[tgt_gene] = tgt_id
            tgt_id2name[tgt_id] = tgt_name
        except:
            break
    print >> sys.stderr, 'DEBUG: protein ids/names read: %d' % len(tgt_id2name)
    #for tgt_id in tgt_id2name.keys():
    #  print >>sys.stderr, 'DEBUG: protein id: %s'%tgt_id

    #gene_nodes = xpath.find('/graphml/graph/nnode/data[@key="class" and text()="gene"]/parent::node()', doc)
    gene_nodes = doc.findall(
        '/graphml/graph/nnode/data[@key="class" and text()="gene"]/parent::node()'
    )
    print >> sys.stderr, 'DEBUG: gene count: %d' % len(gene_nodes)

    for node in gene_nodes:
        tgt_id = None
        atts = node.attributes
        for cnode in node.childNodes:
            if cnode.attributes:
                for attName, attValue in cnode.attributes.items():
                    if attName == 'key' and attValue == 'label':
                        tgt_gene = xml_utils.DOM_NodeText(cnode)
                        if tgt_gene2id.has_key(tgt_gene):
                            tgt_id = tgt_gene2id[tgt_gene]
                        print >> sys.stderr, 'DEBUG: gene = "%s" ; tgt = "%s"' % (
                            tgt_gene, tgt_id)

        if tgt_id and tgt_id2name.has_key(tgt_id):
            namenode = doc.createElement('data')
            namenode.setAttribute('key', 'name')
            namenode.appendChild(doc.createTextNode(tgt_id2name[tgt_id]))
            node.appendChild(namenode)
            xml_str = node.toprettyxml()
            xml_str = re.sub(r'\s*[\n\r]+', r'\n', xml_str)
            #xml_str=re.sub(r'<nnode (.*?)</nnode>',r'<node \1</node>',xml_str,re.DOTALL)
            xml_str = re.sub(r'<nnode', '<node', xml_str)
            xml_str = re.sub(r'/nnode>', '/node>', xml_str)
            #print >>sys.stderr, 'DEBUG: %s'%xml_str

    return doc