Esempio n. 1
0
def eventsToXML(events, xmlEvents, dataSets, srTexts):
    for norText in events.keys():
        usedOffsets = set()
        entitiesByOffset = {}
        entityCount = 0
        interactionCount = 0
        xmlEvents[norText] = {"entities":[], "interactions":[]}
        dataSet = None
        for event in events[norText]:
            srTexts[norText] = event["text"]
            if dataSet == None:
                dataSet = event["dataSet"]
            else:
                assert dataSet == event["dataSet"]
            # Add entity
            offset, entityText = event["entity"].replace("\t", " ").split(" ", 1)
            if offset not in usedOffsets:
                entity = ET.Element("entity")
                entity.set("charOffset", offset)
                entity.set("text", entityText)
                entity.set("type", "Entity")
                entity.set("isName", "False")
                entity.set("id", "e" + str(entityCount))
                entity.set("srId", event["id"])
                entityCount += 1
                usedOffsets.add(offset)
                entitiesByOffset[offset] = entity
                xmlEvents[norText]["entities"].append(entity)
            else:
                entity = entitiesByOffset[offset]
            # Add named entity
            offset, entityText = event["namedEntity"].replace("\t", " ").split(" ", 1)
            if offset not in usedOffsets:
                namedEntity = ET.Element("entity")
                namedEntity.set("charOffset", offset)
                namedEntity.set("text", entityText)
                namedEntity.set("type", "Protein")
                namedEntity.set("isName", "True")
                namedEntity.set("id", "e" + str(entityCount))
                namedEntity.set("srId", event["id"])
                entityCount += 1
                usedOffsets.add(offset)
                entitiesByOffset[offset] = namedEntity
                xmlEvents[norText]["entities"].append(namedEntity)
            else:
                namedEntity = entitiesByOffset[offset]
            # Add interactions
            if event["interaction"] == "Yes":
                interaction = ET.Element("interaction")
                interaction.set("type", "SR-" + event["eventType"])
                interaction.set("directed", "False")
                interaction.set("e1", namedEntity.get("id"))
                interaction.set("e2", entity.get("id"))
                interaction.set("id", "i" + str(interactionCount))
                interaction.set("srId", event["id"])
                xmlEvents[norText]["interactions"].append(interaction)
                interactionCount += 1
            else:
                assert event["interaction"] == "No", event["interaction"]
        dataSets[norText] = dataSet
Esempio n. 2
0
def saveKeys():
    if not _KEYFILE:
        raise Exception, 'No _KEYFILE'

    tree = ElementTree.ElementTree(ElementTree.Element('Keys'))
    for x in _KEYS:
        e = ElementTree.Element('key')
        e.text = x
        tree.getroot().append(e)
    tree.write(_KEYFILE)
Esempio n. 3
0
def make_keyfiles():
    x = ElementTree.Element('Keys')
    y = ElementTree.Element('key')
    y["name"] = KEY[0]
    y.text = KEY[1]
    x.append(y)
    y = ElementTree.Element('key')
    y.text = 'foo'
    x.append(y)
    b = ElementTree.ElementTree(x)
    b.write('tests/xml/keys')
    b.write('tests/xml/keys3')
Esempio n. 4
0
    def arcSVG(self):
        spec1="M%(frox)d,%(y)d L%(linebx)d,%(lineby)d C%(c1bx)d,%(c1by)d %(c1ex)d,%(c1ey)d %(recx)d,%(midy)d"%self.param
        spec2="M%(recxe)d,%(midy)d C%(c2bx)d,%(c2by)d %(c2ex)d,%(c2ey)d %(lineex)d,%(lineey)d L%(tox)d,%(y)d"%self.param
        arcN1=ET.Element("path")
        arcN1.set("d",spec1)
        styleStr=";".join("%s:%s"%(var,val) for var,val in self.arcStyle().items())
        arcN1.set("style",styleStr)
        
        arcN2=ET.Element("path")
        arcN2.set("d",spec2)
        styleStr=";".join("%s:%s"%(var,val) for var,val in self.arcStyle().items())
        arcN2.set("style",styleStr)

        return [arcN1,arcN2]
Esempio n. 5
0
def generateSVG(tokens,dependencies):
    layout(tokens,dependencies)
    tree=ET.Element("svg")
    tree.set("xmlns","http://www.w3.org/2000/svg")
    tree.set("xmlns:xlink","http://www.w3.org/1999/xlink")
    tree.set("version","1.1")
    tree.set("baseProfile","full")
    allNodes=[]
    totalWidth=0
    totalHeight=tokens[0].y+10
    if SVGOptions.whAttributes:
        tree.set("height",strint(totalHeight))
    for t in tokens:
        allNodes.extend(t.toSVG())
        tokX=t.x+t.width()
        if tokX>totalWidth:
            totalWidth=tokX
    if SVGOptions.whAttributes:
        tree.set("width",strint(totalWidth))
    for d in dependencies:
        allNodes.extend(d.arcSVG())
        allNodes.extend(d.labelSVG())
    allNodes.sort(cmp=drawOrder)
    for n in allNodes:
        tree.append(n)
    return tree
Esempio n. 6
0
def processLines(lines,
                 setName,
                 usedIds,
                 directed=True,
                 negatives="INCLUDE",
                 tree=None,
                 corpusId="SE10T8"):
    if tree == None:
        corpus = ET.Element("corpus", {"source": corpusId})
        tree = ET.ElementTree(corpus)
    else:
        corpus = tree.getroot()
    sentence = None
    for line in lines:
        line = line.strip()
        if sentence == None:
            assert line[0].isdigit(), line
            origId, line = line.split("\t")
            sentence = Sentence(origId,
                                line.strip().strip("\""), corpusId, usedIds,
                                setName)
        else:
            if line.startswith("Comment:"):
                sentence.comment = line.split(":", 1)[-1].strip()
            elif line != "":
                sentence.relation = line
            else:
                assert sentence != None
                corpus.append(
                    sentence.process(directed=directed, negatives=negatives))
                sentence = None
    return tree
Esempio n. 7
0
 def _node2xmlfields(self,noderecord):
     ''' fields in a node are written to xml fields; output is sorted according to grammar
     '''
     if 'BOTSID' not in noderecord:
         raise botslib.OutMessageError(_(u'No field "BOTSID" in xml-output in: "$record"'),record=noderecord)
     #first generate the xml-'record'
     attributedict = {}
     recordtag = noderecord['BOTSID']
     attributemarker = recordtag + self.ta_info['attributemarker'] 
     for key,value in noderecord.items():    #find the attributes for the xml-record, put these in attributedict
         if key.startswith(attributemarker):
             attributedict[key[len(attributemarker):]] = value
     xmlrecord = ET.Element(recordtag,attributedict) #make the xml ET node
     if 'BOTSCONTENT' in noderecord:
         xmlrecord.text = noderecord['BOTSCONTENT']
         del noderecord['BOTSCONTENT']
     for key in attributedict.keys():  #remove used fields
         del noderecord[attributemarker+key]
     del noderecord['BOTSID']    #remove 'record' tag
     #generate xml-'fields' in xml-'record'; not sorted
     noderecordcopy = noderecord.copy()
     for key,value in noderecordcopy.items():
         if key not in noderecord or self.ta_info['attributemarker'] in key: #if field not in outmessage: skip
             continue
         attributedict = {}
         attributemarker = key + self.ta_info['attributemarker'] 
         for key2,value2 in noderecord.items():
             if key2.startswith(attributemarker):
                 attributedict[key2[len(attributemarker):]] = value2
         ET.SubElement(xmlrecord, key,attributedict).text=value    #add xml element to xml record
         for key2 in attributedict.keys():  #remove used fields
             del noderecord[attributemarker+key2]
         del noderecord[key]    #remove xml entity tag
     return xmlrecord
Esempio n. 8
0
def buildElements(filePath, sourceType):
    f = codecs.open(filePath, "rt", "utf-8")
    lines = f.readlines()
    f.close()
    spans = []
    for line in lines:
        if line.startswith("#"):
            continue
        splits = line.strip("\n").split("\t")
        identifier = ""
        category = ""
        source = sourceType
        if sourceType == "sner":
            text, offset, eType = splits
        elif sourceType == "spec":
            text, offset, identifier, category, eType = splits
        elif sourceType == "tags":
            identifier, documentId, start, end, text, comment = splits
            eType = identifier.split(":")[0]
            offset = start + " " + end
            source = "linnaeus"
        span = ET.Element('span')
        span.set("text", text)
        span.set("offset", [int(x) for x in offset.split()])
        #span.set("charOffset", "-".join(span.offset))
        span.set("identifier", identifier)
        span.set("category", category)
        span.set("type", eType)
        span.set("source", source)
        spans.append(span)
    return spans
Esempio n. 9
0
 def test_cet():
     """cElementTree"""
     _table = cet.Element('table')
     for row in table:
         tr = cet.SubElement(_table, 'tr')
         for c in row.values():
             cet.SubElement(tr, 'td').text = str(c)
     cet.tostring(_table)
Esempio n. 10
0
 def test_cet():
     """cElementTree"""
     _table = cet.Element("table")
     for row in table:
         tr = cet.SubElement(_table, "tr")
         for c in row.values():
             cet.SubElement(tr, "td").text = str(c)
     cet.tostring(_table)
Esempio n. 11
0
    def add_jms(self, message, queue_name, now):

        jms = etree.Element("jms")
        dst = etree.Element("Dst")
        tms = etree.Element("Tms")
        dlv = etree.Element("Dlv")

        jms.append(dst)
        jms.append(tms)
        jms.append(dlv)

        tms.text = unicode(now)
        dst.text = u"queue:///" + queue_name
        dlv.text = unicode(message.jms_delivery_mode)

        if message.jms_expiration:
            exp = etree.Element("Exp")
            exp.text = unicode(now + message.jms_expiration)
            self.logger.log(TRACE1, "jms.Exp [%r]" % exp.text)
            jms.append(exp)

        if message.jms_priority:
            pri = etree.Element("Pri")
            pri.text = unicode(message.jms_priority)
            self.logger.log(TRACE1, "jms.Pri [%r]" % pri.text)
            jms.append(pri)

        if message.jms_correlation_id:
            cid = etree.Element("Cid")
            cid.text = unicode(message.jms_correlation_id)
            self.logger.log(TRACE1, "jms.Cid [%r]" % cid.text)
            jms.append(cid)

        self.folders["jms"] = jms
Esempio n. 12
0
 def test_kid_et():
     """Kid template + cElementTree"""
     _table = cet.Element('table')
     for row in table:
         td = cet.SubElement(_table, 'tr')
         for c in row.values():
             cet.SubElement(td, 'td').text = str(c)
     kid_tmpl2.table = _table
     kid_tmpl2.serialize(output='html')
Esempio n. 13
0
 def test_kid_et():
     """Kid template + cElementTree"""
     _table = cet.Element("table")
     for row in table:
         td = cet.SubElement(_table, "tr")
         for c in row.values():
             cet.SubElement(td, "td").text = str(c)
     kid_tmpl2.table = _table
     kid_tmpl2.serialize(output="html")
Esempio n. 14
0
    def labelSVG(self):
            

            recNode=ET.Element("rect")
            recNode.set("x",strint(self.param["recx"]))
            recNode.set("y",strint(self.param["recy"]))
            recNode.set("width",strint(self.param["recw"]))
            recNode.set("height",strint(self.param["rech"]))
            recNode.set("style","fill:white;")#stroke:black")
            
            labNode=ET.Element("text")
            labNode.set("systemlanguage","en")
            labNode.set("x",strint(self.param['txtX']))
            labNode.set("y",strint(self.param['txtY']))
            labNode.set("txt",self.type)
            labNode.text=self.type
            styleStr=";".join("%s:%s"%(var,val) for var,val in self.labelStyle().items())
            labNode.set("style",styleStr)
            return [recNode,labNode]
Esempio n. 15
0
    def _map_custom_class(self, obj, mappings, ns):
        """ Fill in the missing attributes of Python objects and make it look
        to the rest of XMLConfig as if they already were in the XML config file.
        """
        for class_name in mappings:
            tag_no_ns = obj.tag.replace(ns, "")
            if class_name == tag_no_ns:

                obj.set("class", mappings[class_name])
                constructor_arg = etree.Element("%s%s" % (ns, "constructor-arg"))
                value = etree.Element("%s%s" % (ns, "value"))
                value.text = obj.text
                obj.append(constructor_arg)
                constructor_arg.append(value)
                obj.text = ""

                break

        else:
            self.logger.warning("No matching type found for object %s" % obj)
Esempio n. 16
0
    def add_usr(self, message):
        user_attrs = set(dir(message)) - reserved_attributes
        self.logger.log(TRACE1, "user_attrs [%s]" % user_attrs)

        if user_attrs:
            usr = etree.Element("usr")

            for user_attr in user_attrs:

                user_attr_value = getattr(message, user_attr)

                # Some values are integers, e.g. delivery_mode
                if isinstance(user_attr_value, basestring):
                    user_attr_value = escape(user_attr_value)

                # Create a JMS attribute and set its value.
                user_attr = etree.Element(unicode(user_attr))
                user_attr.text = unicode(user_attr_value)
                usr.append(user_attr)

            self.folders["usr"] = usr
Esempio n. 17
0
def generate_xml(troves, label):
    document = cElementTree.Element("Packages", label=label)
    for trove in troves:
        name = trove.getName()
        version = trove.getVersion().trailingRevision().asString()
        meta = trove.getMetadata()

        package = cElementTree.Element("Package")

        node_name = cElementTree.Element("name")
        node_name.text = name
        node_version = cElementTree.Element("version")
        node_version.text = version

        for i in [node_name, node_version]:
            package.append(i)

        for key, value in meta.items():
            if value is not None and value != "None":
                if key == "categories":
                    for cat in value:
                        cat_node = cElementTree.Element("category", lang="en")
                        cat_node.text = cat
                        package.append(node)
                else:
                    node = cElementTree.Element(key, lang="en")
                    node.text = value
                    package.append(node)

        document.append(package)
    return document
Esempio n. 18
0
 def makenode(tag, content):
     node = ET.Element(tag)
     if not content:
         pass  #empty element
     elif isinstance(content, basestring):
         node.text = content
     elif isinstance(content, list):
         node.tag = tag + 's'  #change node tag
         for element in content:
             node.append(makenode(tag, element))
     elif isinstance(content, dict):
         for key, value in content.items():
             node.append(makenode(key, value))
     else:
         node.text = repr(content)
     return node
Esempio n. 19
0
 def toSVG(self):
     runningY=self.y
     texts=[self.txt]+self.otherLines
     texts.reverse()
     nodes=[]
     for txt in texts:
         node=ET.Element("text")
         node.set("systemLanguage","en")
         node.set("x",strint(self.x))
         node.set("y",strint(runningY))
         styleStr=";".join("%s:%s"%(var,val) for var,val in self.style().items())
         node.set("style",styleStr)
         node.text=txt
         nodes.append(node)
         runningY-=SVGOptions.fontSize+SVGOptions.lineSep
     return nodes
Esempio n. 20
0
 def _getEntity(self, line, tag):
     try:
         before, entityText, after = re.split(
             r'<' + tag + '>|</' + tag + '>', line)
     except ValueError as e:
         print "ValueError in line '" + line + "' for tag", tag
         raise e
     begin = len(before)
     end = len(before) + len(entityText)
     return ET.Element(
         "entity", {
             "text": entityText,
             "type": "entity",
             "given": "True",
             "charOffset": str(begin) + "-" + str(end),
             "id": self.id + "." + tag
         })
Esempio n. 21
0
def loadKeys(fname=None, force_=False):
    global _KEYS, _KEYFILE
    if not fname:
        fname = os.path.expanduser('~/.isbndbkeys')
    if not os.path.exists(fname):
        a = ElementTree.Element('Keys')
        ElementTree.ElementTree(a).write(fname)
    _KEYFILE = fname
    _KEYS = dict()
    tree = ElementTree.parse(fname)
    for x in tree.findall('key'):
        try:
            _KEYS[x.get("name")] = Key(x.text, x.get("name"))
        except:
            if force_:
                _KEYS[x.get("name")] = Key(x.text, x.get("name"), force_=True)
            else:
                pass
Esempio n. 22
0
 def _node2xmlfields(self, noderecord):
     ''' fields in a node are written to xml fields; output is sorted according to grammar
     '''
     #first generate the xml-'record'
     #~ print 'record',noderecord['BOTSID']
     attributedict = {}
     recordtag = noderecord['BOTSID']
     attributemarker = recordtag + self.ta_info[
         'attributemarker']  #attributemarker is a marker in the fieldname used to find out if field is an attribute of either xml-'record' or xml-element
     #~ print '    rec_att_mark',attributemarker
     for key, value in noderecord.items(
     ):  #find attributes belonging to xml-'record' and store in attributedict
         if key.startswith(attributemarker):
             #~ print '    record attribute',key,value
             attributedict[key[len(attributemarker):]] = value
     xmlrecord = ET.Element(recordtag, attributedict)  #make the xml ET node
     if 'BOTSCONTENT' in noderecord:  #BOTSCONTENT is used to store the value/text of the xml-record itself.
         xmlrecord.text = noderecord['BOTSCONTENT']
         del noderecord['BOTSCONTENT']
     for key in attributedict.keys():  #remove used fields
         del noderecord[attributemarker + key]
     del noderecord['BOTSID']  #remove 'record' tag
     #generate xml-'fields' in xml-'record'; sort these by looping over records definition
     for field_def in self.defmessage.recorddefs[
             recordtag]:  #loop over fields in 'record'
         if field_def[
                 ID] not in noderecord:  #if field not in outmessage: skip
             continue
         #~ print '    field',field_def
         attributedict = {}
         attributemarker = field_def[ID] + self.ta_info['attributemarker']
         #~ print '    field_att_mark',attributemarker
         for key, value in noderecord.items():
             if key.startswith(attributemarker):
                 print '        field attribute', key, value
                 attributedict[key[len(attributemarker):]] = value
         ET.SubElement(xmlrecord, field_def[ID],
                       attributedict).text = noderecord[
                           field_def[ID]]  #add xml element to xml record
         for key in attributedict.keys():  #remove used fields
             del noderecord[attributemarker + key]
         del noderecord[field_def[ID]]  #remove xml entity tag
     return xmlrecord
Esempio n. 23
0
 def _getInteraction(self,
                     relType,
                     e1,
                     e2,
                     directed,
                     count,
                     eMap,
                     relFrom=None,
                     relTo=None):
     if relFrom == None: relFrom = e1
     if relTo == None: relFrom = e2
     attrs = {
         "id": self.id + ".i" + str(count),
         "type": relType,
         "directed": str(directed),
         "e1": eMap[e1].get("id"),
         "e2": eMap[e2].get("id")
     }
     if relFrom != "": attrs["from"] = relFrom
     if relTo != "": attrs["to"] = relTo
     return ET.Element("interaction", attrs)
Esempio n. 24
0
def duplicateFlat(sourceEnt, targetEnt, entitiesById, sentencesById,
                  interactionsByEntity):
    sourceEntId = sourceEnt.get("id")
    targetEntId = targetEnt.get("id")
    if interactionsByEntity.has_key(sourceEntId):
        for interaction in interactionsByEntity[sourceEntId]:
            e1 = interaction.get("e1")
            e2 = interaction.get("e2")
            assert e2 == sourceEntId, (sourceEntId, targetEntId
                                       )  # only named entities are duplicated
            sentenceId = interaction.get("id").rsplit(".", 1)[0]
            sentence = sentencesById[sentenceId]

            # Create new interaction (or pair) element
            newInteraction = ElementTree.Element(interaction.tag)
            newInteraction.set("e2", targetEntId)
            newInteraction.set("e1", e1)
            newInteraction.set("directed", "True")
            newInteraction.set("notes", "Equiv")
            newInteraction.set("type", interaction.get("type"))
            newInteraction.set("origId", interaction.get("origId"))
            insertInteraction(sentence, newInteraction)
Esempio n. 25
0
# From cmqc.h
_WMQ_MQFMT_RF_HEADER_2 = "MQHRF2  "

# MQRFH_NO_FLAGS_WIRE is in cmqc.h
_WMQ_MQRFH_NO_FLAGS_WIRE_FORMAT = "\x00\x00\x00\x00"

# Java documentation says "214748364.7 seconds".
_WMQ_MAX_EXPIRY_TIME = 214748364.7

_WMQ_ID_PREFIX = "ID:"

# In current implementation, an mcd JMS folder is constant for every message
# sent, so let's build it here.

_mcd = etree.Element("mcd")
_msd = etree.Element("Msd")
_mcd.append(_msd)

# For now, it's always a TextMessage
_msd.text = "jms_text"

_msgbody = etree.Element("msgbody")
_msgbody.set("xmlns:xsi", "dummy") # We're using a dummy namespace
_msgbody.set("xsi:nil", "true")
_mcd.append(_msgbody)

# Clean up namespace.
del(_msd, _msgbody)

Esempio n. 26
0
def convertDDI(outDir,
               downloadDir=None,
               redownload=False,
               makeIntermediateFiles=True,
               debug=False):
    cwd = os.getcwd()
    if not os.path.exists(outDir):
        os.makedirs(outDir)
    os.chdir(outDir)
    logFileName = os.path.join(outDir, "DDI11-conversion-log.txt")
    Stream.openLog(logFileName)
    print >> sys.stderr, "=======================", "Converting DDI'11 corpus", "======================="
    corpusDir = outDir + "/DDI11-original"
    Utils.Download.downloadAndExtract(Settings.URL["DDI11_CORPUS"], corpusDir,
                                      downloadDir)

    bigfileName = os.path.join(outDir, "DDI11")
    #oldXML = ETUtils.ETFromObj(bigfileName+".xml")
    trainUnified = corpusDir + "/train"
    trainMTMX = corpusDir + "/train_MTMX"
    testUnified = corpusDir + "/test"
    testMTMX = corpusDir + "/test_MTMX"

    # Load main documents
    tempdir = tempfile.mkdtemp()
    print >> sys.stderr, "Temporary files directory at", tempdir
    documents, docById, docCounts = loadDocs(trainUnified)
    # Divide training data into a train and devel set
    sortedDocCounts = sorted(docCounts.iteritems(),
                             key=lambda (k, v): (v, k),
                             reverse=True)
    datasetCounts = {"train": [0, 0], "devel": [0, 0], "test": [0, 0]}
    for i in range(0, len(sortedDocCounts) - 3, 4):
        for j in [0, 1]:
            docById[sortedDocCounts[i + j][0]].set("set", "train")
            datasetCounts["train"][0] += sortedDocCounts[i + j][1][0]
            datasetCounts["train"][1] += sortedDocCounts[i + j][1][1]
        docById[sortedDocCounts[i + 2][0]].set(
            "set",
            "train")  #docById[sortedDocCounts[i+2][0]].set("set", "devel")
        docById[sortedDocCounts[i + 3][0]].set(
            "set",
            "devel")  #docById[sortedDocCounts[i+3][0]].set("set", "test")
        datasetCounts["train"][0] += sortedDocCounts[i + 2][1][
            0]  #datasetCounts["devel"][0] += sortedDocCounts[i+2][1][0]
        datasetCounts["train"][1] += sortedDocCounts[i + 2][1][
            1]  #datasetCounts["devel"][1] += sortedDocCounts[i+2][1][1]
        datasetCounts["devel"][0] += sortedDocCounts[i + 3][1][
            0]  #datasetCounts["test"][0] += sortedDocCounts[i+3][1][0]
        datasetCounts["devel"][1] += sortedDocCounts[i + 3][1][
            1]  #datasetCounts["test"][1] += sortedDocCounts[i+3][1][1]
    for document in documents:  # epajaolliset jaa yli
        if document.get("set") == None:
            document.set("set", "train")
    # Print division results
    print >> sys.stderr, datasetCounts
    for key in datasetCounts.keys():
        if datasetCounts[key][1] != 0:
            print key, datasetCounts[key][0] / float(datasetCounts[key][1])
        else:
            print key, datasetCounts[key][0], "/", float(datasetCounts[key][1])
    # Some of the train and test ids overlap. Let's change the train set ids, because test set ones are needed
    # for the final evaluation.
    changeIdCount = 1000
    for trainId in [
            'DrugDDI.d312', 'DrugDDI.d316', 'DrugDDI.d332', 'DrugDDI.d334',
            'DrugDDI.d337', 'DrugDDI.d342', 'DrugDDI.d349', 'DrugDDI.d354',
            'DrugDDI.d373', 'DrugDDI.d379', 'DrugDDI.d383', 'DrugDDI.d388',
            'DrugDDI.d392', 'DrugDDI.d396', 'DrugDDI.d398', 'DrugDDI.d409',
            'DrugDDI.d411', 'DrugDDI.d415', 'DrugDDI.d425', 'DrugDDI.d430',
            'DrugDDI.d433', 'DrugDDI.d448', 'DrugDDI.d450', 'DrugDDI.d452',
            'DrugDDI.d462', 'DrugDDI.d467', 'DrugDDI.d470', 'DrugDDI.d474',
            'DrugDDI.d480', 'DrugDDI.d482', 'DrugDDI.d485', 'DrugDDI.d492',
            'DrugDDI.d494', 'DrugDDI.d496', 'DrugDDI.d498', 'DrugDDI.d500',
            'DrugDDI.d503', 'DrugDDI.d506', 'DrugDDI.d518', 'DrugDDI.d523',
            'DrugDDI.d528', 'DrugDDI.d535', 'DrugDDI.d539', 'DrugDDI.d552',
            'DrugDDI.d554', 'DrugDDI.d558', 'DrugDDI.d561', 'DrugDDI.d570',
            'DrugDDI.d578'
    ]:
        newId = "DrugDDI.d" + str(changeIdCount)
        print >> sys.stderr, "Changing train/devel id", trainId, "to", newId
        for element in docById[trainId].getiterator():
            for attrName, attrValue in element.attrib.iteritems():
                if trainId in attrValue:
                    element.set(attrName, attrValue.replace(trainId, newId))
        docById[newId] = docById[trainId]
        del docById[trainId]
        changeIdCount += 1
    # If test set exists, load it, too
    if testUnified != None:
        testDocuments, testDocById, testDocCounts = loadDocs(testUnified)
        for document in testDocuments:
            document.set("set", "test")
        documents = documents + testDocuments
        overlappingIds = []
        for key in docById:
            if key in testDocById:
                overlappingIds.append(key)
        for key in docById:
            assert key not in testDocById, (key, docById[key].get("origId"),
                                            testDocById[key].get("origId"),
                                            sorted(docById.keys()),
                                            sorted(testDocById.keys()),
                                            sorted(overlappingIds))
        docById.update(testDocById)

    # Add all documents into one XML
    xmlTree = ET.ElementTree(ET.Element("corpus"))
    root = xmlTree.getroot()
    root.set("source", "DDI11")
    for document in documents:
        root.append(document)
    if makeIntermediateFiles:
        ETUtils.write(root, bigfileName + "-documents-notfixed.xml")
    xml = xmlTree
    print >> sys.stderr, "Fixing DDI XML"
    fixEntities(xml)
    convertToInteractions(xml)
    # Add MTMX
    if trainMTMX != None:
        inDir = Utils.Download.getTopDir(
            tempdir,
            Utils.Download.downloadAndExtract(trainMTMX, tempdir,
                                              outDir + "/DDI11-original"))
        DDITools.addMTMX(xml, inDir)
    if testMTMX != None:
        inDir = Utils.Download.getTopDir(
            tempdir,
            Utils.Download.downloadAndExtract(testMTMX, tempdir,
                                              outDir + "/DDI11-original"))
        DDITools.addMTMX(xml, inDir)
    if makeIntermediateFiles:
        ETUtils.write(root, bigfileName + "-documents.xml")

    print >> sys.stderr, "---------------", "Inserting TEES-generated analyses", "---------------"
    Utils.Download.downloadAndExtract(Settings.URL["TEES_PARSES"],
                                      os.path.join(Settings.DATAPATH,
                                                   "TEES-parses"),
                                      downloadDir,
                                      redownload=redownload)
    extractedFilename = os.path.join(Settings.DATAPATH,
                                     "TEES-parses") + "/DDI11"
    print >> sys.stderr, "Making sentences"
    Tools.SentenceSplitter.makeSentences(xml, extractedFilename, None)
    print >> sys.stderr, "Inserting McCC parses"
    Tools.BLLIPParser.insertParses(
        xml,
        extractedFilename,
        None,
        extraAttributes={"source": "TEES-preparsed"})
    print >> sys.stderr, "Inserting Stanford conversions"
    Tools.StanfordParser.insertParses(
        xml,
        extractedFilename,
        None,
        extraAttributes={"stanfordSource": "TEES-preparsed"})
    print >> sys.stderr, "Protein Name Splitting"
    splitTarget = "McCC"
    #ProteinNameSplitter.mainFunc(xml, None, splitTarget, splitTarget, "split-"+splitTarget, "split-"+splitTarget)
    ProteinNameSplitter.mainFunc(xml, None, splitTarget, removeOld=True)
    print >> sys.stderr, "Head Detection"
    #xml = FindHeads.findHeads(xml, "split-"+splitTarget, tokenization=None, output=None, removeExisting=True)
    xml = FindHeads.findHeads(xml,
                              splitTarget,
                              tokenization=None,
                              output=None,
                              removeExisting=True)

    print >> sys.stderr, "Dividing into sets"
    Utils.InteractionXML.DivideSets.processCorpus(xml, outDir, "DDI11", ".xml")

    Stream.closeLog(logFileName)
    if not debug:
        print >> sys.stderr, "Removing temporary directory", tempdir
        shutil.rmtree(tempdir)
    os.chdir(cwd)
Esempio n. 27
0
 def process(self, directed, negatives):
     # Build the entities
     for tag in ("e1", "e2"):
         self.entities.append(self._getEntity(self.text, tag))
         self.text = self.text.replace("<" + tag + ">",
                                       "").replace("</" + tag + ">", "")
     # Check entity offsets
     for entity in self.entities:
         begin, end = [int(x) for x in entity.get("charOffset").split("-")]
         assert entity.get("text") == self.text[begin:end], (
             entity.get("text"), self.text, self.text[begin:end],
             [begin, end])
     assert len(self.entities) == 2
     eMap = {"e1": self.entities[0], "e2": self.entities[1]}
     for key in eMap:  # Check that e1 == e1 and e2 == e2
         assert eMap[key].get("id").endswith("." + key)
     # Build the sentence
     docElem = ET.Element("document", {
         "id": self.corpusId + ".d" + self.origId,
         "set": self.setName
     })
     sentElem = ET.SubElement(
         docElem, "sentence", {
             "id": self.id,
             "charOffset": "0-" + str(len(self.text)),
             "text": self.text,
             "origId": self.origId
         })
     sentElem.set("relation", self.relation)
     if self.comment != None and self.comment != "":
         sentElem.set("comment", self.comment)
     for entity in self.entities:
         sentElem.append(entity)
     # Determine interaction types per direction
     relFrom, relTo = "", ""
     if self.relation == "Other":
         sentElem.append(
             self._getInteraction(self.relation, "e1", "e2", directed, 0,
                                  eMap, relFrom, relTo))
     else:
         relType, rest = self.relation.strip(")").split("(")
         relFrom, relTo = rest.split(",")
         reverse = (relFrom == "e2" and relTo == "e1")
         if not reverse:
             assert relFrom == "e1" and relTo == "e2"
             forwardType = self.mergeType(relType, relFrom, relTo) if (
                 negatives == "REVERSE_POS") else relType
             reverseType = self.mergeType(relType, relTo, relFrom) if (
                 negatives == "REVERSE_POS") else "neg"
         else:
             forwardType = self.mergeType(relType, relFrom, relTo) if (
                 negatives == "REVERSE_POS") else "neg"
             reverseType = self.mergeType(relType, relTo, relFrom) if (
                 negatives == "REVERSE_POS") else relType
         # Build the interactions
         if directed:
             if forwardType != "neg" or negatives == "INCLUDE":
                 sentElem.append(
                     self._getInteraction(forwardType, "e1", "e2", directed,
                                          0, eMap, "e1", "e2"))
             if reverseType != "neg" or negatives == "INCLUDE":
                 sentElem.append(
                     self._getInteraction(reverseType, "e2", "e1", directed,
                                          1, eMap, "e2", "e1"))
         else:
             sentElem.append(
                 self._getInteraction(self.relation, "e1", "e2", directed,
                                      0, eMap, relFrom, relTo))
     return docElem
Esempio n. 28
0
def eventsToNewXML(events):
    xml = ET.Element("corpus")
    xml.set("source", "Static Relations")
    docCount = 0
    sentenceById = {}
    for sentenceId in sorted(events.keys()):
        entities = []
        interactions = []
        entityByOffset = {}
        for event in events[sentenceId]:
            #print event
            if sentenceId not in sentenceById:
                document = ET.SubElement(xml, "document")
                document.set("id", "SR.d"+str(docCount))
                document.set("origId", sentenceId)
                document.set("set", event["dataSet"])
                sentence = ET.SubElement(document, "sentence")
                sentence.set("id", "SR.d"+str(docCount)+".s"+str(docCount))
                sentence.set("origId", sentenceId)
                sentence.set("text", event["text"])
                sentence.set("charOffset", "0-"+str(len(event["text"])-1))
                docCount += 1
                sentenceById[sentenceId] = sentence
            else:
                sentence = sentenceById[sentenceId]
                assert sentence.get("text") == event["text"], (sentence.get("text"), event["text"])
            # Add entities
            e1Offset = event["entity"].split("\t")[0]
            e2Offset = event["namedEntity"].split("\t")[0]
            if e1Offset not in entityByOffset:
                e1 = ET.Element("entity")
                e1.set("text", event["entity"].split("\t")[1].strip())
                e1.set("id", sentence.get("id")+".e"+str(len(entities)))
                offset = getOffset(event["entity"].split("\t")[0])
                assert sentence.get("text")[offset[0]:offset[1]+1] == e1.get("text"), (event, sentence.get("text"), e1.get("text"))
                e1.set("charOffset", str(offset[0]) + "-" + str(offset[1]))
                e1.set("isName", "False")
                e1.set("type", "Entity")
                entities.append(e1)
                entityByOffset[e1Offset] = e1
            else:
                e1 = entityByOffset[e1Offset]
            if e2Offset not in entityByOffset:
                e2 = ET.Element("entity")
                e2.set("text", event["namedEntity"].split("\t")[1].strip())
                e2.set("id", sentence.get("id")+".e"+str(len(entities)))
                offset = getOffset(event["namedEntity"].split("\t")[0])
                assert sentence.get("text")[offset[0]:offset[1]+1] == e2.get("text"), (event, sentence.get("text"), e2.get("text"))
                e2.set("charOffset", str(offset[0]) + "-" + str(offset[1]))
                e2.set("isName", "True")
                e2.set("type", "Protein")
                entities.append(e2)
                entityByOffset[e2Offset] = e2
            else:
                e2 = entityByOffset[e2Offset]
            # Add interactions
            interaction = ET.Element("interaction")
            interaction.set("id", sentence.get("id")+".i"+str(len(interactions)))
            interaction.set("origId", event["id"])
            interaction.set("type", event["eventType"])
            interaction.set("e1", e1.get("id"))
            interaction.set("e2", e2.get("id"))
            interactions.append(interaction)
        for entity in entities:
            sentence.append(entity)
        for interaction in interactions:
            sentence.append(interaction)
    return xml
Esempio n. 29
0
def convertChemProt(inDirs=None,
                    setNames=None,
                    outPath=None,
                    goldTestSet=True,
                    downloadDir=None,
                    extractDir=None,
                    redownload=False,
                    debug=False):
    tempDir = None
    if inDirs == None:
        print >> sys.stderr, "---------------", "Downloading ChemProt files", "---------------"
        if extractDir == None:
            tempDir = tempfile.mkdtemp()
        inDirs = []
        for setName in ("TRAIN", "DEVEL", "TEST"):
            if goldTestSet and setName == "TEST":
                setName = "TEST_GOLD"
            if Settings.URL["CP17_" + setName] != None:
                currentExtractDir = extractDir if extractDir else tempDir
                currentExtractDir = os.path.join(currentExtractDir,
                                                 setName.lower())
                inDirs.append(
                    downloadFile(Settings.URL["CP17_" + setName], downloadDir,
                                 currentExtractDir, redownload))
    print >> sys.stderr, "Reading ChemProt corpus from input", inDirs, "using dataset mapping", setNames
    dataSets = OrderedDict()
    for inDir in inDirs:
        print >> sys.stderr, "Reading input directory", inDir
        filenames = os.listdir(inDir)
        filetypes = ["_abstracts", "_entities", "_relations"]
        # Collect the file paths for the data types
        dirDataSets = set()
        for filename in filenames:
            if not (filename.endswith(".tsv")
                    and any([x in filename for x in filetypes])):
                continue
            dataSetId, dataType = filename.replace("_gs", "").rsplit("_", 1)
            if setNames != None:
                dataSetId = setNames.get(dataSetId, dataSetId)
            dirDataSets.add(dataSetId)
            dataType = dataType.split(".")[0]
            if dataSetId not in dataSets:
                dataSets[dataSetId] = {}
            assert dataType not in dataSets[dataSetId]
            dataSets[dataSetId][dataType] = os.path.join(inDir, filename)
        print >> sys.stderr, "Found ChemProt datasets", list(
            dirDataSets), "at", inDir
    print >> sys.stderr, "Read datasets:", dataSets.keys()
    # Build the Interaction XML
    print >> sys.stderr, "Converting to Interaction XML"
    corpusName = "CP17"
    corpus = ET.Element("corpus", {"source": corpusName})
    counts = defaultdict(int)
    docById = {}
    entityById = {}
    entitiesByDoc = {}
    docsWithErrors = set()
    for dataSetId in sorted(dataSets.keys()):
        prevCounts = copy.copy(counts)
        print >> sys.stderr, "---", "Building elements for dataset", dataSetId, "---"
        dataSet = dataSets[dataSetId]
        counts["sets"] += 1
        with open(dataSet["abstracts"], "rt") as f:
            print >> sys.stderr, "Adding document elements for dataset", dataSetId
            for row in UnicodeDictReader(
                    f,
                    delimiter="\t",
                    fieldnames=["id", "title", "abstract"],
                    quoting=csv.QUOTE_NONE):
                document = ET.Element(
                    "document", {
                        "id": corpusName + ".d" + str(counts["documents"]),
                        "origId": row["id"],
                        "set": dataSetId
                    })
                document.set("text", row["title"] + " " + row["abstract"])
                document.set("titleOffset",
                             Range.tuplesToCharOffset((0, len(row["title"]))))
                if document.get("origId") in docById:
                    assert document.get("text") == docById[document.get(
                        "origId")].get("text")
                    assert document.get("titleOffset") == docById[document.get(
                        "origId")].get("titleOffset")
                    counts["duplicate-documents"] += 1
                else:
                    corpus.append(document)
                    docById[document.get("origId")] = document
                    counts["documents"] += 1
        with open(dataSet["entities"], "rt") as f:
            print >> sys.stderr, "Adding entity elements for dataset", dataSetId
            for row in UnicodeDictReader(
                    f,
                    delimiter="\t",
                    fieldnames=["docId", "id", "type", "begin", "end", "text"],
                    quoting=csv.QUOTE_NONE):
                document = docById[row["docId"]]
                assert row["type"] in ("CHEMICAL", "GENE-Y", "GENE-N")
                # Check for duplicate entities
                if row["docId"] not in entitiesByDoc:
                    entitiesByDoc[row["docId"]] = set()
                assert row["id"] not in entitiesByDoc[row["docId"]]
                entitiesByDoc[row["docId"]].add(row["id"])
                # Determine the offset
                offset = (int(row["begin"]), int(row["end"]))
                docSpan = document.get("text")[offset[0]:offset[1]]
                if docSpan == row["text"]:
                    entity = ET.SubElement(
                        document, "entity", {
                            "id":
                            document.get("id") + ".e" +
                            str(len([x for x in document.findall("entity")]))
                        })
                    entity.set("given", "True")
                    entity.set("origId", row["id"])
                    entity.set("type", row["type"].split("-")[0])
                    entity.set(
                        "normalized",
                        "True" if row["type"].endswith("-Y") else "False")
                    entity.set(
                        "charOffset",
                        Range.tuplesToCharOffset((offset[0], offset[1])))
                    entity.set("text", row["text"])
                    if row["docId"] not in entityById:
                        entityById[row["docId"]] = {}
                    assert entity.get("origId") not in entityById[row["docId"]]
                    entityById[row["docId"]][entity.get("origId")] = entity
                    counts["entities"] += 1
                else:
                    print >> sys.stderr, "Alignment error in document", row[
                        "docId"], (offset, docSpan, row)
                    counts["entities-error"] += 1
                    docsWithErrors.add(row["docId"])
        if "relations" in dataSet:
            print >> sys.stderr, "Adding relation elements for dataset", dataSetId
            with open(dataSet["relations"], "rt") as f:
                for row in UnicodeDictReader(f,
                                             delimiter="\t",
                                             fieldnames=[
                                                 "docId", "group", "groupEval",
                                                 "type", "arg1", "arg2"
                                             ],
                                             quoting=csv.QUOTE_NONE):
                    for argId in ("1", "2"):
                        assert row["arg" + argId].startswith("Arg" + argId +
                                                             ":")
                        row["arg" + argId] = row["arg" + argId][5:]
                    document = docById[row["docId"]]
                    e1 = entityById[row["docId"]].get(row["arg1"])
                    e2 = entityById[row["docId"]].get(row["arg2"])
                    if e1 != None and e2 != None:
                        interaction = ET.SubElement(
                            document, "interaction", {
                                "id":
                                document.get("id") + ".i" + str(
                                    len([
                                        x for x in document.findall(
                                            "interaction")
                                    ]))
                            })
                        interaction.set("directed", "True")
                        interaction.set("type", row["group"])
                        interaction.set("relType", row["type"])
                        row["groupEval"] = row["groupEval"].strip()
                        assert row["groupEval"] in ("Y", "N")
                        interaction.set(
                            "evaluated",
                            "True" if row["groupEval"] == "Y" else "False")
                        interaction.set("e1", e1.get("id"))
                        interaction.set("e2", e2.get("id"))
                        counts["interactions"] += 1
                    else:
                        counts["interaction-error"] += 1
                        docsWithErrors.add(row["docId"])
        else:
            print >> sys.stderr, "No relations for dataset", dataSetId
        print >> sys.stderr, "dataset", dataSetId, {
            x: counts[x] - prevCounts.get(x, 0)
            for x in counts if counts[x] - prevCounts.get(x, 0) > 0
        }
    if len(docsWithErrors) > 0:
        counts["documents-with-errors"] = len(docsWithErrors)
    print >> sys.stderr, "---", "All Datasets Done", "---"
    print >> sys.stderr, "ChemProt conversion:", dict(counts)
    if tempDir != None and not debug:
        print >> sys.stderr, "Removing temporary directory", tempDir
        shutil.rmtree(tempDir)
    if outPath != None:
        ETUtils.write(corpus, outPath)
    return ET.ElementTree(corpus)
def setDefaultElement(parent, name):
    element = parent.find(name)
    if element == None:
        element = ElementTree.Element(name)
        parent.append(element)
    return element