Esempio n. 1
0
def insertInteraction(sentence, interaction):
    interactions = sentence.findall("interaction")
    newIdNumber = IDUtils.getNextFreeId(interactions)
    interaction.set("id", sentence.get("id") + ".i" + str(newIdNumber))
    
    # insert into sentence
    inserted = False
    for i in range(len(sentence)):
        if sentence[i].tag == "sentenceanalyses":
            sentence.insert(i, interaction)
            inserted = True
    assert inserted
Esempio n. 2
0
def insertInteraction(sentence, interaction):
    interactions = sentence.findall("interaction")
    newIdNumber = IDUtils.getNextFreeId(interactions)
    interaction.set("id", sentence.get("id") + ".i" + str(newIdNumber))

    # insert into sentence
    inserted = False
    for i in range(len(sentence)):
        if sentence[i].tag == "sentenceanalyses":
            sentence.insert(i, interaction)
            inserted = True
    assert inserted
Esempio n. 3
0
def process(sentenceObject, examplesBySentence, classSet, classIds, predictionsByExample):
    sentenceElement = sentenceObject.sentence
    sentenceId = sentenceElement.get("id")
    entityElements = sentenceElement.findall("entity")
    # remove non-name entities
    if entityElements != None:
        for entityElement in entityElements:
            if entityElement.get("isName") == "False": # interaction word
                sentenceElement.remove(entityElement)

    # add new pairs
    entityElements = sentenceElement.findall("entity")
    entityCount = IDUtils.getNextFreeId(entityElements)
    
    if examplesBySentence.has_key(sentenceId):
        # split merged examples
        for example in examplesBySentence[sentenceId][:]:
            prediction = predictionsByExample[example[0]]
            if classSet.getName(prediction[0]).find("---") != -1:
                nameSplits = classSet.getName(prediction[0]).split("---")
                prediction[0] = classSet.getId(nameSplits[0], False)
                count = 1
                for nameSplit in nameSplits[1:]:
                    newExample = example[:]
                    newExample[0] += ".dupl" + str(count)
                    examplesBySentence[sentenceId].append(newExample)
                    newPrediction = prediction[:]
                    newPrediction[0] = classSet.getId(nameSplit, False)
                    predictionsByExample[newExample[0]] = newPrediction
                    count += 1
        
        # remove negatives
        examplesToKeep = []
        for example in examplesBySentence[sentenceId]:
            prediction = predictionsByExample[example[0]]
            if prediction[0] != 1:
                examplesToKeep.append(example)
        examplesBySentence[sentenceId] = examplesToKeep
        
        map = {}
        for token in sentenceObject.tokens:
            map[token.get("id")] = {}
        addExistingEntities(map, entityElements, sentenceObject)
        addExamples(map, examplesBySentence[sentenceId])
        markFinal(map)
        entities = buildEntityNodes(map, sentenceObject, entityCount, classSet, classIds, predictionsByExample)
        interactions = buildInteractions(map, sentenceObject.sentence, predictionsByExample)
        for entity in entities:
            sentenceElement.append(entity)
        for interaction in interactions:
            sentenceElement.append(interaction)

#ENDIF
Esempio n. 4
0
    def writeXMLSentence(self,
                         examples,
                         predictionsByExample,
                         sentenceObject,
                         classSet,
                         classIds,
                         goldSentence=None):
        self.assertSameSentence(examples)

        sentenceElement = sentenceObject.sentence
        sentenceId = sentenceElement.get("id")
        sentenceText = sentenceElement.get("text")
        # detach analyses-element
        sentenceAnalysesElement = None
        sentenceAnalysesElement = sentenceElement.find("sentenceanalyses")
        if sentenceAnalysesElement != None:
            sentenceElement.remove(sentenceAnalysesElement)
        # remove pairs and interactions
        interactions = self.removeChildren(sentenceElement,
                                           ["pair", "interaction"])
        # remove entities
        newEntityIdCount = IDUtils.getNextFreeId(
            sentenceElement.findall("entity"))
        nonNameEntities = self.removeNonNameEntities(sentenceElement)

        # add new pairs
        for example in examples:
            prediction = predictionsByExample[example[0]]
            entityElement = ET.Element("entity")
            entityElement.attrib["isName"] = "False"
            headToken = example[3]["t"]
            for token in sentenceObject.tokens:
                if token.get("id") == headToken:
                    headToken = token
                    break
            entityElement.set("charOffset", example[3]["charOffset"])
            entityElement.set("headOffset", headToken.get("charOffset"))
            entityElement.set("phraseType", example[3]["ptype"])
            entOffset = Range.charOffsetToSingleTuple(example[3]["charOffset"])
            entityElement.set("text",
                              sentenceText[entOffset[0]:entOffset[1] + 1])
            entityElement.set("id", sentenceId + ".e" + str(newEntityIdCount))
            self.setElementType(entityElement, prediction, classSet, classIds)
            newEntityIdCount += 1
            sentenceElement.append(entityElement)

        # re-attach the analyses-element
        if sentenceAnalysesElement != None:
            sentenceElement.append(sentenceAnalysesElement)
    def writeXMLSentence(self, examples, predictionsByExample, sentenceObject, classSet, classIds, goldSentence=None):        
        self.assertSameSentence(examples)
        
        sentenceElement = sentenceObject.sentence
        sentenceId = sentenceElement.get("id")
        sentenceText = sentenceElement.get("text")
        # detach analyses-element
        sentenceAnalysesElement = None
        sentenceAnalysesElement = sentenceElement.find("sentenceanalyses")
        if sentenceAnalysesElement != None:
            sentenceElement.remove(sentenceAnalysesElement)
        # remove pairs and interactions
        interactions = self.removeChildren(sentenceElement, ["pair", "interaction"])
        # remove entities
        newEntityIdCount = IDUtils.getNextFreeId(sentenceElement.findall("entity"))
        nonNameEntities = self.removeNonNameEntities(sentenceElement)
            
        # add new pairs
        for example in examples:
            prediction = predictionsByExample[example[0]]
            entityElement = ET.Element("entity")
            entityElement.attrib["isName"] = "False"
            headToken = example[3]["t"]
            for token in sentenceObject.tokens:
                if token.get("id") == headToken:
                    headToken = token
                    break
            entityElement.set("charOffset", example[3]["charOffset"]) 
            entityElement.set("headOffset", headToken.get("charOffset"))
            entityElement.set("phraseType", example[3]["ptype"])
            entOffset = Range.charOffsetToSingleTuple(example[3]["charOffset"])
            entityElement.set("text", sentenceText[entOffset[0]:entOffset[1]+1])
            entityElement.set("id", sentenceId + ".e" + str(newEntityIdCount))
            self.setElementType(entityElement, prediction, classSet, classIds)
            newEntityIdCount += 1
            sentenceElement.append(entityElement)

        # re-attach the analyses-element
        if sentenceAnalysesElement != None:
            sentenceElement.append(sentenceAnalysesElement)
   def writeXMLSentence(self, examples, predictionsByExample, sentenceObject, classSet, classIds):        
       self.assertSameSentence(examples)
       
       sentenceElement = sentenceObject.sentence
       sentenceId = sentenceElement.get("id")
       # detach analyses-element
       sentenceAnalysesElement = None
       sentenceAnalysesElement = sentenceElement.find("sentenceanalyses")
       if sentenceAnalysesElement != None:
           sentenceElement.remove(sentenceAnalysesElement)
       # remove pairs and interactions
       self.removeChildren(sentenceElement, ["pair", "interaction"])
       # remove entities
       self.removeNonNameEntities(sentenceElement)
       
       entityByTokenByType = {}
       # First add existing entities (names) (use sentenceElement, as sentenceObject still has all entities)
       for entity in sentenceElement.findall("entity"):
           headOffset = entity.get("headOffset")
           headToken = None
           for token in sentenceObject.tokens:
               if token.get("charOffset") == headOffset:
                   headToken = token
                   break
           assert headToken != None
           headTokenId = headToken.get("id")
           if not entityByTokenByType.has_key(headTokenId):
               entityByTokenByType[headTokenId] = {}
           entityByTokenByType[headTokenId][entity.get("type")] = entity
       
       # Then add entities defined by examples
       newEntityIdCount = IDUtils.getNextFreeId(sentenceElement.findall("entity"))
       for example in examples:
           prediction = predictionsByExample[example[0]]
           if prediction[0] == 1:
               continue
           
           headTokenId = example[3]["t1"]
           if not entityByTokenByType.has_key(headTokenId):
               entityByTokenByType[headTokenId] = {}
           e1Type = classSet.getName(prediction[0])
           if e1Type == "Cause":
               continue
           
           # Maximum of one entity per type per token
           if entityByTokenByType[headTokenId].has_key(e1Type):
               continue
           
           entityElement = ET.Element("entity")
           entityByTokenByType[headTokenId][e1Type] = entityElement 
           entityElement.attrib["isName"] = "False"
           for token in sentenceObject.tokens:
               if token.get("id") == headTokenId:
                   headToken = token
                   break
           entityElement.attrib["charOffset"] = headToken.get("charOffset") 
           entityElement.attrib["headOffset"] = headToken.get("charOffset")
           entityElement.attrib["text"] = headToken.get("text")
           entityElement.attrib["id"] = sentenceId + ".e" + str(newEntityIdCount)
           entityElement.set("type", e1Type)
           newEntityIdCount += 1
           sentenceElement.append(entityElement)
   
       pairCount = 0
       for example in examples:
           prediction = predictionsByExample[example[0]]
           if prediction[0] == 1:
               continue
           exampleType = classSet.getName(prediction[0])
           t1Id = example[3]["t1"]
           t2Id = example[3]["t2"]
           
           if exampleType != "Cause":
               if entityByTokenByType.has_key(t2Id):
                   e1Id = entityByTokenByType[t1Id][exampleType].get("id")
                   for e2Type in sorted(entityByTokenByType[t2Id].keys()):
                       if exampleType.find("egulation") == -1 and e2Type != "Protein":
                           continue
                       pairElement = ET.Element("interaction")
                       pairElement.attrib["directed"] = "Unknown"
                       pairElement.attrib["e1"] = e1Id
                       pairElement.attrib["e2"] = entityByTokenByType[t2Id][e2Type].get("id")
                       pairElement.attrib["id"] = sentenceId + ".i" + str(pairCount)
                       pairElement.attrib["predictions"] = self.getEdgePredictionString(example, prediction, classSet, classIds)
                       pairElement.set("type", "Theme")
                       sentenceElement.append(pairElement)
                       pairCount += 1
           else:
               if entityByTokenByType.has_key(t1Id) and entityByTokenByType.has_key(t2Id): 
                   for e1Type in sorted(entityByTokenByType[t1Id].keys()):
                       if e1Type.find("egulation") == -1:
                           continue
                       for e2Type in sorted(entityByTokenByType[t2Id].keys()):
                           pairElement = ET.Element("interaction")
                           pairElement.attrib["directed"] = "Unknown"
                           pairElement.attrib["e1"] = entityByTokenByType[t1Id][e1Type].get("id")
                           pairElement.attrib["e2"] = entityByTokenByType[t2Id][e2Type].get("id")
                           pairElement.attrib["id"] = sentenceId + ".i" + str(pairCount)
                           pairElement.attrib["predictions"] = self.getEdgePredictionString(example, prediction, classSet, classIds)
                           pairElement.set("type", "Cause")
                           sentenceElement.append(pairElement)
                           pairCount += 1
 
       # re-attach the analyses-element
       if sentenceAnalysesElement != None:
           sentenceElement.append(sentenceAnalysesElement)
Esempio n. 7
0
    def writeXMLSentence(self, examples, predictionsByExample, sentenceObject, classSet, classIds, goldSentence=None):
        sentenceElement = sentenceObject.sentence
        self.sentenceId = sentenceElement.get("id")
        self.assertSameSentence(examples, self.sentenceId)
        # detach analyses-element
        sentenceAnalysesElement = None
        sentenceAnalysesElement = sentenceElement.find("sentenceanalyses")
        if sentenceAnalysesElement == None:
            sentenceAnalysesElement = sentenceElement.find("analyses")
        if sentenceAnalysesElement != None:
            sentenceElement.remove(sentenceAnalysesElement)

        # remove pairs and interactions
        interactions = self.removeChildren(sentenceElement, ["pair", "interaction"])
        # remove entities
        entities = self.removeNonNameEntities(sentenceElement)

        # filter interactions
        interactionsToKeep = []
        for interaction in interactions:
            if interaction.get("type") != "neg":
                interactionsToKeep.append(interaction)
        interactions = interactionsToKeep

        # early out
        cutoff = 100
        if len(interactions) == 0 or len(interactions) > cutoff:
            # re-attach the analyses-element
            if sentenceAnalysesElement != None:
                sentenceElement.append(sentenceAnalysesElement)
            if len(interactions) > cutoff:
                print >> sys.stderr, "Warning, sentence", sentenceObject.sentence.get(
                    "id"
                ), "has more than", cutoff, "interactions, removing all."
            return

        interactionsByEntity = {}
        interactionsById = {}
        for entity in entities:
            interactionsByEntity[entity.get("id")] = []
        for interaction in interactions:
            e1Id = interaction.get("e1")
            if not interactionsByEntity.has_key(e1Id):
                interactionsByEntity[e1Id] = []
            interactionsByEntity[e1Id].append(interaction)
            interactionsById[interaction.get("id")] = interaction

        # NOTE! Following won't work for pairs
        self.entityCount = IDUtils.getNextFreeId(sentenceElement.findall("entity"))
        self.interactionCount = IDUtils.getNextFreeId(sentenceElement.findall("interaction"))
        self.newEntities = []
        self.newInteractions = []

        # Mapping for connecting the events
        self.entitiesByHeadByType = {}
        # self.tokenByOffset = {}
        # for token in sentenceObject.tokens:
        #    self.tokenByOffset[token.get("charOffset")] = token
        #    self.entityByHeadByType[token.get("charOffset")] = {}
        for entity in sentenceObject.entities:
            # by offset
            offset = entity.get("headOffset")
            if not self.entitiesByHeadByType.has_key(offset):
                self.entitiesByHeadByType[offset] = {}
            # by type
            eType = entity.get("type")
            if entity.get("isName") != "True":
                self.entitiesByHeadByType[offset][eType] = []
            else:  # add names to structure
                if not self.entitiesByHeadByType[offset].has_key(eType):
                    self.entitiesByHeadByType[offset][eType] = []
                self.entitiesByHeadByType[offset][eType].append(entity)

        entityKeys = sentenceObject.entitiesById.keys()
        exampleByEntityId = {}
        for example in examples:
            # if predictionsByExample[example[0]][0] == 1: # negative
            #    continue
            eId = example[3]["e"]
            assert eId in entityKeys
            if not exampleByEntityId.has_key(eId):
                exampleByEntityId[eId] = []
            exampleByEntityId[eId].append(example)

        # This doesn't work, it was an attempt to include
        # only the positive example with the highest prediction strength
        #        for key in sorted(exampleByEntityId.keys()):
        #            eType = sentenceObject.entitiesById[key].get("type")
        #            eExamples = exampleByEntityId[key]
        #            if eType == "Binding" and len(eExamples) > 1:
        #                maxArgs = -1
        #                maxStr = -999999999
        #                for example in eExamples:
        #                    if predictionsByExample[example[0]][0] == 1:
        #                        continue
        #                    numArgs = example[3]["i"].count(",") + 1
        #                    if numArgs > maxArgs:
        #                        maxArgs = numArgs
        #                    predClass = predictionsByExample[example[0]][0]
        #                    predictionStrength = predictionsByExample[example[0]][predClass]
        #                    if predictionStrength > maxStr:
        #                        maxStr = predictionStrength
        #                #print maxArgs, len(eExamples)
        #                for example in eExamples:
        #                    if predictionsByExample[example[0]][0] == 1:
        #                        continue
        #                    predClass = predictionsByExample[example[0]][0]
        #                    predictionStrength = predictionsByExample[example[0]][predClass]
        #                    if predictionStrength != maxStr:
        #                        examples.remove(example)
        #                    #if example[3]["i"].count(",") + 1 < maxArgs:
        #                    #    examples.remove(example)

        # self.newEntitiesById = {}
        # self.outEdgesByEntity = {}

        # Gather arguments for the simple, one-argument events
        argumentsByExample = {}
        positiveExamples = []
        exampleIdCount = 0
        for entity in entities:
            # If no example, case is unambiguous
            if entity.get("id") not in exampleByEntityId:
                simpleEventInteractions = interactionsByEntity[entity.get("id")]
                numCauses = 0
                numThemes = 0
                for interaction in simpleEventInteractions[:]:
                    if self.isIntersentence(interaction):
                        print "Warning, intersentence interaction for", entity.get("id"), entity.get("type")
                        simpleEventInteractions.remove(interaction)
                        continue
                    if interaction.get("type") == "neg":
                        simpleEventInteractions.remove(interaction)
                        continue
                    iType = interaction.get("type")
                    if iType == "Cause":
                        numCauses += 1
                    elif iType == "Theme":
                        numThemes += 1
                eType = entity.get("type")
                assert (
                    numThemes == 0 or (numThemes != 0 and numCauses == 0) or (numThemes > 1 and eType != "Binding")
                ), (numThemes, numCauses, eType, entity.get("id"), [x[0] for x in examples], entityKeys)
                # assert numThemes == 0 or (numThemes != 0 and numCauses == 0) or (numThemes > 1 and eType == "Binding"), (numThemes,numCauses,eType,entity.get("id"))
                for interaction in simpleEventInteractions:
                    self.counts["simple-" + eType + "-" + interaction.get("type")] += 1
                    exampleId = "simple." + str(exampleIdCount)
                    exampleIdCount += 1
                    positiveExamples.append([exampleId, None, None, None])
                    argumentsByExample[exampleId] = [interaction]
                    # self.addEvent([interaction], sentenceObject, "simple")

        # Gather arguments for predicted, unmerged events
        for example in examples:
            # print predictionsByExample[example[0]]
            if predictionsByExample[example[0]][0] == 1:  # negative
                continue
            positiveExamples.append(example)
            arguments = []
            for iId in example[3]["i"].split(","):
                if iId == "":  # processes can have 0 arguments
                    assert "etype" in example[3], example[3]
                    assert example[3]["etype"] == "Process", example[3]
                    break
                arg = interactionsById[iId]
                if self.isIntersentence(arg):
                    continue
                assert arg.get("type") != "neg"
                arguments.append(arg)
            argumentsByExample[example[0]] = arguments

        # Loop until all positive examples are added. This process
        # assumes that the events (mostly) form a directed acyclic
        # graph, which can written by "growing" the structure from
        # the "leaf" events, and consecutively adding levels of
        # nesting events.
        examplesLeft = len(positiveExamples)
        exampleAdded = {}
        for example in positiveExamples:
            exampleAdded[example[0]] = False
        forceAdd = False
        forcedCount = 0
        while examplesLeft > 0:
            if len(self.newEntities) > 100:
                print >> sys.stderr, "Warning, sentence", sentenceObject.sentence.get(
                    "id"
                ), "has generated more than", cutoff, "events, skipping the rest."
                break
            examplesAddedThisRound = 0
            # For each round, loop through the potentially remaining examples
            for example in positiveExamples:
                if len(self.newEntities) > 100:
                    break
                if exampleAdded[example[0]]:  # This event has already been inserted
                    continue
                arguments = argumentsByExample[example[0]]
                # An event can be added if all of its argument events have already
                # been added. Addition is forced if lack of argument events blocks
                # the process.
                if forceAdd or self.argumentEntitiesExist(arguments, sentenceObject):
                    umType = "complex"  # mark the root entity in the output xml
                    predictionStrength = None
                    if example[0].find("simple") != -1:
                        umType = "simple"
                    else:
                        # Prediction strength is only available for classified argument groups
                        predictionStrength = self.getPredictionStrength(
                            example, predictionsByExample, classSet, classIds
                        )
                    # print example
                    if (
                        umType != "simple"
                        and "eType" in example[3]
                        and example[3]["etype"] == "Process"
                        and len(arguments) == 0
                    ):
                        origProcess = sentenceObject.entitiesById[example[3]["e"]]
                        # Put back the original entity
                        newProcess = self.addEntity(origProcess)
                        newProcess.set("umType", umType)
                        if predictionStrength != None:
                            newProcess.set("umStrength", str(predictionStrength))
                    else:  # example has arguments
                        self.addEvent(arguments, sentenceObject, umType, forceAdd, predictionStrength)
                    exampleAdded[example[0]] = True
                    examplesLeft -= 1
                    examplesAddedThisRound += 1
                    forceAdd = False
            if examplesLeft > 0 and examplesAddedThisRound == 0:
                # If there are examples left, but nothing was added, this
                # means that some nested events are missing. Theoretically
                # this could also be because two events are referring to
                # each other, preventing each other's insertion. In any
                # case this is solved by simply forcing the addition of
                # the first non-inserted event, by creating 0-argument
                # entities for its argument events.
                forcedCount += 1
                # print "Warning, forcing event addition"
                forceAdd = True

        # Attach the new elements
        for element in self.newEntities + self.newInteractions:
            sentenceElement.append(element)

        # re-attach the analyses-element
        if sentenceAnalysesElement != None:
            sentenceElement.append(sentenceAnalysesElement)
    def writeXMLSentence(self, examples, predictionsByExample, sentenceObject, classSet, classIds):        
        self.assertSameSentence(examples)
        
        sentenceElement = sentenceObject.sentence
        sentenceId = sentenceElement.get("id")
        # detach analyses-element
        sentenceAnalysesElement = None
        sentenceAnalysesElement = sentenceElement.find("sentenceanalyses")
        if sentenceAnalysesElement != None:
            sentenceElement.remove(sentenceAnalysesElement)
        # remove pairs and interactions
        self.removeChildren(sentenceElement, ["pair", "interaction"])
        
        # remove negative predicted entities
        self.removeChildren(sentenceElement, ["entity"], {"type":"neg"})
        
        # add required entities for dummy nodes with positive interactions
        dummies = {}
        newEntityIdCount = IDUtils.getNextFreeId(sentenceElement.findall("entity"))
        for example in examples:
            prediction = predictionsByExample[example[0]]
            #if self.isNegative(prediction, classSet):
            #    continue
            assert example[3]["d1"] in ["T","F"], ("Example d1 error:", example)
            assert example[3]["d2"] in ["T","F"], ("Example d2 error:", example)
            for node in ["1","2"]:
                d = example[3]["d"+node]
                if d == "T": # Node is a dummy node
                    e = example[3]["e"+node]
                    l = example[3]["l"+node]
                    if not dummies.has_key(e): dummies[e] = {}
                    if not dummies[e].has_key(l): # Create a real node for the empty slot
                        entityElement = ET.Element("entity")
                        entityElement.attrib["isName"] = "False"
                        headToken = example[3]["t"+node]
                        for token in sentenceObject.tokens:
                            if token.get("id") == headToken:
                                headToken = token
                                break
                        entityElement.set("charOffset", headToken.get("charOffset")) 
                        entityElement.set("headOffset", headToken.get("charOffset"))
                        entityElement.set("text", headToken.get("text"))
                        entityElement.set("id", sentenceId + ".e" + str(newEntityIdCount))
                        #self.setElementType(entityElement, prediction, classSet, classIds)
                        entityElement.set("type", sentenceObject.entitiesById[e].get("type"))
                        # Add element to sentence
                        newEntityIdCount += 1
                        sentenceElement.append(entityElement)
                        newEntityId = entityElement.get("id")
                        #print "newEntityId",newEntityId
                        assert not sentenceObject.entitiesById.has_key(newEntityId)
                        sentenceObject.entitiesById[newEntityId] = entityElement
                        # Keep track of created dummies
                        dummies[e][l] = entityElement

        # select examples for correct edge combinations
        #print "DUMMIES", dummies
        #print sentenceObject.entitiesById
        examples = self.getValidExamples(examples, predictionsByExample, sentenceObject, dummies, classSet, classIds)
        
        # add interactions
        pairCount = 0
        for example in examples:
            prediction = predictionsByExample[example[0]]
            #if self.isNegative(prediction, classSet):
            #    continue
            pairElement = ET.Element("interaction")
            if example[3].has_key("discarded") and example[3]["discarded"]:
                pairElement.attrib["discarded"] = "True"
            pairElement.attrib["directed"] = "Unknown"
            if example[3]["d1"] == "F":
                pairElement.attrib["e1"] = example[3]["e1"]
            else:
                pairElement.attrib["e1"] = dummies[example[3]["e1"]][example[3]["l1"]].get("id")
            if example[3]["d2"] == "F":
                pairElement.attrib["e2"] = example[3]["e2"]
            else:
                pairElement.attrib["e2"] = dummies[example[3]["e2"]][example[3]["l2"]].get("id")
            pairElement.attrib["id"] = sentenceId + ".i" + str(pairCount)
            self.setElementType(pairElement, prediction, classSet, classIds)
            sentenceElement.append(pairElement)
            pairCount += 1
  
        # re-attach the analyses-element
        if sentenceAnalysesElement != None:
            sentenceElement.append(sentenceAnalysesElement)
Esempio n. 9
0
def run(input, output, eventDir, parse="split-mccc-preparsed", verbose=False):
    print >> sys.stderr, "Loading corpus", input
    corpusTree = ETUtils.ETFromObj(input)
    print >> sys.stderr, "Corpus file loaded"
    corpusRoot = corpusTree.getroot()

    counts = defaultdict(int)
    for document in corpusRoot.findall("document"):
        sentDict = None
        pmid = document.get("pmid")
        isPMC = False
        for sentence in document.findall("sentence"):
            counts["sentences"] += 1
            sentenceId = str(sentence.get("id")) + "/" + str(
                sentence.get("origId"))
            if verbose: print "Processing", sentenceId
            if sentDict == None:
                if sentence.get("origId") != None:
                    assert pmid == None
                    sentDict = loadEventXML(
                        eventDir + "/" + sentence.get("origId").split(".")[0] +
                        ".xml",
                        verbose=verbose)
                else:
                    #pmid = sentence.get("pmid")
                    assert pmid != None
                    if pmid.startswith("PMC"):
                        isPMC = True
                        sentDict = {}
                    else:
                        assert pmid.startswith("PMID")
                        sentDict = loadEventXML(
                            eventDir + "/" + pmid.split("-", 1)[-1] + ".xml",
                            verbose=verbose)
            interactionXMLText = sentence.get("text")
            if not sentDict.has_key(interactionXMLText):
                counts["missing-sentences"] += 1
                if isPMC: counts["missing-sentences-PMC"] += 1
                if verbose:
                    print "Missing sentence:", pmid, (sentenceId, sentDict,
                                                      sentence.get("text"))
            else:
                sentenceAnalyses = sentence.find("sentenceanalyses")
                if sentenceAnalyses != None:
                    sentence.remove(sentenceAnalyses)
                entityIdCount = IDUtils.getNextFreeId(
                    sentence.findall("entity"))
                events = sentDict[interactionXMLText]
                events.sort()
                for event in events:
                    if not keepEvent(event[2]):
                        counts["filtered-triggers"] += 1
                        continue
                    trigger = ET.Element("entity")
                    trigger.set("isName", "False")
                    trigger.set("charOffset",
                                str(event[0]) + "-" + str(event[1]))
                    trigger.set("type", str(event[2]))
                    trigger.set("text", str(event[3]))
                    trigger.set("source", "GENIA_event_annotation_0.9")
                    trigger.set("id",
                                sentence.get("id") + ".e" + str(entityIdCount))
                    entityIdCount += 1
                    counts["added-triggers"] += 1
                    sentence.append(trigger)
                if sentenceAnalyses != None:
                    sentence.append(sentenceAnalyses)

    FindHeads.findHeads(corpusTree, parse, removeExisting=False)
    removeDuplicates(corpusRoot)
    print counts

    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(corpusRoot, output)
    return corpusTree
Esempio n. 10
0
    def writeXMLSentence(self, examples, predictionsByExample, sentenceObject, classSet, classIds, goldSentence=None):        
        self.assertSameSentence(examples)
        
        extensionRequested = False
        
        sentenceElement = sentenceObject.sentence
        sentenceId = sentenceElement.get("id")
        # detach analyses-element
        sentenceAnalysesElement = None
        sentenceAnalysesElement = sentenceElement.find("sentenceanalyses")
        if sentenceAnalysesElement == None:
            sentenceAnalysesElement = sentenceElement.find("analyses")
        if sentenceAnalysesElement != None:
            sentenceElement.remove(sentenceAnalysesElement)
        # remove pairs and interactions
        interactions = self.removeChildren(sentenceElement, ["pair", "interaction"])
        # remove entities
        newEntityIdCount = IDUtils.getNextFreeId(sentenceElement.findall("entity"))
        nonNameEntities = self.removeNonNameEntities(sentenceElement)
        
        # gold sentence elements
        goldEntityTypeByHeadOffset = {}
        goldEntityByHeadOffset = {}
        if goldSentence != None:
            for entity in goldSentence.entities:
                headOffset = entity.get("headOffset")
                if not goldEntityTypeByHeadOffset.has_key(headOffset):
                    goldEntityTypeByHeadOffset[headOffset] = []
                    goldEntityByHeadOffset[headOffset] = []
                goldEntityTypeByHeadOffset[headOffset].append(entity)
                goldEntityByHeadOffset[headOffset].append(entity)
            for key in goldEntityTypeByHeadOffset:
                goldEntityTypeByHeadOffset[key] =  self.getMergedEntityType(goldEntityTypeByHeadOffset[key])
            for token in sentenceObject.tokens:
                if not goldEntityTypeByHeadOffset.has_key(token.get("charOffset")):
                    goldEntityTypeByHeadOffset[token.get("charOffset")] = "neg"
            
        # add new pairs
        for example in examples:
            unmergeEPINeg = None
            if "unmergeneg" in example[3] and example[3]["unmergeneg"] == "epi":
                unmergeEPINeg = headToken.get("text")
            if "trigex" in example[3] and example[3]["trigex"] == "bb":
                extensionRequested = True
            headToken = example[3]["t"]
            for token in sentenceObject.tokens:
                if token.get("id") == headToken:
                    headToken = token
                    break
            prediction = predictionsByExample[example[0]]
            predictionString = self.getPredictionStrengthString(prediction, classSet, classIds)
            for eType in self.getElementTypes(prediction, classSet, classIds, unmergeEPINegText=unmergeEPINeg): # split merged classes
                entityElement = ET.Element("entity")
                entityElement.set("isName", "False")
                entityElement.set("charOffset", headToken.get("charOffset"))
                entityElement.set("headOffset", headToken.get("charOffset"))
                entityElement.set("text", headToken.get("text"))
                entityElement.set("id", sentenceId + ".e" + str(newEntityIdCount))
                entityElement.set("type", eType)
                entityElement.set("predictions", predictionString)
                #self.setElementType(entityElement, prediction, classSet, classIds, unmergeEPINeg=unmergeEPINeg)
                if self.insertWeights: # in other words, use gold types
                    headOffset = headToken.get("charOffset")
                    if goldEntityByHeadOffset.has_key(headOffset):
                        for entity in goldEntityByHeadOffset[headOffset]:
                            entity.set("predictions", entityElement.get("predictions") )
                if goldEntityTypeByHeadOffset.has_key(headToken.get("charOffset")):
                    entityElement.set("goldType", goldEntityTypeByHeadOffset[headToken.get("charOffset")])
                if "goldIds" in example[3]: # The entities for which this example was built
                    entityElement.set("goldIds", example[3]["goldIds"])
                if (entityElement.get("type") != "neg" and not goldEntityByHeadOffset.has_key(entityElement.get("headOffset"))) or not self.insertWeights:
                    newEntityIdCount += 1
                    sentenceElement.append(entityElement)
                elif entityElement.get("type") == "neg":
                    pass
                    #newEntityIdCount += 1
                    #sentenceElement.append(entityElement)
        
        # if only adding weights, re-attach interactions and gold entities
        if self.insertWeights:
            for entity in nonNameEntities:
                sentenceElement.append(entity)
            for interaction in interactions:
                sentenceElement.append(interaction)

        # re-attach the analyses-element
        if sentenceAnalysesElement != None:
            sentenceElement.append(sentenceAnalysesElement)
        
        # Extend bacteria triggers
        if extensionRequested:
            InteractionXML.ExtendTriggers.extend(sentenceElement, entityTypes=["Bacterium"])
Esempio n. 11
0
    def writeXMLSentence(self,
                         examples,
                         predictionsByExample,
                         sentenceObject,
                         classSet,
                         classIds,
                         goldSentence=None):
        self.assertSameSentence(examples)

        extensionRequested = False

        sentenceElement = sentenceObject.sentence
        sentenceId = sentenceElement.get("id")
        # detach analyses-element
        sentenceAnalysesElement = None
        sentenceAnalysesElement = sentenceElement.find("sentenceanalyses")
        if sentenceAnalysesElement == None:
            sentenceAnalysesElement = sentenceElement.find("analyses")
        if sentenceAnalysesElement != None:
            sentenceElement.remove(sentenceAnalysesElement)
        # remove pairs and interactions
        interactions = self.removeChildren(sentenceElement,
                                           ["pair", "interaction"])
        # remove entities
        newEntityIdCount = IDUtils.getNextFreeId(
            sentenceElement.findall("entity"))
        nonNameEntities = self.removeNonNameEntities(sentenceElement)

        # gold sentence elements
        goldEntityTypeByHeadOffset = {}
        goldEntityByHeadOffset = {}
        if goldSentence != None:
            for entity in goldSentence.entities:
                headOffset = entity.get("headOffset")
                if not goldEntityTypeByHeadOffset.has_key(headOffset):
                    goldEntityTypeByHeadOffset[headOffset] = []
                    goldEntityByHeadOffset[headOffset] = []
                goldEntityTypeByHeadOffset[headOffset].append(entity)
                goldEntityByHeadOffset[headOffset].append(entity)
            for key in goldEntityTypeByHeadOffset:
                goldEntityTypeByHeadOffset[key] = self.getMergedEntityType(
                    goldEntityTypeByHeadOffset[key])
            for token in sentenceObject.tokens:
                if not goldEntityTypeByHeadOffset.has_key(
                        token.get("charOffset")):
                    goldEntityTypeByHeadOffset[token.get("charOffset")] = "neg"

        # add new pairs
        for example in examples:
            unmergeEPINeg = None
            if "unmergeneg" in example[3] and example[3]["unmergeneg"] == "epi":
                unmergeEPINeg = headToken.get("text")
            if "trigex" in example[3] and example[3]["trigex"] == "bb":
                extensionRequested = True
            headToken = example[3]["t"]
            for token in sentenceObject.tokens:
                if token.get("id") == headToken:
                    headToken = token
                    break
            prediction = predictionsByExample[example[0]]
            predictionString = self.getPredictionStrengthString(
                prediction, classSet, classIds)
            for eType in self.getElementTypes(
                    prediction,
                    classSet,
                    classIds,
                    unmergeEPINegText=unmergeEPINeg):  # split merged classes
                entityElement = ET.Element("entity")
                entityElement.set("isName", "False")
                entityElement.set("charOffset", headToken.get("charOffset"))
                entityElement.set("headOffset", headToken.get("charOffset"))
                entityElement.set("text", headToken.get("text"))
                entityElement.set("id",
                                  sentenceId + ".e" + str(newEntityIdCount))
                entityElement.set("type", eType)
                entityElement.set("predictions", predictionString)
                #self.setElementType(entityElement, prediction, classSet, classIds, unmergeEPINeg=unmergeEPINeg)
                if self.insertWeights:  # in other words, use gold types
                    headOffset = headToken.get("charOffset")
                    if goldEntityByHeadOffset.has_key(headOffset):
                        for entity in goldEntityByHeadOffset[headOffset]:
                            entity.set("predictions",
                                       entityElement.get("predictions"))
                if goldEntityTypeByHeadOffset.has_key(
                        headToken.get("charOffset")):
                    entityElement.set(
                        "goldType", goldEntityTypeByHeadOffset[headToken.get(
                            "charOffset")])
                if "goldIds" in example[
                        3]:  # The entities for which this example was built
                    entityElement.set("goldIds", example[3]["goldIds"])
                if (entityElement.get("type") != "neg"
                        and not goldEntityByHeadOffset.has_key(
                            entityElement.get("headOffset"))
                    ) or not self.insertWeights:
                    newEntityIdCount += 1
                    sentenceElement.append(entityElement)
                elif entityElement.get("type") == "neg":
                    pass
                    #newEntityIdCount += 1
                    #sentenceElement.append(entityElement)

        # if only adding weights, re-attach interactions and gold entities
        if self.insertWeights:
            for entity in nonNameEntities:
                sentenceElement.append(entity)
            for interaction in interactions:
                sentenceElement.append(interaction)

        # re-attach the analyses-element
        if sentenceAnalysesElement != None:
            sentenceElement.append(sentenceAnalysesElement)

        # Extend bacteria triggers
        if extensionRequested:
            InteractionXML.ExtendTriggers.extend(sentenceElement,
                                                 entityTypes=["Bacterium"])
Esempio n. 12
0
    def writeXMLSentence(self,
                         examples,
                         predictionsByExample,
                         sentenceObject,
                         classSet,
                         classIds,
                         goldSentence=None):
        sentenceElement = sentenceObject.sentence
        self.sentenceId = sentenceElement.get("id")
        self.assertSameSentence(examples, self.sentenceId)
        # detach analyses-element
        sentenceAnalysesElement = None
        sentenceAnalysesElement = sentenceElement.find("sentenceanalyses")
        if sentenceAnalysesElement == None:
            sentenceAnalysesElement = sentenceElement.find("analyses")
        if sentenceAnalysesElement != None:
            sentenceElement.remove(sentenceAnalysesElement)

        # remove pairs and interactions
        interactions = self.removeChildren(sentenceElement,
                                           ["pair", "interaction"])
        # remove entities
        entities = self.removeNonNameEntities(sentenceElement)

        # filter interactions
        interactionsToKeep = []
        for interaction in interactions:
            if interaction.get("type") != "neg":
                interactionsToKeep.append(interaction)
        interactions = interactionsToKeep

        # early out
        cutoff = 100
        if len(interactions) == 0 or len(interactions) > cutoff:
            # re-attach the analyses-element
            if sentenceAnalysesElement != None:
                sentenceElement.append(sentenceAnalysesElement)
            if len(interactions) > cutoff:
                print >> sys.stderr, "Warning, sentence", sentenceObject.sentence.get(
                    "id"
                ), "has more than", cutoff, "interactions, removing all."
            return

        interactionsByEntity = {}
        interactionsById = {}
        for entity in entities:
            interactionsByEntity[entity.get("id")] = []
        for interaction in interactions:
            e1Id = interaction.get("e1")
            if not interactionsByEntity.has_key(e1Id):
                interactionsByEntity[e1Id] = []
            interactionsByEntity[e1Id].append(interaction)
            interactionsById[interaction.get("id")] = interaction

        # NOTE! Following won't work for pairs
        self.entityCount = IDUtils.getNextFreeId(
            sentenceElement.findall("entity"))
        self.interactionCount = IDUtils.getNextFreeId(
            sentenceElement.findall("interaction"))
        self.newEntities = []
        self.newInteractions = []

        # Mapping for connecting the events
        self.entitiesByHeadByType = {}
        #self.tokenByOffset = {}
        #for token in sentenceObject.tokens:
        #    self.tokenByOffset[token.get("charOffset")] = token
        #    self.entityByHeadByType[token.get("charOffset")] = {}
        for entity in sentenceObject.entities:
            # by offset
            offset = entity.get("headOffset")
            if not self.entitiesByHeadByType.has_key(offset):
                self.entitiesByHeadByType[offset] = {}
            # by type
            eType = entity.get("type")
            if entity.get("isName") != "True":
                self.entitiesByHeadByType[offset][eType] = []
            else:  # add names to structure
                if not self.entitiesByHeadByType[offset].has_key(eType):
                    self.entitiesByHeadByType[offset][eType] = []
                self.entitiesByHeadByType[offset][eType].append(entity)

        entityKeys = sentenceObject.entitiesById.keys()
        exampleByEntityId = {}
        for example in examples:
            #if predictionsByExample[example[0]][0] == 1: # negative
            #    continue
            eId = example[3]["e"]
            assert eId in entityKeys
            if not exampleByEntityId.has_key(eId):
                exampleByEntityId[eId] = []
            exampleByEntityId[eId].append(example)

        # This doesn't work, it was an attempt to include
        # only the positive example with the highest prediction strength
#        for key in sorted(exampleByEntityId.keys()):
#            eType = sentenceObject.entitiesById[key].get("type")
#            eExamples = exampleByEntityId[key]
#            if eType == "Binding" and len(eExamples) > 1:
#                maxArgs = -1
#                maxStr = -999999999
#                for example in eExamples:
#                    if predictionsByExample[example[0]][0] == 1:
#                        continue
#                    numArgs = example[3]["i"].count(",") + 1
#                    if numArgs > maxArgs:
#                        maxArgs = numArgs
#                    predClass = predictionsByExample[example[0]][0]
#                    predictionStrength = predictionsByExample[example[0]][predClass]
#                    if predictionStrength > maxStr:
#                        maxStr = predictionStrength
#                #print maxArgs, len(eExamples)
#                for example in eExamples:
#                    if predictionsByExample[example[0]][0] == 1:
#                        continue
#                    predClass = predictionsByExample[example[0]][0]
#                    predictionStrength = predictionsByExample[example[0]][predClass]
#                    if predictionStrength != maxStr:
#                        examples.remove(example)
#                    #if example[3]["i"].count(",") + 1 < maxArgs:
#                    #    examples.remove(example)

#self.newEntitiesById = {}
#self.outEdgesByEntity = {}

# Gather arguments for the simple, one-argument events
        argumentsByExample = {}
        positiveExamples = []
        exampleIdCount = 0
        for entity in entities:
            # If no example, case is unambiguous
            if entity.get("id") not in exampleByEntityId:
                simpleEventInteractions = interactionsByEntity[entity.get(
                    "id")]
                numCauses = 0
                numThemes = 0
                for interaction in simpleEventInteractions[:]:
                    if self.isIntersentence(interaction):
                        print "Warning, intersentence interaction for", entity.get(
                            "id"), entity.get("type")
                        simpleEventInteractions.remove(interaction)
                        continue
                    if interaction.get("type") == "neg":
                        simpleEventInteractions.remove(interaction)
                        continue
                    iType = interaction.get("type")
                    if iType == "Cause":
                        numCauses += 1
                    elif iType == "Theme":
                        numThemes += 1
                eType = entity.get("type")
                assert numThemes == 0 or (numThemes != 0 and numCauses
                                          == 0) or (numThemes > 1
                                                    and eType != "Binding"), (
                                                        numThemes,
                                                        numCauses, eType,
                                                        entity.get("id"), [
                                                            x[0]
                                                            for x in examples
                                                        ], entityKeys)
                #assert numThemes == 0 or (numThemes != 0 and numCauses == 0) or (numThemes > 1 and eType == "Binding"), (numThemes,numCauses,eType,entity.get("id"))
                for interaction in simpleEventInteractions:
                    self.counts["simple-" + eType + "-" +
                                interaction.get("type")] += 1
                    exampleId = "simple." + str(exampleIdCount)
                    exampleIdCount += 1
                    positiveExamples.append([exampleId, None, None, None])
                    argumentsByExample[exampleId] = [interaction]
                    #self.addEvent([interaction], sentenceObject, "simple")

        # Gather arguments for predicted, unmerged events
        for example in examples:
            #print predictionsByExample[example[0]]
            if predictionsByExample[example[0]][0] == 1:  # negative
                continue
            positiveExamples.append(example)
            arguments = []
            for iId in example[3]["i"].split(","):
                if iId == "":  # processes can have 0 arguments
                    assert "etype" in example[3], example[3]
                    assert example[3]["etype"] == "Process", example[3]
                    break
                arg = interactionsById[iId]
                if self.isIntersentence(arg):
                    continue
                assert arg.get("type") != "neg"
                arguments.append(arg)
            argumentsByExample[example[0]] = arguments

        # Loop until all positive examples are added. This process
        # assumes that the events (mostly) form a directed acyclic
        # graph, which can written by "growing" the structure from
        # the "leaf" events, and consecutively adding levels of
        # nesting events.
        examplesLeft = len(positiveExamples)
        exampleAdded = {}
        for example in positiveExamples:
            exampleAdded[example[0]] = False
        forceAdd = False
        forcedCount = 0
        while examplesLeft > 0:
            if len(self.newEntities) > 100:
                print >> sys.stderr, "Warning, sentence", sentenceObject.sentence.get(
                    "id"
                ), "has generated more than", cutoff, "events, skipping the rest."
                break
            examplesAddedThisRound = 0
            # For each round, loop through the potentially remaining examples
            for example in positiveExamples:
                if len(self.newEntities) > 100:
                    break
                if exampleAdded[
                        example[0]]:  # This event has already been inserted
                    continue
                arguments = argumentsByExample[example[0]]
                # An event can be added if all of its argument events have already
                # been added. Addition is forced if lack of argument events blocks
                # the process.
                if forceAdd or self.argumentEntitiesExist(
                        arguments, sentenceObject):
                    umType = "complex"  # mark the root entity in the output xml
                    predictionStrength = None
                    if example[0].find("simple") != -1:
                        umType = "simple"
                    else:
                        # Prediction strength is only available for classified argument groups
                        predictionStrength = self.getPredictionStrength(
                            example, predictionsByExample, classSet, classIds)
                    #print example
                    if umType != "simple" and "eType" in example[
                            3] and example[3]["etype"] == "Process" and len(
                                arguments) == 0:
                        origProcess = sentenceObject.entitiesById[example[3]
                                                                  ["e"]]
                        # Put back the original entity
                        newProcess = self.addEntity(origProcess)
                        newProcess.set("umType", umType)
                        if predictionStrength != None:
                            newProcess.set("umStrength",
                                           str(predictionStrength))
                    else:  # example has arguments
                        self.addEvent(arguments, sentenceObject, umType,
                                      forceAdd, predictionStrength)
                    exampleAdded[example[0]] = True
                    examplesLeft -= 1
                    examplesAddedThisRound += 1
                    forceAdd = False
            if examplesLeft > 0 and examplesAddedThisRound == 0:
                # If there are examples left, but nothing was added, this
                # means that some nested events are missing. Theoretically
                # this could also be because two events are referring to
                # each other, preventing each other's insertion. In any
                # case this is solved by simply forcing the addition of
                # the first non-inserted event, by creating 0-argument
                # entities for its argument events.
                forcedCount += 1
                #print "Warning, forcing event addition"
                forceAdd = True

        # Attach the new elements
        for element in self.newEntities + self.newInteractions:
            sentenceElement.append(element)

        # re-attach the analyses-element
        if sentenceAnalysesElement != None:
            sentenceElement.append(sentenceAnalysesElement)
Esempio n. 13
0
def run(input, output, eventDir, parse="split-mccc-preparsed", verbose=False):
    print >> sys.stderr, "Loading corpus", input
    corpusTree = ETUtils.ETFromObj(input)
    print >> sys.stderr, "Corpus file loaded"
    corpusRoot = corpusTree.getroot()
    
    counts = defaultdict(int)
    for document in corpusRoot.findall("document"):
        sentDict = None
        pmid = document.get("pmid")
        isPMC = False
        for sentence in document.findall("sentence"):
            counts["sentences"] += 1
            sentenceId = str(sentence.get("id")) + "/" + str(sentence.get("origId"))
            if verbose: print "Processing", sentenceId
            if sentDict == None:
                if sentence.get("origId") != None:
                    assert pmid == None
                    sentDict = loadEventXML( eventDir + "/" + sentence.get("origId").split(".")[0] + ".xml" , verbose=verbose)
                else:
                    #pmid = sentence.get("pmid")
                    assert pmid != None
                    if pmid.startswith("PMC"):
                        isPMC = True
                        sentDict = {}
                    else:
                        assert pmid.startswith("PMID")
                        sentDict = loadEventXML( eventDir + "/" + pmid.split("-", 1)[-1] + ".xml" , verbose=verbose)
            interactionXMLText = sentence.get("text")
            if not sentDict.has_key(interactionXMLText):
                counts["missing-sentences"] += 1
                if isPMC: counts["missing-sentences-PMC"] += 1
                if verbose: print "Missing sentence:", pmid, (sentenceId, sentDict, sentence.get("text"))
            else:
                sentenceAnalyses = sentence.find("sentenceanalyses")
                if sentenceAnalyses != None:
                    sentence.remove(sentenceAnalyses)
                entityIdCount = IDUtils.getNextFreeId(sentence.findall("entity"))
                events = sentDict[interactionXMLText]
                events.sort()
                for event in events:
                    if not keepEvent(event[2]):
                        counts["filtered-triggers"] += 1
                        continue
                    trigger = ET.Element("entity")
                    trigger.set("isName", "False")
                    trigger.set("charOffset", str(event[0]) + "-" + str(event[1]))
                    trigger.set("type", str(event[2]))
                    trigger.set("text", str(event[3]))
                    trigger.set("source", "GENIA_event_annotation_0.9")
                    trigger.set("id", sentence.get("id") + ".e" + str(entityIdCount))
                    entityIdCount += 1
                    counts["added-triggers"] += 1
                    sentence.append(trigger)
                if sentenceAnalyses != None:
                    sentence.append(sentenceAnalyses)
    
    FindHeads.findHeads(corpusTree, parse, removeExisting=False)
    removeDuplicates(corpusRoot)
    print counts
    
    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(corpusRoot, output)
    return corpusTree
Esempio n. 14
0
def _writeExamplesToInteractionXML(examples, predictionsByExample, sentenceObject, classSet, classIds, xType):
    currentSetMajorId = None
    for example in examples:
        majorId, minorId = example[0].rsplit(".x", 1)
        if currentSetMajorId == None: 
            currentSetMajorId = majorId
        else: 
            assert currentSetMajorId == majorId, str(currentSetMajorId) + "/" + str(majorId)
    
    sentenceElement = sentenceObject.sentence
    sentenceId = sentenceElement.get("id")
    # Dummy structure for backwards compatibility
    examplesBySentence = {}
    if len(examples) > 0:
        examplesBySentence[sentenceId] = examples
    # detach analyses
    sentenceAnalysesElement = None
    sentenceAnalysesElement = sentenceElement.find("sentenceanalyses")
    if sentenceAnalysesElement != None:
        sentenceElement.remove(sentenceAnalysesElement)
    # remove pairs and interactions
    pairElements = sentenceElement.findall("pair")
    if pairElements != None:
        for pairElement in pairElements:
            sentenceElement.remove(pairElement)
    interactionElements = sentenceElement.findall("interaction")
    if interactionElements != None:
        for interactionElement in interactionElements:
            sentenceElement.remove(interactionElement)
    # remove entities
    if xType == "token":
        entityElements = sentenceElement.findall("entity")
        entityCount = 0
        if entityElements != None:
            entityCount = len(entityElements) # get the count _before_ removing entities
            for entityElement in entityElements:
                if entityElement.get("isName") == "False": # interaction word
                    sentenceElement.remove(entityElement)
        # add new pairs
        entityElements = sentenceElement.findall("entity")
        newEntityIdCount = IDUtils.getNextFreeId(entityElements)
        if examplesBySentence.has_key(sentenceId):
            for example in examplesBySentence[sentenceId]:
                prediction = predictionsByExample[example[0]]
                entityElement = ET.Element("entity")
                entityElement.attrib["isName"] = "False"
                headToken = example[3]["t"]
                for token in sentenceObject.tokens:
                    if token.get("id") == headToken:
                        headToken = token
                        break
                entityElement.attrib["charOffset"] = headToken.get("charOffset") 
                entityElement.attrib["headOffset"] = headToken.get("charOffset")
                entityElement.attrib["text"] = headToken.get("text")
                entityElement.attrib["id"] = sentenceId + ".e" + str(newEntityIdCount)
                newEntityIdCount += 1
                if classSet == None: # binary classification
                    if prediction[0] > 0:
                        entityElement.attrib["type"] = str(True)
                    else:
                        entityElement.attrib["type"] = str(False)
                else:
                    entityElement.attrib["type"] = classSet.getName(prediction[0])
                    classWeights = prediction[1:]
                    predictionString = ""
                    for i in range(len(classWeights)):
                        if predictionString != "":
                            predictionString += ","
                        predictionString += classSet.getName(classIds[i]) + ":" + str(classWeights[i])
                    entityElement.attrib["predictions"] = predictionString
                #if entityElement.attrib["type"] != "neg":
                sentenceElement.append(entityElement)
                entityCount += 1
    elif xType == "edge":
        pairCount = 0
        if examplesBySentence.has_key(sentenceId):
            for example in examplesBySentence[sentenceId]:
                prediction = predictionsByExample[example[0]]
                pairElement = ET.Element("interaction")
                #pairElement.attrib["origId"] = origId
                #pairElement.attrib["type"] = example[3]["categoryName"]
                pairElement.attrib["directed"] = "Unknown"
                pairElement.attrib["e1"] = example[3]["e1"] #.attrib["id"]
                pairElement.attrib["e2"] = example[3]["e2"] #.attrib["id"]
                pairElement.attrib["id"] = sentenceId + ".i" + str(pairCount)
                if classSet == None: # binary classification
                    if prediction[0] > 0:
                        pairElement.attrib["type"] = str(True)
                    else:
                        pairElement.attrib["type"] = str(False)
                else:
                    pairElement.attrib["type"] = classSet.getName(prediction[0])
                    classWeights = prediction[1:]
                    predictionString = ""
                    for i in range(len(classWeights)):
                        if predictionString != "":
                            predictionString += ","
                        predictionString += classSet.getName(classIds[i]) + ":" + str(classWeights[i])
                    pairElement.attrib["predictions"] = predictionString
                sentenceElement.append(pairElement)
                pairCount += 1
    elif xType == "trigger-event":
        eventsByToken = {}
        existingEntities = set()
        entityElements = sentenceElement.findall("entity")
        entityCount = 0
        pairCount = 0
        if entityElements != None:
            entityCount = len(entityElements) # get the count _before_ removing entities
            for entityElement in entityElements:
                if entityElement.get("isName") == "False": # interaction word
                    sentenceElement.remove(entityElement)
                else:
                    existingEntities.add(entityElement.get("id"))
        # add new pairs
        entityElements = sentenceElement.findall("entity")
        newEntityIdCount = IDUtils.getNextFreeId(entityElements)
        if examplesBySentence.has_key(sentenceId):
            eventIdByExample = {}
            newEntities = []
            for example in examplesBySentence[sentenceId]:
                prediction = predictionsByExample[example[0]]
                if prediction[0] == 1:
                    continue
                entityElement = ET.Element("entity")
                newEntities.append(entityElement)
                entityElement.attrib["isName"] = "False"
                headToken = example[3]["et"]
                for token in sentenceObject.tokens:
                    if token.get("id") == headToken:
                        headToken = token
                        break
                entityElement.attrib["charOffset"] = headToken.get("charOffset") 
                entityElement.attrib["headOffset"] = headToken.get("charOffset")
                entityElement.attrib["text"] = headToken.get("text")
                entityElement.attrib["id"] = sentenceId + ".e" + str(newEntityIdCount)
                newEntityIdCount += 1
                eventIdByExample[example[0]] = entityElement.get("id")
                
                #if not eventByOrigId.has_key(example[3]["e"]):
                #    eventByOrigId[example[3]["e"]] = []
                #eventByOrigId[example[3]["e"]].append(entityElement.attrib["id"])
                #example[3]["e"] = entityElement.attrib["id"]
                
                
                if not eventsByToken.has_key(example[3]["et"]):
                    eventsByToken[example[3]["et"]] = []
                eventsByToken[example[3]["et"]].append(entityElement.get("id"))

                entityElement.attrib["type"] = example[3]["type"]
                classWeights = prediction[1:]
                predictionString = ""
                for i in range(len(classWeights)):
                    if predictionString != "":
                        predictionString += ","
                    predictionString += classSet.getName(classIds[i]) + ":" + str(classWeights[i])
                entityElement.attrib["predictions"] = predictionString
                #if entityElement.attrib["type"] != "neg":
                sentenceElement.append(entityElement)
                entityCount += 1
                
            for example in examplesBySentence[sentenceId]:
                prediction = predictionsByExample[example[0]]
                if prediction[0] == 1:
                    continue
                # add theme edge
                if example[3].has_key("t"):
                    pairElement = ET.Element("interaction")
                    pairElement.attrib["directed"] = "Unknown"
                    pairElement.attrib["e1"] = eventIdByExample[example[0]]
                    if eventsByToken.has_key(example[3]["tt"]):
                        pairElement.attrib["e2"] = eventsByToken[example[3]["tt"]][0]
                    else:
                        if example[3]["t"] in existingEntities:
                            pairElement.attrib["e2"] = example[3]["t"] #.attrib["id"]
                    pairElement.attrib["id"] = sentenceId + ".i" + str(pairCount)
                    pairElement.attrib["type"] = "Theme"
                    if pairElement.get("e2") != None:
                        sentenceElement.append(pairElement)
                        pairCount += 1
                
                # add cause edge
                if example[3].has_key("c"):
                    pairElement = ET.Element("interaction")
                    pairElement.attrib["directed"] = "Unknown"
                    pairElement.attrib["e1"] = eventIdByExample[example[0]]
                    if eventsByToken.has_key(example[3]["ct"]):
                        pairElement.attrib["e2"] = eventsByToken[example[3]["ct"]][0]
                    else:
                        if example[3]["c"] in existingEntities:
                            pairElement.attrib["e2"] = example[3]["c"] #.attrib["id"]
                    pairElement.attrib["id"] = sentenceId + ".i" + str(pairCount)
                    pairElement.attrib["type"] = "Cause"
                    if pairElement.get("e2") != None:
                        sentenceElement.append(pairElement)
                        pairCount += 1
#                    classWeights = prediction[1:]
#                    predictionString = ""
#                    for i in range(len(classWeights)):
#                        if predictionString != "":
#                            predictionString += ","
#                        predictionString += classSet.getName(classIds[i]) + ":" + str(classWeights[i])
#                    pairElement.attrib["predictions"] = predictionString
    elif xType == "event":
        if True:
            process(sentenceObject, examplesBySentence, classSet, classIds, predictionsByExample)
        else:
            eventsByToken = {}
            existingEntities = set()
            entityElements = sentenceElement.findall("entity")
            entityCount = 0
            pairCount = 0
            if entityElements != None:
                entityCount = len(entityElements) # get the count _before_ removing entities
                for entityElement in entityElements:
                    if entityElement.get("isName") == "False": # interaction word
                        sentenceElement.remove(entityElement)
                    else:
                        existingEntities.add(entityElement.get("id"))
            # add new pairs
            entityElements = sentenceElement.findall("entity")
            newEntityIdCount = IDUtils.getNextFreeId(entityElements)
            if examplesBySentence.has_key(sentenceId):
                # split merged examples
                for example in examplesBySentence[sentenceId][:]:
                    prediction = predictionsByExample[example[0]]
                    if classSet.getName(prediction[0]).find("---") != -1:
                        nameSplits = classSet.getName(prediction[0]).split("---")
                        prediction[0] = classSet.getId(nameSplits[0], False)
                        count = 1
                        for nameSplit in nameSplits[1:]:
                            newExample = example[:]
                            newExample[0] += ".dupl" + str(count)
                            examplesBySentence[sentenceId].append(newExample)
                            newPrediction = prediction[:]
                            newPrediction[0] = classSet.getId(nameSplit, False)
                            predictionsByExample[newExample[0]] = newPrediction
                            count += 1
                
                # the rest of the stuff
                eventIdByExample = {}
                newEntities = []
                for example in examplesBySentence[sentenceId]:
                    prediction = predictionsByExample[example[0]]
                    if prediction[0] == 1:
                        continue
                    entityElement = ET.Element("entity")
                    newEntities.append(entityElement)
                    entityElement.attrib["isName"] = "False"
                    headToken = example[3]["et"]
                    for token in sentenceObject.tokens:
                        if token.get("id") == headToken:
                            headToken = token
                            break
                    entityElement.attrib["charOffset"] = headToken.get("charOffset") 
                    entityElement.attrib["headOffset"] = headToken.get("charOffset")
                    entityElement.attrib["text"] = headToken.get("text")
                    entityElement.attrib["id"] = sentenceId + ".e" + str(newEntityIdCount)
                    newEntityIdCount += 1
                    eventIdByExample[example[0]] = entityElement.get("id")
                    
                    #if not eventByOrigId.has_key(example[3]["e"]):
                    #    eventByOrigId[example[3]["e"]] = []
                    #eventByOrigId[example[3]["e"]].append(entityElement.attrib["id"])
                    #example[3]["e"] = entityElement.attrib["id"]
                    
                    
                    if not eventsByToken.has_key(example[3]["et"]):
                        eventsByToken[example[3]["et"]] = []
                    eventsByToken[example[3]["et"]].append(entityElement.get("id"))

                    entityElement.attrib["type"] = classSet.getName(prediction[0]) #example[3]["type"]
                    classWeights = prediction[1:]
                    predictionString = ""
                    for i in range(len(classWeights)):
                        if predictionString != "":
                            predictionString += ","
                        predictionString += classSet.getName(classIds[i]) + ":" + str(classWeights[i])
                    entityElement.attrib["predictions"] = predictionString
                    #if entityElement.attrib["type"] != "neg":
                    sentenceElement.append(entityElement)
                    entityCount += 1
                    
                for example in examplesBySentence[sentenceId]:
                    prediction = predictionsByExample[example[0]]
                    if prediction[0] == 1:
                        continue
                    # add theme edge
                    if example[3].has_key("tt"):
                        pairElement = ET.Element("interaction")
                        pairElement.attrib["directed"] = "Unknown"
                        pairElement.attrib["e1"] = eventIdByExample[example[0]]
                        if eventsByToken.has_key(example[3]["tt"]):
                            pairElement.attrib["e2"] = eventsByToken[example[3]["tt"]][0]
                        elif example[3].has_key("t") and example[3]["t"] in existingEntities:
                            pairElement.attrib["e2"] = example[3]["t"] #.attrib["id"]
                        pairElement.attrib["id"] = sentenceId + ".i" + str(pairCount)
                        pairElement.attrib["type"] = "Theme"
                        if pairElement.get("e2") != None:
                            sentenceElement.append(pairElement)
                            pairCount += 1
                    
                    # add cause edge
                    if example[3].has_key("ct"):
                        pairElement = ET.Element("interaction")
                        pairElement.attrib["directed"] = "Unknown"
                        pairElement.attrib["e1"] = eventIdByExample[example[0]]
                        if eventsByToken.has_key(example[3]["ct"]):
                            pairElement.attrib["e2"] = eventsByToken[example[3]["ct"]][0]
                        elif example[3].has_key("c") and example[3]["c"] in existingEntities:
                            pairElement.attrib["e2"] = example[3]["c"] #.attrib["id"]
                        pairElement.attrib["id"] = sentenceId + ".i" + str(pairCount)
                        pairElement.attrib["type"] = "Cause"
                        if pairElement.get("e2") != None:
                            sentenceElement.append(pairElement)
                            pairCount += 1
    elif xType == None:
        pass
    else:
        sys.exit("Error, unknown xtype")
    # re-attach the analyses-element
    if sentenceAnalysesElement != None:
        sentenceElement.append(sentenceAnalysesElement)