def insertInteraction(sentence, interaction): interactions = sentence.findall("interaction") newIdNumber = IDUtils.getNextFreeId(interactions) interaction.set("id", sentence.get("id") + ".i" + str(newIdNumber)) # insert into sentence inserted = False for i in range(len(sentence)): if sentence[i].tag == "sentenceanalyses": sentence.insert(i, interaction) inserted = True assert inserted
def process(sentenceObject, examplesBySentence, classSet, classIds, predictionsByExample): sentenceElement = sentenceObject.sentence sentenceId = sentenceElement.get("id") entityElements = sentenceElement.findall("entity") # remove non-name entities if entityElements != None: for entityElement in entityElements: if entityElement.get("isName") == "False": # interaction word sentenceElement.remove(entityElement) # add new pairs entityElements = sentenceElement.findall("entity") entityCount = IDUtils.getNextFreeId(entityElements) if examplesBySentence.has_key(sentenceId): # split merged examples for example in examplesBySentence[sentenceId][:]: prediction = predictionsByExample[example[0]] if classSet.getName(prediction[0]).find("---") != -1: nameSplits = classSet.getName(prediction[0]).split("---") prediction[0] = classSet.getId(nameSplits[0], False) count = 1 for nameSplit in nameSplits[1:]: newExample = example[:] newExample[0] += ".dupl" + str(count) examplesBySentence[sentenceId].append(newExample) newPrediction = prediction[:] newPrediction[0] = classSet.getId(nameSplit, False) predictionsByExample[newExample[0]] = newPrediction count += 1 # remove negatives examplesToKeep = [] for example in examplesBySentence[sentenceId]: prediction = predictionsByExample[example[0]] if prediction[0] != 1: examplesToKeep.append(example) examplesBySentence[sentenceId] = examplesToKeep map = {} for token in sentenceObject.tokens: map[token.get("id")] = {} addExistingEntities(map, entityElements, sentenceObject) addExamples(map, examplesBySentence[sentenceId]) markFinal(map) entities = buildEntityNodes(map, sentenceObject, entityCount, classSet, classIds, predictionsByExample) interactions = buildInteractions(map, sentenceObject.sentence, predictionsByExample) for entity in entities: sentenceElement.append(entity) for interaction in interactions: sentenceElement.append(interaction) #ENDIF
def writeXMLSentence(self, examples, predictionsByExample, sentenceObject, classSet, classIds, goldSentence=None): self.assertSameSentence(examples) sentenceElement = sentenceObject.sentence sentenceId = sentenceElement.get("id") sentenceText = sentenceElement.get("text") # detach analyses-element sentenceAnalysesElement = None sentenceAnalysesElement = sentenceElement.find("sentenceanalyses") if sentenceAnalysesElement != None: sentenceElement.remove(sentenceAnalysesElement) # remove pairs and interactions interactions = self.removeChildren(sentenceElement, ["pair", "interaction"]) # remove entities newEntityIdCount = IDUtils.getNextFreeId( sentenceElement.findall("entity")) nonNameEntities = self.removeNonNameEntities(sentenceElement) # add new pairs for example in examples: prediction = predictionsByExample[example[0]] entityElement = ET.Element("entity") entityElement.attrib["isName"] = "False" headToken = example[3]["t"] for token in sentenceObject.tokens: if token.get("id") == headToken: headToken = token break entityElement.set("charOffset", example[3]["charOffset"]) entityElement.set("headOffset", headToken.get("charOffset")) entityElement.set("phraseType", example[3]["ptype"]) entOffset = Range.charOffsetToSingleTuple(example[3]["charOffset"]) entityElement.set("text", sentenceText[entOffset[0]:entOffset[1] + 1]) entityElement.set("id", sentenceId + ".e" + str(newEntityIdCount)) self.setElementType(entityElement, prediction, classSet, classIds) newEntityIdCount += 1 sentenceElement.append(entityElement) # re-attach the analyses-element if sentenceAnalysesElement != None: sentenceElement.append(sentenceAnalysesElement)
def writeXMLSentence(self, examples, predictionsByExample, sentenceObject, classSet, classIds, goldSentence=None): self.assertSameSentence(examples) sentenceElement = sentenceObject.sentence sentenceId = sentenceElement.get("id") sentenceText = sentenceElement.get("text") # detach analyses-element sentenceAnalysesElement = None sentenceAnalysesElement = sentenceElement.find("sentenceanalyses") if sentenceAnalysesElement != None: sentenceElement.remove(sentenceAnalysesElement) # remove pairs and interactions interactions = self.removeChildren(sentenceElement, ["pair", "interaction"]) # remove entities newEntityIdCount = IDUtils.getNextFreeId(sentenceElement.findall("entity")) nonNameEntities = self.removeNonNameEntities(sentenceElement) # add new pairs for example in examples: prediction = predictionsByExample[example[0]] entityElement = ET.Element("entity") entityElement.attrib["isName"] = "False" headToken = example[3]["t"] for token in sentenceObject.tokens: if token.get("id") == headToken: headToken = token break entityElement.set("charOffset", example[3]["charOffset"]) entityElement.set("headOffset", headToken.get("charOffset")) entityElement.set("phraseType", example[3]["ptype"]) entOffset = Range.charOffsetToSingleTuple(example[3]["charOffset"]) entityElement.set("text", sentenceText[entOffset[0]:entOffset[1]+1]) entityElement.set("id", sentenceId + ".e" + str(newEntityIdCount)) self.setElementType(entityElement, prediction, classSet, classIds) newEntityIdCount += 1 sentenceElement.append(entityElement) # re-attach the analyses-element if sentenceAnalysesElement != None: sentenceElement.append(sentenceAnalysesElement)
def writeXMLSentence(self, examples, predictionsByExample, sentenceObject, classSet, classIds): self.assertSameSentence(examples) sentenceElement = sentenceObject.sentence sentenceId = sentenceElement.get("id") # detach analyses-element sentenceAnalysesElement = None sentenceAnalysesElement = sentenceElement.find("sentenceanalyses") if sentenceAnalysesElement != None: sentenceElement.remove(sentenceAnalysesElement) # remove pairs and interactions self.removeChildren(sentenceElement, ["pair", "interaction"]) # remove entities self.removeNonNameEntities(sentenceElement) entityByTokenByType = {} # First add existing entities (names) (use sentenceElement, as sentenceObject still has all entities) for entity in sentenceElement.findall("entity"): headOffset = entity.get("headOffset") headToken = None for token in sentenceObject.tokens: if token.get("charOffset") == headOffset: headToken = token break assert headToken != None headTokenId = headToken.get("id") if not entityByTokenByType.has_key(headTokenId): entityByTokenByType[headTokenId] = {} entityByTokenByType[headTokenId][entity.get("type")] = entity # Then add entities defined by examples newEntityIdCount = IDUtils.getNextFreeId(sentenceElement.findall("entity")) for example in examples: prediction = predictionsByExample[example[0]] if prediction[0] == 1: continue headTokenId = example[3]["t1"] if not entityByTokenByType.has_key(headTokenId): entityByTokenByType[headTokenId] = {} e1Type = classSet.getName(prediction[0]) if e1Type == "Cause": continue # Maximum of one entity per type per token if entityByTokenByType[headTokenId].has_key(e1Type): continue entityElement = ET.Element("entity") entityByTokenByType[headTokenId][e1Type] = entityElement entityElement.attrib["isName"] = "False" for token in sentenceObject.tokens: if token.get("id") == headTokenId: headToken = token break entityElement.attrib["charOffset"] = headToken.get("charOffset") entityElement.attrib["headOffset"] = headToken.get("charOffset") entityElement.attrib["text"] = headToken.get("text") entityElement.attrib["id"] = sentenceId + ".e" + str(newEntityIdCount) entityElement.set("type", e1Type) newEntityIdCount += 1 sentenceElement.append(entityElement) pairCount = 0 for example in examples: prediction = predictionsByExample[example[0]] if prediction[0] == 1: continue exampleType = classSet.getName(prediction[0]) t1Id = example[3]["t1"] t2Id = example[3]["t2"] if exampleType != "Cause": if entityByTokenByType.has_key(t2Id): e1Id = entityByTokenByType[t1Id][exampleType].get("id") for e2Type in sorted(entityByTokenByType[t2Id].keys()): if exampleType.find("egulation") == -1 and e2Type != "Protein": continue pairElement = ET.Element("interaction") pairElement.attrib["directed"] = "Unknown" pairElement.attrib["e1"] = e1Id pairElement.attrib["e2"] = entityByTokenByType[t2Id][e2Type].get("id") pairElement.attrib["id"] = sentenceId + ".i" + str(pairCount) pairElement.attrib["predictions"] = self.getEdgePredictionString(example, prediction, classSet, classIds) pairElement.set("type", "Theme") sentenceElement.append(pairElement) pairCount += 1 else: if entityByTokenByType.has_key(t1Id) and entityByTokenByType.has_key(t2Id): for e1Type in sorted(entityByTokenByType[t1Id].keys()): if e1Type.find("egulation") == -1: continue for e2Type in sorted(entityByTokenByType[t2Id].keys()): pairElement = ET.Element("interaction") pairElement.attrib["directed"] = "Unknown" pairElement.attrib["e1"] = entityByTokenByType[t1Id][e1Type].get("id") pairElement.attrib["e2"] = entityByTokenByType[t2Id][e2Type].get("id") pairElement.attrib["id"] = sentenceId + ".i" + str(pairCount) pairElement.attrib["predictions"] = self.getEdgePredictionString(example, prediction, classSet, classIds) pairElement.set("type", "Cause") sentenceElement.append(pairElement) pairCount += 1 # re-attach the analyses-element if sentenceAnalysesElement != None: sentenceElement.append(sentenceAnalysesElement)
def writeXMLSentence(self, examples, predictionsByExample, sentenceObject, classSet, classIds, goldSentence=None): sentenceElement = sentenceObject.sentence self.sentenceId = sentenceElement.get("id") self.assertSameSentence(examples, self.sentenceId) # detach analyses-element sentenceAnalysesElement = None sentenceAnalysesElement = sentenceElement.find("sentenceanalyses") if sentenceAnalysesElement == None: sentenceAnalysesElement = sentenceElement.find("analyses") if sentenceAnalysesElement != None: sentenceElement.remove(sentenceAnalysesElement) # remove pairs and interactions interactions = self.removeChildren(sentenceElement, ["pair", "interaction"]) # remove entities entities = self.removeNonNameEntities(sentenceElement) # filter interactions interactionsToKeep = [] for interaction in interactions: if interaction.get("type") != "neg": interactionsToKeep.append(interaction) interactions = interactionsToKeep # early out cutoff = 100 if len(interactions) == 0 or len(interactions) > cutoff: # re-attach the analyses-element if sentenceAnalysesElement != None: sentenceElement.append(sentenceAnalysesElement) if len(interactions) > cutoff: print >> sys.stderr, "Warning, sentence", sentenceObject.sentence.get( "id" ), "has more than", cutoff, "interactions, removing all." return interactionsByEntity = {} interactionsById = {} for entity in entities: interactionsByEntity[entity.get("id")] = [] for interaction in interactions: e1Id = interaction.get("e1") if not interactionsByEntity.has_key(e1Id): interactionsByEntity[e1Id] = [] interactionsByEntity[e1Id].append(interaction) interactionsById[interaction.get("id")] = interaction # NOTE! Following won't work for pairs self.entityCount = IDUtils.getNextFreeId(sentenceElement.findall("entity")) self.interactionCount = IDUtils.getNextFreeId(sentenceElement.findall("interaction")) self.newEntities = [] self.newInteractions = [] # Mapping for connecting the events self.entitiesByHeadByType = {} # self.tokenByOffset = {} # for token in sentenceObject.tokens: # self.tokenByOffset[token.get("charOffset")] = token # self.entityByHeadByType[token.get("charOffset")] = {} for entity in sentenceObject.entities: # by offset offset = entity.get("headOffset") if not self.entitiesByHeadByType.has_key(offset): self.entitiesByHeadByType[offset] = {} # by type eType = entity.get("type") if entity.get("isName") != "True": self.entitiesByHeadByType[offset][eType] = [] else: # add names to structure if not self.entitiesByHeadByType[offset].has_key(eType): self.entitiesByHeadByType[offset][eType] = [] self.entitiesByHeadByType[offset][eType].append(entity) entityKeys = sentenceObject.entitiesById.keys() exampleByEntityId = {} for example in examples: # if predictionsByExample[example[0]][0] == 1: # negative # continue eId = example[3]["e"] assert eId in entityKeys if not exampleByEntityId.has_key(eId): exampleByEntityId[eId] = [] exampleByEntityId[eId].append(example) # This doesn't work, it was an attempt to include # only the positive example with the highest prediction strength # for key in sorted(exampleByEntityId.keys()): # eType = sentenceObject.entitiesById[key].get("type") # eExamples = exampleByEntityId[key] # if eType == "Binding" and len(eExamples) > 1: # maxArgs = -1 # maxStr = -999999999 # for example in eExamples: # if predictionsByExample[example[0]][0] == 1: # continue # numArgs = example[3]["i"].count(",") + 1 # if numArgs > maxArgs: # maxArgs = numArgs # predClass = predictionsByExample[example[0]][0] # predictionStrength = predictionsByExample[example[0]][predClass] # if predictionStrength > maxStr: # maxStr = predictionStrength # #print maxArgs, len(eExamples) # for example in eExamples: # if predictionsByExample[example[0]][0] == 1: # continue # predClass = predictionsByExample[example[0]][0] # predictionStrength = predictionsByExample[example[0]][predClass] # if predictionStrength != maxStr: # examples.remove(example) # #if example[3]["i"].count(",") + 1 < maxArgs: # # examples.remove(example) # self.newEntitiesById = {} # self.outEdgesByEntity = {} # Gather arguments for the simple, one-argument events argumentsByExample = {} positiveExamples = [] exampleIdCount = 0 for entity in entities: # If no example, case is unambiguous if entity.get("id") not in exampleByEntityId: simpleEventInteractions = interactionsByEntity[entity.get("id")] numCauses = 0 numThemes = 0 for interaction in simpleEventInteractions[:]: if self.isIntersentence(interaction): print "Warning, intersentence interaction for", entity.get("id"), entity.get("type") simpleEventInteractions.remove(interaction) continue if interaction.get("type") == "neg": simpleEventInteractions.remove(interaction) continue iType = interaction.get("type") if iType == "Cause": numCauses += 1 elif iType == "Theme": numThemes += 1 eType = entity.get("type") assert ( numThemes == 0 or (numThemes != 0 and numCauses == 0) or (numThemes > 1 and eType != "Binding") ), (numThemes, numCauses, eType, entity.get("id"), [x[0] for x in examples], entityKeys) # assert numThemes == 0 or (numThemes != 0 and numCauses == 0) or (numThemes > 1 and eType == "Binding"), (numThemes,numCauses,eType,entity.get("id")) for interaction in simpleEventInteractions: self.counts["simple-" + eType + "-" + interaction.get("type")] += 1 exampleId = "simple." + str(exampleIdCount) exampleIdCount += 1 positiveExamples.append([exampleId, None, None, None]) argumentsByExample[exampleId] = [interaction] # self.addEvent([interaction], sentenceObject, "simple") # Gather arguments for predicted, unmerged events for example in examples: # print predictionsByExample[example[0]] if predictionsByExample[example[0]][0] == 1: # negative continue positiveExamples.append(example) arguments = [] for iId in example[3]["i"].split(","): if iId == "": # processes can have 0 arguments assert "etype" in example[3], example[3] assert example[3]["etype"] == "Process", example[3] break arg = interactionsById[iId] if self.isIntersentence(arg): continue assert arg.get("type") != "neg" arguments.append(arg) argumentsByExample[example[0]] = arguments # Loop until all positive examples are added. This process # assumes that the events (mostly) form a directed acyclic # graph, which can written by "growing" the structure from # the "leaf" events, and consecutively adding levels of # nesting events. examplesLeft = len(positiveExamples) exampleAdded = {} for example in positiveExamples: exampleAdded[example[0]] = False forceAdd = False forcedCount = 0 while examplesLeft > 0: if len(self.newEntities) > 100: print >> sys.stderr, "Warning, sentence", sentenceObject.sentence.get( "id" ), "has generated more than", cutoff, "events, skipping the rest." break examplesAddedThisRound = 0 # For each round, loop through the potentially remaining examples for example in positiveExamples: if len(self.newEntities) > 100: break if exampleAdded[example[0]]: # This event has already been inserted continue arguments = argumentsByExample[example[0]] # An event can be added if all of its argument events have already # been added. Addition is forced if lack of argument events blocks # the process. if forceAdd or self.argumentEntitiesExist(arguments, sentenceObject): umType = "complex" # mark the root entity in the output xml predictionStrength = None if example[0].find("simple") != -1: umType = "simple" else: # Prediction strength is only available for classified argument groups predictionStrength = self.getPredictionStrength( example, predictionsByExample, classSet, classIds ) # print example if ( umType != "simple" and "eType" in example[3] and example[3]["etype"] == "Process" and len(arguments) == 0 ): origProcess = sentenceObject.entitiesById[example[3]["e"]] # Put back the original entity newProcess = self.addEntity(origProcess) newProcess.set("umType", umType) if predictionStrength != None: newProcess.set("umStrength", str(predictionStrength)) else: # example has arguments self.addEvent(arguments, sentenceObject, umType, forceAdd, predictionStrength) exampleAdded[example[0]] = True examplesLeft -= 1 examplesAddedThisRound += 1 forceAdd = False if examplesLeft > 0 and examplesAddedThisRound == 0: # If there are examples left, but nothing was added, this # means that some nested events are missing. Theoretically # this could also be because two events are referring to # each other, preventing each other's insertion. In any # case this is solved by simply forcing the addition of # the first non-inserted event, by creating 0-argument # entities for its argument events. forcedCount += 1 # print "Warning, forcing event addition" forceAdd = True # Attach the new elements for element in self.newEntities + self.newInteractions: sentenceElement.append(element) # re-attach the analyses-element if sentenceAnalysesElement != None: sentenceElement.append(sentenceAnalysesElement)
def writeXMLSentence(self, examples, predictionsByExample, sentenceObject, classSet, classIds): self.assertSameSentence(examples) sentenceElement = sentenceObject.sentence sentenceId = sentenceElement.get("id") # detach analyses-element sentenceAnalysesElement = None sentenceAnalysesElement = sentenceElement.find("sentenceanalyses") if sentenceAnalysesElement != None: sentenceElement.remove(sentenceAnalysesElement) # remove pairs and interactions self.removeChildren(sentenceElement, ["pair", "interaction"]) # remove negative predicted entities self.removeChildren(sentenceElement, ["entity"], {"type":"neg"}) # add required entities for dummy nodes with positive interactions dummies = {} newEntityIdCount = IDUtils.getNextFreeId(sentenceElement.findall("entity")) for example in examples: prediction = predictionsByExample[example[0]] #if self.isNegative(prediction, classSet): # continue assert example[3]["d1"] in ["T","F"], ("Example d1 error:", example) assert example[3]["d2"] in ["T","F"], ("Example d2 error:", example) for node in ["1","2"]: d = example[3]["d"+node] if d == "T": # Node is a dummy node e = example[3]["e"+node] l = example[3]["l"+node] if not dummies.has_key(e): dummies[e] = {} if not dummies[e].has_key(l): # Create a real node for the empty slot entityElement = ET.Element("entity") entityElement.attrib["isName"] = "False" headToken = example[3]["t"+node] for token in sentenceObject.tokens: if token.get("id") == headToken: headToken = token break entityElement.set("charOffset", headToken.get("charOffset")) entityElement.set("headOffset", headToken.get("charOffset")) entityElement.set("text", headToken.get("text")) entityElement.set("id", sentenceId + ".e" + str(newEntityIdCount)) #self.setElementType(entityElement, prediction, classSet, classIds) entityElement.set("type", sentenceObject.entitiesById[e].get("type")) # Add element to sentence newEntityIdCount += 1 sentenceElement.append(entityElement) newEntityId = entityElement.get("id") #print "newEntityId",newEntityId assert not sentenceObject.entitiesById.has_key(newEntityId) sentenceObject.entitiesById[newEntityId] = entityElement # Keep track of created dummies dummies[e][l] = entityElement # select examples for correct edge combinations #print "DUMMIES", dummies #print sentenceObject.entitiesById examples = self.getValidExamples(examples, predictionsByExample, sentenceObject, dummies, classSet, classIds) # add interactions pairCount = 0 for example in examples: prediction = predictionsByExample[example[0]] #if self.isNegative(prediction, classSet): # continue pairElement = ET.Element("interaction") if example[3].has_key("discarded") and example[3]["discarded"]: pairElement.attrib["discarded"] = "True" pairElement.attrib["directed"] = "Unknown" if example[3]["d1"] == "F": pairElement.attrib["e1"] = example[3]["e1"] else: pairElement.attrib["e1"] = dummies[example[3]["e1"]][example[3]["l1"]].get("id") if example[3]["d2"] == "F": pairElement.attrib["e2"] = example[3]["e2"] else: pairElement.attrib["e2"] = dummies[example[3]["e2"]][example[3]["l2"]].get("id") pairElement.attrib["id"] = sentenceId + ".i" + str(pairCount) self.setElementType(pairElement, prediction, classSet, classIds) sentenceElement.append(pairElement) pairCount += 1 # re-attach the analyses-element if sentenceAnalysesElement != None: sentenceElement.append(sentenceAnalysesElement)
def run(input, output, eventDir, parse="split-mccc-preparsed", verbose=False): print >> sys.stderr, "Loading corpus", input corpusTree = ETUtils.ETFromObj(input) print >> sys.stderr, "Corpus file loaded" corpusRoot = corpusTree.getroot() counts = defaultdict(int) for document in corpusRoot.findall("document"): sentDict = None pmid = document.get("pmid") isPMC = False for sentence in document.findall("sentence"): counts["sentences"] += 1 sentenceId = str(sentence.get("id")) + "/" + str( sentence.get("origId")) if verbose: print "Processing", sentenceId if sentDict == None: if sentence.get("origId") != None: assert pmid == None sentDict = loadEventXML( eventDir + "/" + sentence.get("origId").split(".")[0] + ".xml", verbose=verbose) else: #pmid = sentence.get("pmid") assert pmid != None if pmid.startswith("PMC"): isPMC = True sentDict = {} else: assert pmid.startswith("PMID") sentDict = loadEventXML( eventDir + "/" + pmid.split("-", 1)[-1] + ".xml", verbose=verbose) interactionXMLText = sentence.get("text") if not sentDict.has_key(interactionXMLText): counts["missing-sentences"] += 1 if isPMC: counts["missing-sentences-PMC"] += 1 if verbose: print "Missing sentence:", pmid, (sentenceId, sentDict, sentence.get("text")) else: sentenceAnalyses = sentence.find("sentenceanalyses") if sentenceAnalyses != None: sentence.remove(sentenceAnalyses) entityIdCount = IDUtils.getNextFreeId( sentence.findall("entity")) events = sentDict[interactionXMLText] events.sort() for event in events: if not keepEvent(event[2]): counts["filtered-triggers"] += 1 continue trigger = ET.Element("entity") trigger.set("isName", "False") trigger.set("charOffset", str(event[0]) + "-" + str(event[1])) trigger.set("type", str(event[2])) trigger.set("text", str(event[3])) trigger.set("source", "GENIA_event_annotation_0.9") trigger.set("id", sentence.get("id") + ".e" + str(entityIdCount)) entityIdCount += 1 counts["added-triggers"] += 1 sentence.append(trigger) if sentenceAnalyses != None: sentence.append(sentenceAnalyses) FindHeads.findHeads(corpusTree, parse, removeExisting=False) removeDuplicates(corpusRoot) print counts if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree
def writeXMLSentence(self, examples, predictionsByExample, sentenceObject, classSet, classIds, goldSentence=None): self.assertSameSentence(examples) extensionRequested = False sentenceElement = sentenceObject.sentence sentenceId = sentenceElement.get("id") # detach analyses-element sentenceAnalysesElement = None sentenceAnalysesElement = sentenceElement.find("sentenceanalyses") if sentenceAnalysesElement == None: sentenceAnalysesElement = sentenceElement.find("analyses") if sentenceAnalysesElement != None: sentenceElement.remove(sentenceAnalysesElement) # remove pairs and interactions interactions = self.removeChildren(sentenceElement, ["pair", "interaction"]) # remove entities newEntityIdCount = IDUtils.getNextFreeId(sentenceElement.findall("entity")) nonNameEntities = self.removeNonNameEntities(sentenceElement) # gold sentence elements goldEntityTypeByHeadOffset = {} goldEntityByHeadOffset = {} if goldSentence != None: for entity in goldSentence.entities: headOffset = entity.get("headOffset") if not goldEntityTypeByHeadOffset.has_key(headOffset): goldEntityTypeByHeadOffset[headOffset] = [] goldEntityByHeadOffset[headOffset] = [] goldEntityTypeByHeadOffset[headOffset].append(entity) goldEntityByHeadOffset[headOffset].append(entity) for key in goldEntityTypeByHeadOffset: goldEntityTypeByHeadOffset[key] = self.getMergedEntityType(goldEntityTypeByHeadOffset[key]) for token in sentenceObject.tokens: if not goldEntityTypeByHeadOffset.has_key(token.get("charOffset")): goldEntityTypeByHeadOffset[token.get("charOffset")] = "neg" # add new pairs for example in examples: unmergeEPINeg = None if "unmergeneg" in example[3] and example[3]["unmergeneg"] == "epi": unmergeEPINeg = headToken.get("text") if "trigex" in example[3] and example[3]["trigex"] == "bb": extensionRequested = True headToken = example[3]["t"] for token in sentenceObject.tokens: if token.get("id") == headToken: headToken = token break prediction = predictionsByExample[example[0]] predictionString = self.getPredictionStrengthString(prediction, classSet, classIds) for eType in self.getElementTypes(prediction, classSet, classIds, unmergeEPINegText=unmergeEPINeg): # split merged classes entityElement = ET.Element("entity") entityElement.set("isName", "False") entityElement.set("charOffset", headToken.get("charOffset")) entityElement.set("headOffset", headToken.get("charOffset")) entityElement.set("text", headToken.get("text")) entityElement.set("id", sentenceId + ".e" + str(newEntityIdCount)) entityElement.set("type", eType) entityElement.set("predictions", predictionString) #self.setElementType(entityElement, prediction, classSet, classIds, unmergeEPINeg=unmergeEPINeg) if self.insertWeights: # in other words, use gold types headOffset = headToken.get("charOffset") if goldEntityByHeadOffset.has_key(headOffset): for entity in goldEntityByHeadOffset[headOffset]: entity.set("predictions", entityElement.get("predictions") ) if goldEntityTypeByHeadOffset.has_key(headToken.get("charOffset")): entityElement.set("goldType", goldEntityTypeByHeadOffset[headToken.get("charOffset")]) if "goldIds" in example[3]: # The entities for which this example was built entityElement.set("goldIds", example[3]["goldIds"]) if (entityElement.get("type") != "neg" and not goldEntityByHeadOffset.has_key(entityElement.get("headOffset"))) or not self.insertWeights: newEntityIdCount += 1 sentenceElement.append(entityElement) elif entityElement.get("type") == "neg": pass #newEntityIdCount += 1 #sentenceElement.append(entityElement) # if only adding weights, re-attach interactions and gold entities if self.insertWeights: for entity in nonNameEntities: sentenceElement.append(entity) for interaction in interactions: sentenceElement.append(interaction) # re-attach the analyses-element if sentenceAnalysesElement != None: sentenceElement.append(sentenceAnalysesElement) # Extend bacteria triggers if extensionRequested: InteractionXML.ExtendTriggers.extend(sentenceElement, entityTypes=["Bacterium"])
def writeXMLSentence(self, examples, predictionsByExample, sentenceObject, classSet, classIds, goldSentence=None): self.assertSameSentence(examples) extensionRequested = False sentenceElement = sentenceObject.sentence sentenceId = sentenceElement.get("id") # detach analyses-element sentenceAnalysesElement = None sentenceAnalysesElement = sentenceElement.find("sentenceanalyses") if sentenceAnalysesElement == None: sentenceAnalysesElement = sentenceElement.find("analyses") if sentenceAnalysesElement != None: sentenceElement.remove(sentenceAnalysesElement) # remove pairs and interactions interactions = self.removeChildren(sentenceElement, ["pair", "interaction"]) # remove entities newEntityIdCount = IDUtils.getNextFreeId( sentenceElement.findall("entity")) nonNameEntities = self.removeNonNameEntities(sentenceElement) # gold sentence elements goldEntityTypeByHeadOffset = {} goldEntityByHeadOffset = {} if goldSentence != None: for entity in goldSentence.entities: headOffset = entity.get("headOffset") if not goldEntityTypeByHeadOffset.has_key(headOffset): goldEntityTypeByHeadOffset[headOffset] = [] goldEntityByHeadOffset[headOffset] = [] goldEntityTypeByHeadOffset[headOffset].append(entity) goldEntityByHeadOffset[headOffset].append(entity) for key in goldEntityTypeByHeadOffset: goldEntityTypeByHeadOffset[key] = self.getMergedEntityType( goldEntityTypeByHeadOffset[key]) for token in sentenceObject.tokens: if not goldEntityTypeByHeadOffset.has_key( token.get("charOffset")): goldEntityTypeByHeadOffset[token.get("charOffset")] = "neg" # add new pairs for example in examples: unmergeEPINeg = None if "unmergeneg" in example[3] and example[3]["unmergeneg"] == "epi": unmergeEPINeg = headToken.get("text") if "trigex" in example[3] and example[3]["trigex"] == "bb": extensionRequested = True headToken = example[3]["t"] for token in sentenceObject.tokens: if token.get("id") == headToken: headToken = token break prediction = predictionsByExample[example[0]] predictionString = self.getPredictionStrengthString( prediction, classSet, classIds) for eType in self.getElementTypes( prediction, classSet, classIds, unmergeEPINegText=unmergeEPINeg): # split merged classes entityElement = ET.Element("entity") entityElement.set("isName", "False") entityElement.set("charOffset", headToken.get("charOffset")) entityElement.set("headOffset", headToken.get("charOffset")) entityElement.set("text", headToken.get("text")) entityElement.set("id", sentenceId + ".e" + str(newEntityIdCount)) entityElement.set("type", eType) entityElement.set("predictions", predictionString) #self.setElementType(entityElement, prediction, classSet, classIds, unmergeEPINeg=unmergeEPINeg) if self.insertWeights: # in other words, use gold types headOffset = headToken.get("charOffset") if goldEntityByHeadOffset.has_key(headOffset): for entity in goldEntityByHeadOffset[headOffset]: entity.set("predictions", entityElement.get("predictions")) if goldEntityTypeByHeadOffset.has_key( headToken.get("charOffset")): entityElement.set( "goldType", goldEntityTypeByHeadOffset[headToken.get( "charOffset")]) if "goldIds" in example[ 3]: # The entities for which this example was built entityElement.set("goldIds", example[3]["goldIds"]) if (entityElement.get("type") != "neg" and not goldEntityByHeadOffset.has_key( entityElement.get("headOffset")) ) or not self.insertWeights: newEntityIdCount += 1 sentenceElement.append(entityElement) elif entityElement.get("type") == "neg": pass #newEntityIdCount += 1 #sentenceElement.append(entityElement) # if only adding weights, re-attach interactions and gold entities if self.insertWeights: for entity in nonNameEntities: sentenceElement.append(entity) for interaction in interactions: sentenceElement.append(interaction) # re-attach the analyses-element if sentenceAnalysesElement != None: sentenceElement.append(sentenceAnalysesElement) # Extend bacteria triggers if extensionRequested: InteractionXML.ExtendTriggers.extend(sentenceElement, entityTypes=["Bacterium"])
def writeXMLSentence(self, examples, predictionsByExample, sentenceObject, classSet, classIds, goldSentence=None): sentenceElement = sentenceObject.sentence self.sentenceId = sentenceElement.get("id") self.assertSameSentence(examples, self.sentenceId) # detach analyses-element sentenceAnalysesElement = None sentenceAnalysesElement = sentenceElement.find("sentenceanalyses") if sentenceAnalysesElement == None: sentenceAnalysesElement = sentenceElement.find("analyses") if sentenceAnalysesElement != None: sentenceElement.remove(sentenceAnalysesElement) # remove pairs and interactions interactions = self.removeChildren(sentenceElement, ["pair", "interaction"]) # remove entities entities = self.removeNonNameEntities(sentenceElement) # filter interactions interactionsToKeep = [] for interaction in interactions: if interaction.get("type") != "neg": interactionsToKeep.append(interaction) interactions = interactionsToKeep # early out cutoff = 100 if len(interactions) == 0 or len(interactions) > cutoff: # re-attach the analyses-element if sentenceAnalysesElement != None: sentenceElement.append(sentenceAnalysesElement) if len(interactions) > cutoff: print >> sys.stderr, "Warning, sentence", sentenceObject.sentence.get( "id" ), "has more than", cutoff, "interactions, removing all." return interactionsByEntity = {} interactionsById = {} for entity in entities: interactionsByEntity[entity.get("id")] = [] for interaction in interactions: e1Id = interaction.get("e1") if not interactionsByEntity.has_key(e1Id): interactionsByEntity[e1Id] = [] interactionsByEntity[e1Id].append(interaction) interactionsById[interaction.get("id")] = interaction # NOTE! Following won't work for pairs self.entityCount = IDUtils.getNextFreeId( sentenceElement.findall("entity")) self.interactionCount = IDUtils.getNextFreeId( sentenceElement.findall("interaction")) self.newEntities = [] self.newInteractions = [] # Mapping for connecting the events self.entitiesByHeadByType = {} #self.tokenByOffset = {} #for token in sentenceObject.tokens: # self.tokenByOffset[token.get("charOffset")] = token # self.entityByHeadByType[token.get("charOffset")] = {} for entity in sentenceObject.entities: # by offset offset = entity.get("headOffset") if not self.entitiesByHeadByType.has_key(offset): self.entitiesByHeadByType[offset] = {} # by type eType = entity.get("type") if entity.get("isName") != "True": self.entitiesByHeadByType[offset][eType] = [] else: # add names to structure if not self.entitiesByHeadByType[offset].has_key(eType): self.entitiesByHeadByType[offset][eType] = [] self.entitiesByHeadByType[offset][eType].append(entity) entityKeys = sentenceObject.entitiesById.keys() exampleByEntityId = {} for example in examples: #if predictionsByExample[example[0]][0] == 1: # negative # continue eId = example[3]["e"] assert eId in entityKeys if not exampleByEntityId.has_key(eId): exampleByEntityId[eId] = [] exampleByEntityId[eId].append(example) # This doesn't work, it was an attempt to include # only the positive example with the highest prediction strength # for key in sorted(exampleByEntityId.keys()): # eType = sentenceObject.entitiesById[key].get("type") # eExamples = exampleByEntityId[key] # if eType == "Binding" and len(eExamples) > 1: # maxArgs = -1 # maxStr = -999999999 # for example in eExamples: # if predictionsByExample[example[0]][0] == 1: # continue # numArgs = example[3]["i"].count(",") + 1 # if numArgs > maxArgs: # maxArgs = numArgs # predClass = predictionsByExample[example[0]][0] # predictionStrength = predictionsByExample[example[0]][predClass] # if predictionStrength > maxStr: # maxStr = predictionStrength # #print maxArgs, len(eExamples) # for example in eExamples: # if predictionsByExample[example[0]][0] == 1: # continue # predClass = predictionsByExample[example[0]][0] # predictionStrength = predictionsByExample[example[0]][predClass] # if predictionStrength != maxStr: # examples.remove(example) # #if example[3]["i"].count(",") + 1 < maxArgs: # # examples.remove(example) #self.newEntitiesById = {} #self.outEdgesByEntity = {} # Gather arguments for the simple, one-argument events argumentsByExample = {} positiveExamples = [] exampleIdCount = 0 for entity in entities: # If no example, case is unambiguous if entity.get("id") not in exampleByEntityId: simpleEventInteractions = interactionsByEntity[entity.get( "id")] numCauses = 0 numThemes = 0 for interaction in simpleEventInteractions[:]: if self.isIntersentence(interaction): print "Warning, intersentence interaction for", entity.get( "id"), entity.get("type") simpleEventInteractions.remove(interaction) continue if interaction.get("type") == "neg": simpleEventInteractions.remove(interaction) continue iType = interaction.get("type") if iType == "Cause": numCauses += 1 elif iType == "Theme": numThemes += 1 eType = entity.get("type") assert numThemes == 0 or (numThemes != 0 and numCauses == 0) or (numThemes > 1 and eType != "Binding"), ( numThemes, numCauses, eType, entity.get("id"), [ x[0] for x in examples ], entityKeys) #assert numThemes == 0 or (numThemes != 0 and numCauses == 0) or (numThemes > 1 and eType == "Binding"), (numThemes,numCauses,eType,entity.get("id")) for interaction in simpleEventInteractions: self.counts["simple-" + eType + "-" + interaction.get("type")] += 1 exampleId = "simple." + str(exampleIdCount) exampleIdCount += 1 positiveExamples.append([exampleId, None, None, None]) argumentsByExample[exampleId] = [interaction] #self.addEvent([interaction], sentenceObject, "simple") # Gather arguments for predicted, unmerged events for example in examples: #print predictionsByExample[example[0]] if predictionsByExample[example[0]][0] == 1: # negative continue positiveExamples.append(example) arguments = [] for iId in example[3]["i"].split(","): if iId == "": # processes can have 0 arguments assert "etype" in example[3], example[3] assert example[3]["etype"] == "Process", example[3] break arg = interactionsById[iId] if self.isIntersentence(arg): continue assert arg.get("type") != "neg" arguments.append(arg) argumentsByExample[example[0]] = arguments # Loop until all positive examples are added. This process # assumes that the events (mostly) form a directed acyclic # graph, which can written by "growing" the structure from # the "leaf" events, and consecutively adding levels of # nesting events. examplesLeft = len(positiveExamples) exampleAdded = {} for example in positiveExamples: exampleAdded[example[0]] = False forceAdd = False forcedCount = 0 while examplesLeft > 0: if len(self.newEntities) > 100: print >> sys.stderr, "Warning, sentence", sentenceObject.sentence.get( "id" ), "has generated more than", cutoff, "events, skipping the rest." break examplesAddedThisRound = 0 # For each round, loop through the potentially remaining examples for example in positiveExamples: if len(self.newEntities) > 100: break if exampleAdded[ example[0]]: # This event has already been inserted continue arguments = argumentsByExample[example[0]] # An event can be added if all of its argument events have already # been added. Addition is forced if lack of argument events blocks # the process. if forceAdd or self.argumentEntitiesExist( arguments, sentenceObject): umType = "complex" # mark the root entity in the output xml predictionStrength = None if example[0].find("simple") != -1: umType = "simple" else: # Prediction strength is only available for classified argument groups predictionStrength = self.getPredictionStrength( example, predictionsByExample, classSet, classIds) #print example if umType != "simple" and "eType" in example[ 3] and example[3]["etype"] == "Process" and len( arguments) == 0: origProcess = sentenceObject.entitiesById[example[3] ["e"]] # Put back the original entity newProcess = self.addEntity(origProcess) newProcess.set("umType", umType) if predictionStrength != None: newProcess.set("umStrength", str(predictionStrength)) else: # example has arguments self.addEvent(arguments, sentenceObject, umType, forceAdd, predictionStrength) exampleAdded[example[0]] = True examplesLeft -= 1 examplesAddedThisRound += 1 forceAdd = False if examplesLeft > 0 and examplesAddedThisRound == 0: # If there are examples left, but nothing was added, this # means that some nested events are missing. Theoretically # this could also be because two events are referring to # each other, preventing each other's insertion. In any # case this is solved by simply forcing the addition of # the first non-inserted event, by creating 0-argument # entities for its argument events. forcedCount += 1 #print "Warning, forcing event addition" forceAdd = True # Attach the new elements for element in self.newEntities + self.newInteractions: sentenceElement.append(element) # re-attach the analyses-element if sentenceAnalysesElement != None: sentenceElement.append(sentenceAnalysesElement)
def run(input, output, eventDir, parse="split-mccc-preparsed", verbose=False): print >> sys.stderr, "Loading corpus", input corpusTree = ETUtils.ETFromObj(input) print >> sys.stderr, "Corpus file loaded" corpusRoot = corpusTree.getroot() counts = defaultdict(int) for document in corpusRoot.findall("document"): sentDict = None pmid = document.get("pmid") isPMC = False for sentence in document.findall("sentence"): counts["sentences"] += 1 sentenceId = str(sentence.get("id")) + "/" + str(sentence.get("origId")) if verbose: print "Processing", sentenceId if sentDict == None: if sentence.get("origId") != None: assert pmid == None sentDict = loadEventXML( eventDir + "/" + sentence.get("origId").split(".")[0] + ".xml" , verbose=verbose) else: #pmid = sentence.get("pmid") assert pmid != None if pmid.startswith("PMC"): isPMC = True sentDict = {} else: assert pmid.startswith("PMID") sentDict = loadEventXML( eventDir + "/" + pmid.split("-", 1)[-1] + ".xml" , verbose=verbose) interactionXMLText = sentence.get("text") if not sentDict.has_key(interactionXMLText): counts["missing-sentences"] += 1 if isPMC: counts["missing-sentences-PMC"] += 1 if verbose: print "Missing sentence:", pmid, (sentenceId, sentDict, sentence.get("text")) else: sentenceAnalyses = sentence.find("sentenceanalyses") if sentenceAnalyses != None: sentence.remove(sentenceAnalyses) entityIdCount = IDUtils.getNextFreeId(sentence.findall("entity")) events = sentDict[interactionXMLText] events.sort() for event in events: if not keepEvent(event[2]): counts["filtered-triggers"] += 1 continue trigger = ET.Element("entity") trigger.set("isName", "False") trigger.set("charOffset", str(event[0]) + "-" + str(event[1])) trigger.set("type", str(event[2])) trigger.set("text", str(event[3])) trigger.set("source", "GENIA_event_annotation_0.9") trigger.set("id", sentence.get("id") + ".e" + str(entityIdCount)) entityIdCount += 1 counts["added-triggers"] += 1 sentence.append(trigger) if sentenceAnalyses != None: sentence.append(sentenceAnalyses) FindHeads.findHeads(corpusTree, parse, removeExisting=False) removeDuplicates(corpusRoot) print counts if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree
def _writeExamplesToInteractionXML(examples, predictionsByExample, sentenceObject, classSet, classIds, xType): currentSetMajorId = None for example in examples: majorId, minorId = example[0].rsplit(".x", 1) if currentSetMajorId == None: currentSetMajorId = majorId else: assert currentSetMajorId == majorId, str(currentSetMajorId) + "/" + str(majorId) sentenceElement = sentenceObject.sentence sentenceId = sentenceElement.get("id") # Dummy structure for backwards compatibility examplesBySentence = {} if len(examples) > 0: examplesBySentence[sentenceId] = examples # detach analyses sentenceAnalysesElement = None sentenceAnalysesElement = sentenceElement.find("sentenceanalyses") if sentenceAnalysesElement != None: sentenceElement.remove(sentenceAnalysesElement) # remove pairs and interactions pairElements = sentenceElement.findall("pair") if pairElements != None: for pairElement in pairElements: sentenceElement.remove(pairElement) interactionElements = sentenceElement.findall("interaction") if interactionElements != None: for interactionElement in interactionElements: sentenceElement.remove(interactionElement) # remove entities if xType == "token": entityElements = sentenceElement.findall("entity") entityCount = 0 if entityElements != None: entityCount = len(entityElements) # get the count _before_ removing entities for entityElement in entityElements: if entityElement.get("isName") == "False": # interaction word sentenceElement.remove(entityElement) # add new pairs entityElements = sentenceElement.findall("entity") newEntityIdCount = IDUtils.getNextFreeId(entityElements) if examplesBySentence.has_key(sentenceId): for example in examplesBySentence[sentenceId]: prediction = predictionsByExample[example[0]] entityElement = ET.Element("entity") entityElement.attrib["isName"] = "False" headToken = example[3]["t"] for token in sentenceObject.tokens: if token.get("id") == headToken: headToken = token break entityElement.attrib["charOffset"] = headToken.get("charOffset") entityElement.attrib["headOffset"] = headToken.get("charOffset") entityElement.attrib["text"] = headToken.get("text") entityElement.attrib["id"] = sentenceId + ".e" + str(newEntityIdCount) newEntityIdCount += 1 if classSet == None: # binary classification if prediction[0] > 0: entityElement.attrib["type"] = str(True) else: entityElement.attrib["type"] = str(False) else: entityElement.attrib["type"] = classSet.getName(prediction[0]) classWeights = prediction[1:] predictionString = "" for i in range(len(classWeights)): if predictionString != "": predictionString += "," predictionString += classSet.getName(classIds[i]) + ":" + str(classWeights[i]) entityElement.attrib["predictions"] = predictionString #if entityElement.attrib["type"] != "neg": sentenceElement.append(entityElement) entityCount += 1 elif xType == "edge": pairCount = 0 if examplesBySentence.has_key(sentenceId): for example in examplesBySentence[sentenceId]: prediction = predictionsByExample[example[0]] pairElement = ET.Element("interaction") #pairElement.attrib["origId"] = origId #pairElement.attrib["type"] = example[3]["categoryName"] pairElement.attrib["directed"] = "Unknown" pairElement.attrib["e1"] = example[3]["e1"] #.attrib["id"] pairElement.attrib["e2"] = example[3]["e2"] #.attrib["id"] pairElement.attrib["id"] = sentenceId + ".i" + str(pairCount) if classSet == None: # binary classification if prediction[0] > 0: pairElement.attrib["type"] = str(True) else: pairElement.attrib["type"] = str(False) else: pairElement.attrib["type"] = classSet.getName(prediction[0]) classWeights = prediction[1:] predictionString = "" for i in range(len(classWeights)): if predictionString != "": predictionString += "," predictionString += classSet.getName(classIds[i]) + ":" + str(classWeights[i]) pairElement.attrib["predictions"] = predictionString sentenceElement.append(pairElement) pairCount += 1 elif xType == "trigger-event": eventsByToken = {} existingEntities = set() entityElements = sentenceElement.findall("entity") entityCount = 0 pairCount = 0 if entityElements != None: entityCount = len(entityElements) # get the count _before_ removing entities for entityElement in entityElements: if entityElement.get("isName") == "False": # interaction word sentenceElement.remove(entityElement) else: existingEntities.add(entityElement.get("id")) # add new pairs entityElements = sentenceElement.findall("entity") newEntityIdCount = IDUtils.getNextFreeId(entityElements) if examplesBySentence.has_key(sentenceId): eventIdByExample = {} newEntities = [] for example in examplesBySentence[sentenceId]: prediction = predictionsByExample[example[0]] if prediction[0] == 1: continue entityElement = ET.Element("entity") newEntities.append(entityElement) entityElement.attrib["isName"] = "False" headToken = example[3]["et"] for token in sentenceObject.tokens: if token.get("id") == headToken: headToken = token break entityElement.attrib["charOffset"] = headToken.get("charOffset") entityElement.attrib["headOffset"] = headToken.get("charOffset") entityElement.attrib["text"] = headToken.get("text") entityElement.attrib["id"] = sentenceId + ".e" + str(newEntityIdCount) newEntityIdCount += 1 eventIdByExample[example[0]] = entityElement.get("id") #if not eventByOrigId.has_key(example[3]["e"]): # eventByOrigId[example[3]["e"]] = [] #eventByOrigId[example[3]["e"]].append(entityElement.attrib["id"]) #example[3]["e"] = entityElement.attrib["id"] if not eventsByToken.has_key(example[3]["et"]): eventsByToken[example[3]["et"]] = [] eventsByToken[example[3]["et"]].append(entityElement.get("id")) entityElement.attrib["type"] = example[3]["type"] classWeights = prediction[1:] predictionString = "" for i in range(len(classWeights)): if predictionString != "": predictionString += "," predictionString += classSet.getName(classIds[i]) + ":" + str(classWeights[i]) entityElement.attrib["predictions"] = predictionString #if entityElement.attrib["type"] != "neg": sentenceElement.append(entityElement) entityCount += 1 for example in examplesBySentence[sentenceId]: prediction = predictionsByExample[example[0]] if prediction[0] == 1: continue # add theme edge if example[3].has_key("t"): pairElement = ET.Element("interaction") pairElement.attrib["directed"] = "Unknown" pairElement.attrib["e1"] = eventIdByExample[example[0]] if eventsByToken.has_key(example[3]["tt"]): pairElement.attrib["e2"] = eventsByToken[example[3]["tt"]][0] else: if example[3]["t"] in existingEntities: pairElement.attrib["e2"] = example[3]["t"] #.attrib["id"] pairElement.attrib["id"] = sentenceId + ".i" + str(pairCount) pairElement.attrib["type"] = "Theme" if pairElement.get("e2") != None: sentenceElement.append(pairElement) pairCount += 1 # add cause edge if example[3].has_key("c"): pairElement = ET.Element("interaction") pairElement.attrib["directed"] = "Unknown" pairElement.attrib["e1"] = eventIdByExample[example[0]] if eventsByToken.has_key(example[3]["ct"]): pairElement.attrib["e2"] = eventsByToken[example[3]["ct"]][0] else: if example[3]["c"] in existingEntities: pairElement.attrib["e2"] = example[3]["c"] #.attrib["id"] pairElement.attrib["id"] = sentenceId + ".i" + str(pairCount) pairElement.attrib["type"] = "Cause" if pairElement.get("e2") != None: sentenceElement.append(pairElement) pairCount += 1 # classWeights = prediction[1:] # predictionString = "" # for i in range(len(classWeights)): # if predictionString != "": # predictionString += "," # predictionString += classSet.getName(classIds[i]) + ":" + str(classWeights[i]) # pairElement.attrib["predictions"] = predictionString elif xType == "event": if True: process(sentenceObject, examplesBySentence, classSet, classIds, predictionsByExample) else: eventsByToken = {} existingEntities = set() entityElements = sentenceElement.findall("entity") entityCount = 0 pairCount = 0 if entityElements != None: entityCount = len(entityElements) # get the count _before_ removing entities for entityElement in entityElements: if entityElement.get("isName") == "False": # interaction word sentenceElement.remove(entityElement) else: existingEntities.add(entityElement.get("id")) # add new pairs entityElements = sentenceElement.findall("entity") newEntityIdCount = IDUtils.getNextFreeId(entityElements) if examplesBySentence.has_key(sentenceId): # split merged examples for example in examplesBySentence[sentenceId][:]: prediction = predictionsByExample[example[0]] if classSet.getName(prediction[0]).find("---") != -1: nameSplits = classSet.getName(prediction[0]).split("---") prediction[0] = classSet.getId(nameSplits[0], False) count = 1 for nameSplit in nameSplits[1:]: newExample = example[:] newExample[0] += ".dupl" + str(count) examplesBySentence[sentenceId].append(newExample) newPrediction = prediction[:] newPrediction[0] = classSet.getId(nameSplit, False) predictionsByExample[newExample[0]] = newPrediction count += 1 # the rest of the stuff eventIdByExample = {} newEntities = [] for example in examplesBySentence[sentenceId]: prediction = predictionsByExample[example[0]] if prediction[0] == 1: continue entityElement = ET.Element("entity") newEntities.append(entityElement) entityElement.attrib["isName"] = "False" headToken = example[3]["et"] for token in sentenceObject.tokens: if token.get("id") == headToken: headToken = token break entityElement.attrib["charOffset"] = headToken.get("charOffset") entityElement.attrib["headOffset"] = headToken.get("charOffset") entityElement.attrib["text"] = headToken.get("text") entityElement.attrib["id"] = sentenceId + ".e" + str(newEntityIdCount) newEntityIdCount += 1 eventIdByExample[example[0]] = entityElement.get("id") #if not eventByOrigId.has_key(example[3]["e"]): # eventByOrigId[example[3]["e"]] = [] #eventByOrigId[example[3]["e"]].append(entityElement.attrib["id"]) #example[3]["e"] = entityElement.attrib["id"] if not eventsByToken.has_key(example[3]["et"]): eventsByToken[example[3]["et"]] = [] eventsByToken[example[3]["et"]].append(entityElement.get("id")) entityElement.attrib["type"] = classSet.getName(prediction[0]) #example[3]["type"] classWeights = prediction[1:] predictionString = "" for i in range(len(classWeights)): if predictionString != "": predictionString += "," predictionString += classSet.getName(classIds[i]) + ":" + str(classWeights[i]) entityElement.attrib["predictions"] = predictionString #if entityElement.attrib["type"] != "neg": sentenceElement.append(entityElement) entityCount += 1 for example in examplesBySentence[sentenceId]: prediction = predictionsByExample[example[0]] if prediction[0] == 1: continue # add theme edge if example[3].has_key("tt"): pairElement = ET.Element("interaction") pairElement.attrib["directed"] = "Unknown" pairElement.attrib["e1"] = eventIdByExample[example[0]] if eventsByToken.has_key(example[3]["tt"]): pairElement.attrib["e2"] = eventsByToken[example[3]["tt"]][0] elif example[3].has_key("t") and example[3]["t"] in existingEntities: pairElement.attrib["e2"] = example[3]["t"] #.attrib["id"] pairElement.attrib["id"] = sentenceId + ".i" + str(pairCount) pairElement.attrib["type"] = "Theme" if pairElement.get("e2") != None: sentenceElement.append(pairElement) pairCount += 1 # add cause edge if example[3].has_key("ct"): pairElement = ET.Element("interaction") pairElement.attrib["directed"] = "Unknown" pairElement.attrib["e1"] = eventIdByExample[example[0]] if eventsByToken.has_key(example[3]["ct"]): pairElement.attrib["e2"] = eventsByToken[example[3]["ct"]][0] elif example[3].has_key("c") and example[3]["c"] in existingEntities: pairElement.attrib["e2"] = example[3]["c"] #.attrib["id"] pairElement.attrib["id"] = sentenceId + ".i" + str(pairCount) pairElement.attrib["type"] = "Cause" if pairElement.get("e2") != None: sentenceElement.append(pairElement) pairCount += 1 elif xType == None: pass else: sys.exit("Error, unknown xtype") # re-attach the analyses-element if sentenceAnalysesElement != None: sentenceElement.append(sentenceAnalysesElement)