def writeXMLSentence( self, examples, predictionsByExample, sentenceObject, classSet, classIds, goldSentence=None, exampleStyle=None, structureAnalyzer=None, ): sentenceElement = sentenceObject.sentence self.sentenceId = sentenceElement.get("id") self.assertSameSentence(examples, self.sentenceId) # detach analyses-element sentenceAnalysesElement = None sentenceAnalysesElement = sentenceElement.find("sentenceanalyses") if sentenceAnalysesElement == None: sentenceAnalysesElement = sentenceElement.find("analyses") if sentenceAnalysesElement != None: sentenceElement.remove(sentenceAnalysesElement) # remove pairs and interactions interactions = self.removeChildren(sentenceElement, ["interaction"]) arguments, relations = self.getInteractionsAndRelations(interactions) # remove entities entities = self.removeNonNameEntities(sentenceElement) interactionsByEntity, interactionsById = self.mapInteractions(arguments + relations, sentenceObject.entities) self.entityCount = IDUtils.getNextFreeId(sentenceElement.findall("entity")) self.interactionCount = IDUtils.getNextFreeId(sentenceElement.findall("interaction")) self.newEntities = [] self.newInteractions = [] self.mapEntities(sentenceObject.entities) exampleByEntityId = self.mapExamples(examples, sentenceObject) argumentsByExample = self.connectArgumentsToExamples( examples, predictionsByExample, interactionsById, sentenceObject.entitiesById ) self.mapEntityDuplicates(sentenceObject.entities) self.insertExamples(examples, predictionsByExample, argumentsByExample, sentenceObject, classSet, classIds) self.insertRelations(relations, sentenceObject.entitiesById) # Attach the new elements for element in self.newEntities + self.newInteractions: sentenceElement.append(element) # re-attach the analyses-element if sentenceAnalysesElement != None: sentenceElement.append(sentenceAnalysesElement)
def writeXMLSentence(self, examples, predictionsByExample, sentenceObject, classSet, classIds, goldSentence=None, exampleStyle=None, structureAnalyzer=None): self.assertSameSentence(examples) sentenceElement = sentenceObject.sentence sentenceId = sentenceElement.get("id") sentenceText = sentenceElement.get("text") # detach analyses-element sentenceAnalysesElement = None sentenceAnalysesElement = sentenceElement.find("sentenceanalyses") if sentenceAnalysesElement == None: sentenceAnalysesElement = sentenceElement.find("analyses") if sentenceAnalysesElement != None: sentenceElement.remove(sentenceAnalysesElement) # remove pairs and interactions interactions = self.removeChildren(sentenceElement, ["pair", "interaction"]) # remove entities newEntityIdCount = IDUtils.getNextFreeId( sentenceElement.findall("entity")) nonNameEntities = self.removeNonNameEntities(sentenceElement) # add new pairs for example in examples: prediction = predictionsByExample[example[0]] entityElement = ET.Element("entity") #entityElement.attrib["given"] = "False" headToken = example[3]["t"] for token in sentenceObject.tokens: if token.get("id") == headToken: headToken = token break entityElement.set("charOffset", example[3]["charOffset"]) entityElement.set("headOffset", headToken.get("charOffset")) entityElement.set("phraseType", example[3]["ptype"]) entOffset = Range.charOffsetToSingleTuple(example[3]["charOffset"]) entityElement.set("text", sentenceText[entOffset[0]:entOffset[1]]) entityElement.set("id", sentenceId + ".e" + str(newEntityIdCount)) self.setElementType(entityElement, prediction, classSet, classIds) newEntityIdCount += 1 sentenceElement.append(entityElement) # re-attach the analyses-element if sentenceAnalysesElement != None: sentenceElement.append(sentenceAnalysesElement)
def writeXMLSentence(self, examples, predictionsByExample, sentenceObject, classSet, classIds, goldSentence=None, exampleStyle=None, structureAnalyzer=None): sentenceElement = sentenceObject.sentence self.sentenceId = sentenceElement.get("id") self.assertSameSentence(examples, self.sentenceId) # detach analyses-element sentenceAnalysesElement = None sentenceAnalysesElement = sentenceElement.find("sentenceanalyses") if sentenceAnalysesElement == None: sentenceAnalysesElement = sentenceElement.find("analyses") if sentenceAnalysesElement != None: sentenceElement.remove(sentenceAnalysesElement) # remove pairs and interactions interactions = self.removeChildren(sentenceElement, ["interaction"]) arguments, relations = self.getInteractionsAndRelations(interactions) # remove entities entities = self.removeNonNameEntities(sentenceElement) interactionsByEntity, interactionsById = self.mapInteractions(arguments + relations, sentenceObject.entities) self.entityCount = IDUtils.getNextFreeId(sentenceElement.findall("entity")) self.interactionCount = IDUtils.getNextFreeId(sentenceElement.findall("interaction")) self.newEntities = [] self.newInteractions = [] self.mapEntities(sentenceObject.entities) exampleByEntityId = self.mapExamples(examples, sentenceObject) argumentsByExample = self.connectArgumentsToExamples(examples, predictionsByExample, interactionsById, sentenceObject.entitiesById) self.mapEntityDuplicates(sentenceObject.entities) self.insertExamples(examples, predictionsByExample, argumentsByExample, sentenceObject, classSet, classIds) self.insertRelations(relations, sentenceObject.entitiesById) # Attach the new elements for element in self.newEntities + self.newInteractions: sentenceElement.append(element) # re-attach the analyses-element if sentenceAnalysesElement != None: sentenceElement.append(sentenceAnalysesElement)
def writeXMLSentence(self, examples, predictionsByExample, sentenceObject, classSet, classIds, goldSentence=None, exampleStyle=None, structureAnalyzer=None): self.assertSameSentence(examples) sentenceElement = sentenceObject.sentence sentenceId = sentenceElement.get("id") sentenceText = sentenceElement.get("text") # detach analyses-element sentenceAnalysesElement = None sentenceAnalysesElement = sentenceElement.find("sentenceanalyses") if sentenceAnalysesElement == None: sentenceAnalysesElement = sentenceElement.find("analyses") if sentenceAnalysesElement != None: sentenceElement.remove(sentenceAnalysesElement) # remove pairs and interactions interactions = self.removeChildren(sentenceElement, ["pair", "interaction"]) # remove entities newEntityIdCount = IDUtils.getNextFreeId(sentenceElement.findall("entity")) nonNameEntities = self.removeNonNameEntities(sentenceElement) # add new pairs for example in examples: prediction = predictionsByExample[example[0]] entityElement = ET.Element("entity") #entityElement.attrib["given"] = "False" headToken = example[3]["t"] for token in sentenceObject.tokens: if token.get("id") == headToken: headToken = token break entityElement.set("charOffset", example[3]["charOffset"]) entityElement.set("headOffset", headToken.get("charOffset")) entityElement.set("phraseType", example[3]["ptype"]) entOffset = Range.charOffsetToSingleTuple(example[3]["charOffset"]) entityElement.set("text", sentenceText[entOffset[0]:entOffset[1]]) entityElement.set("id", sentenceId + ".e" + str(newEntityIdCount)) self.setElementType(entityElement, prediction, classSet, classIds) newEntityIdCount += 1 sentenceElement.append(entityElement) # re-attach the analyses-element if sentenceAnalysesElement != None: sentenceElement.append(sentenceAnalysesElement)
def writeXMLSentence(self, examples, predictionsByExample, sentenceObject, classSet, classIds, goldSentence=None, exampleStyle=None, structureAnalyzer=None): self.assertSameSentence(examples) extensionRequested = False sentenceElement = sentenceObject.sentence sentenceId = sentenceElement.get("id") # detach analyses-element sentenceAnalysesElement = None sentenceAnalysesElement = sentenceElement.find("sentenceanalyses") if sentenceAnalysesElement == None: sentenceAnalysesElement = sentenceElement.find("analyses") if sentenceAnalysesElement != None: sentenceElement.remove(sentenceAnalysesElement) # remove pairs and interactions interactions = self.removeChildren(sentenceElement, ["pair", "interaction"]) # remove entities newEntityIdCount = IDUtils.getNextFreeId( sentenceElement.findall("entity")) nonNameEntities = self.removeNonNameEntities(sentenceElement) # remove named entities if needed if exampleStyle != None and "names" in exampleStyle and exampleStyle[ "names"]: # remove all entities, including names self.removeChildren(sentenceElement, ["entity"]) # gold sentence elements goldEntityTypeByHeadOffset = {} goldEntityByHeadOffset = {} if goldSentence != None: for entity in goldSentence.entities: headOffset = entity.get("headOffset") if not goldEntityTypeByHeadOffset.has_key(headOffset): goldEntityTypeByHeadOffset[headOffset] = [] goldEntityByHeadOffset[headOffset] = [] goldEntityTypeByHeadOffset[headOffset].append(entity) goldEntityByHeadOffset[headOffset].append(entity) for key in goldEntityTypeByHeadOffset: goldEntityTypeByHeadOffset[key] = self.getMergedEntityType( goldEntityTypeByHeadOffset[key]) for token in sentenceObject.tokens: if not goldEntityTypeByHeadOffset.has_key( token.get("charOffset")): goldEntityTypeByHeadOffset[token.get("charOffset")] = "neg" # add new pairs for example in examples: # Entity examplesalways refer to a single head token headTokenId = example[3]["t"] headToken = None for token in sentenceObject.tokens: if token.get("id") == headTokenId: headToken = token break assert headToken != None, example[3] # Determine if additional processing is requested unmergeEPINeg = None if "unmergeneg" in example[3] and example[3]["unmergeneg"] == "epi": unmergeEPINeg = headToken.get("text") if "trigex" in example[3] and example[3]["trigex"] == "bb": extensionRequested = True # Make entities for positive predictions prediction = predictionsByExample[example[0]] predictionString = self.getPredictionStrengthString( prediction, classSet, classIds) for eType in self.getElementTypes( prediction, classSet, classIds, unmergeEPINegText=unmergeEPINeg): # split merged classes entityElement = ET.Element("entity") #entityElement.set("given", "False") entityElement.set("charOffset", headToken.get("charOffset")) entityElement.set("headOffset", headToken.get("charOffset")) entityElement.set("text", headToken.get("text")) entityElement.set("id", sentenceId + ".e" + str(newEntityIdCount)) entityElement.set("type", eType) entityElement.set("conf", predictionString) if structureAnalyzer.isEvent(eType): entityElement.set("event", "True") #self.setElementType(entityElement, prediction, classSet, classIds, unmergeEPINeg=unmergeEPINeg) if self.insertWeights: # in other words, use gold types headOffset = headToken.get("charOffset") if goldEntityByHeadOffset.has_key(headOffset): for entity in goldEntityByHeadOffset[headOffset]: entity.set("conf", entityElement.get("conf")) if goldEntityTypeByHeadOffset.has_key( headToken.get("charOffset")): entityElement.set( "goldType", goldEntityTypeByHeadOffset[headToken.get( "charOffset")]) if "goldIds" in example[ 3]: # The entities for which this example was built entityElement.set("goldIds", example[3]["goldIds"]) if (entityElement.get("type") != "neg" and not goldEntityByHeadOffset.has_key( entityElement.get("headOffset")) ) or not self.insertWeights: newEntityIdCount += 1 sentenceElement.append(entityElement) elif entityElement.get("type") == "neg": pass #newEntityIdCount += 1 #sentenceElement.append(entityElement) # if only adding weights, re-attach interactions and gold entities if self.insertWeights: for entity in nonNameEntities: sentenceElement.append(entity) for interaction in interactions: sentenceElement.append(interaction) # re-attach the analyses-element if sentenceAnalysesElement != None: sentenceElement.append(sentenceAnalysesElement) # Extend bacteria triggers if extensionRequested: Utils.InteractionXML.ExtendTriggers.extend( sentenceElement, entityTypes=["Bacterium"])
def writeXMLSentence(self, examples, predictionsByExample, sentenceObject, classSet, classIds, goldSentence=None, exampleStyle=None, structureAnalyzer=None): self.assertSameSentence(examples) extensionRequested = False sentenceElement = sentenceObject.sentence sentenceId = sentenceElement.get("id") # detach analyses-element sentenceAnalysesElement = None sentenceAnalysesElement = sentenceElement.find("sentenceanalyses") if sentenceAnalysesElement == None: sentenceAnalysesElement = sentenceElement.find("analyses") if sentenceAnalysesElement != None: sentenceElement.remove(sentenceAnalysesElement) # remove pairs and interactions interactions = self.removeChildren(sentenceElement, ["pair", "interaction"]) # remove entities newEntityIdCount = IDUtils.getNextFreeId(sentenceElement.findall("entity")) nonNameEntities = self.removeNonNameEntities(sentenceElement) # remove named entities if needed if exampleStyle != None and "names" in exampleStyle and exampleStyle["names"]: # remove all entities, including names self.removeChildren(sentenceElement, ["entity"]) # gold sentence elements goldEntityTypeByHeadOffset = {} goldEntityByHeadOffset = {} if goldSentence != None: for entity in goldSentence.entities: headOffset = entity.get("headOffset") if not goldEntityTypeByHeadOffset.has_key(headOffset): goldEntityTypeByHeadOffset[headOffset] = [] goldEntityByHeadOffset[headOffset] = [] goldEntityTypeByHeadOffset[headOffset].append(entity) goldEntityByHeadOffset[headOffset].append(entity) for key in goldEntityTypeByHeadOffset: goldEntityTypeByHeadOffset[key] = self.getMergedEntityType(goldEntityTypeByHeadOffset[key]) for token in sentenceObject.tokens: if not goldEntityTypeByHeadOffset.has_key(token.get("charOffset")): goldEntityTypeByHeadOffset[token.get("charOffset")] = "neg" # add new pairs for example in examples: # Entity examplesalways refer to a single head token headTokenId = example[3]["t"] headToken = None for token in sentenceObject.tokens: if token.get("id") == headTokenId: headToken = token break assert headToken != None, example[3] # Determine if additional processing is requested unmergeEPINeg = None if "unmergeneg" in example[3] and example[3]["unmergeneg"] == "epi": unmergeEPINeg = headToken.get("text") if "trigex" in example[3] and example[3]["trigex"] == "bb": extensionRequested = True # Make entities for positive predictions prediction = predictionsByExample[example[0]] predictionString = self.getPredictionStrengthString(prediction, classSet, classIds) for eType in self.getElementTypes(prediction, classSet, classIds, unmergeEPINegText=unmergeEPINeg): # split merged classes entityElement = ET.Element("entity") #entityElement.set("given", "False") entityElement.set("charOffset", headToken.get("charOffset")) if "define_offset" in example[3]: entityElement.set("charOffset", example[3]["define_offset"]) entityElement.set("headOffset", headToken.get("charOffset")) entityElement.set("text", headToken.get("text")) entityElement.set("id", sentenceId + ".e" + str(newEntityIdCount)) entityElement.set("type", eType) entityElement.set("conf", predictionString) if structureAnalyzer.isEvent(eType): entityElement.set("event", "True") #self.setElementType(entityElement, prediction, classSet, classIds, unmergeEPINeg=unmergeEPINeg) if self.insertWeights: # in other words, use gold types headOffset = headToken.get("charOffset") if goldEntityByHeadOffset.has_key(headOffset): for entity in goldEntityByHeadOffset[headOffset]: entity.set("conf", entityElement.get("conf") ) if goldEntityTypeByHeadOffset.has_key(headToken.get("charOffset")): entityElement.set("goldType", goldEntityTypeByHeadOffset[headToken.get("charOffset")]) if "goldIds" in example[3]: # The entities for which this example was built entityElement.set("goldIds", example[3]["goldIds"]) if (entityElement.get("type") != "neg" and not goldEntityByHeadOffset.has_key(entityElement.get("headOffset"))) and not self.insertWeights: newEntityIdCount += 1 sentenceElement.append(entityElement) elif entityElement.get("type") == "neg": pass #newEntityIdCount += 1 #sentenceElement.append(entityElement) # if only adding weights, re-attach interactions and gold entities if self.insertWeights: for entity in nonNameEntities: sentenceElement.append(entity) for interaction in interactions: sentenceElement.append(interaction) # re-attach the analyses-element if sentenceAnalysesElement != None: sentenceElement.append(sentenceAnalysesElement) # Extend bacteria triggers if extensionRequested: Utils.InteractionXML.ExtendTriggers.extend(sentenceElement, entityTypes=["Bacterium"])
def run(input, output, eventDir, parse="split-mccc-preparsed", verbose=False): print >> sys.stderr, "Loading corpus", input corpusTree = ETUtils.ETFromObj(input) print >> sys.stderr, "Corpus file loaded" corpusRoot = corpusTree.getroot() counts = defaultdict(int) for document in corpusRoot.findall("document"): sentDict = None pmid = document.get("pmid") isPMC = False for sentence in document.findall("sentence"): counts["sentences"] += 1 sentenceId = str(sentence.get("id")) + "/" + str(sentence.get("origId")) if verbose: print "Processing", sentenceId if sentDict == None: if sentence.get("origId") != None: assert pmid == None sentDict = loadEventXML( eventDir + "/" + sentence.get("origId").split(".")[0] + ".xml" , verbose=verbose) else: #pmid = sentence.get("pmid") assert pmid != None if pmid.startswith("PMC"): isPMC = True sentDict = {} else: assert pmid.startswith("PMID") sentDict = loadEventXML( eventDir + "/" + pmid.split("-", 1)[-1] + ".xml" , verbose=verbose) interactionXMLText = sentence.get("text") if not sentDict.has_key(interactionXMLText): counts["missing-sentences"] += 1 if isPMC: counts["missing-sentences-PMC"] += 1 if verbose: print "Missing sentence:", pmid, (sentenceId, sentDict, sentence.get("text")) else: sentenceAnalyses = sentence.find("sentenceanalyses") if sentenceAnalyses != None: sentence.remove(sentenceAnalyses) entityIdCount = IDUtils.getNextFreeId(sentence.findall("entity")) events = sentDict[interactionXMLText] events.sort() for event in events: if not keepEvent(event[2]): counts["filtered-triggers"] += 1 continue trigger = ET.Element("entity") trigger.set("isName", "False") trigger.set("charOffset", str(event[0]) + "-" + str(event[1])) trigger.set("type", str(event[2])) trigger.set("text", str(event[3])) trigger.set("source", "GENIA_event_annotation_0.9") trigger.set("id", sentence.get("id") + ".e" + str(entityIdCount)) entityIdCount += 1 counts["added-triggers"] += 1 sentence.append(trigger) if sentenceAnalyses != None: sentence.append(sentenceAnalyses) FindHeads.findHeads(corpusTree, parse, removeExisting=False) removeDuplicates(corpusRoot) print counts if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree
def run(input, output, eventDir, parse="split-mccc-preparsed", verbose=False): print >> sys.stderr, "Loading corpus", input corpusTree = ETUtils.ETFromObj(input) print >> sys.stderr, "Corpus file loaded" corpusRoot = corpusTree.getroot() counts = defaultdict(int) for document in corpusRoot.findall("document"): sentDict = None pmid = document.get("pmid") isPMC = False for sentence in document.findall("sentence"): counts["sentences"] += 1 sentenceId = str(sentence.get("id")) + "/" + str(sentence.get("origId")) if verbose: print "Processing", sentenceId if sentDict == None: if sentence.get("origId") != None: assert pmid == None sentDict = loadEventXML( eventDir + "/" + sentence.get("origId").split(".")[0] + ".xml" , verbose=verbose) else: #pmid = sentence.get("pmid") assert pmid != None if pmid.startswith("PMC"): isPMC = True sentDict = {} else: assert pmid.startswith("PMID") sentDict = loadEventXML( eventDir + "/" + pmid.split("-", 1)[-1] + ".xml" , verbose=verbose) interactionXMLText = sentence.get("text") if not sentDict.has_key(interactionXMLText): counts["missing-sentences"] += 1 if isPMC: counts["missing-sentences-PMC"] += 1 if verbose: print "Missing sentence:", pmid, (sentenceId, sentDict, sentence.get("text")) else: sentenceAnalyses = sentence.find("sentenceanalyses") if sentenceAnalyses != None: sentence.remove(sentenceAnalyses) entityIdCount = IDUtils.getNextFreeId(sentence.findall("entity")) events = sentDict[interactionXMLText] events.sort() for event in events: if not keepEvent(event[2]): counts["filtered-triggers"] += 1 continue trigger = ET.Element("entity") #trigger.set("given", "False") trigger.set("charOffset", str(event[0]) + "-" + str(event[1])) trigger.set("type", str(event[2])) trigger.set("text", str(event[3])) trigger.set("source", "GENIA_event_annotation_0.9") trigger.set("id", sentence.get("id") + ".e" + str(entityIdCount)) entityIdCount += 1 counts["added-triggers"] += 1 sentence.append(trigger) if sentenceAnalyses != None: sentence.append(sentenceAnalyses) FindHeads.findHeads(corpusTree, parse, removeExisting=False) removeDuplicates(corpusRoot) print counts if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree
def writeXMLSentence(self, examples, predictionsByExample, sentenceObject, classSet, classIds, goldSentence=None, exampleStyle=None): sentenceElement = sentenceObject.sentence self.sentenceId = sentenceElement.get("id") self.assertSameSentence(examples, self.sentenceId) # detach analyses-element sentenceAnalysesElement = None sentenceAnalysesElement = sentenceElement.find("sentenceanalyses") if sentenceAnalysesElement == None: sentenceAnalysesElement = sentenceElement.find("analyses") if sentenceAnalysesElement != None: sentenceElement.remove(sentenceAnalysesElement) # remove pairs and interactions interactions = self.removeChildren(sentenceElement, ["pair", "interaction"]) # remove entities entities = self.removeNonNameEntities(sentenceElement) # filter interactions interactionsToKeep = [] for interaction in interactions: if interaction.get("type") != "neg": interactionsToKeep.append(interaction) interactions = interactionsToKeep # early out cutoff = 100 #if len(interactions) == 0 or len(interactions) > cutoff: if len(interactions) > cutoff: # re-attach the analyses-element if sentenceAnalysesElement != None: sentenceElement.append(sentenceAnalysesElement) #if len(interactions) > cutoff: print >> sys.stderr, "Warning, sentence", sentenceObject.sentence.get("id"), "has more than", cutoff, "interactions, removing all." return interactionsByEntity = {} interactionsById = {} for entity in entities: interactionsByEntity[entity.get("id")] = [] for interaction in interactions: e1Id = interaction.get("e1") if not interactionsByEntity.has_key(e1Id): interactionsByEntity[e1Id] = [] interactionsByEntity[e1Id].append(interaction) interactionsById[interaction.get("id")] = interaction # NOTE! Following won't work for pairs self.entityCount = IDUtils.getNextFreeId(sentenceElement.findall("entity")) self.interactionCount = IDUtils.getNextFreeId(sentenceElement.findall("interaction")) self.newEntities = [] self.newInteractions = [] # Mapping for connecting the events self.entitiesByHeadByType = {} #self.tokenByOffset = {} #for token in sentenceObject.tokens: # self.tokenByOffset[token.get("charOffset")] = token # self.entityByHeadByType[token.get("charOffset")] = {} for entity in sentenceObject.entities: # by offset offset = entity.get("headOffset") if not self.entitiesByHeadByType.has_key(offset): self.entitiesByHeadByType[offset] = {} # by type eType = entity.get("type") if entity.get("isName") != "True": self.entitiesByHeadByType[offset][eType] = [] else: # add names to structure if not self.entitiesByHeadByType[offset].has_key(eType): self.entitiesByHeadByType[offset][eType] = [] self.entitiesByHeadByType[offset][eType].append(entity) entityKeys = sentenceObject.entitiesById.keys() exampleByEntityId = {} for example in examples: #if predictionsByExample[example[0]][0] == 1: # negative # continue eId = example[3]["e"] assert eId in entityKeys if not exampleByEntityId.has_key(eId): exampleByEntityId[eId] = [] exampleByEntityId[eId].append(example) # This doesn't work, it was an attempt to include # only the positive example with the highest prediction strength # for key in sorted(exampleByEntityId.keys()): # eType = sentenceObject.entitiesById[key].get("type") # eExamples = exampleByEntityId[key] # if eType == "Binding" and len(eExamples) > 1: # maxArgs = -1 # maxStr = -999999999 # for example in eExamples: # if predictionsByExample[example[0]][0] == 1: # continue # numArgs = example[3]["i"].count(",") + 1 # if numArgs > maxArgs: # maxArgs = numArgs # predClass = predictionsByExample[example[0]][0] # predictionStrength = predictionsByExample[example[0]][predClass] # if predictionStrength > maxStr: # maxStr = predictionStrength # #print maxArgs, len(eExamples) # for example in eExamples: # if predictionsByExample[example[0]][0] == 1: # continue # predClass = predictionsByExample[example[0]][0] # predictionStrength = predictionsByExample[example[0]][predClass] # if predictionStrength != maxStr: # examples.remove(example) # #if example[3]["i"].count(",") + 1 < maxArgs: # # examples.remove(example) #self.newEntitiesById = {} #self.outEdgesByEntity = {} # Gather arguments for the simple, one-argument events argumentsByExample = {} positiveExamples = [] exampleIdCount = 0 for entity in entities: # If no example, case is unambiguous if entity.get("id") not in exampleByEntityId: simpleEventInteractions = interactionsByEntity[entity.get("id")] numCauses = 0 numThemes = 0 for interaction in simpleEventInteractions[:]: if self.isIntersentence(interaction): print "Warning, intersentence interaction for", entity.get("id"), entity.get("type") simpleEventInteractions.remove(interaction) continue if interaction.get("type") == "neg": simpleEventInteractions.remove(interaction) continue iType = interaction.get("type") if iType == "Cause": numCauses += 1 elif iType == "Theme": numThemes += 1 eType = entity.get("type") assert numThemes == 0 or (numThemes != 0 and numCauses == 0) or (numThemes > 1 and eType != "Binding"), (numThemes,numCauses,eType,entity.get("id"), [x[0] for x in examples], entityKeys) #assert numThemes == 0 or (numThemes != 0 and numCauses == 0) or (numThemes > 1 and eType == "Binding"), (numThemes,numCauses,eType,entity.get("id")) for interaction in simpleEventInteractions: self.counts["simple-" + eType + "-" + interaction.get("type")] += 1 exampleId = "simple." + str(exampleIdCount) exampleIdCount += 1 positiveExamples.append([exampleId,None,None,None]) argumentsByExample[exampleId] = [interaction] #self.addEvent([interaction], sentenceObject, "simple") # Gather arguments for predicted, unmerged events for example in examples: #print predictionsByExample[example[0]] if predictionsByExample[example[0]][0] == 1: # negative continue positiveExamples.append(example) arguments = [] for iId in example[3]["i"].split(","): if iId == "": # processes can have 0 arguments assert "etype" in example[3], example[3] assert example[3]["etype"] == "Process", example[3] break arg = interactionsById[iId] if self.isIntersentence(arg): continue assert arg.get("type") != "neg" arguments.append(arg) argumentsByExample[example[0]] = arguments # Loop until all positive examples are added. This process # assumes that the events (mostly) form a directed acyclic # graph, which can written by "growing" the structure from # the "leaf" events, and consecutively adding levels of # nesting events. examplesLeft = len(positiveExamples) exampleAdded = {} for example in positiveExamples: exampleAdded[example[0]] = False forceAdd = False forcedCount = 0 while examplesLeft > 0: if len(self.newEntities) > 100: print >> sys.stderr, "Warning, sentence", sentenceObject.sentence.get("id"), "has generated more than", cutoff, "events, skipping the rest." break examplesAddedThisRound = 0 # For each round, loop through the potentially remaining examples for example in positiveExamples: if len(self.newEntities) > 100: break if exampleAdded[example[0]]: # This event has already been inserted continue arguments = argumentsByExample[example[0]] # An event can be added if all of its argument events have already # been added. Addition is forced if lack of argument events blocks # the process. if forceAdd or self.argumentEntitiesExist(arguments, sentenceObject): umType = "complex" # mark the root entity in the output xml predictionStrength = None if example[0].find("simple") != -1: umType = "simple" else: # Prediction strength is only available for classified argument groups predictionStrength = self.getPredictionStrength(example, predictionsByExample, classSet, classIds) #print example if umType != "simple" and "etype" in example[3] and example[3]["etype"] == "Process" and len(arguments) == 0: origProcess = sentenceObject.entitiesById[example[3]["e"]] # Put back the original entity newProcess = self.addEntity(origProcess) newProcess.set("umType", umType) if predictionStrength != None: newProcess.set("umStrength", str(predictionStrength)) else: # example has arguments self.addEvent(arguments, sentenceObject, umType, forceAdd, predictionStrength, exampleNotes=example[3]) exampleAdded[example[0]] = True examplesLeft -= 1 examplesAddedThisRound += 1 forceAdd = False if examplesLeft > 0 and examplesAddedThisRound == 0: # If there are examples left, but nothing was added, this # means that some nested events are missing. Theoretically # this could also be because two events are referring to # each other, preventing each other's insertion. In any # case this is solved by simply forcing the addition of # the first non-inserted event, by creating 0-argument # entities for its argument events. forcedCount += 1 #print "Warning, forcing event addition" forceAdd = True # Attach the new elements for element in self.newEntities + self.newInteractions: sentenceElement.append(element) # re-attach the analyses-element if sentenceAnalysesElement != None: sentenceElement.append(sentenceAnalysesElement)