def _markNamedEntities(self): """ This method is used to define which tokens belong to _named_ entities. Named entities are sometimes masked when testing learning of interactions, to prevent the system making a trivial decision based on commonly interacting names. """ self.tokenIsName = {} self.tokenIsEntity = {} self.tokenIsEntityHead = {} # Initialize the dictionaries for token in self.tokens: self.tokenIsName[token] = False self.tokenIsEntity[token] = False self.tokenIsEntityHead[token] = [] for entity in self.entities: entityOffsets = Range.charOffsetToTuples(entity.get("charOffset")) entityHeadOffset = Range.charOffsetToSingleTuple(entity.get("headOffset")) for token in self.tokens: tokenOffset = Range.charOffsetToSingleTuple(token.get("charOffset")) for entityOffset in entityOffsets: if Range.overlap(entityOffset, tokenOffset): self.tokenIsEntity[token] = True if entity.get("isName") != None: if entity.get("isName") == "True": self.tokenIsName[token] = True else: entity.set("isName", "True") self.tokenIsName[token] = True if Range.overlap(entityHeadOffset, tokenOffset): self.tokenIsEntityHead[token].append(entity)
def fixAltOffsets(input, output=None): print >> sys.stderr, "Loading corpus", input corpusTree = ETUtils.ETFromObj(input) print >> sys.stderr, "Corpus file loaded" corpusRoot = corpusTree.getroot() docCount = 0 sentencesCreated = 0 sentences = [x for x in corpusRoot.getiterator("sentence")] counter = ProgressCounter(len(sentences), "FixAltOffsets") fixCount = 0 # fix spans for sentence in sentences: counter.update(1, "Fixing AltOffsets for sentence ("+sentence.get("id")+"): ") sentOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset")) for entity in sentence.findall("entity"): altOffsetString = entity.get("altOffset") if altOffsetString == None: continue #print altOffsetString altOffsets = Range.charOffsetToTuples(altOffsetString) assert len(altOffsets) == 1 for i in range(len(altOffsets)): altOffset = altOffsets[i] altOffsets[i] = (altOffset[0] - sentOffset[0], altOffset[1] - sentOffset[0]) entity.set("altOffset", Range.tuplesToCharOffset(altOffsets)) fixCount += 1 print >> sys.stderr, "Fixed", fixCount, "altOffsets" if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree
def _markNamedEntities(self): """ This method is used to define which tokens belong to _named_ entities. Named entities are sometimes masked when testing learning of interactions, to prevent the system making a trivial decision based on commonly interacting names. This function assumes that all given entities are named entities. """ self.tokenIsName = {} self.tokenIsEntity = {} self.tokenIsEntityHead = {} # Initialize the dictionaries for token in self.tokens: self.tokenIsName[token] = False self.tokenIsEntity[token] = False self.tokenIsEntityHead[token] = [] for entity in self.entities: entityOffsets = Range.charOffsetToTuples(entity.get("charOffset")) entityHeadOffset = Range.charOffsetToSingleTuple( entity.get("headOffset")) for token in self.tokens: tokenOffset = Range.charOffsetToSingleTuple( token.get("charOffset")) for entityOffset in entityOffsets: if Range.overlap(entityOffset, tokenOffset): self.tokenIsEntity[token] = True if entity.get("given") == "True": self.tokenIsName[token] = True # if entity.get("given") != None: # if entity.get("given") == "True": # self.tokenIsName[token] = True # else: # entity.set("given", "True") # self.tokenIsName[token] = True if Range.overlap(entityHeadOffset, tokenOffset): self.tokenIsEntityHead[token].append(entity)
def getAttributes(element): attrib = element.attrib.copy() #attrib[TAGKEY] = element.tag for key in attrib: if "offset" in key.lower(): attrib[key] = Range.charOffsetToTuples(attrib[key]) if len(attrib[key]) == 1: attrib[key] = attrib[key][0] return attrib
def moveElements(document): entMap = {} entSentence = {} entSentenceIndex = {} sentences = document.findall("sentence") sentenceCount = 0 for sentence in sentences: sentenceOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset")) # Move entities entCount = 0 for entity in document.findall("entity"): entityOffsets = Range.charOffsetToTuples(entity.get("charOffset")) overlaps = False for entityOffset in entityOffsets: if Range.overlap(sentenceOffset, entityOffset): overlaps = True break if overlaps: document.remove(entity) sentence.append(entity) entityId = entity.get("id") entityIdLastPart = entityId.rsplit(".", 1)[-1] if entityIdLastPart.startswith("e"): entity.set("id", sentence.get("id") + "." + entityIdLastPart) entMap[entityId] = sentence.get("id") + "." + entityIdLastPart else: entity.set("docId", entityId) entity.set("id", sentence.get("id") + ".e" + str(entCount)) entMap[entityId] = sentence.get("id") + ".e" + str(entCount) entSentence[entityId] = sentence entSentenceIndex[entityId] = sentenceCount #newEntityOffset = (entityOffset[0] - sentenceOffset[0], entityOffset[1] - sentenceOffset[0]) newEntityOffsets = [] for entityOffset in entityOffsets: newEntityOffsets.append( (entityOffset[0] - sentenceOffset[0], entityOffset[1] - sentenceOffset[0]) ) entity.set("origOffset", entity.get("charOffset")) #entity.set("charOffset", str(newEntityOffset[0]) + "-" + str(newEntityOffset[1])) entity.set("charOffset", Range.tuplesToCharOffset(newEntityOffsets)) entCount += 1 sentenceCount += 1 # Move interactions intCount = 0 for interaction in document.findall("interaction"): #if entSentenceIndex[interaction.get("e1")] < entSentenceIndex[interaction.get("e2")]: # targetSentence = entSentence[interaction.get("e1")] #else: # targetSentence = entSentence[interaction.get("e2")] # Interactions go to a sentence always by e1, as this is the event they are an argument of. # If an intersentence interaction is a relation, this shouldn't matter. targetSentence = entSentence[interaction.get("e1")] document.remove(interaction) targetSentence.append(interaction) interaction.set("id", targetSentence.get("id") + ".i" + str(intCount)) interaction.set("e1", entMap[interaction.get("e1")]) interaction.set("e2", entMap[interaction.get("e2")]) intCount += 1
def processElements(xml): for ddi in xml.getiterator("ddi"): ddi.tag = "interaction" for entity in xml.getiterator("entity"): entity.set("given", "True") # Reformat disjoint character offsets and update character range format for TEES 2.0+ charOffsets = Range.charOffsetToTuples(entity.get("charOffset"), rangeSep=";") updatedCharOffsets = [] for charOffset in charOffsets: updatedCharOffsets.append( (charOffset[0], charOffset[1]+1) ) entity.set("charOffset", Range.tuplesToCharOffset(updatedCharOffsets))
def processElements(xml): for ddi in xml.getiterator("ddi"): ddi.tag = "interaction" for entity in xml.getiterator("entity"): entity.set("given", "True") # Reformat disjoint character offsets and update character range format for TEES 2.0+ charOffsets = Range.charOffsetToTuples(entity.get("charOffset"), rangeSep=";") updatedCharOffsets = [] for charOffset in charOffsets: updatedCharOffsets.append((charOffset[0], charOffset[1] + 1)) entity.set("charOffset", Range.tuplesToCharOffset(updatedCharOffsets))
def getEntityHeadToken(entity, tokens, tokenHeadScores): if entity.get("headOffset") != None: charOffsets = Range.charOffsetToTuples(entity.get("headOffset")) elif entity.get("charOffset") != "": charOffsets = Range.charOffsetToTuples(entity.get("charOffset")) else: charOffsets = [] # Each entity can consist of multiple syntactic tokens, covered by its # charOffset-range. One of these must be chosen as the head token. headTokens = [] # potential head tokens for token in tokens: tokenOffset = Range.charOffsetToSingleTuple(token.get("charOffset")) for offset in charOffsets: if Range.overlap(offset, tokenOffset): headTokens.append(token) if len(headTokens) == 1: # An unambiguous head token was found selectedHeadToken = headTokens[0] else: # One head token must be chosen from the candidates selectedHeadToken = findHeadToken(headTokens, tokenHeadScores) #if verbose: # print >> sys.stderr, "Selected head:", token.attrib["id"], token.attrib["text"] assert selectedHeadToken != None, entityElement.get("id") return selectedHeadToken
def getEntityHeadToken(entity, tokens, tokenHeadScores): if entity.get("headOffset") != None: charOffsets = Range.charOffsetToTuples(entity.get("headOffset")) elif entity.get("charOffset") != "": charOffsets = Range.charOffsetToTuples(entity.get("charOffset")) else: charOffsets = [] # Each entity can consist of multiple syntactic tokens, covered by its # charOffset-range. One of these must be chosen as the head token. headTokens = [] # potential head tokens for token in tokens: tokenOffset = Range.charOffsetToSingleTuple(token.get("charOffset")) for offset in charOffsets: if Range.overlap(offset, tokenOffset): headTokens.append(token) if len(headTokens)==1: # An unambiguous head token was found selectedHeadToken = headTokens[0] else: # One head token must be chosen from the candidates selectedHeadToken = findHeadToken(headTokens, tokenHeadScores) #if verbose: # print >> sys.stderr, "Selected head:", token.attrib["id"], token.attrib["text"] assert selectedHeadToken != None, entityElement.get("id") return selectedHeadToken
def makeDDI13SubmissionFile(input, output, mode="interactions", idfilter=None): xml = ETUtils.ETFromObj(input) outFile = open(output, "wt") for sentence in xml.getiterator("sentence"): sentenceId = sentence.get("id") if idfilter != None and idfilter not in sentenceId: continue # Output entities if mode == "entities": for entity in sentence.findall("entity"): if entity.get("type") != "neg": outFile.write(sentenceId) offsets = Range.charOffsetToTuples( entity.get("charOffset")) for i in range(len(offsets)): offsets[i] = (offsets[i][0], offsets[i][1] - 1) outFile.write( "|" + Range.tuplesToCharOffset(offsets, rangeSep=";")) outFile.write("|" + entity.get("text")) outFile.write("|" + entity.get("type")) outFile.write("\n") if mode == "interactions": # First determine which pairs interact intMap = defaultdict(lambda: defaultdict(lambda: None)) for interaction in sentence.findall("interaction"): # Make mapping both ways to discard edge directionality. This isn't actually needed, # since MultiEdgeExampleBuilder builds entity pairs in the same order as this function, # but shouldn't harm to include it and now it works regardless of pair direction. if interaction.get("type") != "neg" and interaction.get( "given") != "True": intMap[interaction.get("e1")][interaction.get( "e2")] = interaction intMap[interaction.get("e2")][interaction.get( "e1")] = interaction # Then write all pairs to the output file entities = sentence.findall("entity") for i in range(0, len(entities) - 1): for j in range(i + 1, len(entities)): eIId = entities[i].get("id") eJId = entities[j].get("id") outFile.write(sentenceId + "|" + eIId + "|" + eJId + "|") if intMap[eIId][eJId] != None: interaction = intMap[eIId][eJId] assert interaction.get("type") != "neg" outFile.write("1|" + interaction.get("type") + "\n") else: outFile.write("0|null\n") outFile.close()
def makeDDI13SubmissionFile(input, output, mode="interactions", idfilter=None): xml = ETUtils.ETFromObj(input) outFile = open(output, "wt") for sentence in xml.getiterator("sentence"): sentenceId = sentence.get("id") if idfilter != None and idfilter not in sentenceId: continue # Output entities if mode == "entities": for entity in sentence.findall("entity"): if entity.get("type") != "neg": outFile.write(sentenceId) offsets = Range.charOffsetToTuples(entity.get("charOffset")) for i in range(len(offsets)): offsets[i] = (offsets[i][0], offsets[i][1]-1) outFile.write("|" + Range.tuplesToCharOffset(offsets, rangeSep=";")) outFile.write("|" + entity.get("text")) outFile.write("|" + entity.get("type")) outFile.write("\n") if mode == "interactions": # First determine which pairs interact intMap = defaultdict(lambda:defaultdict(lambda:None)) for interaction in sentence.findall("interaction"): # Make mapping both ways to discard edge directionality. This isn't actually needed, # since MultiEdgeExampleBuilder builds entity pairs in the same order as this function, # but shouldn't harm to include it and now it works regardless of pair direction. if interaction.get("type") != "neg" and interaction.get("given") != "True": intMap[interaction.get("e1")][interaction.get("e2")] = interaction intMap[interaction.get("e2")][interaction.get("e1")] = interaction # Then write all pairs to the output file entities = sentence.findall("entity") for i in range(0, len(entities)-1): for j in range(i+1, len(entities)): eIId = entities[i].get("id") eJId = entities[j].get("id") outFile.write(sentenceId + "|" + eIId + "|" + eJId + "|") if intMap[eIId][eJId] != None: interaction = intMap[eIId][eJId] assert interaction.get("type") != "neg" outFile.write("1|" + interaction.get("type") + "\n") else: outFile.write("0|null\n") outFile.close()
def updateXML(root, removeAnalyses=True): counts = defaultdict(int) for document in root.findall("document"): sentencePos = 0 counts["documents"] += 1 for sentence in document.findall("sentence"): counts["sentences"] += 1 # Remove the original parses analyses = sentence.find("sentenceanalyses") if analyses != None: counts["analyses"] += 1 if removeAnalyses: counts["removed-analyses"] += 1 sentence.remove(analyses) # Add an artifical sentence offset so that sentences can be exported as a single document sentenceText = sentence.get("text") sentence.set("charOffset", Range.tuplesToCharOffset((sentencePos, sentencePos + len(sentenceText)))) # Update the character offsets of all entities from the old format (begin,end) to the new one (begin,end+1) for entity in sentence.findall("entity"): counts["entities"] += 1 offsets = [(x[0], x[1] + 1) for x in Range.charOffsetToTuples(entity.get("charOffset"))] entityText = entity.get("text") for offset, entitySpan in zip(offsets, [sentenceText[x[0]:x[1]] for x in offsets]): counts["entity-offsets"] += 1 lenOffset = offset[1] - offset[0] offsetText, entityText = entityText[:lenOffset].strip(), entityText[lenOffset:].strip() assert offsetText == entitySpan, (offsets, (entity.get("text"), entitySpan), (offsetText, entityText), sentenceText) entity.set("charOffset", Range.tuplesToCharOffset(offsets)) # Convert positive pairs into interaction elements numInteractions = 0 for pair in sentence.findall("pair"): counts["pairs"] += 1 sentence.remove(pair) if pair.get("interaction") == "True": del pair.attrib["interaction"] pair.set("id", pair.get("id").rsplit(".", 1)[0] + ".i" + str(numInteractions)) pair.set("type", "PPI") ET.SubElement(sentence, "interaction", pair.attrib) numInteractions += 1 counts["interactions"] += 1 sentencePos += len(sentenceText) + 1 print >> sys.stderr, "Updated Interaction XML format:", dict(counts) return root
def fixAltOffsets(input, output=None): print >> sys.stderr, "Loading corpus", input corpusTree = ETUtils.ETFromObj(input) print >> sys.stderr, "Corpus file loaded" corpusRoot = corpusTree.getroot() docCount = 0 sentencesCreated = 0 sentences = [x for x in corpusRoot.getiterator("sentence")] counter = ProgressCounter(len(sentences), "FixAltOffsets") fixCount = 0 # fix spans for sentence in sentences: counter.update( 1, "Fixing AltOffsets for sentence (" + sentence.get("id") + "): ") sentOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset")) for entity in sentence.findall("entity"): altOffsetString = entity.get("altOffset") if altOffsetString == None: continue #print altOffsetString altOffsets = Range.charOffsetToTuples(altOffsetString) assert len(altOffsets) == 1 for i in range(len(altOffsets)): altOffset = altOffsets[i] altOffsets[i] = (altOffset[0] - sentOffset[0], altOffset[1] - sentOffset[0]) entity.set("altOffset", Range.tuplesToCharOffset(altOffsets)) fixCount += 1 print >> sys.stderr, "Fixed", fixCount, "altOffsets" if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree
def mergeSentences(input, output, verbose=False): print >> sys.stderr, "Merging sentences into documents" print >> sys.stderr, "Loading corpus file", input corpusTree = ETUtils.ETFromObj(input) corpusRoot = corpusTree.getroot() counts = defaultdict(int) for document in corpusRoot.findall("document"): counts["documents"] += 1 # Check that the entity has only sentence elements as children children = [x for x in document] docChildTypes = sorted(set([x.tag for x in children])) if len(docChildTypes) == 0: counts["documents-with-no-sentences"] += 1 continue elif len(docChildTypes) > 1 or docChildTypes[0] != "sentence": raise Exception("Document '" + str(document.get("id")) + "' has non-sentence children: " + str(docChildTypes)) # Process all the child sentence elements docId = document.get("id") interactions = [] entities = [] entityById = {} interactionById = {} combinedText = "" calculatedOffset = (0, 0) for sentence in children: document.remove(sentence) sentenceText = sentence.get("head", "") + sentence.get( "text", "") + sentence.get("tail", "") sentOffset = sentence.get("charOffset") if sentence == children[0]: noDefinedOffsets = sentOffset == None elif (sentOffset == None) != noDefinedOffsets: raise Exception("Only some sentences in document '" + docId + "' have defined offsets") if sentOffset == None: if sentence != children[-1]: sentenceText = sentenceText + " " calculatedOffset = (calculatedOffset[1], calculatedOffset[1] + len(sentenceText)) sentOffset = calculatedOffset else: sentOffset = Range.charOffsetToSingleTuple(sentOffset) combinedText += sentenceText # Collect and update the entity elements for entity in sentence.findall("entity"): # Map sentence-level entity offsets to document level for offsetKey in ("charOffset", "headOffset"): if entity.get(offsetKey) != None: offset = Range.charOffsetToTuples( entity.get(offsetKey)) for i in range(len(offset)): offset[i] = (offset[i][0] + sentOffset[0], offset[i][1] + sentOffset[0]) entity.set(offsetKey, Range.tuplesToCharOffset(offset)) # Compare mapped offsets to origOffset, if available if entity.get("origOffset") != None: if entity.get("charOffset") != entity.get("origOffset"): raise Exception( "Document '" + str(document.get("id")) + "' entity '" + str(entity.get("id")) + "' new charOffset differs from origOffset: " + str([ entity.get("charOffset"), entity.get("origOffset") ])) counts["checked-origOffsets"] += 1 del entity.attrib["origOffset"] assert entity.get("id") not in entityById entityById[entity.get( "id" )] = entity # For re-mapping the interaction 'e1' and 'e2' attributes entities.append(entity) counts["moved-entities"] += 1 # Collect and update the interaction elements for interaction in sentence.findall("interaction"): assert interaction.get("id") not in interactionById interactionById[interaction.get( "id" )] = interaction # For re-mapping the interaction 'siteOf' attributes interactions.append(interaction) counts["moved-interactions"] += 1 # Check that the combined sentence text matches the document text, if available if document.get("text") != None and document.get( "text") != combinedText: if combinedText == document.get( "text")[0:len(combinedText)] and document.get( "text")[len(combinedText):].strip() == "": if verbose: print >> sys.stderr, "Warning, document '" + document.get( "id" ) + "' text has trailing whitespace not included in the combined sentence text" combinedText = document.get("text") counts["missing-trailing-whitespace"] += 1 else: raise Exception( "Document '" + str(document.get("id")) + "' text differs from combined sentence text: " + str([document.get("text"), combinedText])) counts["checked-document-texts"] += 1 # Check that the entities' texts match the document text for entity in entities: offset = Range.charOffsetToTuples(entity.get("charOffset")) if len(offset) == 1: # Compare only continous entities if not Range.contains((0, len(combinedText)), offset[0]): raise Exception( "Document '" + str(document.get("id")) + "' entity '" + str(entity.get("id")) + "' offset is not contained in combined sentence text: " + str([ entity.attrib, offset, [0, len(combinedText)], combinedText ])) combTextSpan = combinedText[offset[0][0]:offset[0][1]] if entity.get("text") != combTextSpan: raise Exception( "Document '" + str(document.get("id")) + "' entity '" + str(entity.get("id")) + "' text does not match combined sentence text: " + str([entity.get("text"), combTextSpan])) counts["checked-charOffsets"] += 1 # Set the combined text as the document text document.set("text", combinedText) # Update entity and interaction ids (not done earlier so that possible error messages will refer to original ids, also because of siteOf-remapping) for i in range(len(entities)): entities[i].set("id", docId + ".e" + str(i)) # Update the id for the document level for i in range(len(interactions)): interaction.set("id", docId + ".i" + str(i)) # Update the id for the document level # Update interaction e1 and e2 ids (cannot be done earlier because interactions may refer to entities from multiple sentences) for i in range(len(interactions)): interaction = interactions[i] for entKey in ("e1", "e2"): interaction.set(entKey, entityById[interaction.get(entKey)].get("id")) if interaction.get("siteOf") != None: interaction.set( "siteOf", interactionById[interaction.get("siteOf")].get("id")) # Add the entity and interaction elements to the document document.extend(entities) document.extend(interactions) print >> sys.stderr, "Counts:", dict(counts) if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree
def moveElements(document): entMap = {} entSentence = {} entSentenceIndex = {} sentences = document.findall("sentence") sentenceCount = 0 for sentence in sentences: sentenceOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset")) # Move entities entCount = 0 for entity in document.findall("entity"): entityOffsets = Range.charOffsetToTuples(entity.get("charOffset")) overlaps = False for entityOffset in entityOffsets: if Range.overlap(sentenceOffset, entityOffset): overlaps = True break if overlaps: document.remove(entity) sentence.append(entity) entityId = entity.get("id") entityIdLastPart = entityId.rsplit(".", 1)[-1] if entityIdLastPart.startswith("e"): entity.set("id", sentence.get("id") + "." + entityIdLastPart) entMap[entityId] = sentence.get("id") + "." + entityIdLastPart else: entity.set("docId", entityId) entity.set("id", sentence.get("id") + ".e" + str(entCount)) entMap[entityId] = sentence.get("id") + ".e" + str(entCount) entSentence[entityId] = sentence entSentenceIndex[entityId] = sentenceCount #newEntityOffset = (entityOffset[0] - sentenceOffset[0], entityOffset[1] - sentenceOffset[0]) newEntityOffsets = [] for entityOffset in entityOffsets: newOffset = (entityOffset[0] - sentenceOffset[0], entityOffset[1] - sentenceOffset[0]) newOffset = (max(0, newOffset[0]), max(0, newOffset[1])) if newOffset != (0, 0): assert newOffset[1] > newOffset[0], (entity.attrib, entityOffsets, sentenceOffset) newEntityOffsets.append( (entityOffset[0] - sentenceOffset[0], entityOffset[1] - sentenceOffset[0]) ) assert len(newEntityOffsets) > 0, (entity.attrib, entityOffsets, sentenceOffset) entity.set("origOffset", entity.get("charOffset")) #entity.set("charOffset", str(newEntityOffset[0]) + "-" + str(newEntityOffset[1])) entity.set("charOffset", Range.tuplesToCharOffset(newEntityOffsets)) entCount += 1 sentenceCount += 1 if len([x for x in document.findall("entity")]) != 0: raise Exception("Sentence splitting does not cover the entire document") # Move interactions intCount = 0 interactions = [] interactionOldToNewId = {} for interaction in document.findall("interaction"): interactions.append(interaction) #if entSentenceIndex[interaction.get("e1")] < entSentenceIndex[interaction.get("e2")]: # targetSentence = entSentence[interaction.get("e1")] #else: # targetSentence = entSentence[interaction.get("e2")] # Interactions go to a sentence always by e1, as this is the event they are an argument of. # If an intersentence interaction is a relation, this shouldn't matter. targetSentence = entSentence[interaction.get("e1")] document.remove(interaction) targetSentence.append(interaction) newId = targetSentence.get("id") + ".i" + str(intCount) interactionOldToNewId[interaction.get("id")] = newId interaction.set("id", newId) interaction.set("e1", entMap[interaction.get("e1")]) interaction.set("e2", entMap[interaction.get("e2")]) intCount += 1 for interaction in interactions: if interaction.get("siteOf") != None: interaction.set("siteOf", interactionOldToNewId[interaction.get("siteOf")])
def addEntitiesToSTDoc(doc, docElement, tMap, eMap, entityElementMap, useOrigIds=False): containerElements = [docElement ] + [x for x in docElement.getiterator("sentence")] for containerElement in containerElements: for entity in containerElement.findall("entity"): eType = entity.get("type") if eType == "neg": # skip negative predictions if they are present continue assert entity.get("id") != None entityElementMap[entity.get("id")] = entity entityOffsets = Range.charOffsetToTuples(entity.get("charOffset")) ann = Annotation() ann.type = eType if useOrigIds: entityOrigId = entity.get("origId") if entityOrigId != None and entityOrigId.find( ".") != -1: # fix gluing of doc and ann id entityOrigId = entityOrigId.rsplit(".", 1)[-1] if entityOrigId != None: if entityOrigId[ 0] == "E": # a special id denoting a numbered, but triggerless event ann.eventId = entityOrigId ann.id = None else: ann.id = entityOrigId ann.text = entity.get("text") if entity.get("normalization") != None: ann.normalization = entity.get("normalization") #assert entityOffset[1] - entityOffset[0] in [len(ann.text), len(ann.text) - 1], (ann.text, entityOffset) ann.charOffsets = entityOffsets #ann.charBegin = entityOffset[0] #ann.charEnd = entityOffset[0] + len(ann.text) # entityOffset[1] + 1 if containerElement.tag == "sentence": # entity offset is relative to the container element, and for sentences, they can be relative to the document sentenceOffset = Range.charOffsetToSingleTuple( containerElement.get("charOffset")) for i in range(len(ann.charOffsets)): ann.charOffsets[i] = (ann.charOffsets[i][0] + sentenceOffset[0], ann.charOffsets[i][1] + sentenceOffset[0]) #ann.charBegin += sentenceOffset[0] #ann.charEnd += sentenceOffset[0] # idStem = entity.get("id").split(".e", 1)[0] # if sentenceOffsets.has_key(idStem): # sentenceOffset = sentenceOffsets[idStem] # ann.charBegin += sentenceOffset[0] # ann.charEnd += sentenceOffset[0] if entity.get("speculation") == "True": ann.speculation = True if entity.get("negation") == "True": ann.negation = True ann.extra = getExtraFromElement( entity) # add all scores and extra data if entity.get("given") == "True": # Remember to use original id for names! if entity.get("origId") != None: ann.id = entity.get("origId").rsplit(".", 1)[-1] assert ann.id[0].isupper(), ann.id for c in ann.id[1:]: assert c.isdigit(), ann.id doc.proteins.append(ann) tMap[entity.get("id")] = ann # The part below is dangerous, and incompatibilities should be handled rather # by not converting to the shared task format when it cannot be done #if entity.get("origId") != None: # # Attempt to process origId, assuming it corresponds to the BioNLP Shared Task format # nonNamedEntityOrigId = entity.get("origId").rsplit(".", 1)[-1] # if len(nonNamedEntityOrigId) > 1 and nonNamedEntityOrigId[0].isupper() and nonNamedEntityOrigId[1:].isdigit(): # ann.id = nonNamedEntityOrigId #stDoc.proteins.append(ann) else: # a predicted protein or trigger duplicateAnn = findDuplicateForSTTrigger(ann, doc.triggers) if duplicateAnn == None: doc.triggers.append(ann) tMap[entity.get("id")] = ann # Add confidence scores #ann.extra = getExtraFromElement(entity, ["conf"]) #ann.triggerScores = entity.get("predictions") #ann.unmergingScores = entity.get("umStrength") #ann.speculationScores = entity.get("modPred") #ann.negationScores = entity.get("modPred") # Events with 0 interactions (such as some Process-type events) would not be formed when constructing events based on interactions if entity.get("event") == "True": event = makeSTEvent(ann, entityElementMap[entity.get("id")]) eMap[entity.get("id")] = event doc.events.append(event) else: # a duplicate trigger already exists tMap[entity.get("id")] = duplicateAnn
def extend(input, output=None, entityTypes=["Bacterium"], verbose=False): if not (ET.iselement(input) and input.tag == "sentence"): print >> sys.stderr, "Loading corpus file", input corpusTree = ETUtils.ETFromObj(input) corpusRoot = corpusTree.getroot() bacteriaTokens = ExampleBuilders.PhraseTriggerExampleBuilder.getBacteriaTokens() if not (ET.iselement(input) and input.tag == "sentence"): sentences = corpusRoot.getiterator("sentence") else: sentences = [input] counts = defaultdict(int) for sentence in sentences: incorrectCount = 0 sentenceText = sentence.get("text") tokens = tokenize(sentenceText) for entity in sentence.findall("entity"): counts["all-entities"] += 1 if entity.get("type") not in entityTypes: continue headOffset = entity.get("headOffset") if headOffset == None: if verbose: print "WARNING, no head offset for entity", entity.get("id") headOffset = entity.get("charOffset") headOffset = Range.charOffsetToTuples(headOffset)[0] charOffset = entity.get("charOffset") assert charOffset != None, "WARNING, no head offset for entity " + str(entity.get("id")) charOffset = Range.charOffsetToTuples(charOffset)[0] tokPos = [0,0] tokIndex = None # find main token for i in range(len(tokens)): token = tokens[i] tokPos[1] = tokPos[0] + len(token) # - 1 if Range.overlap(headOffset, tokPos): tokIndex = i break tokPos[0] += len(token) assert tokIndex != None, (entity.get("id"), entity.get("text"), tokens) skip = False if tokPos[0] < headOffset[0]: tokPos = headOffset skip = True if not skip: # Extend before beginIndex = tokIndex for i in range(tokIndex-1, -1, -1): token = tokens[i] if token.isspace(): continue if not isBacteriaToken(token, bacteriaTokens, i - tokIndex): beginIndex = i + 1 break if i == 0: beginIndex = i while tokens[beginIndex].isspace() or isExtraWord(tokens[beginIndex], toLower=False): beginIndex += 1 if beginIndex >= tokIndex: beginIndex = tokIndex break # Extend after endIndex = tokIndex if tokens[tokIndex][-1] != ",": endIndex = tokIndex for i in range(tokIndex+1, len(tokens)): token = tokens[i] if token.isspace(): continue if not isBacteriaToken(token, bacteriaTokens, i - tokIndex): endIndex = i - 1 break if i == len(tokens) - 1: endIndex = i while tokens[endIndex].isspace(): endIndex -= 1 # Modify range if tokIndex > beginIndex: for token in reversed(tokens[beginIndex:tokIndex]): tokPos[0] -= len(token) if tokIndex < endIndex: for token in tokens[tokIndex+1:endIndex+1]: tokPos[1] += len(token) # Attempt to remove trailing periods and commas while not sentenceText[tokPos[1] - 1].isalnum(): tokPos[1] -= 1 if tokPos[1] < tokPos[0] + 1: tokPos[1] = tokPos[0] + 1 break while not sentenceText[tokPos[0]].isalnum(): tokPos[0] += 1 if tokPos[0] >= tokPos[1]: tokPos[0] = tokPos[1] - 1 break # Split merged names #newPos = [tokPos[0], tokPos[1]] #for split in sentenceText[tokPos[0]:tokPos[1]+1].split("/"): # newPos[0] += len(split) # if # Insert changed charOffset counts["entities"] += 1 newOffset = tuple(tokPos) newOffsetString = Range.tuplesToCharOffset([newOffset]) if verbose: print "Entity", entity.get("id"), #print [entity.get("text"), sentenceText[headOffset[0]:headOffset[1]+1], sentenceText[newOffset[0]:newOffset[1]+1]], print [entity.get("text"), sentenceText[headOffset[0]:headOffset[1]], sentenceText[newOffset[0]:newOffset[1]]], print [entity.get("charOffset"), entity.get("headOffset"), newOffsetString], "Sent:", len(sentence.get("text")), if newOffset != headOffset: counts["extended"] += 1 if verbose: print "EXTENDED", if newOffset == charOffset: counts["correct"] += 1 if verbose: print "CORRECT" else: counts["incorrect"] += 1 incorrectCount += 1 if verbose: print "INCORRECT" entity.set("charOffset", newOffsetString) #entity.set("text", sentenceText[newOffset[0]:newOffset[1]+1]) entity.set("text", sentenceText[newOffset[0]:newOffset[1]]) if incorrectCount > 0 and verbose: print "TOKENS:", "|".join(tokens) print "--------------------------------" if verbose: print counts if not (ET.iselement(input) and input.tag == "sentence"): if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree
def moveElements(document): entMap = {} entSentence = {} entSentenceIndex = {} sentences = document.findall("sentence") sentenceCount = 0 for sentence in sentences: sentenceOffset = Range.charOffsetToSingleTuple( sentence.get("charOffset")) # Move entities entCount = 0 for entity in document.findall("entity"): entityOffsets = Range.charOffsetToTuples(entity.get("charOffset")) overlaps = False for entityOffset in entityOffsets: if Range.overlap(sentenceOffset, entityOffset): overlaps = True break if overlaps: document.remove(entity) sentence.append(entity) entityId = entity.get("id") entityIdLastPart = entityId.rsplit(".", 1)[-1] if entityIdLastPart.startswith("e"): entity.set("id", sentence.get("id") + "." + entityIdLastPart) entMap[entityId] = sentence.get( "id") + "." + entityIdLastPart else: entity.set("docId", entityId) entity.set("id", sentence.get("id") + ".e" + str(entCount)) entMap[entityId] = sentence.get("id") + ".e" + str( entCount) entSentence[entityId] = sentence entSentenceIndex[entityId] = sentenceCount #newEntityOffset = (entityOffset[0] - sentenceOffset[0], entityOffset[1] - sentenceOffset[0]) newEntityOffsets = [] for entityOffset in entityOffsets: newOffset = (entityOffset[0] - sentenceOffset[0], entityOffset[1] - sentenceOffset[0]) newOffset = (max(0, newOffset[0]), max(0, newOffset[1])) if newOffset != (0, 0): assert newOffset[1] > newOffset[0], (entity.attrib, entityOffsets, sentenceOffset) newEntityOffsets.append( (entityOffset[0] - sentenceOffset[0], entityOffset[1] - sentenceOffset[0])) assert len(newEntityOffsets) > 0, (entity.attrib, entityOffsets, sentenceOffset) entity.set("origOffset", entity.get("charOffset")) #entity.set("charOffset", str(newEntityOffset[0]) + "-" + str(newEntityOffset[1])) entity.set("charOffset", Range.tuplesToCharOffset(newEntityOffsets)) entCount += 1 sentenceCount += 1 if len([x for x in document.findall("entity")]) != 0: raise Exception( "Sentence splitting does not cover the entire document") # Move interactions intCount = 0 interactions = [] interactionOldToNewId = {} for interaction in document.findall("interaction"): interactions.append(interaction) #if entSentenceIndex[interaction.get("e1")] < entSentenceIndex[interaction.get("e2")]: # targetSentence = entSentence[interaction.get("e1")] #else: # targetSentence = entSentence[interaction.get("e2")] # Interactions go to a sentence always by e1, as this is the event they are an argument of. # If an intersentence interaction is a relation, this shouldn't matter. targetSentence = entSentence[interaction.get("e1")] document.remove(interaction) targetSentence.append(interaction) newId = targetSentence.get("id") + ".i" + str(intCount) interactionOldToNewId[interaction.get("id")] = newId interaction.set("id", newId) interaction.set("e1", entMap[interaction.get("e1")]) interaction.set("e2", entMap[interaction.get("e2")]) intCount += 1 for interaction in interactions: if interaction.get("siteOf") != None: interaction.set("siteOf", interactionOldToNewId[interaction.get("siteOf")])
def extend(input, output=None, entityTypes=["Bacterium"], verbose=False): if not (ET.iselement(input) and input.tag == "sentence"): print >> sys.stderr, "Loading corpus file", input corpusTree = ETUtils.ETFromObj(input) corpusRoot = corpusTree.getroot() bacteriaTokens = ExampleBuilders.PhraseTriggerExampleBuilder.getBacteriaTokens( ) if not (ET.iselement(input) and input.tag == "sentence"): sentences = corpusRoot.getiterator("sentence") else: sentences = [input] counts = defaultdict(int) for sentence in sentences: incorrectCount = 0 sentenceText = sentence.get("text") tokens = tokenize(sentenceText) for entity in sentence.findall("entity"): counts["all-entities"] += 1 if entity.get("type") not in entityTypes: continue headOffset = entity.get("headOffset") if headOffset == None: if verbose: print "WARNING, no head offset for entity", entity.get( "id") headOffset = entity.get("charOffset") headOffset = Range.charOffsetToTuples(headOffset)[0] charOffset = entity.get("charOffset") assert charOffset != None, "WARNING, no head offset for entity " + str( entity.get("id")) charOffset = Range.charOffsetToTuples(charOffset)[0] tokPos = [0, 0] tokIndex = None # find main token for i in range(len(tokens)): token = tokens[i] tokPos[1] = tokPos[0] + len(token) # - 1 if Range.overlap(headOffset, tokPos): tokIndex = i break tokPos[0] += len(token) assert tokIndex != None, (entity.get("id"), entity.get("text"), tokens) skip = False if tokPos[0] < headOffset[0]: tokPos = headOffset skip = True if not skip: # Extend before beginIndex = tokIndex for i in range(tokIndex - 1, -1, -1): token = tokens[i] if token.isspace(): continue if not isBacteriaToken(token, bacteriaTokens, i - tokIndex): beginIndex = i + 1 break if i == 0: beginIndex = i while tokens[beginIndex].isspace() or isExtraWord( tokens[beginIndex], toLower=False): beginIndex += 1 if beginIndex >= tokIndex: beginIndex = tokIndex break # Extend after endIndex = tokIndex if tokens[tokIndex][-1] != ",": endIndex = tokIndex for i in range(tokIndex + 1, len(tokens)): token = tokens[i] if token.isspace(): continue if not isBacteriaToken(token, bacteriaTokens, i - tokIndex): endIndex = i - 1 break if i == len(tokens) - 1: endIndex = i while tokens[endIndex].isspace(): endIndex -= 1 # Modify range if tokIndex > beginIndex: for token in reversed(tokens[beginIndex:tokIndex]): tokPos[0] -= len(token) if tokIndex < endIndex: for token in tokens[tokIndex + 1:endIndex + 1]: tokPos[1] += len(token) # Attempt to remove trailing periods and commas while not sentenceText[tokPos[1] - 1].isalnum(): tokPos[1] -= 1 if tokPos[1] < tokPos[0] + 1: tokPos[1] = tokPos[0] + 1 break while not sentenceText[tokPos[0]].isalnum(): tokPos[0] += 1 if tokPos[0] >= tokPos[1]: tokPos[0] = tokPos[1] - 1 break # Split merged names #newPos = [tokPos[0], tokPos[1]] #for split in sentenceText[tokPos[0]:tokPos[1]+1].split("/"): # newPos[0] += len(split) # if # Insert changed charOffset counts["entities"] += 1 newOffset = tuple(tokPos) newOffsetString = Range.tuplesToCharOffset([newOffset]) if verbose: print "Entity", entity.get("id"), #print [entity.get("text"), sentenceText[headOffset[0]:headOffset[1]+1], sentenceText[newOffset[0]:newOffset[1]+1]], print[ entity.get("text"), sentenceText[headOffset[0]:headOffset[1]], sentenceText[newOffset[0]:newOffset[1]] ], print[ entity.get("charOffset"), entity.get("headOffset"), newOffsetString ], "Sent:", len(sentence.get("text")), if newOffset != headOffset: counts["extended"] += 1 if verbose: print "EXTENDED", if newOffset == charOffset: counts["correct"] += 1 if verbose: print "CORRECT" else: counts["incorrect"] += 1 incorrectCount += 1 if verbose: print "INCORRECT" entity.set("charOffset", newOffsetString) #entity.set("text", sentenceText[newOffset[0]:newOffset[1]+1]) entity.set("text", sentenceText[newOffset[0]:newOffset[1]]) if incorrectCount > 0 and verbose: print "TOKENS:", "|".join(tokens) print "--------------------------------" if verbose: print counts if not (ET.iselement(input) and input.tag == "sentence"): if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree
def mapEntity(self, entityElement, verbose=False): """ Determine the head token for a named entity or trigger. The head token is the token closest to the root for the subtree of the dependency parse spanned by the text of the element. @param entityElement: a semantic node (trigger or named entity) @type entityElement: cElementTree.Element @param verbose: Print selected head tokens on screen @param verbose: boolean """ headOffset = None if entityElement.get("headOffset") != None: headOffset = Range.charOffsetToSingleTuple(entityElement.get("headOffset")) if entityElement.get("charOffset") != "": charOffsets = Range.charOffsetToTuples(entityElement.get("charOffset")) else: charOffsets = [] # Each entity can consist of multiple syntactic tokens, covered by its # charOffset-range. One of these must be chosen as the head token. headTokens = [] # potential head tokens for token in self.tokens: #print token.attrib["id"], token.attrib["charOffset"] tokenOffset = Range.charOffsetToSingleTuple(token.get("charOffset")) if headOffset != None and entityElement.get("type") != "Binding": # A head token can already be defined in the headOffset-attribute. # However, depending on the tokenization, even this range may # contain multiple tokens. Still, it can always be assumed that # if headOffset is defined, the corret head token is in this range. if Range.overlap(headOffset,tokenOffset): headTokens.append(token) else: for offset in charOffsets: if Range.overlap(offset,tokenOffset): headTokens.append(token) if len(headTokens)==1: # An unambiguous head token was found token = headTokens[0] else: # One head token must be chosen from the candidates selHead = None if entityElement.get("type") == "Binding": for t in headTokens: compText = t.get("text").lower() if compText.find("bind") != -1 or compText.find("complex") != -1: selHead = t #print "Head:", selHead.get("text"), "/", entityElement.get("text"), entityElement.get("headOffset"), selHead.get("charOffset") entityElement.set("headOffset", selHead.get("charOffset")) break if selHead == None: token = self.findHeadToken(headTokens) else: token = selHead if verbose: print >> sys.stderr, "Selected head:", token.get("id"), token.get("text") #assert token != None, entityElement.get("id") if token != None: # The ElementTree entity-element is modified by setting the headOffset attribute if entityElement.get("headOffset") == None or entityElement.get("headOffset") != token.get("charOffset"): entityElement.set("headOffset", token.get("charOffset")) if not self.entitiesByToken.has_key(token): self.entitiesByToken[token] = [] self.entitiesByToken[token].append(entityElement) else: print >> sys.stderr, "Warning, no tokens for entity", entityElement.get("id") return token
def addEntitiesToSTDoc(doc, docElement, tMap, eMap, entityElementMap, useOrigIds=False): containerElements = [docElement] + [x for x in docElement.getiterator("sentence")] for containerElement in containerElements: for entity in containerElement.findall("entity"): eType = entity.get("type") if eType == "neg": # skip negative predictions if they are present continue assert entity.get("id") != None entityElementMap[entity.get("id")] = entity entityOffsets = Range.charOffsetToTuples(entity.get("charOffset")) ann = Annotation() ann.type = eType if useOrigIds: entityOrigId = entity.get("origId") if entityOrigId != None and entityOrigId.find(".") != -1: # fix gluing of doc and ann id entityOrigId = entityOrigId.rsplit(".",1)[-1] if entityOrigId != None: if entityOrigId[0] == "E": # a special id denoting a numbered, but triggerless event ann.eventId = entityOrigId ann.id = None else: ann.id = entityOrigId ann.text = entity.get("text") if entity.get("normalization") != None: ann.normalization = entity.get("normalization") #assert entityOffset[1] - entityOffset[0] in [len(ann.text), len(ann.text) - 1], (ann.text, entityOffset) ann.charOffsets = entityOffsets #ann.charBegin = entityOffset[0] #ann.charEnd = entityOffset[0] + len(ann.text) # entityOffset[1] + 1 if containerElement.tag == "sentence": # entity offset is relative to the container element, and for sentences, they can be relative to the document sentenceOffset = Range.charOffsetToSingleTuple(containerElement.get("charOffset")) for i in range(len(ann.charOffsets)): ann.charOffsets[i] = (ann.charOffsets[i][0] + sentenceOffset[0], ann.charOffsets[i][1] + sentenceOffset[0]) #ann.charBegin += sentenceOffset[0] #ann.charEnd += sentenceOffset[0] # idStem = entity.get("id").split(".e", 1)[0] # if sentenceOffsets.has_key(idStem): # sentenceOffset = sentenceOffsets[idStem] # ann.charBegin += sentenceOffset[0] # ann.charEnd += sentenceOffset[0] if entity.get("speculation") == "True": ann.speculation = True if entity.get("negation") == "True": ann.negation = True ann.extra = getExtraFromElement(entity) # add all scores and extra data if entity.get("given") == "True": # Remember to use original id for names! if entity.get("origId") != None: ann.id = entity.get("origId").rsplit(".", 1)[-1] assert ann.id[0].isupper(), ann.id for c in ann.id[1:]: assert c.isdigit(), ann.id doc.proteins.append(ann) tMap[entity.get("id")] = ann # The part below is dangerous, and incompatibilities should be handled rather # by not converting to the shared task format when it cannot be done #if entity.get("origId") != None: # # Attempt to process origId, assuming it corresponds to the BioNLP Shared Task format # nonNamedEntityOrigId = entity.get("origId").rsplit(".", 1)[-1] # if len(nonNamedEntityOrigId) > 1 and nonNamedEntityOrigId[0].isupper() and nonNamedEntityOrigId[1:].isdigit(): # ann.id = nonNamedEntityOrigId #stDoc.proteins.append(ann) else: # a predicted protein or trigger duplicateAnn = findDuplicateForSTTrigger(ann, doc.triggers) if duplicateAnn == None: doc.triggers.append(ann) tMap[entity.get("id")] = ann # Add confidence scores #ann.extra = getExtraFromElement(entity, ["conf"]) #ann.triggerScores = entity.get("predictions") #ann.unmergingScores = entity.get("umStrength") #ann.speculationScores = entity.get("modPred") #ann.negationScores = entity.get("modPred") # Events with 0 interactions (such as some Process-type events) would not be formed when constructing events based on interactions if entity.get("event") == "True": event = makeSTEvent(ann, entityElementMap[entity.get("id")]) eMap[entity.get("id")] = event doc.events.append(event) else: # a duplicate trigger already exists tMap[entity.get("id")] = duplicateAnn
def mapEntity(self, entityElement, verbose=False): """ Determine the head token for a named entity or trigger. The head token is the token closest to the root for the subtree of the dependency parse spanned by the text of the element. @param entityElement: a semantic node (trigger or named entity) @type entityElement: cElementTree.Element @param verbose: Print selected head tokens on screen @param verbose: boolean """ headOffset = None if entityElement.get("headOffset") != None: headOffset = Range.charOffsetToSingleTuple( entityElement.get("headOffset")) if entityElement.get("charOffset") != "": charOffsets = Range.charOffsetToTuples( entityElement.get("charOffset")) else: charOffsets = [] # Each entity can consist of multiple syntactic tokens, covered by its # charOffset-range. One of these must be chosen as the head token. headTokens = [] # potential head tokens for token in self.tokens: #print token.attrib["id"], token.attrib["charOffset"] tokenOffset = Range.charOffsetToSingleTuple( token.get("charOffset")) if headOffset != None and entityElement.get("type") != "Binding": # A head token can already be defined in the headOffset-attribute. # However, depending on the tokenization, even this range may # contain multiple tokens. Still, it can always be assumed that # if headOffset is defined, the corret head token is in this range. if Range.overlap(headOffset, tokenOffset): headTokens.append(token) else: for offset in charOffsets: if Range.overlap(offset, tokenOffset): headTokens.append(token) if len(headTokens) == 1: # An unambiguous head token was found token = headTokens[0] else: # One head token must be chosen from the candidates selHead = None if entityElement.get("type") == "Binding": for t in headTokens: compText = t.get("text").lower() for bindWord in ("bind", "complex", "h**o", "hetero", "dimer"): if bindWord in compText: selHead = t break if selHead != None: break # if compText.find("bind") != -1 or compText.find("complex") != -1: # selHead = t # #print "Head:", selHead.get("text"), "/", entityElement.get("text"), entityElement.get("headOffset"), selHead.get("charOffset") # entityElement.set("headOffset", selHead.get("charOffset")) # break # elif "egulation" in entityElement.get("type"): # self.getTokenHeadScores() # regulationHeads = [x for x in headTokens if self.tokenHeadScores[x] >= 1] # if len(regulationHeads) > 0: # selHead = regulationHeads[-1] if selHead == None: token = self.findHeadToken(headTokens) else: token = selHead if verbose: print >> sys.stderr, "Selected head:", token.get( "id"), token.get("text") #assert token != None, entityElement.get("id") if token != None: # The ElementTree entity-element is modified by setting the headOffset attribute if entityElement.get("headOffset") == None or entityElement.get( "headOffset") != token.get("charOffset"): entityElement.set("headOffset", token.get("charOffset")) if not self.entitiesByToken.has_key(token): self.entitiesByToken[token] = [] self.entitiesByToken[token].append(entityElement) else: print >> sys.stderr, "Warning, no tokens for entity", entityElement.get( "id") return token