def fixAltOffsets(input, output=None): print >> sys.stderr, "Loading corpus", input corpusTree = ETUtils.ETFromObj(input) print >> sys.stderr, "Corpus file loaded" corpusRoot = corpusTree.getroot() docCount = 0 sentencesCreated = 0 sentences = [x for x in corpusRoot.getiterator("sentence")] counter = ProgressCounter(len(sentences), "FixAltOffsets") fixCount = 0 # fix spans for sentence in sentences: counter.update(1, "Fixing AltOffsets for sentence ("+sentence.get("id")+"): ") sentOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset")) for entity in sentence.findall("entity"): altOffsetString = entity.get("altOffset") if altOffsetString == None: continue #print altOffsetString altOffsets = Range.charOffsetToTuples(altOffsetString) assert len(altOffsets) == 1 for i in range(len(altOffsets)): altOffset = altOffsets[i] altOffsets[i] = (altOffset[0] - sentOffset[0], altOffset[1] - sentOffset[0]) entity.set("altOffset", Range.tuplesToCharOffset(altOffsets)) fixCount += 1 print >> sys.stderr, "Fixed", fixCount, "altOffsets" if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree
def _markNamedEntities(self): """ This method is used to define which tokens belong to _named_ entities. Named entities are sometimes masked when testing learning of interactions, to prevent the system making a trivial decision based on commonly interacting names. This function assumes that all given entities are named entities. """ self.tokenIsName = {} self.tokenIsEntity = {} self.tokenIsEntityHead = {} # Initialize the dictionaries for token in self.tokens: self.tokenIsName[token] = False self.tokenIsEntity[token] = False self.tokenIsEntityHead[token] = [] for entity in self.entities: entityOffsets = Range.charOffsetToTuples(entity.get("charOffset")) entityHeadOffset = Range.charOffsetToSingleTuple( entity.get("headOffset")) for token in self.tokens: tokenOffset = Range.charOffsetToSingleTuple( token.get("charOffset")) for entityOffset in entityOffsets: if Range.overlap(entityOffset, tokenOffset): self.tokenIsEntity[token] = True if entity.get("given") == "True": self.tokenIsName[token] = True # if entity.get("given") != None: # if entity.get("given") == "True": # self.tokenIsName[token] = True # else: # entity.set("given", "True") # self.tokenIsName[token] = True if Range.overlap(entityHeadOffset, tokenOffset): self.tokenIsEntityHead[token].append(entity)
def makeINSubPhrases(phrases, tokens, phraseDict, filter=None): newPhrases = [] for phrase in phrases: if filter != None and phrase.get("type") not in filter: continue phraseOffset = Range.charOffsetToSingleTuple(phrase.get("charOffset")) phraseBegin = int(phrase.get("begin")) phraseEnd = int(phrase.get("end")) prevToken = None tokCount = 0 for token in tokens[phraseBegin:phraseEnd + 1]: if token.get("POS") == "IN" and prevToken != None: newPhraseOffset = (phraseOffset[0], Range.charOffsetToSingleTuple( prevToken.get("charOffset"))[-1]) newPhrase = makePhrase( phrase.get("type") + "-IN", newPhraseOffset, phraseBegin, phraseBegin + tokCount - 1) if not phraseDict.has_key(newPhraseOffset): #print "NEW PHRASE:", ETUtils.toStr(newPhrase) newPhrases.append(newPhrase) phraseDict[newPhraseOffset] = [newPhrase] prevToken = token tokCount += 1 return newPhrases
def getMetaMapFeatures(self, token, sentenceGraph, features): analyses = sentenceGraph.sentenceElement.find("analyses") if analyses == None: return metamap = analyses.find("metamap") if metamap == None: return tokenOffset = Range.charOffsetToSingleTuple(token.get("charOffset")) skipAttr = set(["charOffset", "text"]) for phrase in metamap.findall("phrase"): phraseOffset = Range.charOffsetToSingleTuple( phrase.get("charOffset")) if Range.overlap(tokenOffset, phraseOffset): attr = phrase.attrib attrNames = sorted(attr.keys()) for attrName in attrNames: if attrName in skipAttr: continue elif attrName == "score": features["_metamap_score"] = 0.001 * abs( int(attr[attrName])) else: attrValues = attr[attrName].split(",") for attrValue in attrValues: features["_metamap_" + attrName + "_" + attrValue.replace(" ", "-")] = 1
def insertElements(corpus, specAnn): for document in corpus.iter('document'): docId = document.get("origId") assert docId in specAnn, docId for sentence in document.iter('sentence'): sentOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset")) analyses = sentence.find("analyses") if not analyses: analyses = ET.SubElement(sentence, "analyses") #entitiesElement = sentence.find("entities") # Find the container container = analyses.find("entities") #None # for entitiesElement in entitiesElements: # if entitiesElement.get("source") == "SPECIES": # container = entitiesElement # break if not container: container = ET.SubElement(analyses, "entities") #container.set("source", "SPECIES") # Map the spans for span in specAnn[docId][:]: offset = span.get("offset") if Range.overlap(offset, sentOffset): if sentOffset[0] > offset[0] or sentOffset[1] < offset[1]: continue specAnn[docId].remove(span) charOffset = (offset[0] - sentOffset[0], offset[1] - sentOffset[0]) matchingText = sentence.get("text")[charOffset[0]:charOffset[1]] spanText = span.get("text") #print matchingText, spanText assert matchingText == spanText, (matchingText, spanText, charOffset) span.set("charOffset", "-".join([str(x) for x in charOffset])) assert not "--" in span.get("charOffset"), [str(x) for x in charOffset] del span.attrib["offset"] #span.set("offset", "") container.append(span)
def _markNamedEntities(self): """ This method is used to define which tokens belong to _named_ entities. Named entities are sometimes masked when testing learning of interactions, to prevent the system making a trivial decision based on commonly interacting names. """ self.tokenIsName = {} self.tokenIsEntity = {} self.tokenIsEntityHead = {} # Initialize the dictionaries for token in self.tokens: self.tokenIsName[token] = False self.tokenIsEntity[token] = False self.tokenIsEntityHead[token] = [] for entity in self.entities: entityOffsets = Range.charOffsetToTuples(entity.get("charOffset")) entityHeadOffset = Range.charOffsetToSingleTuple(entity.get("headOffset")) for token in self.tokens: tokenOffset = Range.charOffsetToSingleTuple(token.get("charOffset")) for entityOffset in entityOffsets: if Range.overlap(entityOffset, tokenOffset): self.tokenIsEntity[token] = True if entity.get("isName") != None: if entity.get("isName") == "True": self.tokenIsName[token] = True else: entity.set("isName", "True") self.tokenIsName[token] = True if Range.overlap(entityHeadOffset, tokenOffset): self.tokenIsEntityHead[token].append(entity)
def getHeads(corpus): corpus = ETUtils.ETFromObj(corpus) headDict = {} headDict["None"] = {} for sentence in corpus.getiterator("sentence"): headOffsetStrings = set() for entity in sentence.findall("entity"): eType = entity.get("type") if not headDict.has_key(eType): headDict[eType] = {} eText = entity.get("text") headOffset = entity.get("headOffset") headOffsetStrings.add(headOffset) headOffset = Range.charOffsetToSingleTuple(headOffset) charOffset = Range.charOffsetToSingleTuple(entity.get("charOffset")) if headOffset == charOffset: if not headDict[eType].has_key(eText): headDict[eType][eText] = 0 headDict[eType][eText] += 1 else: headText = sentenceText[headOffset[0]-charOffset[0]:headOffset[1]-charOffset[0]+1] if not headDict[eType].has_key(headText): headDict[eType][headText] = 0 headDict[eType][headText] += 1 for token in tokens: if not token.get("charOffset") in headOffsetStrings: # token is not the head of any entity headText = token.get("text") if not headDict["None"].has_key(headText): headDict["None"][headText] = 0 headDict["None"][headText] += 1 return headDict
def moveElements(document): entMap = {} entSentence = {} entSentenceIndex = {} sentences = document.findall("sentence") sentenceCount = 0 for sentence in sentences: sentenceOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset")) # Move entities entCount = 0 for entity in document.findall("entity"): entityOffsets = Range.charOffsetToTuples(entity.get("charOffset")) overlaps = False for entityOffset in entityOffsets: if Range.overlap(sentenceOffset, entityOffset): overlaps = True break if overlaps: document.remove(entity) sentence.append(entity) entityId = entity.get("id") entityIdLastPart = entityId.rsplit(".", 1)[-1] if entityIdLastPart.startswith("e"): entity.set("id", sentence.get("id") + "." + entityIdLastPart) entMap[entityId] = sentence.get("id") + "." + entityIdLastPart else: entity.set("docId", entityId) entity.set("id", sentence.get("id") + ".e" + str(entCount)) entMap[entityId] = sentence.get("id") + ".e" + str(entCount) entSentence[entityId] = sentence entSentenceIndex[entityId] = sentenceCount #newEntityOffset = (entityOffset[0] - sentenceOffset[0], entityOffset[1] - sentenceOffset[0]) newEntityOffsets = [] for entityOffset in entityOffsets: newEntityOffsets.append( (entityOffset[0] - sentenceOffset[0], entityOffset[1] - sentenceOffset[0]) ) entity.set("origOffset", entity.get("charOffset")) #entity.set("charOffset", str(newEntityOffset[0]) + "-" + str(newEntityOffset[1])) entity.set("charOffset", Range.tuplesToCharOffset(newEntityOffsets)) entCount += 1 sentenceCount += 1 # Move interactions intCount = 0 for interaction in document.findall("interaction"): #if entSentenceIndex[interaction.get("e1")] < entSentenceIndex[interaction.get("e2")]: # targetSentence = entSentence[interaction.get("e1")] #else: # targetSentence = entSentence[interaction.get("e2")] # Interactions go to a sentence always by e1, as this is the event they are an argument of. # If an intersentence interaction is a relation, this shouldn't matter. targetSentence = entSentence[interaction.get("e1")] document.remove(interaction) targetSentence.append(interaction) interaction.set("id", targetSentence.get("id") + ".i" + str(intCount)) interaction.set("e1", entMap[interaction.get("e1")]) interaction.set("e2", entMap[interaction.get("e2")]) intCount += 1
def selectBestMatch(entity, phrases): entOffset = Range.charOffsetToSingleTuple(entity.get("charOffset")) if entity.get("altOffset") != None: entOffset = Range.charOffsetToSingleTuple(entity.get("altOffset")) best = (sys.maxint, None) for phrase in phrases: matchValue = Range.mismatch(entOffset, Range.charOffsetToSingleTuple(phrase.get("charOffset"))) if best[0] > matchValue: best = (matchValue, phrase) return best[1]
def exportChemProtPredictions(xml, outPath, fileTypes="predictions", setNames=None): if fileTypes == "all": fileTypes = ["predictions", "abstracts", "entities", "relations"] elif isinstance(fileTypes, basestring): fileTypes = fileTypes.split(",") for fileType in fileTypes: if fileType not in ["predictions", "abstracts", "entities", "relations"]: raise Exception("Unknown ChemProt file type '" + str(fileType) + "'") xml = ETUtils.ETFromObj(xml) #with open(outPath, "wt") as f outFiles = {} openFiles = {} for document in xml.getiterator("document"): docId = document.get("origId") setName = document.get("set") if setNames != None: setName = setNames.get(setName, setName) if setName not in outFiles: outFiles[setName] = {} outFile = openOutFile(setName, outPath, "abstracts", fileTypes, outFiles, openFiles) if outFile != None: docText = document.get("text") #assert docText.count("\t") == 1, (docText.count("\t"), document.attrib) #title, abstract = docText.split("\t") #titleLength = document.get("titleLength") titleOffset = Range.charOffsetToSingleTuple(document.get("titleOffset")) assert titleOffset[0] == 0 outFile.write("\t".join([docId, docText[:titleOffset[1]], docText[titleOffset[1]+1:]]) + "\n") entityById = {} for entity in document.getiterator("entity"): outFile = openOutFile(setName, outPath, "entities", fileTypes, outFiles, openFiles) if outFile != None: eType = entity.get("type") if entity.get("normalized") != None and entity.get("type") == "GENE": eType += "-Y" if entity.get("normalized") == "True" else "-N" offset = Range.charOffsetToSingleTuple(entity.get("charOffset")) outFile.write("\t".join([docId, entity.get("origId"), eType, str(offset[0]), str(offset[1]), entity.get("text")]) + "\n") assert entity.get("id") not in entityById entityById[entity.get("id")] = entity for interaction in document.getiterator("interaction"): e1 = entityById[interaction.get("e1")] e2 = entityById[interaction.get("e2")] outFile = openOutFile(setName, outPath, "relations", fileTypes, outFiles, openFiles) if outFile != None: evaluated = "X" if interaction.get("evaluated") != None: evaluated = "Y " if interaction.get("evaluated") == "True" else "N " outFile.write("\t".join([docId, interaction.get("type"), evaluated, interaction.get("relType"), "Arg1:" + e1.get("origId"), "Arg2:" + e2.get("origId")]) + "\n") outFile = openOutFile(setName, outPath, "predictions", fileTypes, outFiles, openFiles) if outFile != None: outFile.write("\t".join([docId, interaction.get("type"), "Arg1:" + e1.get("origId"), "Arg2:" + e2.get("origId")]) + "\n") print >> sys.stderr, "Closing output files" for f in openFiles.values(): f.close() return xml
def getTokens(self, entity, tokenTuples): offset = entity.get("charOffset") assert offset != None offset = Range.charOffsetToSingleTuple(offset) match = [] for tokenTuple in tokenTuples: if Range.overlap(offset, tokenTuple[0]): match.append(tokenTuple[1].get("text")) elif len(match) > 0: # passed end break return match
def getNECounts(phrases, entities): counts = {} for phrase in phrases: phraseOffset = Range.charOffsetToSingleTuple(phrase.get("charOffset")) counts[phrase] = 0 for entity in entities: if entity.get("given") != "True": # only check names continue if Range.contains(phraseOffset, Range.charOffsetToSingleTuple(entity.get("charOffset"))): counts[phrase] += 1 return counts
def processElements(xml): for ddi in xml.getiterator("ddi"): ddi.tag = "interaction" for entity in xml.getiterator("entity"): entity.set("given", "True") # Reformat disjoint character offsets and update character range format for TEES 2.0+ charOffsets = Range.charOffsetToTuples(entity.get("charOffset"), rangeSep=";") updatedCharOffsets = [] for charOffset in charOffsets: updatedCharOffsets.append( (charOffset[0], charOffset[1]+1) ) entity.set("charOffset", Range.tuplesToCharOffset(updatedCharOffsets))
def selectBestMatch(entity, phrases): entOffset = Range.charOffsetToSingleTuple(entity.get("charOffset")) if entity.get("altOffset") != None: entOffset = Range.charOffsetToSingleTuple(entity.get("altOffset")) best = (sys.maxint, None) for phrase in phrases: matchValue = Range.mismatch( entOffset, Range.charOffsetToSingleTuple(phrase.get("charOffset"))) if best[0] > matchValue: best = (matchValue, phrase) return best[1]
def processElements(xml): for ddi in xml.getiterator("ddi"): ddi.tag = "interaction" for entity in xml.getiterator("entity"): entity.set("given", "True") # Reformat disjoint character offsets and update character range format for TEES 2.0+ charOffsets = Range.charOffsetToTuples(entity.get("charOffset"), rangeSep=";") updatedCharOffsets = [] for charOffset in charOffsets: updatedCharOffsets.append((charOffset[0], charOffset[1] + 1)) entity.set("charOffset", Range.tuplesToCharOffset(updatedCharOffsets))
def fixEntities(xml): counts = defaultdict(int) for sentence in xml.getiterator("sentence"): sText = sentence.get("text") for entity in sentence.findall("entity"): charOffset = entity.get("charOffset") if charOffset == "-": assert False, str(entity) sentence.remove(entity) counts["removed-invalid"] += 1 else: charOffset = Range.charOffsetToSingleTuple(charOffset) # fix length realLength = len(entity.get("text")) lenDiff = (charOffset[1] - charOffset[0] + 1) - realLength if lenDiff != realLength: counts["incorrect-ent-offset"] += 1 counts["incorrect-ent-offset-diff"+str(lenDiff)] += 1 if abs(lenDiff) > 2: print "Warning, lenDiff:", (lenDiff, charOffset, sText, entity.get("text"), entity.get("id")) charOffset = (charOffset[0], charOffset[0] + realLength) # find starting position entIndex = sText.find(entity.get("text"), charOffset[0]) if entIndex == -1: for i in [-1,-2,-3]: entIndex = sText.find(entity.get("text"), charOffset[0]+i) if entIndex != -1: break if entIndex != 0: # could be lowercase sTextLower = sText.lower() for i in [0,-1,-2,-3]: lowerEntIndex = sTextLower.find(entity.get("text"), charOffset[0]+i) if lowerEntIndex != -1: break if lowerEntIndex != -1 and abs(lowerEntIndex - charOffset[0]) < abs(entIndex - charOffset[0]): entIndex = lowerEntIndex assert entIndex != -1, (charOffset, sText, entity.get("text"), entity.get("id")) indexDiff = entIndex - charOffset[0] if indexDiff != 0: counts["incorrect-ent-index"] += 1 counts["incorrect-ent-index-diff"+str(indexDiff)] += 1 print "Warning, indexDiff:", (indexDiff, charOffset, sText, entity.get("text"), entity.get("id")) # move offset charOffset = (charOffset[0]+indexDiff, charOffset[1]+indexDiff) # validate new offset sEntity = sText[charOffset[0]:charOffset[1]] assert sEntity == entity.get("text") or sEntity.lower() == entity.get("text"), (charOffset, sText, entity.get("text"), entity.get("id")) entity.set("charOffset", Range.tuplesToCharOffset( (charOffset[0], charOffset[1]))) entity.set("given", "True") for interaction in sentence.findall("interaction"): interaction.set("type", "DDI") print "Fix counts:", counts
def getNECounts(phrases, entities): counts = {} for phrase in phrases: phraseOffset = Range.charOffsetToSingleTuple(phrase.get("charOffset")) counts[phrase] = 0 for entity in entities: if entity.get("given") != "True": # only check names continue if Range.contains( phraseOffset, Range.charOffsetToSingleTuple(entity.get("charOffset"))): counts[phrase] += 1 return counts
def getRelativePosition(self, entity1Range, entity2Range, token): offset = Range.charOffsetToSingleTuple(token.get("charOffset")) if Range.overlap(entity1Range, offset): return "Entity1" if Range.overlap(entity2Range, offset): return "Entity2" entitiesRange = (min(entity1Range[0],entity2Range[0]),max(entity1Range[1],entity2Range[1])) if offset[1] < entitiesRange[0]: return "Fore" elif offset[1] > entitiesRange[1]: return "After" else: return "Between"
def getRelativePosition(self, entity1Range, entity2Range, token): offset = Range.charOffsetToSingleTuple(token.get("charOffset")) if Range.overlap(entity1Range, offset): return "Entity1" if Range.overlap(entity2Range, offset): return "Entity2" entitiesRange = (min(entity1Range[0], entity2Range[0]), max(entity1Range[1], entity2Range[1])) if offset[1] < entitiesRange[0]: return "Fore" elif offset[1] > entitiesRange[1]: return "After" else: return "Between"
def getMatchingPhrases(entity, phraseOffsets, phraseDict): matches = [] if entity.get("isName") == "True": return [] maxOffset = Range.charOffsetToSingleTuple(entity.get("charOffset")) minOffset = entity.get("altOffset") if minOffset != None: minOffset = Range.charOffsetToSingleTuple(minOffset) else: minOffset = maxOffset for phraseOffset in phraseOffsets: if Range.contains(maxOffset, phraseOffset) and Range.contains(phraseOffset, minOffset): matches.extend(phraseDict[phraseOffset]) return matches
def getMatchingPhrases(entity, phraseOffsets, phraseDict): matches = [] if entity.get("isName") == "True": return [] maxOffset = Range.charOffsetToSingleTuple(entity.get("charOffset")) minOffset = entity.get("altOffset") if minOffset != None: minOffset = Range.charOffsetToSingleTuple(minOffset) else: minOffset = maxOffset for phraseOffset in phraseOffsets: if Range.contains(maxOffset, phraseOffset) and Range.contains( phraseOffset, minOffset): matches.extend(phraseDict[phraseOffset]) return matches
def makeDDI13SubmissionFile(input, output, mode="interactions", idfilter=None): xml = ETUtils.ETFromObj(input) outFile = open(output, "wt") for sentence in xml.getiterator("sentence"): sentenceId = sentence.get("id") if idfilter != None and idfilter not in sentenceId: continue # Output entities if mode == "entities": for entity in sentence.findall("entity"): if entity.get("type") != "neg": outFile.write(sentenceId) offsets = Range.charOffsetToTuples( entity.get("charOffset")) for i in range(len(offsets)): offsets[i] = (offsets[i][0], offsets[i][1] - 1) outFile.write( "|" + Range.tuplesToCharOffset(offsets, rangeSep=";")) outFile.write("|" + entity.get("text")) outFile.write("|" + entity.get("type")) outFile.write("\n") if mode == "interactions": # First determine which pairs interact intMap = defaultdict(lambda: defaultdict(lambda: None)) for interaction in sentence.findall("interaction"): # Make mapping both ways to discard edge directionality. This isn't actually needed, # since MultiEdgeExampleBuilder builds entity pairs in the same order as this function, # but shouldn't harm to include it and now it works regardless of pair direction. if interaction.get("type") != "neg" and interaction.get( "given") != "True": intMap[interaction.get("e1")][interaction.get( "e2")] = interaction intMap[interaction.get("e2")][interaction.get( "e1")] = interaction # Then write all pairs to the output file entities = sentence.findall("entity") for i in range(0, len(entities) - 1): for j in range(i + 1, len(entities)): eIId = entities[i].get("id") eJId = entities[j].get("id") outFile.write(sentenceId + "|" + eIId + "|" + eJId + "|") if intMap[eIId][eJId] != None: interaction = intMap[eIId][eJId] assert interaction.get("type") != "neg" outFile.write("1|" + interaction.get("type") + "\n") else: outFile.write("0|null\n") outFile.close()
def addParseElements(doc, docEl): if docEl.tag != "sentence": return sentAnalysesEl = ET.SubElement(docEl, "analyses") #parsesEl = ET.SubElement(sentAnalysesEl, "parses") parseEl = ET.SubElement(sentAnalysesEl, "parse") #tokenizationsEl = ET.SubElement(sentAnalysesEl, "tokenizations") tokenizationEl = ET.SubElement(sentAnalysesEl, "tokenization") parseEl.set("parser", "gold") parseEl.set("tokenizer", "gold") tokenizationEl.set("tokenizer", "gold") tokenMap = {} for word in doc.words: tokEl = ET.SubElement(tokenizationEl, "token") tokEl.set("id", word.id) tokEl.set("text", word.text) tokEl.set("POS", "None") assert len(word.charOffsets) == 1, (word, word.charOffsets) tokEl.set("charOffset", Range.tuplesToCharOffset(word.charOffsets)) tokenMap[word.id] = tokEl for dep in doc.dependencies: depEl = ET.SubElement(parseEl, "dependency") depEl.set("id", dep.id) depEl.set("type", dep.type) assert len(dep.arguments) == 2 depEl.set("t1", dep.arguments[0].target.id) depEl.set("t2", dep.arguments[1].target.id) if dep.type.find(":") != -1: word1Type, word2Type = dep.type.split("(")[0].split(":")[-1].split("-") tokenMap[dep.arguments[0].target.id].set("POS", word1Type) tokenMap[dep.arguments[1].target.id].set("POS", word2Type)
def makeEntityElement(ann, idCount, docEl): entEl = ET.Element("entity") entEl.set("type", ann.type) entEl.set("text", ann.text) # identifiers protId = docEl.get("id") + ".e" + str(idCount) entEl.set("id", protId) if ann.id != None: entEl.set("origId", docEl.get("origId") + "." + str(ann.id)) # offsets entEl.set("charOffset", Range.tuplesToCharOffset(ann.charOffsets)) if len(ann.alternativeOffsets) > 0: altOffs = [] for alternativeOffset in ann.alternativeOffsets: altOffs.append( str(alternativeOffset[0]) + "-" + str(alternativeOffset[1] - 1)) entEl.set("altOffset", ",".join(altOffs)) if ann.normalization != None: entEl.set("normalization", ann.normalization) addExtraToElement(entEl, ann.extra) # determine if given data assert ann.fileType in ["a1", "a2", "rel"], ann.fileType if ann.fileType == "a1": #protein.isName(): entEl.set("given", "True") #else: # entEl.set("given", "False") return entEl
def addParseElements(doc, docEl): if docEl.tag != "sentence": return sentAnalysesEl = ET.SubElement(docEl, "analyses") #parsesEl = ET.SubElement(sentAnalysesEl, "parses") parseEl = ET.SubElement(sentAnalysesEl, "parse") #tokenizationsEl = ET.SubElement(sentAnalysesEl, "tokenizations") tokenizationEl = ET.SubElement(sentAnalysesEl, "tokenization") parseEl.set("parser", "gold") parseEl.set("tokenizer", "gold") tokenizationEl.set("tokenizer", "gold") tokenMap = {} for word in doc.words: tokEl = ET.SubElement(tokenizationEl, "token") tokEl.set("id", word.id) tokEl.set("text", word.text) tokEl.set("POS", "None") assert len(word.charOffsets) == 1, (word, word.charOffsets) tokEl.set("charOffset", Range.tuplesToCharOffset(word.charOffsets)) tokenMap[word.id] = tokEl for dep in doc.dependencies: depEl = ET.SubElement(parseEl, "dependency") depEl.set("id", dep.id) depEl.set("type", dep.type) assert len(dep.arguments) == 2 depEl.set("t1", dep.arguments[0].target.id) depEl.set("t2", dep.arguments[1].target.id) if dep.type.find(":") != -1: word1Type, word2Type = dep.type.split("(")[0].split(":")[-1].split( "-") tokenMap[dep.arguments[0].target.id].set("POS", word1Type) tokenMap[dep.arguments[1].target.id].set("POS", word2Type)
def makeEntityElement(ann, idCount, docEl): entEl = ET.Element("entity") entEl.set("type", ann.type) entEl.set("text", ann.text) # identifiers protId = docEl.get("id") + ".e" + str(idCount) entEl.set("id", protId) if ann.id != None: entEl.set("origId", docEl.get("origId") + "." + str(ann.id)) # offsets entEl.set("charOffset", Range.tuplesToCharOffset(ann.charOffsets)) if len(ann.alternativeOffsets) > 0: altOffs = [] for alternativeOffset in ann.alternativeOffsets: altOffs.append( str(alternativeOffset[0]) + "-" + str(alternativeOffset[1]-1) ) entEl.set("altOffset", ",".join(altOffs)) if ann.normalization != None: entEl.set("normalization", ann.normalization) addExtraToElement(entEl, ann.extra) # determine if given data assert ann.fileType in ["a1", "a2", "rel"], ann.fileType if ann.fileType == "a1": #protein.isName(): entEl.set("given", "True") #else: # entEl.set("given", "False") return entEl
def addSentence(self, sentenceGraph): if sentenceGraph == None: return tokens = sorted([(Range.charOffsetToSingleTuple(x.get("charOffset")), x) for x in sentenceGraph.tokens]) indexByTokenId = {tokens[i][1].get("id"):i for i in range(len(tokens))} assert len(indexByTokenId) == len(tokens) # check that there were no duplicate ids entityById = {x.get("id"):x for x in sentenceGraph.entities} events = {} for interaction in sentenceGraph.interactions: e1Id = interaction.get("e1") e2Id = interaction.get("e2") e1 = entityById[e1Id] e2 = entityById[e2Id] t1 = sentenceGraph.entityHeadTokenByEntity[e1] t2 = sentenceGraph.entityHeadTokenByEntity[e2] index1 = indexByTokenId[t1.get("id")] index2 = indexByTokenId[t2.get("id")] intSpan = abs(index1 - index2) self.interactionSpans[intSpan] = self.interactionSpans.get(intSpan, 0) + 1 self.intSpan["min"] = min(self.intSpan.get("min"), intSpan) self.intSpan["max"] = max(self.intSpan.get("max"), intSpan) if interaction.get("event") == "True": if e1Id not in events: events[e1Id] = {"min":9999, "max":-9999} events[e1Id]["min"] = min(events[e1Id]["min"], index1, index2) events[e1Id]["max"] = max(events[e1Id]["max"], index1, index2) for eventId in sorted(events.keys()): eventSpan = events[eventId]["max"] - events[eventId]["min"] self.eventSpans[eventSpan] = self.eventSpans.get(eventSpan, 0) + 1 self.eventSpan["min"] = min(self.eventSpan.get("min"), eventSpan) self.eventSpan["max"] = max(self.eventSpan.get("max"), eventSpan)
def prepareTokens(self, tokens): tokenTuples = [] for token in tokens: tokenTuples.append( (Range.charOffsetToSingleTuple(token.get("charOffset")), token)) return tokenTuples
def getPatterns(self, e1, e2): e1Range = Range.charOffsetToSingleTuple(e1.get("charOffset")) e2Range = Range.charOffsetToSingleTuple(e2.get("charOffset")) tokenPositions = {} for token in self.sentenceGraph.tokens: tokenPositions[token.get("id")] = self.getRelativePosition( e1Range, e2Range, token) prevTokenText = None prevToken2Text = None prevPosition = None patternForeBetween = {} patternBetween = {} patternBetweenAfter = {} for token in self.sentenceGraph.tokens: if self.sentenceGraph.tokenIsName[token]: continue id = token.get("id") text = token.get("text").lower() if prevPosition != tokenPositions[id]: prevTokenText = None prevToken2Text = None if tokenPositions[id] == "Fore": self.addToPattern(patternForeBetween, text, prevTokenText, prevToken2Text) elif tokenPositions[id] == "Between": self.addToPattern(patternForeBetween, text, prevTokenText, prevToken2Text) self.addToPattern(patternBetween, text, prevTokenText, prevToken2Text) self.addToPattern(patternBetweenAfter, text, prevTokenText, prevToken2Text) elif tokenPositions[id] == "After": self.addToPattern(patternBetweenAfter, text, prevTokenText, prevToken2Text) prevPosition = tokenPositions[id] #if tokenPositions[id].find("Entity") != -1: prevToken2Text = prevTokenText prevTokenText = text return patternForeBetween, patternBetween, patternBetweenAfter
def getPhraseDict(phrases): phraseDict = {} # Define offsets for phrase in phrases: phraseOffset = Range.charOffsetToSingleTuple(phrase.get("charOffset")) if not phraseDict.has_key(phraseOffset): phraseDict[phraseOffset] = [] phraseDict[phraseOffset].append(phrase) return phraseDict
def makeDETSubPhrases(phrases, tokens, phraseDict, filter=None): newPhrases = [] for phrase in phrases: if filter != None and phrase.get("type") not in filter: continue phraseOffset = Range.charOffsetToSingleTuple(phrase.get("charOffset")) phraseBegin = int(phrase.get("begin")) phraseEnd = int(phrase.get("end")) if phraseBegin > 0 and tokens[phraseBegin - 1].get("POS") == "DT": newPhraseOffset = (Range.charOffsetToSingleTuple( tokens[phraseBegin - 1].get("charOffset"))[0], phraseOffset[1]) newPhrase = makePhrase("DT-" + phrase.get("type"), newPhraseOffset, phraseBegin - 1, phraseEnd) if not phraseDict.has_key(newPhraseOffset): #print "NEW PHRASE:", ETUtils.toStr(newPhrase) newPhrases.append(newPhrase) phraseDict[newPhraseOffset] = [newPhrase] return newPhrases
def getAttributes(element): attrib = element.attrib.copy() #attrib[TAGKEY] = element.tag for key in attrib: if "offset" in key.lower(): attrib[key] = Range.charOffsetToTuples(attrib[key]) if len(attrib[key]) == 1: attrib[key] = attrib[key][0] return attrib
def moveElements(document): entMap = {} entSentence = {} entSentenceIndex = {} sentences = document.findall("sentence") sentenceCount = 0 for sentence in sentences: sentenceOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset")) # Move entities entCount = 0 for entity in document.findall("entity"): entityOffset = Range.charOffsetToSingleTuple(entity.get("charOffset")) if Range.overlap(sentenceOffset, entityOffset): document.remove(entity) sentence.append(entity) entityId = entity.get("id") entityIdLastPart = entityId.rsplit(".", 1)[-1] if entityIdLastPart.startswith("e"): entity.set("id", sentence.get("id") + "." + entityIdLastPart) entMap[entityId] = sentence.get("id") + "." + entityIdLastPart else: entity.set("docId", entityId) entity.set("id", sentence.get("id") + ".e" + str(entCount)) entMap[entityId] = sentence.get("id") + ".e" + str(entCount) entSentence[entityId] = sentence entSentenceIndex[entityId] = sentenceCount newEntityOffset = (entityOffset[0] - sentenceOffset[0], entityOffset[1] - sentenceOffset[0]) entity.set("origOffset", entity.get("charOffset")) entity.set("charOffset", str(newEntityOffset[0]) + "-" + str(newEntityOffset[1])) entCount += 1 sentenceCount += 1 # Move interactions intCount = 0 for interaction in document.findall("interaction"): if entSentenceIndex[interaction.get("e1")] < entSentenceIndex[interaction.get("e2")]: targetSentence = entSentence[interaction.get("e1")] else: targetSentence = entSentence[interaction.get("e2")] document.remove(interaction) targetSentence.append(interaction) interaction.set("id", targetSentence.get("id") + ".i" + str(intCount)) interaction.set("e1", entMap[interaction.get("e1")]) interaction.set("e2", entMap[interaction.get("e2")]) intCount += 1
def makeDDI13SubmissionFile(input, output, mode="interactions", idfilter=None): xml = ETUtils.ETFromObj(input) outFile = open(output, "wt") for sentence in xml.getiterator("sentence"): sentenceId = sentence.get("id") if idfilter != None and idfilter not in sentenceId: continue # Output entities if mode == "entities": for entity in sentence.findall("entity"): if entity.get("type") != "neg": outFile.write(sentenceId) offsets = Range.charOffsetToTuples(entity.get("charOffset")) for i in range(len(offsets)): offsets[i] = (offsets[i][0], offsets[i][1]-1) outFile.write("|" + Range.tuplesToCharOffset(offsets, rangeSep=";")) outFile.write("|" + entity.get("text")) outFile.write("|" + entity.get("type")) outFile.write("\n") if mode == "interactions": # First determine which pairs interact intMap = defaultdict(lambda:defaultdict(lambda:None)) for interaction in sentence.findall("interaction"): # Make mapping both ways to discard edge directionality. This isn't actually needed, # since MultiEdgeExampleBuilder builds entity pairs in the same order as this function, # but shouldn't harm to include it and now it works regardless of pair direction. if interaction.get("type") != "neg" and interaction.get("given") != "True": intMap[interaction.get("e1")][interaction.get("e2")] = interaction intMap[interaction.get("e2")][interaction.get("e1")] = interaction # Then write all pairs to the output file entities = sentence.findall("entity") for i in range(0, len(entities)-1): for j in range(i+1, len(entities)): eIId = entities[i].get("id") eJId = entities[j].get("id") outFile.write(sentenceId + "|" + eIId + "|" + eJId + "|") if intMap[eIId][eJId] != None: interaction = intMap[eIId][eJId] assert interaction.get("type") != "neg" outFile.write("1|" + interaction.get("type") + "\n") else: outFile.write("0|null\n") outFile.close()
def makeDETSubPhrases(phrases, tokens, phraseDict, filter=None): newPhrases = [] for phrase in phrases: if filter != None and phrase.get("type") not in filter: continue phraseOffset = Range.charOffsetToSingleTuple(phrase.get("charOffset")) phraseBegin = int(phrase.get("begin")) phraseEnd = int(phrase.get("end")) if phraseBegin > 0 and tokens[phraseBegin - 1].get("POS") == "DT": newPhraseOffset = ( Range.charOffsetToSingleTuple(tokens[phraseBegin - 1].get("charOffset"))[0], phraseOffset[1], ) newPhrase = makePhrase("DT-" + phrase.get("type"), newPhraseOffset, phraseBegin - 1, phraseEnd) if not phraseDict.has_key(newPhraseOffset): # print "NEW PHRASE:", ETUtils.toStr(newPhrase) newPhrases.append(newPhrase) phraseDict[newPhraseOffset] = [newPhrase] return newPhrases
def updateXML(root, removeAnalyses=True): counts = defaultdict(int) for document in root.findall("document"): sentencePos = 0 counts["documents"] += 1 for sentence in document.findall("sentence"): counts["sentences"] += 1 # Remove the original parses analyses = sentence.find("sentenceanalyses") if analyses != None: counts["analyses"] += 1 if removeAnalyses: counts["removed-analyses"] += 1 sentence.remove(analyses) # Add an artifical sentence offset so that sentences can be exported as a single document sentenceText = sentence.get("text") sentence.set("charOffset", Range.tuplesToCharOffset((sentencePos, sentencePos + len(sentenceText)))) # Update the character offsets of all entities from the old format (begin,end) to the new one (begin,end+1) for entity in sentence.findall("entity"): counts["entities"] += 1 offsets = [(x[0], x[1] + 1) for x in Range.charOffsetToTuples(entity.get("charOffset"))] entityText = entity.get("text") for offset, entitySpan in zip(offsets, [sentenceText[x[0]:x[1]] for x in offsets]): counts["entity-offsets"] += 1 lenOffset = offset[1] - offset[0] offsetText, entityText = entityText[:lenOffset].strip(), entityText[lenOffset:].strip() assert offsetText == entitySpan, (offsets, (entity.get("text"), entitySpan), (offsetText, entityText), sentenceText) entity.set("charOffset", Range.tuplesToCharOffset(offsets)) # Convert positive pairs into interaction elements numInteractions = 0 for pair in sentence.findall("pair"): counts["pairs"] += 1 sentence.remove(pair) if pair.get("interaction") == "True": del pair.attrib["interaction"] pair.set("id", pair.get("id").rsplit(".", 1)[0] + ".i" + str(numInteractions)) pair.set("type", "PPI") ET.SubElement(sentence, "interaction", pair.attrib) numInteractions += 1 counts["interactions"] += 1 sentencePos += len(sentenceText) + 1 print >> sys.stderr, "Updated Interaction XML format:", dict(counts) return root
def insertElements(corpus, specAnn): for document in corpus.iter('document'): docId = document.get("origId") assert docId in specAnn, docId for sentence in document.iter('sentence'): sentOffset = Range.charOffsetToSingleTuple( sentence.get("charOffset")) analyses = sentence.find("analyses") if not analyses: analyses = ET.SubElement(sentence, "analyses") #entitiesElement = sentence.find("entities") # Find the container container = analyses.find("entities") #None # for entitiesElement in entitiesElements: # if entitiesElement.get("source") == "SPECIES": # container = entitiesElement # break if not container: container = ET.SubElement(analyses, "entities") #container.set("source", "SPECIES") # Map the spans for span in specAnn[docId][:]: offset = span.get("offset") if Range.overlap(offset, sentOffset): if sentOffset[0] > offset[0] or sentOffset[1] < offset[1]: continue specAnn[docId].remove(span) charOffset = (offset[0] - sentOffset[0], offset[1] - sentOffset[0]) matchingText = sentence.get( "text")[charOffset[0]:charOffset[1]] spanText = span.get("text") #print matchingText, spanText assert matchingText == spanText, (matchingText, spanText, charOffset) span.set("charOffset", "-".join([str(x) for x in charOffset])) assert not "--" in span.get("charOffset"), [ str(x) for x in charOffset ] del span.attrib["offset"] #span.set("offset", "") container.append(span)
def makeTokenSubPhrases(tokens, phraseDict, includePOS=["PRP$", "IN", "WP$"]): newPhrases = [] for i in range(len(tokens)): token = tokens[i] tokPOS = token.get("POS") if tokPOS in includePOS: tokOffset = Range.charOffsetToSingleTuple(token.get("charOffset")) if not phraseDict.has_key(tokOffset): newPhrase = makePhrase("TOK-t" + tokPOS, tokOffset, i, i) newPhrases.append(newPhrase) phraseDict[tokOffset] = [newPhrase] return newPhrases
def getPatterns(self, e1, e2): e1Range = Range.charOffsetToSingleTuple(e1.get("charOffset")) e2Range = Range.charOffsetToSingleTuple(e2.get("charOffset")) tokenPositions = {} for token in self.sentenceGraph.tokens: tokenPositions[token.get("id")] = self.getRelativePosition(e1Range,e2Range,token) prevTokenText = None prevToken2Text = None prevPosition = None patternForeBetween = {} patternBetween = {} patternBetweenAfter = {} for token in self.sentenceGraph.tokens: if self.sentenceGraph.tokenIsName[token]: continue id = token.get("id") text = token.get("text").lower() if prevPosition != tokenPositions[id]: prevTokenText = None prevToken2Text = None if tokenPositions[id] == "Fore": self.addToPattern(patternForeBetween, text, prevTokenText, prevToken2Text) elif tokenPositions[id] == "Between": self.addToPattern(patternForeBetween, text, prevTokenText, prevToken2Text) self.addToPattern(patternBetween, text, prevTokenText, prevToken2Text) self.addToPattern(patternBetweenAfter, text, prevTokenText, prevToken2Text) elif tokenPositions[id] == "After": self.addToPattern(patternBetweenAfter, text, prevTokenText, prevToken2Text) prevPosition = tokenPositions[id] #if tokenPositions[id].find("Entity") != -1: prevToken2Text = prevTokenText prevTokenText = text return patternForeBetween, patternBetween, patternBetweenAfter
def writeXMLSentence(self, examples, predictionsByExample, sentenceObject, classSet, classIds, goldSentence=None, exampleStyle=None, structureAnalyzer=None): self.assertSameSentence(examples) sentenceElement = sentenceObject.sentence sentenceId = sentenceElement.get("id") sentenceText = sentenceElement.get("text") # detach analyses-element sentenceAnalysesElement = None sentenceAnalysesElement = sentenceElement.find("sentenceanalyses") if sentenceAnalysesElement == None: sentenceAnalysesElement = sentenceElement.find("analyses") if sentenceAnalysesElement != None: sentenceElement.remove(sentenceAnalysesElement) # remove pairs and interactions interactions = self.removeChildren(sentenceElement, ["pair", "interaction"]) # remove entities newEntityIdCount = IDUtils.getNextFreeId( sentenceElement.findall("entity")) nonNameEntities = self.removeNonNameEntities(sentenceElement) # add new pairs for example in examples: prediction = predictionsByExample[example[0]] entityElement = ET.Element("entity") #entityElement.attrib["given"] = "False" headToken = example[3]["t"] for token in sentenceObject.tokens: if token.get("id") == headToken: headToken = token break entityElement.set("charOffset", example[3]["charOffset"]) entityElement.set("headOffset", headToken.get("charOffset")) entityElement.set("phraseType", example[3]["ptype"]) entOffset = Range.charOffsetToSingleTuple(example[3]["charOffset"]) entityElement.set("text", sentenceText[entOffset[0]:entOffset[1]]) entityElement.set("id", sentenceId + ".e" + str(newEntityIdCount)) self.setElementType(entityElement, prediction, classSet, classIds) newEntityIdCount += 1 sentenceElement.append(entityElement) # re-attach the analyses-element if sentenceAnalysesElement != None: sentenceElement.append(sentenceAnalysesElement)
def fixIndices(phrases, tokens): fixCount = 0 phraseCount = 0 for phrase in phrases: fixed = False phraseOffset = Range.charOffsetToSingleTuple(phrase.get("charOffset")) phraseBegin = int(phrase.get("begin")) phraseEnd = int(phrase.get("end")) for i in range(len(tokens)): token = tokens[i] tokOffset = Range.charOffsetToSingleTuple(token.get("charOffset")) if tokOffset[0] == phraseOffset[0]: if phraseBegin != i: phrase.set("begin", str(i)) fixed = True if tokOffset[1] == phraseOffset[1]: if phraseEnd != i: phrase.set("end", str(i)) fixed = True break if fixed: fixCount += 1 phraseCount += 1
def makeINSubPhrases(phrases, tokens, phraseDict, filter=None): newPhrases = [] for phrase in phrases: if filter != None and phrase.get("type") not in filter: continue phraseOffset = Range.charOffsetToSingleTuple(phrase.get("charOffset")) phraseBegin = int(phrase.get("begin")) phraseEnd = int(phrase.get("end")) prevToken = None tokCount = 0 for token in tokens[phraseBegin : phraseEnd + 1]: if token.get("POS") == "IN" and prevToken != None: newPhraseOffset = (phraseOffset[0], Range.charOffsetToSingleTuple(prevToken.get("charOffset"))[-1]) newPhrase = makePhrase( phrase.get("type") + "-IN", newPhraseOffset, phraseBegin, phraseBegin + tokCount - 1 ) if not phraseDict.has_key(newPhraseOffset): # print "NEW PHRASE:", ETUtils.toStr(newPhrase) newPhrases.append(newPhrase) phraseDict[newPhraseOffset] = [newPhrase] prevToken = token tokCount += 1 return newPhrases
def getEntityHeadToken(entity, tokens, tokenHeadScores): if entity.get("headOffset") != None: charOffsets = Range.charOffsetToTuples(entity.get("headOffset")) elif entity.get("charOffset") != "": charOffsets = Range.charOffsetToTuples(entity.get("charOffset")) else: charOffsets = [] # Each entity can consist of multiple syntactic tokens, covered by its # charOffset-range. One of these must be chosen as the head token. headTokens = [] # potential head tokens for token in tokens: tokenOffset = Range.charOffsetToSingleTuple(token.get("charOffset")) for offset in charOffsets: if Range.overlap(offset, tokenOffset): headTokens.append(token) if len(headTokens) == 1: # An unambiguous head token was found selectedHeadToken = headTokens[0] else: # One head token must be chosen from the candidates selectedHeadToken = findHeadToken(headTokens, tokenHeadScores) #if verbose: # print >> sys.stderr, "Selected head:", token.attrib["id"], token.attrib["text"] assert selectedHeadToken != None, entityElement.get("id") return selectedHeadToken
def getMetaMapFeatures(self, token, sentenceGraph, features): analyses = sentenceGraph.sentenceElement.find("analyses") if analyses == None: return metamap = analyses.find("metamap") if metamap == None: return tokenOffset = Range.charOffsetToSingleTuple(token.get("charOffset")) skipAttr = set(["charOffset", "text"]) for phrase in metamap.findall("phrase"): phraseOffset = Range.charOffsetToSingleTuple(phrase.get("charOffset")) if Range.overlap(tokenOffset, phraseOffset): attr = phrase.attrib attrNames = sorted(attr.keys()) for attrName in attrNames: if attrName in skipAttr: continue elif attrName == "score": features["_metamap_score"] = 0.001 * abs(int(attr[attrName])) else: attrValues = attr[attrName].split(",") for attrValue in attrValues: features["_metamap_"+attrName+"_"+attrValue.replace(" ", "-")] = 1
def getEntityHeadToken(entity, tokens, tokenHeadScores): if entity.get("headOffset") != None: charOffsets = Range.charOffsetToTuples(entity.get("headOffset")) elif entity.get("charOffset") != "": charOffsets = Range.charOffsetToTuples(entity.get("charOffset")) else: charOffsets = [] # Each entity can consist of multiple syntactic tokens, covered by its # charOffset-range. One of these must be chosen as the head token. headTokens = [] # potential head tokens for token in tokens: tokenOffset = Range.charOffsetToSingleTuple(token.get("charOffset")) for offset in charOffsets: if Range.overlap(offset, tokenOffset): headTokens.append(token) if len(headTokens)==1: # An unambiguous head token was found selectedHeadToken = headTokens[0] else: # One head token must be chosen from the candidates selectedHeadToken = findHeadToken(headTokens, tokenHeadScores) #if verbose: # print >> sys.stderr, "Selected head:", token.attrib["id"], token.attrib["text"] assert selectedHeadToken != None, entityElement.get("id") return selectedHeadToken
def getHeads(corpus): corpus = ETUtils.ETFromObj(corpus) headDict = {} headDict["None"] = {} for sentence in corpus.getiterator("sentence"): headOffsetStrings = set() for entity in sentence.findall("entity"): eType = entity.get("type") if not headDict.has_key(eType): headDict[eType] = {} eText = entity.get("text") headOffset = entity.get("headOffset") headOffsetStrings.add(headOffset) headOffset = Range.charOffsetToSingleTuple(headOffset) charOffset = Range.charOffsetToSingleTuple( entity.get("charOffset")) if headOffset == charOffset: if not headDict[eType].has_key(eText): headDict[eType][eText] = 0 headDict[eType][eText] += 1 else: headText = sentenceText[headOffset[0] - charOffset[0]:headOffset[1] - charOffset[0] + 1] if not headDict[eType].has_key(headText): headDict[eType][headText] = 0 headDict[eType][headText] += 1 for token in tokens: if not token.get( "charOffset" ) in headOffsetStrings: # token is not the head of any entity headText = token.get("text") if not headDict["None"].has_key(headText): headDict["None"][headText] = 0 headDict["None"][headText] += 1 return headDict
def fixAltOffsets(input, output=None): print >> sys.stderr, "Loading corpus", input corpusTree = ETUtils.ETFromObj(input) print >> sys.stderr, "Corpus file loaded" corpusRoot = corpusTree.getroot() docCount = 0 sentencesCreated = 0 sentences = [x for x in corpusRoot.getiterator("sentence")] counter = ProgressCounter(len(sentences), "FixAltOffsets") fixCount = 0 # fix spans for sentence in sentences: counter.update( 1, "Fixing AltOffsets for sentence (" + sentence.get("id") + "): ") sentOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset")) for entity in sentence.findall("entity"): altOffsetString = entity.get("altOffset") if altOffsetString == None: continue #print altOffsetString altOffsets = Range.charOffsetToTuples(altOffsetString) assert len(altOffsets) == 1 for i in range(len(altOffsets)): altOffset = altOffsets[i] altOffsets[i] = (altOffset[0] - sentOffset[0], altOffset[1] - sentOffset[0]) entity.set("altOffset", Range.tuplesToCharOffset(altOffsets)) fixCount += 1 print >> sys.stderr, "Fixed", fixCount, "altOffsets" if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree
def removeNamedEntityPhrases(entities, phrases, phraseDict): neOffsets = set() for entity in entities: if entity.get("given") != "True": continue neOffsets.add(entity.get("charOffset")) phrasesToKeep = [] for phrase in phrases: phraseOffset = phrase.get("charOffset") if phraseOffset in neOffsets: phraseOffsetTuple = Range.charOffsetToSingleTuple(phraseOffset) if phraseOffsetTuple in phraseDict: del phraseDict[phraseOffsetTuple] else: phrasesToKeep.append(phrase) #print >> sys.stderr, "Removed", len(phrases) - len(phrasesToKeep), "named entity phrases" return phrasesToKeep
def removeNamedEntityPhrases(entities, phrases, phraseDict): neOffsets = set() for entity in entities: if entity.get("given") != "True": continue neOffsets.add(entity.get("charOffset")) phrasesToKeep = [] for phrase in phrases: phraseOffset = phrase.get("charOffset") if phraseOffset in neOffsets: phraseOffsetTuple = Range.charOffsetToSingleTuple(phraseOffset) if phraseOffsetTuple in phraseDict: del phraseDict[phraseOffsetTuple] else: phrasesToKeep.append(phrase) # print >> sys.stderr, "Removed", len(phrases) - len(phrasesToKeep), "named entity phrases" return phrasesToKeep
def markNamedEntities(self, entityElements): """ Marks tokens belonging to named entities """ namedEntityTokens = [] for entityElement in entityElements: offsets = [] offsetStrings = entityElement.attrib["charOffset"].split(",") for offsetString in offsetStrings: charFrom, charTo = offsetString.split("-") offset = (int(charFrom), int(charTo)) offsets.append(offset) for k, v in self.tokensById.iteritems(): for offset in offsets: if Range.overlap(offset, v.charOffset): v.entities.append(entityElement.attrib["id"]) namedEntityTokens.append(v.id) return namedEntityTokens
def markNamedEntities(self, entityElements): """ Marks tokens belonging to named entities """ namedEntityTokens = [] for entityElement in entityElements: offsets = [] offsetStrings = entityElement.attrib["charOffset"].split(",") for offsetString in offsetStrings: charFrom, charTo = offsetString.split("-") offset = (int(charFrom), int(charTo)) offsets.append(offset) for k,v in self.tokensById.iteritems(): for offset in offsets: if Range.overlap(offset, v.charOffset): v.entities.append(entityElement.attrib["id"]) namedEntityTokens.append(v.id) return namedEntityTokens
def writeXMLSentence(self, examples, predictionsByExample, sentenceObject, classSet, classIds, goldSentence=None, exampleStyle=None, structureAnalyzer=None): self.assertSameSentence(examples) sentenceElement = sentenceObject.sentence sentenceId = sentenceElement.get("id") sentenceText = sentenceElement.get("text") # detach analyses-element sentenceAnalysesElement = None sentenceAnalysesElement = sentenceElement.find("sentenceanalyses") if sentenceAnalysesElement == None: sentenceAnalysesElement = sentenceElement.find("analyses") if sentenceAnalysesElement != None: sentenceElement.remove(sentenceAnalysesElement) # remove pairs and interactions interactions = self.removeChildren(sentenceElement, ["pair", "interaction"]) # remove entities newEntityIdCount = IDUtils.getNextFreeId(sentenceElement.findall("entity")) nonNameEntities = self.removeNonNameEntities(sentenceElement) # add new pairs for example in examples: prediction = predictionsByExample[example[0]] entityElement = ET.Element("entity") #entityElement.attrib["given"] = "False" headToken = example[3]["t"] for token in sentenceObject.tokens: if token.get("id") == headToken: headToken = token break entityElement.set("charOffset", example[3]["charOffset"]) entityElement.set("headOffset", headToken.get("charOffset")) entityElement.set("phraseType", example[3]["ptype"]) entOffset = Range.charOffsetToSingleTuple(example[3]["charOffset"]) entityElement.set("text", sentenceText[entOffset[0]:entOffset[1]]) entityElement.set("id", sentenceId + ".e" + str(newEntityIdCount)) self.setElementType(entityElement, prediction, classSet, classIds) newEntityIdCount += 1 sentenceElement.append(entityElement) # re-attach the analyses-element if sentenceAnalysesElement != None: sentenceElement.append(sentenceAnalysesElement)
def convert(metamapEl, sentenceEl): """ Convert MetaMap XML into phrase-elements """ newMetamapEl = ET.Element("metamap") # make a new metamap element utteranceCount = 0 for utterance in metamapEl.getiterator("Utterance"): # process all utterances (sentences) utteranceCount += 1 #print "UT:", utterance.find("UttText").text uttOffsetBegin = int(utterance.find("UttStartPos").text) for phrase in utterance.getiterator("Phrase"): # process all phrases for each utterance #print "Phrase:", phrase.find("PhraseText").text phraseEl = ET.Element("phrase") phraseOffset = [int(phrase.find("PhraseStartPos").text), int(phrase.find("PhraseStartPos").text) + int(phrase.find("PhraseLength").text)] phraseOffset = [phraseOffset[0] - uttOffsetBegin, phraseOffset[1] - uttOffsetBegin] phraseEl.set("charOffset", Range.tuplesToCharOffset(phraseOffset)) phraseEl.set("text", phrase.find("PhraseText").text) for candidate in phrase.getiterator("Candidate"): # process first candidate of each phrase phraseEl.set("score", candidate.find("CandidateScore").text) phraseEl.set("cui", candidate.find("CandidateCUI").text) phraseEl.set("matched", candidate.find("CandidateMatched").text) phraseEl.set("preferred", candidate.find("CandidatePreferred").text) semTypes = set() for semType in candidate.getiterator("SemType"): semTypes.add(semType.text) phraseEl.set("semTypes", ",".join(sorted(list(semTypes)))) sources = set() for source in candidate.getiterator("Source"): sources.add(source.text) phraseEl.set("sources", ",".join(sorted(list(sources)))) break if phraseEl.get("matched") != None: # include only matched phrases as new elements newMetamapEl.append(phraseEl) #print ET.tostring(phraseEl, "utf-8") if utteranceCount > 1: print >> sys.stderr, "Warning, sentence", sentenceEl.get("id"), "has", utteranceCount, "utterances" return newMetamapEl
def addSentence(self, sentenceGraph): if sentenceGraph == None: return tokens = sorted([(Range.charOffsetToSingleTuple(x.get("charOffset")), x) for x in sentenceGraph.tokens]) indexByTokenId = { tokens[i][1].get("id"): i for i in range(len(tokens)) } assert len(indexByTokenId) == len( tokens) # check that there were no duplicate ids entityById = {x.get("id"): x for x in sentenceGraph.entities} events = {} for interaction in sentenceGraph.interactions: e1Id = interaction.get("e1") e2Id = interaction.get("e2") e1 = entityById[e1Id] e2 = entityById[e2Id] t1 = sentenceGraph.entityHeadTokenByEntity[e1] t2 = sentenceGraph.entityHeadTokenByEntity[e2] index1 = indexByTokenId[t1.get("id")] index2 = indexByTokenId[t2.get("id")] intSpan = abs(index1 - index2) self.interactionSpans[intSpan] = self.interactionSpans.get( intSpan, 0) + 1 self.intSpan["min"] = min(self.intSpan.get("min"), intSpan) self.intSpan["max"] = max(self.intSpan.get("max"), intSpan) if interaction.get("event") == "True": if e1Id not in events: events[e1Id] = {"min": 9999, "max": -9999} events[e1Id]["min"] = min(events[e1Id]["min"], index1, index2) events[e1Id]["max"] = max(events[e1Id]["max"], index1, index2) for eventId in sorted(events.keys()): eventSpan = events[eventId]["max"] - events[eventId]["min"] self.eventSpans[eventSpan] = self.eventSpans.get(eventSpan, 0) + 1 self.eventSpan["min"] = min(self.eventSpan.get("min"), eventSpan) self.eventSpan["max"] = max(self.eventSpan.get("max"), eventSpan)