def makeEntityElement(ann, idCount, docEl): entEl = ET.Element("entity") entEl.set("type", ann.type) entEl.set("text", ann.text) # identifiers protId = docEl.get("id") + ".e" + str(idCount) entEl.set("id", protId) if ann.id != None: entEl.set("origId", docEl.get("origId") + "." + str(ann.id)) # offsets entEl.set("charOffset", Range.tuplesToCharOffset(ann.charOffsets)) if len(ann.alternativeOffsets) > 0: altOffs = [] for alternativeOffset in ann.alternativeOffsets: altOffs.append( str(alternativeOffset[0]) + "-" + str(alternativeOffset[1] - 1)) entEl.set("altOffset", ",".join(altOffs)) if ann.normalization != None: entEl.set("normalization", ann.normalization) addExtraToElement(entEl, ann.extra) # determine if given data assert ann.fileType in ["a1", "a2", "rel"], ann.fileType if ann.fileType == "a1": #protein.isName(): entEl.set("given", "True") #else: # entEl.set("given", "False") return entEl
def addParseElements(doc, docEl): if docEl.tag != "sentence": return sentAnalysesEl = ET.SubElement(docEl, "analyses") #parsesEl = ET.SubElement(sentAnalysesEl, "parses") parseEl = ET.SubElement(sentAnalysesEl, "parse") #tokenizationsEl = ET.SubElement(sentAnalysesEl, "tokenizations") tokenizationEl = ET.SubElement(sentAnalysesEl, "tokenization") parseEl.set("parser", "gold") parseEl.set("tokenizer", "gold") tokenizationEl.set("tokenizer", "gold") tokenMap = {} for word in doc.words: tokEl = ET.SubElement(tokenizationEl, "token") tokEl.set("id", word.id) tokEl.set("text", word.text) tokEl.set("POS", "None") assert len(word.charOffsets) == 1, (word, word.charOffsets) tokEl.set("charOffset", Range.tuplesToCharOffset(word.charOffsets)) tokenMap[word.id] = tokEl for dep in doc.dependencies: depEl = ET.SubElement(parseEl, "dependency") depEl.set("id", dep.id) depEl.set("type", dep.type) assert len(dep.arguments) == 2 depEl.set("t1", dep.arguments[0].target.id) depEl.set("t2", dep.arguments[1].target.id) if dep.type.find(":") != -1: word1Type, word2Type = dep.type.split("(")[0].split(":")[-1].split( "-") tokenMap[dep.arguments[0].target.id].set("POS", word1Type) tokenMap[dep.arguments[1].target.id].set("POS", word2Type)
def makeEntityElement(ann, idCount, docEl): entEl = ET.Element("entity") entEl.set("type", ann.type) entEl.set("text", ann.text) # identifiers protId = docEl.get("id") + ".e" + str(idCount) entEl.set("id", protId) if ann.id != None: entEl.set("origId", docEl.get("origId") + "." + str(ann.id)) # offsets entEl.set("charOffset", Range.tuplesToCharOffset(ann.charOffsets)) if len(ann.alternativeOffsets) > 0: altOffs = [] for alternativeOffset in ann.alternativeOffsets: altOffs.append( str(alternativeOffset[0]) + "-" + str(alternativeOffset[1]-1) ) entEl.set("altOffset", ",".join(altOffs)) if ann.normalization != None: entEl.set("normalization", ann.normalization) addExtraToElement(entEl, ann.extra) # determine if given data assert ann.fileType in ["a1", "a2", "rel"], ann.fileType if ann.fileType == "a1": #protein.isName(): entEl.set("given", "True") #else: # entEl.set("given", "False") return entEl
def fixAltOffsets(input, output=None): print >> sys.stderr, "Loading corpus", input corpusTree = ETUtils.ETFromObj(input) print >> sys.stderr, "Corpus file loaded" corpusRoot = corpusTree.getroot() docCount = 0 sentencesCreated = 0 sentences = [x for x in corpusRoot.getiterator("sentence")] counter = ProgressCounter(len(sentences), "FixAltOffsets") fixCount = 0 # fix spans for sentence in sentences: counter.update(1, "Fixing AltOffsets for sentence ("+sentence.get("id")+"): ") sentOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset")) for entity in sentence.findall("entity"): altOffsetString = entity.get("altOffset") if altOffsetString == None: continue #print altOffsetString altOffsets = Range.charOffsetToTuples(altOffsetString) assert len(altOffsets) == 1 for i in range(len(altOffsets)): altOffset = altOffsets[i] altOffsets[i] = (altOffset[0] - sentOffset[0], altOffset[1] - sentOffset[0]) entity.set("altOffset", Range.tuplesToCharOffset(altOffsets)) fixCount += 1 print >> sys.stderr, "Fixed", fixCount, "altOffsets" if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree
def addParseElements(doc, docEl): if docEl.tag != "sentence": return sentAnalysesEl = ET.SubElement(docEl, "analyses") #parsesEl = ET.SubElement(sentAnalysesEl, "parses") parseEl = ET.SubElement(sentAnalysesEl, "parse") #tokenizationsEl = ET.SubElement(sentAnalysesEl, "tokenizations") tokenizationEl = ET.SubElement(sentAnalysesEl, "tokenization") parseEl.set("parser", "gold") parseEl.set("tokenizer", "gold") tokenizationEl.set("tokenizer", "gold") tokenMap = {} for word in doc.words: tokEl = ET.SubElement(tokenizationEl, "token") tokEl.set("id", word.id) tokEl.set("text", word.text) tokEl.set("POS", "None") assert len(word.charOffsets) == 1, (word, word.charOffsets) tokEl.set("charOffset", Range.tuplesToCharOffset(word.charOffsets)) tokenMap[word.id] = tokEl for dep in doc.dependencies: depEl = ET.SubElement(parseEl, "dependency") depEl.set("id", dep.id) depEl.set("type", dep.type) assert len(dep.arguments) == 2 depEl.set("t1", dep.arguments[0].target.id) depEl.set("t2", dep.arguments[1].target.id) if dep.type.find(":") != -1: word1Type, word2Type = dep.type.split("(")[0].split(":")[-1].split("-") tokenMap[dep.arguments[0].target.id].set("POS", word1Type) tokenMap[dep.arguments[1].target.id].set("POS", word2Type)
def moveElements(document): entMap = {} entSentence = {} entSentenceIndex = {} sentences = document.findall("sentence") sentenceCount = 0 for sentence in sentences: sentenceOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset")) # Move entities entCount = 0 for entity in document.findall("entity"): entityOffsets = Range.charOffsetToTuples(entity.get("charOffset")) overlaps = False for entityOffset in entityOffsets: if Range.overlap(sentenceOffset, entityOffset): overlaps = True break if overlaps: document.remove(entity) sentence.append(entity) entityId = entity.get("id") entityIdLastPart = entityId.rsplit(".", 1)[-1] if entityIdLastPart.startswith("e"): entity.set("id", sentence.get("id") + "." + entityIdLastPart) entMap[entityId] = sentence.get("id") + "." + entityIdLastPart else: entity.set("docId", entityId) entity.set("id", sentence.get("id") + ".e" + str(entCount)) entMap[entityId] = sentence.get("id") + ".e" + str(entCount) entSentence[entityId] = sentence entSentenceIndex[entityId] = sentenceCount #newEntityOffset = (entityOffset[0] - sentenceOffset[0], entityOffset[1] - sentenceOffset[0]) newEntityOffsets = [] for entityOffset in entityOffsets: newEntityOffsets.append( (entityOffset[0] - sentenceOffset[0], entityOffset[1] - sentenceOffset[0]) ) entity.set("origOffset", entity.get("charOffset")) #entity.set("charOffset", str(newEntityOffset[0]) + "-" + str(newEntityOffset[1])) entity.set("charOffset", Range.tuplesToCharOffset(newEntityOffsets)) entCount += 1 sentenceCount += 1 # Move interactions intCount = 0 for interaction in document.findall("interaction"): #if entSentenceIndex[interaction.get("e1")] < entSentenceIndex[interaction.get("e2")]: # targetSentence = entSentence[interaction.get("e1")] #else: # targetSentence = entSentence[interaction.get("e2")] # Interactions go to a sentence always by e1, as this is the event they are an argument of. # If an intersentence interaction is a relation, this shouldn't matter. targetSentence = entSentence[interaction.get("e1")] document.remove(interaction) targetSentence.append(interaction) interaction.set("id", targetSentence.get("id") + ".i" + str(intCount)) interaction.set("e1", entMap[interaction.get("e1")]) interaction.set("e2", entMap[interaction.get("e2")]) intCount += 1
def updateXML(root, removeAnalyses=True): counts = defaultdict(int) for document in root.findall("document"): sentencePos = 0 counts["documents"] += 1 for sentence in document.findall("sentence"): counts["sentences"] += 1 # Remove the original parses analyses = sentence.find("sentenceanalyses") if analyses != None: counts["analyses"] += 1 if removeAnalyses: counts["removed-analyses"] += 1 sentence.remove(analyses) # Add an artifical sentence offset so that sentences can be exported as a single document sentenceText = sentence.get("text") sentence.set("charOffset", Range.tuplesToCharOffset((sentencePos, sentencePos + len(sentenceText)))) # Update the character offsets of all entities from the old format (begin,end) to the new one (begin,end+1) for entity in sentence.findall("entity"): counts["entities"] += 1 offsets = [(x[0], x[1] + 1) for x in Range.charOffsetToTuples(entity.get("charOffset"))] entityText = entity.get("text") for offset, entitySpan in zip(offsets, [sentenceText[x[0]:x[1]] for x in offsets]): counts["entity-offsets"] += 1 lenOffset = offset[1] - offset[0] offsetText, entityText = entityText[:lenOffset].strip(), entityText[lenOffset:].strip() assert offsetText == entitySpan, (offsets, (entity.get("text"), entitySpan), (offsetText, entityText), sentenceText) entity.set("charOffset", Range.tuplesToCharOffset(offsets)) # Convert positive pairs into interaction elements numInteractions = 0 for pair in sentence.findall("pair"): counts["pairs"] += 1 sentence.remove(pair) if pair.get("interaction") == "True": del pair.attrib["interaction"] pair.set("id", pair.get("id").rsplit(".", 1)[0] + ".i" + str(numInteractions)) pair.set("type", "PPI") ET.SubElement(sentence, "interaction", pair.attrib) numInteractions += 1 counts["interactions"] += 1 sentencePos += len(sentenceText) + 1 print >> sys.stderr, "Updated Interaction XML format:", dict(counts) return root
def processElements(xml): for ddi in xml.getiterator("ddi"): ddi.tag = "interaction" for entity in xml.getiterator("entity"): entity.set("given", "True") # Reformat disjoint character offsets and update character range format for TEES 2.0+ charOffsets = Range.charOffsetToTuples(entity.get("charOffset"), rangeSep=";") updatedCharOffsets = [] for charOffset in charOffsets: updatedCharOffsets.append( (charOffset[0], charOffset[1]+1) ) entity.set("charOffset", Range.tuplesToCharOffset(updatedCharOffsets))
def processElements(xml): for ddi in xml.getiterator("ddi"): ddi.tag = "interaction" for entity in xml.getiterator("entity"): entity.set("given", "True") # Reformat disjoint character offsets and update character range format for TEES 2.0+ charOffsets = Range.charOffsetToTuples(entity.get("charOffset"), rangeSep=";") updatedCharOffsets = [] for charOffset in charOffsets: updatedCharOffsets.append((charOffset[0], charOffset[1] + 1)) entity.set("charOffset", Range.tuplesToCharOffset(updatedCharOffsets))
def fixEntities(xml): counts = defaultdict(int) for sentence in xml.getiterator("sentence"): sText = sentence.get("text") for entity in sentence.findall("entity"): charOffset = entity.get("charOffset") if charOffset == "-": assert False, str(entity) sentence.remove(entity) counts["removed-invalid"] += 1 else: charOffset = Range.charOffsetToSingleTuple(charOffset) # fix length realLength = len(entity.get("text")) lenDiff = (charOffset[1] - charOffset[0] + 1) - realLength if lenDiff != realLength: counts["incorrect-ent-offset"] += 1 counts["incorrect-ent-offset-diff"+str(lenDiff)] += 1 if abs(lenDiff) > 2: print "Warning, lenDiff:", (lenDiff, charOffset, sText, entity.get("text"), entity.get("id")) charOffset = (charOffset[0], charOffset[0] + realLength) # find starting position entIndex = sText.find(entity.get("text"), charOffset[0]) if entIndex == -1: for i in [-1,-2,-3]: entIndex = sText.find(entity.get("text"), charOffset[0]+i) if entIndex != -1: break if entIndex != 0: # could be lowercase sTextLower = sText.lower() for i in [0,-1,-2,-3]: lowerEntIndex = sTextLower.find(entity.get("text"), charOffset[0]+i) if lowerEntIndex != -1: break if lowerEntIndex != -1 and abs(lowerEntIndex - charOffset[0]) < abs(entIndex - charOffset[0]): entIndex = lowerEntIndex assert entIndex != -1, (charOffset, sText, entity.get("text"), entity.get("id")) indexDiff = entIndex - charOffset[0] if indexDiff != 0: counts["incorrect-ent-index"] += 1 counts["incorrect-ent-index-diff"+str(indexDiff)] += 1 print "Warning, indexDiff:", (indexDiff, charOffset, sText, entity.get("text"), entity.get("id")) # move offset charOffset = (charOffset[0]+indexDiff, charOffset[1]+indexDiff) # validate new offset sEntity = sText[charOffset[0]:charOffset[1]] assert sEntity == entity.get("text") or sEntity.lower() == entity.get("text"), (charOffset, sText, entity.get("text"), entity.get("id")) entity.set("charOffset", Range.tuplesToCharOffset( (charOffset[0], charOffset[1]))) entity.set("given", "True") for interaction in sentence.findall("interaction"): interaction.set("type", "DDI") print "Fix counts:", counts
def makeDDI13SubmissionFile(input, output, mode="interactions", idfilter=None): xml = ETUtils.ETFromObj(input) outFile = open(output, "wt") for sentence in xml.getiterator("sentence"): sentenceId = sentence.get("id") if idfilter != None and idfilter not in sentenceId: continue # Output entities if mode == "entities": for entity in sentence.findall("entity"): if entity.get("type") != "neg": outFile.write(sentenceId) offsets = Range.charOffsetToTuples( entity.get("charOffset")) for i in range(len(offsets)): offsets[i] = (offsets[i][0], offsets[i][1] - 1) outFile.write( "|" + Range.tuplesToCharOffset(offsets, rangeSep=";")) outFile.write("|" + entity.get("text")) outFile.write("|" + entity.get("type")) outFile.write("\n") if mode == "interactions": # First determine which pairs interact intMap = defaultdict(lambda: defaultdict(lambda: None)) for interaction in sentence.findall("interaction"): # Make mapping both ways to discard edge directionality. This isn't actually needed, # since MultiEdgeExampleBuilder builds entity pairs in the same order as this function, # but shouldn't harm to include it and now it works regardless of pair direction. if interaction.get("type") != "neg" and interaction.get( "given") != "True": intMap[interaction.get("e1")][interaction.get( "e2")] = interaction intMap[interaction.get("e2")][interaction.get( "e1")] = interaction # Then write all pairs to the output file entities = sentence.findall("entity") for i in range(0, len(entities) - 1): for j in range(i + 1, len(entities)): eIId = entities[i].get("id") eJId = entities[j].get("id") outFile.write(sentenceId + "|" + eIId + "|" + eJId + "|") if intMap[eIId][eJId] != None: interaction = intMap[eIId][eJId] assert interaction.get("type") != "neg" outFile.write("1|" + interaction.get("type") + "\n") else: outFile.write("0|null\n") outFile.close()
def makeDDI13SubmissionFile(input, output, mode="interactions", idfilter=None): xml = ETUtils.ETFromObj(input) outFile = open(output, "wt") for sentence in xml.getiterator("sentence"): sentenceId = sentence.get("id") if idfilter != None and idfilter not in sentenceId: continue # Output entities if mode == "entities": for entity in sentence.findall("entity"): if entity.get("type") != "neg": outFile.write(sentenceId) offsets = Range.charOffsetToTuples(entity.get("charOffset")) for i in range(len(offsets)): offsets[i] = (offsets[i][0], offsets[i][1]-1) outFile.write("|" + Range.tuplesToCharOffset(offsets, rangeSep=";")) outFile.write("|" + entity.get("text")) outFile.write("|" + entity.get("type")) outFile.write("\n") if mode == "interactions": # First determine which pairs interact intMap = defaultdict(lambda:defaultdict(lambda:None)) for interaction in sentence.findall("interaction"): # Make mapping both ways to discard edge directionality. This isn't actually needed, # since MultiEdgeExampleBuilder builds entity pairs in the same order as this function, # but shouldn't harm to include it and now it works regardless of pair direction. if interaction.get("type") != "neg" and interaction.get("given") != "True": intMap[interaction.get("e1")][interaction.get("e2")] = interaction intMap[interaction.get("e2")][interaction.get("e1")] = interaction # Then write all pairs to the output file entities = sentence.findall("entity") for i in range(0, len(entities)-1): for j in range(i+1, len(entities)): eIId = entities[i].get("id") eJId = entities[j].get("id") outFile.write(sentenceId + "|" + eIId + "|" + eJId + "|") if intMap[eIId][eJId] != None: interaction = intMap[eIId][eJId] assert interaction.get("type") != "neg" outFile.write("1|" + interaction.get("type") + "\n") else: outFile.write("0|null\n") outFile.close()
def convert(metamapEl, sentenceEl): """ Convert MetaMap XML into phrase-elements """ newMetamapEl = ET.Element("metamap") # make a new metamap element utteranceCount = 0 for utterance in metamapEl.getiterator("Utterance"): # process all utterances (sentences) utteranceCount += 1 #print "UT:", utterance.find("UttText").text uttOffsetBegin = int(utterance.find("UttStartPos").text) for phrase in utterance.getiterator("Phrase"): # process all phrases for each utterance #print "Phrase:", phrase.find("PhraseText").text phraseEl = ET.Element("phrase") phraseOffset = [int(phrase.find("PhraseStartPos").text), int(phrase.find("PhraseStartPos").text) + int(phrase.find("PhraseLength").text)] phraseOffset = [phraseOffset[0] - uttOffsetBegin, phraseOffset[1] - uttOffsetBegin] phraseEl.set("charOffset", Range.tuplesToCharOffset(phraseOffset)) phraseEl.set("text", phrase.find("PhraseText").text) for candidate in phrase.getiterator("Candidate"): # process first candidate of each phrase phraseEl.set("score", candidate.find("CandidateScore").text) phraseEl.set("cui", candidate.find("CandidateCUI").text) phraseEl.set("matched", candidate.find("CandidateMatched").text) phraseEl.set("preferred", candidate.find("CandidatePreferred").text) semTypes = set() for semType in candidate.getiterator("SemType"): semTypes.add(semType.text) phraseEl.set("semTypes", ",".join(sorted(list(semTypes)))) sources = set() for source in candidate.getiterator("Source"): sources.add(source.text) phraseEl.set("sources", ",".join(sorted(list(sources)))) break if phraseEl.get("matched") != None: # include only matched phrases as new elements newMetamapEl.append(phraseEl) #print ET.tostring(phraseEl, "utf-8") if utteranceCount > 1: print >> sys.stderr, "Warning, sentence", sentenceEl.get("id"), "has", utteranceCount, "utterances" return newMetamapEl
def fixAltOffsets(input, output=None): print >> sys.stderr, "Loading corpus", input corpusTree = ETUtils.ETFromObj(input) print >> sys.stderr, "Corpus file loaded" corpusRoot = corpusTree.getroot() docCount = 0 sentencesCreated = 0 sentences = [x for x in corpusRoot.getiterator("sentence")] counter = ProgressCounter(len(sentences), "FixAltOffsets") fixCount = 0 # fix spans for sentence in sentences: counter.update( 1, "Fixing AltOffsets for sentence (" + sentence.get("id") + "): ") sentOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset")) for entity in sentence.findall("entity"): altOffsetString = entity.get("altOffset") if altOffsetString == None: continue #print altOffsetString altOffsets = Range.charOffsetToTuples(altOffsetString) assert len(altOffsets) == 1 for i in range(len(altOffsets)): altOffset = altOffsets[i] altOffsets[i] = (altOffset[0] - sentOffset[0], altOffset[1] - sentOffset[0]) entity.set("altOffset", Range.tuplesToCharOffset(altOffsets)) fixCount += 1 print >> sys.stderr, "Fixed", fixCount, "altOffsets" if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree
def mergeSentences(input, output, verbose=False): print >> sys.stderr, "Merging sentences into documents" print >> sys.stderr, "Loading corpus file", input corpusTree = ETUtils.ETFromObj(input) corpusRoot = corpusTree.getroot() counts = defaultdict(int) for document in corpusRoot.findall("document"): counts["documents"] += 1 # Check that the entity has only sentence elements as children children = [x for x in document] docChildTypes = sorted(set([x.tag for x in children])) if len(docChildTypes) == 0: counts["documents-with-no-sentences"] += 1 continue elif len(docChildTypes) > 1 or docChildTypes[0] != "sentence": raise Exception("Document '" + str(document.get("id")) + "' has non-sentence children: " + str(docChildTypes)) # Process all the child sentence elements docId = document.get("id") interactions = [] entities = [] entityById = {} interactionById = {} combinedText = "" calculatedOffset = (0, 0) for sentence in children: document.remove(sentence) sentenceText = sentence.get("head", "") + sentence.get( "text", "") + sentence.get("tail", "") sentOffset = sentence.get("charOffset") if sentence == children[0]: noDefinedOffsets = sentOffset == None elif (sentOffset == None) != noDefinedOffsets: raise Exception("Only some sentences in document '" + docId + "' have defined offsets") if sentOffset == None: if sentence != children[-1]: sentenceText = sentenceText + " " calculatedOffset = (calculatedOffset[1], calculatedOffset[1] + len(sentenceText)) sentOffset = calculatedOffset else: sentOffset = Range.charOffsetToSingleTuple(sentOffset) combinedText += sentenceText # Collect and update the entity elements for entity in sentence.findall("entity"): # Map sentence-level entity offsets to document level for offsetKey in ("charOffset", "headOffset"): if entity.get(offsetKey) != None: offset = Range.charOffsetToTuples( entity.get(offsetKey)) for i in range(len(offset)): offset[i] = (offset[i][0] + sentOffset[0], offset[i][1] + sentOffset[0]) entity.set(offsetKey, Range.tuplesToCharOffset(offset)) # Compare mapped offsets to origOffset, if available if entity.get("origOffset") != None: if entity.get("charOffset") != entity.get("origOffset"): raise Exception( "Document '" + str(document.get("id")) + "' entity '" + str(entity.get("id")) + "' new charOffset differs from origOffset: " + str([ entity.get("charOffset"), entity.get("origOffset") ])) counts["checked-origOffsets"] += 1 del entity.attrib["origOffset"] assert entity.get("id") not in entityById entityById[entity.get( "id" )] = entity # For re-mapping the interaction 'e1' and 'e2' attributes entities.append(entity) counts["moved-entities"] += 1 # Collect and update the interaction elements for interaction in sentence.findall("interaction"): assert interaction.get("id") not in interactionById interactionById[interaction.get( "id" )] = interaction # For re-mapping the interaction 'siteOf' attributes interactions.append(interaction) counts["moved-interactions"] += 1 # Check that the combined sentence text matches the document text, if available if document.get("text") != None and document.get( "text") != combinedText: if combinedText == document.get( "text")[0:len(combinedText)] and document.get( "text")[len(combinedText):].strip() == "": if verbose: print >> sys.stderr, "Warning, document '" + document.get( "id" ) + "' text has trailing whitespace not included in the combined sentence text" combinedText = document.get("text") counts["missing-trailing-whitespace"] += 1 else: raise Exception( "Document '" + str(document.get("id")) + "' text differs from combined sentence text: " + str([document.get("text"), combinedText])) counts["checked-document-texts"] += 1 # Check that the entities' texts match the document text for entity in entities: offset = Range.charOffsetToTuples(entity.get("charOffset")) if len(offset) == 1: # Compare only continous entities if not Range.contains((0, len(combinedText)), offset[0]): raise Exception( "Document '" + str(document.get("id")) + "' entity '" + str(entity.get("id")) + "' offset is not contained in combined sentence text: " + str([ entity.attrib, offset, [0, len(combinedText)], combinedText ])) combTextSpan = combinedText[offset[0][0]:offset[0][1]] if entity.get("text") != combTextSpan: raise Exception( "Document '" + str(document.get("id")) + "' entity '" + str(entity.get("id")) + "' text does not match combined sentence text: " + str([entity.get("text"), combTextSpan])) counts["checked-charOffsets"] += 1 # Set the combined text as the document text document.set("text", combinedText) # Update entity and interaction ids (not done earlier so that possible error messages will refer to original ids, also because of siteOf-remapping) for i in range(len(entities)): entities[i].set("id", docId + ".e" + str(i)) # Update the id for the document level for i in range(len(interactions)): interaction.set("id", docId + ".i" + str(i)) # Update the id for the document level # Update interaction e1 and e2 ids (cannot be done earlier because interactions may refer to entities from multiple sentences) for i in range(len(interactions)): interaction = interactions[i] for entKey in ("e1", "e2"): interaction.set(entKey, entityById[interaction.get(entKey)].get("id")) if interaction.get("siteOf") != None: interaction.set( "siteOf", interactionById[interaction.get("siteOf")].get("id")) # Add the entity and interaction elements to the document document.extend(entities) document.extend(interactions) print >> sys.stderr, "Counts:", dict(counts) if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree
def moveElements(document): entMap = {} entSentence = {} entSentenceIndex = {} sentences = document.findall("sentence") sentenceCount = 0 for sentence in sentences: sentenceOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset")) # Move entities entCount = 0 for entity in document.findall("entity"): entityOffsets = Range.charOffsetToTuples(entity.get("charOffset")) overlaps = False for entityOffset in entityOffsets: if Range.overlap(sentenceOffset, entityOffset): overlaps = True break if overlaps: document.remove(entity) sentence.append(entity) entityId = entity.get("id") entityIdLastPart = entityId.rsplit(".", 1)[-1] if entityIdLastPart.startswith("e"): entity.set("id", sentence.get("id") + "." + entityIdLastPart) entMap[entityId] = sentence.get("id") + "." + entityIdLastPart else: entity.set("docId", entityId) entity.set("id", sentence.get("id") + ".e" + str(entCount)) entMap[entityId] = sentence.get("id") + ".e" + str(entCount) entSentence[entityId] = sentence entSentenceIndex[entityId] = sentenceCount #newEntityOffset = (entityOffset[0] - sentenceOffset[0], entityOffset[1] - sentenceOffset[0]) newEntityOffsets = [] for entityOffset in entityOffsets: newOffset = (entityOffset[0] - sentenceOffset[0], entityOffset[1] - sentenceOffset[0]) newOffset = (max(0, newOffset[0]), max(0, newOffset[1])) if newOffset != (0, 0): assert newOffset[1] > newOffset[0], (entity.attrib, entityOffsets, sentenceOffset) newEntityOffsets.append( (entityOffset[0] - sentenceOffset[0], entityOffset[1] - sentenceOffset[0]) ) assert len(newEntityOffsets) > 0, (entity.attrib, entityOffsets, sentenceOffset) entity.set("origOffset", entity.get("charOffset")) #entity.set("charOffset", str(newEntityOffset[0]) + "-" + str(newEntityOffset[1])) entity.set("charOffset", Range.tuplesToCharOffset(newEntityOffsets)) entCount += 1 sentenceCount += 1 if len([x for x in document.findall("entity")]) != 0: raise Exception("Sentence splitting does not cover the entire document") # Move interactions intCount = 0 interactions = [] interactionOldToNewId = {} for interaction in document.findall("interaction"): interactions.append(interaction) #if entSentenceIndex[interaction.get("e1")] < entSentenceIndex[interaction.get("e2")]: # targetSentence = entSentence[interaction.get("e1")] #else: # targetSentence = entSentence[interaction.get("e2")] # Interactions go to a sentence always by e1, as this is the event they are an argument of. # If an intersentence interaction is a relation, this shouldn't matter. targetSentence = entSentence[interaction.get("e1")] document.remove(interaction) targetSentence.append(interaction) newId = targetSentence.get("id") + ".i" + str(intCount) interactionOldToNewId[interaction.get("id")] = newId interaction.set("id", newId) interaction.set("e1", entMap[interaction.get("e1")]) interaction.set("e2", entMap[interaction.get("e2")]) intCount += 1 for interaction in interactions: if interaction.get("siteOf") != None: interaction.set("siteOf", interactionOldToNewId[interaction.get("siteOf")])
def extend(input, output=None, entityTypes=["Bacterium"], verbose=False): if not (ET.iselement(input) and input.tag == "sentence"): print >> sys.stderr, "Loading corpus file", input corpusTree = ETUtils.ETFromObj(input) corpusRoot = corpusTree.getroot() bacteriaTokens = ExampleBuilders.PhraseTriggerExampleBuilder.getBacteriaTokens() if not (ET.iselement(input) and input.tag == "sentence"): sentences = corpusRoot.getiterator("sentence") else: sentences = [input] counts = defaultdict(int) for sentence in sentences: incorrectCount = 0 sentenceText = sentence.get("text") tokens = tokenize(sentenceText) for entity in sentence.findall("entity"): counts["all-entities"] += 1 if entity.get("type") not in entityTypes: continue headOffset = entity.get("headOffset") if headOffset == None: if verbose: print "WARNING, no head offset for entity", entity.get("id") headOffset = entity.get("charOffset") headOffset = Range.charOffsetToTuples(headOffset)[0] charOffset = entity.get("charOffset") assert charOffset != None, "WARNING, no head offset for entity " + str(entity.get("id")) charOffset = Range.charOffsetToTuples(charOffset)[0] tokPos = [0,0] tokIndex = None # find main token for i in range(len(tokens)): token = tokens[i] tokPos[1] = tokPos[0] + len(token) # - 1 if Range.overlap(headOffset, tokPos): tokIndex = i break tokPos[0] += len(token) assert tokIndex != None, (entity.get("id"), entity.get("text"), tokens) skip = False if tokPos[0] < headOffset[0]: tokPos = headOffset skip = True if not skip: # Extend before beginIndex = tokIndex for i in range(tokIndex-1, -1, -1): token = tokens[i] if token.isspace(): continue if not isBacteriaToken(token, bacteriaTokens, i - tokIndex): beginIndex = i + 1 break if i == 0: beginIndex = i while tokens[beginIndex].isspace() or isExtraWord(tokens[beginIndex], toLower=False): beginIndex += 1 if beginIndex >= tokIndex: beginIndex = tokIndex break # Extend after endIndex = tokIndex if tokens[tokIndex][-1] != ",": endIndex = tokIndex for i in range(tokIndex+1, len(tokens)): token = tokens[i] if token.isspace(): continue if not isBacteriaToken(token, bacteriaTokens, i - tokIndex): endIndex = i - 1 break if i == len(tokens) - 1: endIndex = i while tokens[endIndex].isspace(): endIndex -= 1 # Modify range if tokIndex > beginIndex: for token in reversed(tokens[beginIndex:tokIndex]): tokPos[0] -= len(token) if tokIndex < endIndex: for token in tokens[tokIndex+1:endIndex+1]: tokPos[1] += len(token) # Attempt to remove trailing periods and commas while not sentenceText[tokPos[1] - 1].isalnum(): tokPos[1] -= 1 if tokPos[1] < tokPos[0] + 1: tokPos[1] = tokPos[0] + 1 break while not sentenceText[tokPos[0]].isalnum(): tokPos[0] += 1 if tokPos[0] >= tokPos[1]: tokPos[0] = tokPos[1] - 1 break # Split merged names #newPos = [tokPos[0], tokPos[1]] #for split in sentenceText[tokPos[0]:tokPos[1]+1].split("/"): # newPos[0] += len(split) # if # Insert changed charOffset counts["entities"] += 1 newOffset = tuple(tokPos) newOffsetString = Range.tuplesToCharOffset([newOffset]) if verbose: print "Entity", entity.get("id"), #print [entity.get("text"), sentenceText[headOffset[0]:headOffset[1]+1], sentenceText[newOffset[0]:newOffset[1]+1]], print [entity.get("text"), sentenceText[headOffset[0]:headOffset[1]], sentenceText[newOffset[0]:newOffset[1]]], print [entity.get("charOffset"), entity.get("headOffset"), newOffsetString], "Sent:", len(sentence.get("text")), if newOffset != headOffset: counts["extended"] += 1 if verbose: print "EXTENDED", if newOffset == charOffset: counts["correct"] += 1 if verbose: print "CORRECT" else: counts["incorrect"] += 1 incorrectCount += 1 if verbose: print "INCORRECT" entity.set("charOffset", newOffsetString) #entity.set("text", sentenceText[newOffset[0]:newOffset[1]+1]) entity.set("text", sentenceText[newOffset[0]:newOffset[1]]) if incorrectCount > 0 and verbose: print "TOKENS:", "|".join(tokens) print "--------------------------------" if verbose: print counts if not (ET.iselement(input) and input.tag == "sentence"): if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree
def convertChemProt(inDirs=None, setNames=None, outPath=None, goldTestSet=True, downloadDir=None, extractDir=None, redownload=False, debug=False): tempDir = None if inDirs == None: print >> sys.stderr, "---------------", "Downloading ChemProt files", "---------------" if extractDir == None: tempDir = tempfile.mkdtemp() inDirs = [] for setName in ("TRAIN", "DEVEL", "TEST"): if goldTestSet and setName == "TEST": setName = "TEST_GOLD" if Settings.URL["CP17_" + setName] != None: currentExtractDir = extractDir if extractDir else tempDir currentExtractDir = os.path.join(currentExtractDir, setName.lower()) inDirs.append(downloadFile(Settings.URL["CP17_" + setName], downloadDir, currentExtractDir, redownload)) print >> sys.stderr, "Reading ChemProt corpus from input", inDirs, "using dataset mapping", setNames dataSets = OrderedDict() for inDir in inDirs: print >> sys.stderr, "Reading input directory", inDir filenames = os.listdir(inDir) filetypes = ["_abstracts", "_entities", "_relations"] # Collect the file paths for the data types dirDataSets = set() for filename in filenames: if not (filename.endswith(".tsv") and any([x in filename for x in filetypes])): continue dataSetId, dataType = filename.replace("_gs", "").rsplit("_", 1) if setNames != None: dataSetId = setNames.get(dataSetId, dataSetId) dirDataSets.add(dataSetId) dataType = dataType.split(".")[0] if dataSetId not in dataSets: dataSets[dataSetId] = {} assert dataType not in dataSets[dataSetId] dataSets[dataSetId][dataType] = os.path.join(inDir, filename) print >> sys.stderr, "Found ChemProt datasets", list(dirDataSets), "at", inDir print >> sys.stderr, "Read datasets:", dataSets.keys() # Build the Interaction XML print >> sys.stderr, "Converting to Interaction XML" corpusName = "CP17" corpus = ET.Element("corpus", {"source":corpusName}) counts = defaultdict(int) docById = {} entityById = {} entitiesByDoc = {} docsWithErrors = set() for dataSetId in sorted(dataSets.keys()): prevCounts = copy.copy(counts) print >> sys.stderr, "---", "Building elements for dataset", dataSetId, "---" dataSet = dataSets[dataSetId] counts["sets"] += 1 with open(dataSet["abstracts"], "rt") as f: print >> sys.stderr, "Adding document elements for dataset", dataSetId for row in UnicodeDictReader(f, delimiter="\t", fieldnames=["id", "title", "abstract"], quoting=csv.QUOTE_NONE): document = ET.Element("document", {"id":corpusName + ".d" + str(counts["documents"]), "origId":row["id"], "set":dataSetId}) document.set("text", row["title"] + " " + row["abstract"]) document.set("titleOffset", Range.tuplesToCharOffset((0, len(row["title"])))) if document.get("origId") in docById: assert document.get("text") == docById[document.get("origId")].get("text") assert document.get("titleOffset") == docById[document.get("origId")].get("titleOffset") counts["duplicate-documents"] += 1 else: corpus.append(document) docById[document.get("origId")] = document counts["documents"] += 1 with open(dataSet["entities"], "rt") as f: print >> sys.stderr, "Adding entity elements for dataset", dataSetId for row in UnicodeDictReader(f, delimiter="\t", fieldnames=["docId", "id", "type", "begin", "end", "text"], quoting=csv.QUOTE_NONE): document = docById[row["docId"]] assert row["type"] in ("CHEMICAL", "GENE-Y", "GENE-N") # Check for duplicate entities if row["docId"] not in entitiesByDoc: entitiesByDoc[row["docId"]] = set() assert row["id"] not in entitiesByDoc[row["docId"]] entitiesByDoc[row["docId"]].add(row["id"]) # Determine the offset offset = (int(row["begin"]), int(row["end"])) docSpan = document.get("text")[offset[0]:offset[1]] if docSpan == row["text"]: entity = ET.SubElement(document, "entity", {"id":document.get("id") + ".e" + str(len([x for x in document.findall("entity")]))}) entity.set("given", "True") entity.set("origId", row["id"]) entity.set("type", row["type"].split("-")[0]) entity.set("normalized", "True" if row["type"].endswith("-Y") else "False") entity.set("charOffset", Range.tuplesToCharOffset((offset[0], offset[1]))) entity.set("text", row["text"]) if row["docId"] not in entityById: entityById[row["docId"]] = {} assert entity.get("origId") not in entityById[row["docId"]] entityById[row["docId"]][entity.get("origId")] = entity counts["entities"] += 1 else: print >> sys.stderr, "Alignment error in document", row["docId"], (offset, docSpan, row) counts["entities-error"] += 1 docsWithErrors.add(row["docId"]) if "relations" in dataSet: print >> sys.stderr, "Adding relation elements for dataset", dataSetId with open(dataSet["relations"], "rt") as f: for row in UnicodeDictReader(f, delimiter="\t", fieldnames=["docId", "group", "groupEval", "type", "arg1", "arg2"], quoting=csv.QUOTE_NONE): for argId in ("1", "2"): assert row["arg" + argId].startswith("Arg" + argId + ":") row["arg" + argId] = row["arg" + argId][5:] document = docById[row["docId"]] e1 = entityById[row["docId"]].get(row["arg1"]) e2 = entityById[row["docId"]].get(row["arg2"]) if e1 != None and e2 != None: interaction = ET.SubElement(document, "interaction", {"id":document.get("id") + ".i" + str(len([x for x in document.findall("interaction")]))}) interaction.set("directed", "True") interaction.set("type", row["group"]) interaction.set("relType", row["type"]) row["groupEval"] = row["groupEval"].strip() assert row["groupEval"] in ("Y", "N") interaction.set("evaluated", "True" if row["groupEval"] == "Y" else "False") interaction.set("e1", e1.get("id")) interaction.set("e2", e2.get("id")) counts["interactions"] += 1 else: counts["interaction-error"] += 1 docsWithErrors.add(row["docId"]) else: print >> sys.stderr, "No relations for dataset", dataSetId print >> sys.stderr, "dataset", dataSetId, {x:counts[x] - prevCounts.get(x, 0) for x in counts if counts[x] - prevCounts.get(x, 0) > 0} if len(docsWithErrors) > 0: counts["documents-with-errors"] = len(docsWithErrors) print >> sys.stderr, "---", "All Datasets Done", "---" print >> sys.stderr, "ChemProt conversion:", dict(counts) if tempDir != None and not debug: print >> sys.stderr, "Removing temporary directory", tempDir shutil.rmtree(tempDir) if outPath != None: ETUtils.write(corpus, outPath) return ET.ElementTree(corpus)
def extend(input, output=None, entityTypes=["Bacterium"], verbose=False): if not (ET.iselement(input) and input.tag == "sentence"): print >> sys.stderr, "Loading corpus file", input corpusTree = ETUtils.ETFromObj(input) corpusRoot = corpusTree.getroot() bacteriaTokens = ExampleBuilders.PhraseTriggerExampleBuilder.getBacteriaTokens( ) if not (ET.iselement(input) and input.tag == "sentence"): sentences = corpusRoot.getiterator("sentence") else: sentences = [input] counts = defaultdict(int) for sentence in sentences: incorrectCount = 0 sentenceText = sentence.get("text") tokens = tokenize(sentenceText) for entity in sentence.findall("entity"): counts["all-entities"] += 1 if entity.get("type") not in entityTypes: continue headOffset = entity.get("headOffset") if headOffset == None: if verbose: print "WARNING, no head offset for entity", entity.get( "id") headOffset = entity.get("charOffset") headOffset = Range.charOffsetToTuples(headOffset)[0] charOffset = entity.get("charOffset") assert charOffset != None, "WARNING, no head offset for entity " + str( entity.get("id")) charOffset = Range.charOffsetToTuples(charOffset)[0] tokPos = [0, 0] tokIndex = None # find main token for i in range(len(tokens)): token = tokens[i] tokPos[1] = tokPos[0] + len(token) # - 1 if Range.overlap(headOffset, tokPos): tokIndex = i break tokPos[0] += len(token) assert tokIndex != None, (entity.get("id"), entity.get("text"), tokens) skip = False if tokPos[0] < headOffset[0]: tokPos = headOffset skip = True if not skip: # Extend before beginIndex = tokIndex for i in range(tokIndex - 1, -1, -1): token = tokens[i] if token.isspace(): continue if not isBacteriaToken(token, bacteriaTokens, i - tokIndex): beginIndex = i + 1 break if i == 0: beginIndex = i while tokens[beginIndex].isspace() or isExtraWord( tokens[beginIndex], toLower=False): beginIndex += 1 if beginIndex >= tokIndex: beginIndex = tokIndex break # Extend after endIndex = tokIndex if tokens[tokIndex][-1] != ",": endIndex = tokIndex for i in range(tokIndex + 1, len(tokens)): token = tokens[i] if token.isspace(): continue if not isBacteriaToken(token, bacteriaTokens, i - tokIndex): endIndex = i - 1 break if i == len(tokens) - 1: endIndex = i while tokens[endIndex].isspace(): endIndex -= 1 # Modify range if tokIndex > beginIndex: for token in reversed(tokens[beginIndex:tokIndex]): tokPos[0] -= len(token) if tokIndex < endIndex: for token in tokens[tokIndex + 1:endIndex + 1]: tokPos[1] += len(token) # Attempt to remove trailing periods and commas while not sentenceText[tokPos[1] - 1].isalnum(): tokPos[1] -= 1 if tokPos[1] < tokPos[0] + 1: tokPos[1] = tokPos[0] + 1 break while not sentenceText[tokPos[0]].isalnum(): tokPos[0] += 1 if tokPos[0] >= tokPos[1]: tokPos[0] = tokPos[1] - 1 break # Split merged names #newPos = [tokPos[0], tokPos[1]] #for split in sentenceText[tokPos[0]:tokPos[1]+1].split("/"): # newPos[0] += len(split) # if # Insert changed charOffset counts["entities"] += 1 newOffset = tuple(tokPos) newOffsetString = Range.tuplesToCharOffset([newOffset]) if verbose: print "Entity", entity.get("id"), #print [entity.get("text"), sentenceText[headOffset[0]:headOffset[1]+1], sentenceText[newOffset[0]:newOffset[1]+1]], print[ entity.get("text"), sentenceText[headOffset[0]:headOffset[1]], sentenceText[newOffset[0]:newOffset[1]] ], print[ entity.get("charOffset"), entity.get("headOffset"), newOffsetString ], "Sent:", len(sentence.get("text")), if newOffset != headOffset: counts["extended"] += 1 if verbose: print "EXTENDED", if newOffset == charOffset: counts["correct"] += 1 if verbose: print "CORRECT" else: counts["incorrect"] += 1 incorrectCount += 1 if verbose: print "INCORRECT" entity.set("charOffset", newOffsetString) #entity.set("text", sentenceText[newOffset[0]:newOffset[1]+1]) entity.set("text", sentenceText[newOffset[0]:newOffset[1]]) if incorrectCount > 0 and verbose: print "TOKENS:", "|".join(tokens) print "--------------------------------" if verbose: print counts if not (ET.iselement(input) and input.tag == "sentence"): if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree
def fixEntities(xml): counts = defaultdict(int) for sentence in xml.getiterator("sentence"): sText = sentence.get("text") for entity in sentence.findall("entity"): charOffset = entity.get("charOffset") if charOffset == "-": assert False, str(entity) sentence.remove(entity) counts["removed-invalid"] += 1 else: charOffset = Range.charOffsetToSingleTuple(charOffset) # fix length realLength = len(entity.get("text")) lenDiff = (charOffset[1] - charOffset[0] + 1) - realLength if lenDiff != realLength: counts["incorrect-ent-offset"] += 1 counts["incorrect-ent-offset-diff" + str(lenDiff)] += 1 if abs(lenDiff) > 2: print "Warning, lenDiff:", (lenDiff, charOffset, sText, entity.get("text"), entity.get("id")) charOffset = (charOffset[0], charOffset[0] + realLength) # find starting position entIndex = sText.find(entity.get("text"), charOffset[0]) if entIndex == -1: for i in [-1, -2, -3]: entIndex = sText.find(entity.get("text"), charOffset[0] + i) if entIndex != -1: break if entIndex != 0: # could be lowercase sTextLower = sText.lower() for i in [0, -1, -2, -3]: lowerEntIndex = sTextLower.find( entity.get("text"), charOffset[0] + i) if lowerEntIndex != -1: break if lowerEntIndex != -1 and abs( lowerEntIndex - charOffset[0]) < abs(entIndex - charOffset[0]): entIndex = lowerEntIndex assert entIndex != -1, (charOffset, sText, entity.get("text"), entity.get("id")) indexDiff = entIndex - charOffset[0] if indexDiff != 0: counts["incorrect-ent-index"] += 1 counts["incorrect-ent-index-diff" + str(indexDiff)] += 1 print "Warning, indexDiff:", (indexDiff, charOffset, sText, entity.get("text"), entity.get("id")) # move offset charOffset = (charOffset[0] + indexDiff, charOffset[1] + indexDiff) # validate new offset sEntity = sText[charOffset[0]:charOffset[1]] assert sEntity == entity.get("text") or sEntity.lower( ) == entity.get("text"), (charOffset, sText, entity.get("text"), entity.get("id")) entity.set( "charOffset", Range.tuplesToCharOffset((charOffset[0], charOffset[1]))) entity.set("given", "True") for interaction in sentence.findall("interaction"): interaction.set("type", "DDI") print "Fix counts:", counts
def moveElements(document): entMap = {} entSentence = {} entSentenceIndex = {} sentences = document.findall("sentence") sentenceCount = 0 for sentence in sentences: sentenceOffset = Range.charOffsetToSingleTuple( sentence.get("charOffset")) # Move entities entCount = 0 for entity in document.findall("entity"): entityOffsets = Range.charOffsetToTuples(entity.get("charOffset")) overlaps = False for entityOffset in entityOffsets: if Range.overlap(sentenceOffset, entityOffset): overlaps = True break if overlaps: document.remove(entity) sentence.append(entity) entityId = entity.get("id") entityIdLastPart = entityId.rsplit(".", 1)[-1] if entityIdLastPart.startswith("e"): entity.set("id", sentence.get("id") + "." + entityIdLastPart) entMap[entityId] = sentence.get( "id") + "." + entityIdLastPart else: entity.set("docId", entityId) entity.set("id", sentence.get("id") + ".e" + str(entCount)) entMap[entityId] = sentence.get("id") + ".e" + str( entCount) entSentence[entityId] = sentence entSentenceIndex[entityId] = sentenceCount #newEntityOffset = (entityOffset[0] - sentenceOffset[0], entityOffset[1] - sentenceOffset[0]) newEntityOffsets = [] for entityOffset in entityOffsets: newOffset = (entityOffset[0] - sentenceOffset[0], entityOffset[1] - sentenceOffset[0]) newOffset = (max(0, newOffset[0]), max(0, newOffset[1])) if newOffset != (0, 0): assert newOffset[1] > newOffset[0], (entity.attrib, entityOffsets, sentenceOffset) newEntityOffsets.append( (entityOffset[0] - sentenceOffset[0], entityOffset[1] - sentenceOffset[0])) assert len(newEntityOffsets) > 0, (entity.attrib, entityOffsets, sentenceOffset) entity.set("origOffset", entity.get("charOffset")) #entity.set("charOffset", str(newEntityOffset[0]) + "-" + str(newEntityOffset[1])) entity.set("charOffset", Range.tuplesToCharOffset(newEntityOffsets)) entCount += 1 sentenceCount += 1 if len([x for x in document.findall("entity")]) != 0: raise Exception( "Sentence splitting does not cover the entire document") # Move interactions intCount = 0 interactions = [] interactionOldToNewId = {} for interaction in document.findall("interaction"): interactions.append(interaction) #if entSentenceIndex[interaction.get("e1")] < entSentenceIndex[interaction.get("e2")]: # targetSentence = entSentence[interaction.get("e1")] #else: # targetSentence = entSentence[interaction.get("e2")] # Interactions go to a sentence always by e1, as this is the event they are an argument of. # If an intersentence interaction is a relation, this shouldn't matter. targetSentence = entSentence[interaction.get("e1")] document.remove(interaction) targetSentence.append(interaction) newId = targetSentence.get("id") + ".i" + str(intCount) interactionOldToNewId[interaction.get("id")] = newId interaction.set("id", newId) interaction.set("e1", entMap[interaction.get("e1")]) interaction.set("e2", entMap[interaction.get("e2")]) intCount += 1 for interaction in interactions: if interaction.get("siteOf") != None: interaction.set("siteOf", interactionOldToNewId[interaction.get("siteOf")])
def convertChemProt(inDirs=None, setNames=None, outPath=None, goldTestSet=True, downloadDir=None, extractDir=None, redownload=False, debug=False): tempDir = None if inDirs == None: print >> sys.stderr, "---------------", "Downloading ChemProt files", "---------------" if extractDir == None: tempDir = tempfile.mkdtemp() inDirs = [] for setName in ("TRAIN", "DEVEL", "TEST"): if goldTestSet and setName == "TEST": setName = "TEST_GOLD" if Settings.URL["CP17_" + setName] != None: currentExtractDir = extractDir if extractDir else tempDir currentExtractDir = os.path.join(currentExtractDir, setName.lower()) inDirs.append( downloadFile(Settings.URL["CP17_" + setName], downloadDir, currentExtractDir, redownload)) print >> sys.stderr, "Reading ChemProt corpus from input", inDirs, "using dataset mapping", setNames dataSets = OrderedDict() for inDir in inDirs: print >> sys.stderr, "Reading input directory", inDir filenames = os.listdir(inDir) filetypes = ["_abstracts", "_entities", "_relations"] # Collect the file paths for the data types dirDataSets = set() for filename in filenames: if not (filename.endswith(".tsv") and any([x in filename for x in filetypes])): continue dataSetId, dataType = filename.replace("_gs", "").rsplit("_", 1) if setNames != None: dataSetId = setNames.get(dataSetId, dataSetId) dirDataSets.add(dataSetId) dataType = dataType.split(".")[0] if dataSetId not in dataSets: dataSets[dataSetId] = {} assert dataType not in dataSets[dataSetId] dataSets[dataSetId][dataType] = os.path.join(inDir, filename) print >> sys.stderr, "Found ChemProt datasets", list( dirDataSets), "at", inDir print >> sys.stderr, "Read datasets:", dataSets.keys() # Build the Interaction XML print >> sys.stderr, "Converting to Interaction XML" corpusName = "CP17" corpus = ET.Element("corpus", {"source": corpusName}) counts = defaultdict(int) docById = {} entityById = {} entitiesByDoc = {} docsWithErrors = set() for dataSetId in sorted(dataSets.keys()): prevCounts = copy.copy(counts) print >> sys.stderr, "---", "Building elements for dataset", dataSetId, "---" dataSet = dataSets[dataSetId] counts["sets"] += 1 with open(dataSet["abstracts"], "rt") as f: print >> sys.stderr, "Adding document elements for dataset", dataSetId for row in UnicodeDictReader( f, delimiter="\t", fieldnames=["id", "title", "abstract"], quoting=csv.QUOTE_NONE): document = ET.Element( "document", { "id": corpusName + ".d" + str(counts["documents"]), "origId": row["id"], "set": dataSetId }) document.set("text", row["title"] + " " + row["abstract"]) document.set("titleOffset", Range.tuplesToCharOffset((0, len(row["title"])))) if document.get("origId") in docById: assert document.get("text") == docById[document.get( "origId")].get("text") assert document.get("titleOffset") == docById[document.get( "origId")].get("titleOffset") counts["duplicate-documents"] += 1 else: corpus.append(document) docById[document.get("origId")] = document counts["documents"] += 1 with open(dataSet["entities"], "rt") as f: print >> sys.stderr, "Adding entity elements for dataset", dataSetId for row in UnicodeDictReader( f, delimiter="\t", fieldnames=["docId", "id", "type", "begin", "end", "text"], quoting=csv.QUOTE_NONE): document = docById[row["docId"]] assert row["type"] in ("CHEMICAL", "GENE-Y", "GENE-N") # Check for duplicate entities if row["docId"] not in entitiesByDoc: entitiesByDoc[row["docId"]] = set() assert row["id"] not in entitiesByDoc[row["docId"]] entitiesByDoc[row["docId"]].add(row["id"]) # Determine the offset offset = (int(row["begin"]), int(row["end"])) docSpan = document.get("text")[offset[0]:offset[1]] if docSpan == row["text"]: entity = ET.SubElement( document, "entity", { "id": document.get("id") + ".e" + str(len([x for x in document.findall("entity")])) }) entity.set("given", "True") entity.set("origId", row["id"]) entity.set("type", row["type"].split("-")[0]) entity.set( "normalized", "True" if row["type"].endswith("-Y") else "False") entity.set( "charOffset", Range.tuplesToCharOffset((offset[0], offset[1]))) entity.set("text", row["text"]) if row["docId"] not in entityById: entityById[row["docId"]] = {} assert entity.get("origId") not in entityById[row["docId"]] entityById[row["docId"]][entity.get("origId")] = entity counts["entities"] += 1 else: print >> sys.stderr, "Alignment error in document", row[ "docId"], (offset, docSpan, row) counts["entities-error"] += 1 docsWithErrors.add(row["docId"]) if "relations" in dataSet: print >> sys.stderr, "Adding relation elements for dataset", dataSetId with open(dataSet["relations"], "rt") as f: for row in UnicodeDictReader(f, delimiter="\t", fieldnames=[ "docId", "group", "groupEval", "type", "arg1", "arg2" ], quoting=csv.QUOTE_NONE): for argId in ("1", "2"): assert row["arg" + argId].startswith("Arg" + argId + ":") row["arg" + argId] = row["arg" + argId][5:] document = docById[row["docId"]] e1 = entityById[row["docId"]].get(row["arg1"]) e2 = entityById[row["docId"]].get(row["arg2"]) if e1 != None and e2 != None: interaction = ET.SubElement( document, "interaction", { "id": document.get("id") + ".i" + str( len([ x for x in document.findall( "interaction") ])) }) interaction.set("directed", "True") interaction.set("type", row["group"]) interaction.set("relType", row["type"]) row["groupEval"] = row["groupEval"].strip() assert row["groupEval"] in ("Y", "N") interaction.set( "evaluated", "True" if row["groupEval"] == "Y" else "False") interaction.set("e1", e1.get("id")) interaction.set("e2", e2.get("id")) counts["interactions"] += 1 else: counts["interaction-error"] += 1 docsWithErrors.add(row["docId"]) else: print >> sys.stderr, "No relations for dataset", dataSetId print >> sys.stderr, "dataset", dataSetId, { x: counts[x] - prevCounts.get(x, 0) for x in counts if counts[x] - prevCounts.get(x, 0) > 0 } if len(docsWithErrors) > 0: counts["documents-with-errors"] = len(docsWithErrors) print >> sys.stderr, "---", "All Datasets Done", "---" print >> sys.stderr, "ChemProt conversion:", dict(counts) if tempDir != None and not debug: print >> sys.stderr, "Removing temporary directory", tempDir shutil.rmtree(tempDir) if outPath != None: ETUtils.write(corpus, outPath) return ET.ElementTree(corpus)