def processCorpus(inPath, outPath, sourceSet, newSets, seed=1): print >> sys.stderr, "Loading corpus file", inPath corpusTree = ETUtils.ETFromObj(inPath) corpusRoot = corpusTree.getroot() rand = random.Random(seed) documents = corpusRoot.findall("document") counts = {"old": defaultdict(int), "new": defaultdict(int)} for document in documents: counts["old"][document.get("set")] += 1 if sourceSet != None and document.get("set") != sourceSet: counts["new"][document.get("set")] += 1 continue value = rand.random() document.set("setValue", str(value)) document.set("origSet", document.get("set", "")) for setName, cutoff in newSets: if value <= cutoff: document.set("set", setName) break counts["new"][document.get("set")] += 1 #for key in counts: # counts[key] = dict(counts[key]) print "MakeSets result:", "old=" + str(dict( counts["old"])) + ", new=" + str(dict(counts["new"])) if outPath != None: ETUtils.write(corpusRoot, outPath) return corpusTree
def parse(self, parserName, input, output=None, debug=False, reparse=False, stanfordParserDir=None, stanfordParserArgs=None, action="convert", outputFormat=None, memory=None): #global stanfordParserDir, stanfordParserArgs assert action in ("convert", "penn", "dep") if stanfordParserDir == None: stanfordParserDir = Settings.STANFORD_PARSER_DIR # Run the parser process corpusTree, corpusRoot = self.getCorpus(input) workdir = tempfile.mkdtemp() inPath = self.makeInputFile(corpusRoot, workdir, parserName, reparse, action, debug) outPath = self.runProcess(stanfordParserArgs, stanfordParserDir, inPath, workdir, action, outputFormat, memory) self.printStderr(outPath) # Insert the parses if action in ("convert", "dep"): #self.insertDependencyParses(outPath, corpusRoot, parserName, {"stanford-mode":action}, addTimeStamp=True, skipExtra=0, removeExisting=True) self.insertStanfordDependencyParses(outPath, corpusRoot, parserName, skipParsed=reparse, removeExisting=reparse) elif action == "penn": self.insertPennTrees(outPath, corpusRoot, parserName) # Remove work directory if not debug: shutil.rmtree(workdir) else: print >> sys.stderr, "Parser IO files at", workdir # Write the output XML file if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree
def removeUnconnectedEntities(input, output=None): input = ETUtils.ETFromObj(input) root = input.getroot() removed = 0 preserved = 0 for document in root.findall("document"): sentMap = {} # allow for intersentence interactions for sentence in document.findall("sentence"): sentMap[sentence.get("id")] = sentence connected = set() for interaction in document.getiterator("interaction"): connected.add(interaction.get("e1")) connected.add(interaction.get("e2")) entities = [] for entity in document.getiterator("entity"): entities.append(entity) for entity in entities: if entity.get("given") == "True": # never remove named entities continue eId = entity.get("id") if eId not in connected: if eId.find(".s") != -1: # sentence level entity sentMap[eId.rsplit(".", 1)[0]].remove(entity) else: # document level entity document.remove(entity) removed += 1 else: preserved += 1 print >> sys.stderr, "Removed", removed, "entities, preserved", preserved, "entities" if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(root, output) return input
def findHeadsSyntactic(corpus, parse, tokenization): """ Determine the head token for a named entity or trigger. The head token is the token closest to the root for the subtree of the dependency parse spanned by the text of the element. @param entityElement: a semantic node (trigger or named entity) @type entityElement: cElementTree.Element @param verbose: Print selected head tokens on screen @param verbose: boolean """ counts = [0,0] sentences = [x for x in corpus.getiterator("sentence")] counter = ProgressCounter(len(sentences), "SYNTAX") for sentence in sentences: counter.update() tokElement = ETUtils.getElementByAttrib(sentence, "sentenceanalyses/tokenizations/tokenization", {"tokenizer":tokenization}) parseElement = ETUtils.getElementByAttrib(sentence, "sentenceanalyses/parses/parse", {"parser":parse}) if tokElement == None or parseElement == None: print >> sys.stderr, "Warning, sentence", sentence.get("id"), "missing parse or tokenization" tokens = tokElement.findall("token") tokenHeadScores = getTokenHeadScores(tokens, parseElement.findall("dependency"), sentenceId=sentence.get("id")) for entity in sentence.findall("entity"): if entity.get("headOffset") == None: headToken = getEntityHeadToken(entity, tokens, tokenHeadScores) # The ElementTree entity-element is modified by setting the headOffset attribute entity.set("headOffset", headToken.get("charOffset")) entity.set("headMethod", "Syntax") entity.set("headString", headToken.get("text")) counts[0] += 1 return counts
def catenateElements(inputs, output): print >> sys.stderr, "##### Catenate interaction XML as elements #####" c1 = RecalculateIds.recalculateIds(input1, None, False, 0) numDocs = len(c1.getroot().findall("document")) print >> sys.stderr, "Documents in input 1:", numDocs c2 = RecalculateIds.recalculateIds(input2, None, False, numDocs) print >> sys.stderr, "Appending documents" c1Root = c1.getroot() for document in c2.getroot().findall("document"): c1Root.append(document) print >> sys.stderr, "Validating ids" ids = set() for element in c1Root.getiterator("entity"): id = element.get("id") assert not id in ids ids.add(id) for element in c1Root.getiterator("interaction"): id = element.get("id") assert not id in ids ids.add(id) for element in c1Root.getiterator("sentence"): id = element.get("id") assert not id in ids ids.add(id) for element in c1Root.getiterator("document"): id = element.get("id") assert not id in ids ids.add(id) if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(c1Root, output) return c1
def mergeAll(input, output=None, debug=False, iterate=False): if iterate: origItems = defaultdict(int) removedItems = defaultdict(int) for docSentences in SentenceElements.getCorpusIterator(input, output): entitiesByType, duplicatesRemovedByType = mergeDuplicateEntities( docSentences, debug) for key in entitiesByType: origItems[key] += entitiesByType[key] for key in duplicatesRemovedByType: removedItems[key] += duplicatesRemovedByType[key] interactionsByType, duplicatesRemovedByType = mergeDuplicateInteractions( docSentences, debug) for key in interactionsByType: origItems[key] += interactionsByType[key] for key in duplicatesRemovedByType: removedItems[key] += duplicatesRemovedByType[key] printStats(origItems, removedItems) return None else: corpusElements = CorpusElements.loadCorpus( input, removeIntersentenceInteractions=False) print >> sys.stderr, "Merging duplicate entities" entitiesByType, duplicatesRemovedByType = mergeDuplicateEntities( corpusElements.sentences, debug) printStats(entitiesByType, duplicatesRemovedByType) print >> sys.stderr, "Merging duplicate interactions" interactionsByType, duplicatesRemovedByType = mergeDuplicateInteractions( corpusElements.sentences, debug) printStats(interactionsByType, duplicatesRemovedByType) if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusElements.rootElement, output) return corpusElements
def run(cls,inFile,multiplier=1.0,outFile=None,targetLabel="neg", binary=False): """inFile can be a string with file name (.xml or .xml.gz) or an ElementTree or an Element or an open input stream multiplier adjusts the level of boosting the non-negative predictions, it is a real number (0,inf) multiplier 1.0 does nothing, <1.0 decreases negative class confidence, >1.0 increases negative class confidence the root of the modified tree is returned and, if outFile is a string, written out to outFile as well""" print >> sys.stderr, "##### Recall adjust with multiplier " + str(multiplier)[:5] + " #####" tree=ETUtils.ETFromObj(inFile) if not ET.iselement(tree): assert isinstance(tree,ET.ElementTree) root=tree.getroot() else: root = tree if multiplier != -1: if binary: print >> sys.stderr, "Recall binary mode" classRanges = getClassRanges(root.getiterator("entity")) assert len(classRanges.keys()) in [0,2] if len(classRanges.keys()) == 0: print >> sys.stderr, "Warning, recall adjustment skipped because no prediction weights found" else: print >> sys.stderr, "Recall multiclass mode" classRanges = None for entityNode in root.getiterator("entity"): adjustEntity(entityNode,targetLabel,multiplier,classRanges) if outFile: ETUtils.write(root,outFile) return tree
def classify(self, data, model, output, parse=None, task=None, goldData=None, workDir=None, fromStep=None, omitSteps=None, validate=False): model = self.openModel(model, "r") self.enterState(self.STATE_CLASSIFY) self.setWorkDir(workDir) if workDir == None: self.setTempWorkDir() model = self.openModel(model, "r") if parse == None: parse = self.getStr(self.tag+"parse", model) workOutputTag = os.path.join(self.workDir, os.path.basename(output) + "-") xml = self.classifyToXML(data, model, None, workOutputTag, model.get(self.tag+"classifier-model", defaultIfNotExist=None), goldData, parse, float(model.getStr("recallAdjustParameter", defaultIfNotExist=1.0))) if (validate): self.structureAnalyzer.load(model) self.structureAnalyzer.validate(xml) ETUtils.write(xml, output+"-pred.xml.gz") else: shutil.copy2(workOutputTag+self.tag+"pred.xml.gz", output+"-pred.xml.gz") EvaluateInteractionXML.run(self.evaluator, xml, data, parse) stParams = self.getBioNLPSharedTaskParams(self.bioNLPSTParams, model) if stParams["convert"]: #self.useBioNLPSTFormat: extension = ".zip" if (stParams["convert"] == "zip") else ".tar.gz" Utils.STFormat.ConvertXML.toSTFormat(xml, output+"-events" + extension, outputTag=stParams["a2Tag"], writeExtra=(stParams["scores"] == True)) if stParams["evaluate"]: #self.stEvaluator != None: if task == None: task = self.getStr(self.tag+"task", model) self.stEvaluator.evaluate(output+"-events" + extension, task) self.deleteTempWorkDir() self.exitState()
def mergeAll(input, output=None, debug=False, iterate=False): if iterate: origItems = defaultdict(int) removedItems = defaultdict(int) for docSentences in SentenceElements.getCorpusIterator(input, output): entitiesByType, duplicatesRemovedByType = mergeDuplicateEntities(docSentences, debug) for key in entitiesByType: origItems[key] += entitiesByType[key] for key in duplicatesRemovedByType: removedItems[key] += duplicatesRemovedByType[key] interactionsByType, duplicatesRemovedByType = mergeDuplicateInteractions(docSentences, debug) for key in interactionsByType: origItems[key] += interactionsByType[key] for key in duplicatesRemovedByType: removedItems[key] += duplicatesRemovedByType[key] printStats(origItems, removedItems) return None else: corpusElements = CorpusElements.loadCorpus(input, removeIntersentenceInteractions=False) print >> sys.stderr, "Merging duplicate entities" entitiesByType, duplicatesRemovedByType = mergeDuplicateEntities(corpusElements.sentences, debug) printStats(entitiesByType, duplicatesRemovedByType) print >> sys.stderr, "Merging duplicate interactions" interactionsByType, duplicatesRemovedByType = mergeDuplicateInteractions(corpusElements.sentences, debug) printStats(interactionsByType, duplicatesRemovedByType) if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusElements.rootElement, output) return corpusElements
def parse(self, input, output=None, tokenizationName=None, parseName="McCC", requireEntities=False, skipIds=[], skipParsed=True, timeout=600, makePhraseElements=True, debug=False, pathParser=None, pathBioModel="AUTO", addTimeStamp=True): print >> sys.stderr, "BLLIP parser" corpusTree, corpusRoot = self.getCorpus(input) workdir = tempfile.mkdtemp() infileName, numCorpusSentences = self.makeInputFile( workdir, corpusRoot, requireEntities, skipIds, skipParsed, tokenizationName, debug) bllipOutput = self.runProcess(infileName, workdir, pathParser, pathBioModel, tokenizationName, timeout) self.insertPennTrees(bllipOutput, corpusRoot, parseName, requireEntities, skipIds, skipParsed) if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) # Remove work directory if not debug: shutil.rmtree(workdir) else: print >> sys.stderr, "Parser IO files at", workdir return corpusTree
def processCorpus(inputFilename, outputFilename, rules): print >> sys.stderr, "Deleting elements, rules =", rules print >> sys.stderr, "Loading corpus file", inputFilename corpusTree = ETUtils.ETFromObj(inputFilename) corpusRoot = corpusTree.getroot() for eType in rules.keys(): for attrRule in rules[eType].keys(): if type(rules[eType][attrRule]) in types.StringTypes: rules[eType][attrRule] = rules[eType][attrRule].split("|") documents = corpusRoot.findall("document") counter = ProgressCounter(len(documents), "Documents") countsByType = defaultdict(int) for document in documents: counter.update() for sentence in document.findall("sentence"): processSentence(sentence, rules, countsByType) print >> sys.stderr, "Deleted elements" for k in sorted(countsByType.keys()): print >> sys.stderr, " " + k + ":", countsByType[k] if outputFilename != None: print >> sys.stderr, "Writing output to", outputFilename ETUtils.write(corpusRoot, outputFilename) return corpusTree
def fixAltOffsets(input, output=None): print >> sys.stderr, "Loading corpus", input corpusTree = ETUtils.ETFromObj(input) print >> sys.stderr, "Corpus file loaded" corpusRoot = corpusTree.getroot() docCount = 0 sentencesCreated = 0 sentences = [x for x in corpusRoot.getiterator("sentence")] counter = ProgressCounter(len(sentences), "FixAltOffsets") fixCount = 0 # fix spans for sentence in sentences: counter.update(1, "Fixing AltOffsets for sentence ("+sentence.get("id")+"): ") sentOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset")) for entity in sentence.findall("entity"): altOffsetString = entity.get("altOffset") if altOffsetString == None: continue #print altOffsetString altOffsets = Range.charOffsetToTuples(altOffsetString) assert len(altOffsets) == 1 for i in range(len(altOffsets)): altOffset = altOffsets[i] altOffsets[i] = (altOffset[0] - sentOffset[0], altOffset[1] - sentOffset[0]) entity.set("altOffset", Range.tuplesToCharOffset(altOffsets)) fixCount += 1 print >> sys.stderr, "Fixed", fixCount, "altOffsets" if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree
def loadEventXML(path, verbose=False): xml = ETUtils.ETFromObj(path) sentDict = {} for sentence in xml.getiterator("sentence"): sentenceText = getText(sentence).strip() if not sentDict.has_key(sentenceText): sentDict[sentenceText] = [] for event in xml.getiterator("event"): sentenceText = getText(event).strip() if not sentDict.has_key(sentenceText): sentDict[sentenceText] = [] events = sentDict[sentenceText] clue = event.find("clue") clueTuple = getClue(clue) eventType = event.find("type").get("class") if eventType == "Protein_amino_acid_phosphorylation": eventType = "Phosphorylation" if type(clueTuple) == types.StringType: if verbose: print "Event", eventType, "clue with no clueType:", ETUtils.toStr(clue) else: assert sentenceText[clueTuple[1]:clueTuple[2]+1] == clueTuple[0], (sentenceText, sentenceText[clueTuple[1]:clueTuple[2]+1], clueTuple) event = (clueTuple[1], clueTuple[2], eventType, clueTuple[0]) if event not in events: events.append(event) return sentDict
def makeSubset(input, output=None, ratio=1.0, seed=0): if ratio == 1.0: if output != None: shutil.copy2(input, output) return output else: return input totalFolds = 100 selectedFolds = int(ratio * 100.0) print >>sys.stderr, "====== Making subset ======" print >>sys.stderr, "Subset for ", input, "ratio", ratio, "seed", seed xml = ETUtils.ETFromObj(input).getroot() count = 0 sentCount = 0 for document in xml.findall("document"): sentCount += len(document.findall("sentence")) count += 1 division = Core.Split.getFolds(count, totalFolds, seed) # print division, selectedFolds - 1 index = 0 removeCount = 0 sentRemoveCount = 0 for document in xml.findall("document"): if division[index] > selectedFolds - 1: xml.remove(document) sentRemoveCount += len(document.findall("sentence")) removeCount += 1 index += 1 print >>sys.stderr, "Subset", "doc:", count, "removed:", removeCount, "sent:", sentCount, "sentremoved:", sentRemoveCount xml.set("subsetRatio", str(ratio)) xml.set("subsetSeed", str(seed)) if output != None: ETUtils.write(xml, output) return output
def makeSubset(input, output=None, ratio=1.0, seed=0): if ratio == 1.0: if output != None: shutil.copy2(input, output) return output else: return input totalFolds = 100 selectedFolds = int(ratio * 100.0) print >> sys.stderr, "====== Making subset ======" print >> sys.stderr, "Subset for ", input, "ratio", ratio, "seed", seed xml = ETUtils.ETFromObj(input).getroot() count = 0 sentCount = 0 for document in xml.findall("document"): sentCount += len(document.findall("sentence")) count += 1 division = Core.Split.getFolds(count, totalFolds, seed) #print division, selectedFolds - 1 index = 0 removeCount = 0 sentRemoveCount = 0 for document in xml.findall("document"): if division[index] > selectedFolds - 1: xml.remove(document) sentRemoveCount += len(document.findall("sentence")) removeCount += 1 index += 1 print >> sys.stderr, "Subset", "doc:", count, "removed:", removeCount, "sent:", sentCount, "sentremoved:", sentRemoveCount xml.set("subsetRatio", str(ratio)) xml.set("subsetSeed", str(seed)) if output != None: ETUtils.write(xml, output) return output
def removeUnconnectedEntities(input, output=None): input = ETUtils.ETFromObj(input) root = input.getroot() removed = 0 preserved = 0 for document in root.findall("document"): sentMap = {} # allow for intersentence interactions for sentence in document.findall("sentence"): sentMap[sentence.get("id")] = sentence connected = set() for interaction in document.getiterator("interaction"): connected.add(interaction.get("e1")) connected.add(interaction.get("e2")) entities = [] for entity in document.getiterator("entity"): entities.append(entity) for entity in entities: if entity.get("isName") == "True": # never remove named entities continue eId = entity.get("id") if eId not in connected: if eId.find(".s") != -1: # sentence level entity sentMap[eId.rsplit(".", 1)[0]].remove(entity) else: # document level entity document.remove(entity) removed += 1 else: preserved += 1 print >> sys.stderr, "Removed", removed, "entities, preserved", preserved, "entities" if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(root, output) return input
def processCorpus(inPath, outPath, sourceSet, newSets, seed=1): print >> sys.stderr, "Loading corpus file", inPath corpusTree = ETUtils.ETFromObj(inPath) corpusRoot = corpusTree.getroot() rand = random.Random(seed) documents = corpusRoot.findall("document") counts = {"old":defaultdict(int), "new":defaultdict(int)} for document in documents: counts["old"][document.get("set")] += 1 if sourceSet != None and document.get("set") != sourceSet: counts["new"][document.get("set")] += 1 continue value = rand.random() document.set("setValue", str(value)) document.set("origSet", document.get("set", "")) for setName, cutoff in newSets: if value <= cutoff: document.set("set", setName) break counts["new"][document.get("set")] += 1 #for key in counts: # counts[key] = dict(counts[key]) print "MakeSets result:", "old=" + str(dict(counts["old"])) + ", new=" + str(dict(counts["new"])) if outPath != None: ETUtils.write(corpusRoot, outPath) return corpusTree
def processCorpus(inputFilename, outputFilename, rules): print >> sys.stderr, "Deleting elements, rules =", rules print >> sys.stderr, "Loading corpus file", inputFilename corpusTree = ETUtils.ETFromObj(inputFilename) corpusRoot = corpusTree.getroot() for eType in rules.keys(): for attrRule in rules[eType].keys(): rules[eType][attrRule] = rules[eType][attrRule].split("|") documents = corpusRoot.findall("document") counter = ProgressCounter(len(documents), "Documents") countsByType = defaultdict(int) for document in documents: counter.update() for sentence in document.findall("sentence"): processSentence(sentence, rules, countsByType) print >> sys.stderr, "Deleted elements" for k in sorted(countsByType.keys()): print >> sys.stderr, " " + k + ":", countsByType[k] if outputFilename != None: print >> sys.stderr, "Writing output to", outputFilename ETUtils.write(corpusRoot, outputFilename) return corpusTree
def process(input, output=None): download("/tmp/extract", "/tmp/download") specAnn = readResources("/tmp/extract") insertElements(input.getroot(), specAnn) if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(input.getroot(), output) return input
def test(extractPath, downloadPath, inCorpusPath, outCorpusPath): download(extractPath, downloadPath) specAnn = readResources(extractPath) inCorpus = ETUtils.ETFromObj(inCorpusPath) insertElements(inCorpus.getroot(), specAnn) ETUtils.write(inCorpus.getroot(), outCorpusPath) #process("/tmp/extract", "/tmp/download", "/home/jari/Dropbox/data/BioNLP16/corpora/BB_EVENT_16-devel.xml", "/tmp/ner.xml")
def insertParses(self, parseDir, input, output=None, parseName="McCC", extensions=None, subDirs=None, debug=False, skipParsed=False, docMatchKeys=None, conllFormat=None, splitting=True, unescapeFormats="AUTO", tokenMerging=True, extMap=None, sdFailedFormat="empty", origIdType=None, posTags=None): corpusTree, corpusRoot = self.getCorpus(input) if not os.path.exists(parseDir): raise Exception("Cannot find parse input '" + str(parseDir) + "'") if not os.path.isdir(parseDir): raise Exception("Parse input '" + str(parseDir) + "' is not a directory") if extensions == None: extensions = self.allExt elif isinstance(extensions, basestring): extensions = extensions.split(",") extensions = [x for x in extensions if x in self.allExt] unescapeFormats = self.getUnescapeFormats(unescapeFormats) if docMatchKeys == None: docMatchKeys = ["origId", "pmid", "id"] elif isinstance(docMatchKeys, basestring): docMatchKeys = docMatchKeys.split(",") print >> sys.stderr, "Inserting parses from file types:", extensions counts = defaultdict(int) files = self.getParseFiles(parseDir, extensions, subDirs, counts, extMap=extMap, origIdType=origIdType) typeCounts = {x:defaultdict(int) for x in extensions} # Make document elements if needed documents = [x for x in corpusRoot.findall("document")] if len(documents) == 0: typeCounts["document-generation"] = defaultdict(int) documents = self.prepareDocuments(corpusRoot, files) counter = ProgressCounter(len(files), "Parse Insertion") # Insert parses and make sentence elements if needed typeCounts["sentence-splitting"] = defaultdict(int) print >> sys.stderr, "Inserting parses for", len(files), "out of total", len(documents), "documents" for document in documents: counts["document"] += 1 matchFound = False for docMatchValue in [document.get(x) for x in docMatchKeys if document.get(x) != None]: if docMatchValue in files: if matchFound: raise Exception("Multiple matching parses for document " + str(document.attrib) + " using keys " + str(docMatchKeys)) matchFound = True counter.update(1, "Inserting parses for (" + document.get("id") + "/" + str(docMatchValue) + "): ") counts["document-match"] += 1 for ext in extensions: if ext not in files[docMatchValue]: continue counts[ext + "-match"] += 1 sentences = [x for x in self.getSentences(document, skipParsed=skipParsed)] self.insertParse(document, sentences, ext, files[docMatchValue][ext], parseName, splitting, typeCounts, conllFormat, unescapeFormats=unescapeFormats, tokenMerging=tokenMerging, sdFailedFormat=sdFailedFormat, posTags=posTags) if not matchFound: counts["document-no-match"] += 1 if len(typeCounts["sentence-splitting"]) > 0: print >> sys.stderr, "Sentence Splitting Counts", dict(typeCounts["sentence-splitting"]) print >> sys.stderr, "Counts", dict(counts) for ext in extensions: if len(typeCounts[ext]) > 0: print >> sys.stderr, "Counts for type '" + ext + "':", dict(typeCounts[ext]) # Write the output XML file if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree
def processCorpus(input, outDir, stem=None, tail=".xml", mergedSets=[], saveCombined=False, verbose=False): newCorpora = {} print >> sys.stderr, "Loading corpus file", input corpusRoot = ETUtils.ETFromObj(input).getroot() documents = corpusRoot.findall("document") counter = ProgressCounter(len(documents), "Documents") countsByType = {} for document in documents: counter.update() docSet = document.get("set") if docSet == None: if verbose: print >> sys.stderr, "Warning, no set defined for document", document.get("id") if not countsByType.has_key("No set"): countsByType["No set"] = 0 countsByType["No set"] += 1 continue elif not newCorpora.has_key(docSet): newCorpora[docSet] = ET.Element("corpus") for k, v in corpusRoot.attrib.iteritems(): newCorpora[docSet].set(k, v) countsByType[docSet] = 0 newCorpora[docSet].append(document) countsByType[docSet] += 1 # Make merged sets for mergedSet in mergedSets: tag = "-and-".join(sorted(mergedSet)) if not newCorpora.has_key(tag): newCorpora[tag] = ET.Element("corpus") for k, v in corpusRoot.attrib.iteritems(): newCorpora[tag].set(k, v) countsByType[tag] = 0 for componentSet in mergedSet: for element in newCorpora[componentSet].findall("document"): newCorpora[tag].append(element) countsByType[tag] += 1 print >> sys.stderr, "Documents per set" for k in sorted(countsByType.keys()): print >> sys.stderr, " " + str(k) + ":", countsByType[k] if stem == None: outDir, stem = os.path.dirname(outDir), os.path.basename(outDir) if not os.path.exists(outDir): os.makedirs(outDir) print >> sys.stderr, "Writing output files to directory", outDir if saveCombined: print >> sys.stderr, "Saving combined input to", stem + tail ETUtils.write(corpusRoot, stem + tail) else: print >> sys.stderr, "Combined input not saved" for docSet in sorted(newCorpora.keys()): outFilename = os.path.join(outDir, stem + "-" + docSet + tail) print >> sys.stderr, "Writing set", docSet, "to", outFilename ETUtils.write(newCorpora[docSet], outFilename)
def addMTMX(input, mtmxDir, output=None): from collections import defaultdict # read interaction XML print "Reading interaction XML" counts = defaultdict(int) xml = ETUtils.ETFromObj(input).getroot() docById = {} for document in xml.getiterator("document"): docId = document.get("origId") assert docId not in docById docById[docId] = document counts["document"] += 1 for entity in xml.getiterator("entity"): counts["entity"] += 1 # read MTMX files print "Processing MTMX" for filename in sorted(os.listdir(mtmxDir)): if filename.endswith(".xml"): print >> sys.stderr, filename, fileId = filename.split("_")[0] if fileId not in docById: print >> sys.stderr, "skipped" continue else: print >> sys.stderr, "processing" doc = docById[fileId] entityByOrigId = {} for entity in doc.getiterator("entity"): assert entity.get("origId") not in entityByOrigId, entity.get("origId") entityByOrigId[entity.get("origId")] = entity mtmx = ETUtils.ETFromObj(os.path.join(mtmxDir, filename)).getroot() for phrase in mtmx.getiterator("PHRASE"): if phrase.get("ID") in entityByOrigId: entity = entityByOrigId[phrase.get("ID")] mapCount = 0 for map in phrase.getiterator("MAP"): if (map.get("NAME").lower() == entity.get("text").lower()) or (map.get("NAME_SHORT").lower() == entity.get("text").lower()): if entity.get("mtmxProb") != None: if int(entity.get("mtmxProb")) > int(map.get("PROB")): break else: counts["mapped-multi"] += 1 counts["mapped-multi-"+str(mapCount)] += 1 #print filename, phrase.get("ID") else: counts["mapped-at-least-once"] += 1 entity.set("mtmxProb", str(map.get("PROB"))) entity.set("mtmxCui", str(map.get("CUI"))) entity.set("mtmxName", str(map.get("NAME"))) entity.set("mtmxNameShort", str(map.get("NAME_SHORT"))) entity.set("mtmxSemTypes", str(map.get("SEMTYPES"))) counts["mappings"] += 1 mapCount += 1 print >> sys.stderr, counts if output != None: ETUtils.write(xml, output)
def convertDDI13(outDir, downloadDir=None, datasets=["DDI13_TRAIN", "DDI13_TEST_TASK_9.1", "DDI13_TEST_TASK_9.2"], redownload=False, insertParses=True, parse=False, makeIntermediateFiles=True, debug=False): cwd = os.getcwd() if not os.path.exists(outDir): os.makedirs(outDir) os.chdir(outDir) logFileName = os.path.join(outDir, "DDI13-conversion-log.txt") Stream.openLog(logFileName) print >> sys.stderr, "=======================", "Converting DDI'13 corpus", "=======================" tempdir = tempfile.mkdtemp() downloaded = downloadFiles(downloadDir, tempdir, redownload) for dataset in datasets: corpusTree = getCorpusXML() xml = corpusTree.getroot() print >> sys.stderr, "Merging input XMLs" assert downloaded[dataset] != None combineXML(xml, "train", downloaded[dataset], subDirs=["DrugBank", "MedLine", "NER"]) print >> sys.stderr, "Processing elements" processElements(xml) if dataset == "DDI13_TRAIN": print >> sys.stderr, "Dividing training set into folds" divideSets(xml, "train", 10) else: for doc in xml.getiterator("document"): doc.set("set", "test") if parse: print >> sys.stderr, "Parsing" parseXML(corpusTree, os.path.join(tempdir, "parsing"), debug) elif insertParses: assert parse == False print >> sys.stderr, "Inserting McCC parses" Tools.BLLIPParser.insertParses(corpusTree, downloaded[dataset + "_TEES_PARSES"], None, extraAttributes={"source":"TEES"}) print >> sys.stderr, "Inserting Stanford conversions" Tools.StanfordParser.insertParses(corpusTree, downloaded[dataset + "_TEES_PARSES"], None, extraAttributes={"stanfordSource":"TEES"}) # Check what was produced by the conversion print >> sys.stderr, "---------------", "Corpus Structure Analysis", "---------------" analyzer = StructureAnalyzer() analyzer.analyze([xml]) print >> sys.stderr, analyzer.toString() if "9.1" in dataset: outFileName = os.path.join(outDir, "DDI13-test-task9.1.xml") elif "9.2" in dataset: outFileName = os.path.join(outDir, "DDI13-test-task9.2.xml") else: outFileName = os.path.join(outDir, "DDI13-train.xml") print >> sys.stderr, "Writing output to", outFileName ETUtils.write(xml, outFileName) Stream.closeLog(logFileName) if not debug and tempdir != None: print >> sys.stderr, "Removing temporary directory", tempdir shutil.rmtree(tempdir) os.chdir(cwd)
def findHeads(input, parse, tokenization=None, output=None, removeExisting=True, iterate=False): if iterate: from Utils.ProgressCounter import ProgressCounter import InteractionXML.SentenceElements as SentenceElements print >> sys.stderr, "Determining head offsets using parse", parse, "and tokenization", tokenization print >> sys.stderr, "Removing existing head offsets" removeCount = 0 counter = ProgressCounter(None, "Find heads") counter.showMilliseconds = True for sentences in SentenceElements.getCorpusIterator(input, output, parse, tokenization): for sentence in sentences: if removeExisting: for e in sentence.sentence.findall("entity"): if e.get("headOffset") != None: removeCount += 1 del e.attrib["headOffset"] graph = SentenceGraph.SentenceGraph(sentence.sentence, sentence.tokens, sentence.dependencies) graph.mapInteractions(sentence.entities, sentence.interactions) # Make sure every parse gets head scores #if graph.tokenHeadScores == None: # graph.getTokenHeadScores() counter.update(len(sentences), "Finding heads ("+sentences[-1].sentence.get("id")+"): ") print >> sys.stderr, "Removed head offsets from", removeCount, "entities" else: xml = ETUtils.ETFromObj(input) if removeExisting: print >> sys.stderr, "Removing existing head offsets" removeCount = 0 xml = ETUtils.ETFromObj(input) for d in xml.getroot().findall("document"): for s in d.findall("sentence"): for e in s.findall("entity"): if e.get("headOffset") != None: removeCount += 1 del e.attrib["headOffset"] print >> sys.stderr, "Removed head offsets from", removeCount, "entities" # SentenceGraph automatically calculates head offsets and adds them to entities if they are missing print >> sys.stderr, "Determining head offsets using parse", parse, "and tokenization", tokenization corpusElements = SentenceGraph.loadCorpus(xml, parse, tokenization) # Make sure every parse gets head scores for sentence in corpusElements.sentences: if sentence.sentenceGraph == None: continue if sentence.sentenceGraph.tokenHeadScores == None: sentence.sentenceGraph.getTokenHeadScores() if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusElements.rootElement, output) return xml
def validateCorpus(input, output, strict=True): print >> sys.stderr, "Validating XML" print >> sys.stderr, "Loading corpus file", input corpusTree = ETUtils.ETFromObj(input) corpusRoot = corpusTree.getroot() counts = validate(corpusRoot, strict) print >> sys.stderr, "Corpus validated:", dict(counts) if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree
def getCorpusIterator(input, output, parse, tokenization=None, removeNameInfo=False, removeIntersentenceInteractions=True): import Utils.ElementTreeUtils as ETUtils from Utils.InteractionXML.SentenceElements import SentenceElements #import xml.etree.cElementTree as ElementTree if output != None: etWriter = ETUtils.ETWriter(output) for eTuple in ETUtils.ETIteratorFromObj(input, ("start", "end")): element = eTuple[1] if eTuple[0] in ["end", "memory"] and element.tag == "document": sentences = [] for sentenceElement in element.findall("sentence"): #print ElementTree.tostring(sentenceElement) sentence = SentenceElements(sentenceElement, parse, tokenization, removeIntersentenceInteractions= removeIntersentenceInteractions) if len(sentence.tokens ) == 0: # or len(sentence.dependencies) == 0: sentence.sentenceGraph = None else: # Construct the basic SentenceGraph (only syntactic information) graph = SentenceGraph(sentence.sentence, sentence.tokens, sentence.dependencies) # Add semantic information, i.e. the interactions graph.mapInteractions(sentence.entities, sentence.interactions) graph.interSentenceInteractions = sentence.interSentenceInteractions #duplicateInteractionEdgesRemoved += graph.duplicateInteractionEdgesRemoved sentence.sentenceGraph = graph graph.parseElement = sentence.parseElement graph.documentElement = element sentences.append(sentence) yield sentences if output != None: etWriter.write(element) elif element.tag == "corpus" and output != None: if eTuple[0] == "start": etWriter.begin(element) else: etWriter.end(element) if eTuple[0] == "end" and element.tag in ["document", "corpus"]: element.clear() if output != None: etWriter.close()
def negateEvents(input, output=None, verbose=False): if not (ET.iselement(input) and input.tag == "sentence"): print >> sys.stderr, "Loading corpus file", input corpusTree = ETUtils.ETFromObj(input) corpusRoot = corpusTree.getroot() if not (ET.iselement(input) and input.tag == "sentence"): sentences = corpusRoot.getiterator("sentence") else: sentences = [input] counts = defaultdict(int) for sentence in sentences: for entity in sentence.findall("entity"): counts["all-entities"] += 1 eType = entity.get("type") if not isNegatableEPITrigger(eType): counts["out-of-scope"] += 1 continue eBaseType = getEPIBaseType(eType) eText = entity.get("text").lower() eNewType = determineNewType(eType, eText) # Insert changed charOffset counts["entities"] += 1 if verbose: print "Entity", entity.get("id"), [entity.get("text")], [ eType, eBaseType, eNewType ], if eNewType != eBaseType: counts["negated"] += 1 if verbose: print "NEGATED", if eNewType == eType: counts["correct"] += 1 if verbose: print "CORRECT" else: counts["incorrect"] += 1 if eNewType == eBaseType: counts["incorrect-pos"] += 1 else: counts["incorrect-neg"] += 1 if verbose: print "INCORRECT" entity.set("type", eNewType) if verbose: print counts if not (ET.iselement(input) and input.tag == "sentence"): if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree
def makeConfigXML(workdir, bannerDir, oldVersion=True): conf = ET.Element("banner-configuration") banner = ET.SubElement(conf, "banner") eval = ET.SubElement(banner, "eval") datasetName = ET.SubElement(eval, "datasetName").text = "banner.eval.dataset.BC2GMDataset" # Dataset dataset = ET.SubElement(eval, "dataset") ET.SubElement(dataset, "sentenceFilename").text = workdir + "/input.txt" ET.SubElement(dataset, "mentionTestFilename").text = workdir + "/empty.eval" ET.SubElement(dataset, "mentionAlternateFilename").text = workdir + "/empty.eval" codecs.open(os.path.join(workdir, "empty.eval"), "wt", "utf-8").close() # More eval level stuff ET.SubElement(eval, "idInputFilename").text = workdir + "/ids.txt" ET.SubElement(eval, "rawInputFilename").text = workdir + "/raw.txt" ET.SubElement(eval, "trainingInputFilename").text = workdir + "/training.txt" ET.SubElement(eval, "outputFilename").text = workdir + "/output.txt" codecs.open(os.path.join(workdir, "output.txt"), "wt", "utf-8").close() ET.SubElement(eval, "inContextAnalysisFilename").text = workdir + "/contextAnalysis.html" ET.SubElement(eval, "mentionFilename").text = workdir + "/mention.txt" ET.SubElement(eval, "modelFilename").text = bannerDir + "/output/model_BC2GM.bin" ET.SubElement(eval, "lemmatiserDataDirectory").text = bannerDir + "/nlpdata/lemmatiser" ET.SubElement(eval, "posTaggerDataDirectory").text = bannerDir + "/nlpdata/tagger" ET.SubElement(eval, "posTagger").text = "dragon.nlp.tool.HeppleTagger" ET.SubElement(eval, "tokenizer").text = "banner.tokenization.SimpleTokenizer" ET.SubElement(eval, "useParenthesisPostProcessing").text = "true" ET.SubElement(eval, "useLocalAbbreviationPostProcessing").text = "true" ET.SubElement(eval, "useNumericNormalization").text = "true" ET.SubElement(eval, "tagFormat").text = "IOB" ET.SubElement(eval, "crfOrder").text = "2" if not oldVersion: ET.SubElement(eval, "mentionTypes").text = "Required" ET.SubElement(eval, "sameTypeOverlapOption").text = "Exception" ET.SubElement(eval, "differentTypeOverlapOption").text = "Exception" ET.SubElement(eval, "dictionaryTagger").text = "banner.tagging.dictionary.DictionaryTagger" # End eval element tagging = ET.SubElement(banner, "tagging") dictionary = ET.SubElement(tagging, "dictionary") dictionaryTagger = ET.SubElement(dictionary, "DictionaryTagger") ET.SubElement(dictionaryTagger, "filterContainedMentions").text = "true" ET.SubElement(dictionaryTagger, "normalizeMixedCase").text = "false" ET.SubElement(dictionaryTagger, "normalizeDigits").text = "false" ET.SubElement(dictionaryTagger, "canonize").text = "false" ET.SubElement(dictionaryTagger, "generate2PartVariations").text = "true" ET.SubElement(dictionaryTagger, "dropEndParentheticals").text = "false" ET.SubElement(dictionaryTagger, "dictionaryFile").text = bannerDir + "/dict/single.txt" ET.SubElement(dictionaryTagger, "dictionaryType").text = "GENE" # Write to file filename = workdir + "/banner_config.xml" ETUtils.write(conf, workdir + "/banner_config.xml") return workdir + "/banner_config.xml"
def getSubset(input, output=None, fraction=1.0, seed=0, ids=None, attributes=None, invert=False, targetElementTag="document"): distribution = None if ids == None and attributes == None: print >> sys.stderr, "No id-file, using pseudorandom distribution" distribution = getSample( getElementCounts(input, [targetElementTag])[targetElementTag], fraction, seed) elif attributes != None: print >> sys.stderr, "Selecting subset with attributes:", attributes for key in attributes: assert type(attributes[key]) in (types.ListType, types.TupleType), attributes counts = defaultdict(int) outWriter = None if output != None: outWriter = ETUtils.ETWriter(output) targetElementCount = 0 skip = False for event in ETUtils.ETIteratorFromObj(input, ("start", "end")): if event[0] == "start": if event[1].tag == targetElementTag: skip = select(targetElementCount, distribution, event[1], ids, attributes, invert) targetElementCount += 1 if not skip: outWriter.begin(event[1]) counts[event[1].tag + ":kept"] += 1 else: counts[event[1].tag + ":removed"] += 1 elif event[0] == "end": if not skip: outWriter.end(event[1]) if event[1].tag == targetElementTag: skip = False if output != None: outWriter.close() ETUtils.encodeNewlines(output) print >> sys.stderr, "Subset for " + str(input) + ": " + str(counts)
def classify(self, data, model, output, parse=None, task=None, goldData=None, workDir=None, fromStep=None, omitSteps=None, validate=False): model = self.openModel(model, "r") self.enterState(self.STATE_CLASSIFY) self.setWorkDir(workDir) if workDir == None: self.setTempWorkDir() model = self.openModel(model, "r") if parse == None: parse = self.getStr(self.tag + "parse", model) workOutputTag = os.path.join(self.workDir, os.path.basename(output) + "-") xml = self.classifyToXML( data, model, None, workOutputTag, model.get(self.tag + "classifier-model", defaultIfNotExist=None), goldData, parse, float(model.getStr("recallAdjustParameter", defaultIfNotExist=1.0))) if (validate): self.structureAnalyzer.load(model) self.structureAnalyzer.validate(xml) ETUtils.write(xml, output + "-pred.xml.gz") else: shutil.copy2(workOutputTag + self.tag + "pred.xml.gz", output + "-pred.xml.gz") EvaluateInteractionXML.run(self.evaluator, xml, data, parse) stParams = self.getBioNLPSharedTaskParams(self.bioNLPSTParams, model) if stParams["convert"]: #self.useBioNLPSTFormat: extension = ".zip" if (stParams["convert"] == "zip") else ".tar.gz" Utils.STFormat.ConvertXML.toSTFormat( xml, output + "-events" + extension, outputTag=stParams["a2Tag"], writeExtra=(stParams["scores"] == True)) if stParams["evaluate"]: #self.stEvaluator != None: if task == None: task = self.getStr(self.tag + "task", model) self.stEvaluator.evaluate(output + "-events" + extension, task) self.deleteTempWorkDir() self.exitState()
def negateEvents(input, output=None, verbose=False): if not (ET.iselement(input) and input.tag == "sentence"): print >> sys.stderr, "Loading corpus file", input corpusTree = ETUtils.ETFromObj(input) corpusRoot = corpusTree.getroot() if not (ET.iselement(input) and input.tag == "sentence"): sentences = corpusRoot.getiterator("sentence") else: sentences = [input] counts = defaultdict(int) for sentence in sentences: for entity in sentence.findall("entity"): counts["all-entities"] += 1 eType = entity.get("type") if not isNegatableEPITrigger(eType): counts["out-of-scope"] += 1 continue eBaseType = getEPIBaseType(eType) eText = entity.get("text").lower() eNewType = determineNewType(eType, eText) # Insert changed charOffset counts["entities"] += 1 if verbose: print "Entity", entity.get("id"), [entity.get("text")], [eType, eBaseType, eNewType], if eNewType != eBaseType: counts["negated"] += 1 if verbose: print "NEGATED", if eNewType == eType: counts["correct"] += 1 if verbose: print "CORRECT" else: counts["incorrect"] += 1 if eNewType == eBaseType: counts["incorrect-pos"] += 1 else: counts["incorrect-neg"] += 1 if verbose: print "INCORRECT" entity.set("type", eNewType) if verbose: print counts if not (ET.iselement(input) and input.tag == "sentence"): if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree
def loadDrugBank(filename, preTag="{http://drugbank.ca}", verbose=False): data = defaultdict(lambda: defaultdict(list)) print "Loading DrugBank XML from", filename xml = ETUtils.ETFromObj(filename) print "Processing DrugBank XML" root = xml.getroot() assert root.tag == preTag + "drugs", root.tag for drug in root.findall(preTag + "drug"): id = drug.find(preTag + "drugbank-id").text name = drug.find(preTag + "name").text if verbose: print id, name assert id not in data data[id]["name"] = name data[id]["id"] = id # TODO: Enzymes & targets # TODO: hydrophobicity getNestedItems(drug, "synonym", data[id], preTag) getNestedItems(drug, "brand", data[id], preTag) getNestedItems(drug, "group", data[id], preTag) getNestedItems(drug, "category", data[id], preTag, "categories") interactions = drug.find(preTag + "drug-interactions").findall( preTag + "drug-interaction") for interaction in interactions: data[id]["interaction"].append([ interaction.find(preTag + "drug").text, interaction.find(preTag + "name").text, interaction.find(preTag + "description").text, ]) return data
def addSets(corpus, xml, evalStandardDownloadPath, evalStandardPackageDir="ppi-eval-standard"): #evalStandardExtractPath = os.path.join(tempfile.gettempdir(), "PPIEvalStandard") evalStandardPath = os.path.join(tempfile.gettempdir(), evalStandardPackageDir) if not os.path.exists(evalStandardPath): print >> sys.stderr, "Extracting evaluation standard from", evalStandardDownloadPath Utils.Download.extractPackage(evalStandardDownloadPath, tempfile.gettempdir()) print >> sys.stderr, "Using extracted evaluation standard at", evalStandardPath assert os.path.exists(evalStandardPath) docSets = {} for dataSet in "train", "test": dataSetXMLPath = os.path.join(evalStandardPath, dataSet, corpus + "-" + dataSet + ".xml") print >> sys.stderr, "Loading evaluation standard XML from", dataSetXMLPath dataSetXML = ETUtils.ETFromObj(dataSetXMLPath) for document in dataSetXML.getroot().findall("document"): assert document.get("id") not in docSets docSets[document.get("id")] = {"set":dataSet, "element":document} print >> sys.stderr, "Assigning sets" counts = defaultdict(int) for document in xml.findall("document"): counts["documents"] += 1 docId = document.get("id") if docId in docSets: counts["documents-in-eval-standard"] += 1 document.set("set", docSets[docId]["set"]) if document.get("origId") != None and docSets[docId]["element"].get("origId") != None: assert document.get("origId") == docSets[docId]["element"].get("origId"), docId counts["documents-match-by-origId"] += 1 counts["eval-standard-set:" + docSets[docId]["set"]] += 1 else: print >> sys.stderr, "Warning, removing document", document.get("id"), "which is not included in the PPI evaluation standard" counts["missing-from-eval-standard"] += 1 xml.remove(document) print >> sys.stderr, "PPI Evaluation Standard sets for corpus", corpus, "documents:", dict(counts) return xml
def convertCorpus(corpus, outDir=None, downloadDir=None, redownload=False, removeAnalyses=True, develFraction=0.3, logPath=None): assert corpus in PPI_CORPORA if logPath == "AUTO": logPath = outDir + "/conversion/" + corpus + "-conversion-log.txt" if outDir != None else None if logPath: Stream.openLog(logPath) print >> sys.stderr, "==========", "Converting PPI corpus", corpus, "==========" downloaded = downloadCorpus(corpus, outDir, downloadDir, redownload) print >> sys.stderr, "---------------", "Updating Interaction XML format", "---------------" print >> sys.stderr, "Loading", downloaded[corpus + "_LEARNING_FORMAT"] xml = ETUtils.ETFromObj(downloaded[corpus + "_LEARNING_FORMAT"]) root = xml.getroot() updateXML(root, removeAnalyses) print >> sys.stderr, "---------------", "Adding sets from the PPI evaluation standard", "---------------" addSets(corpus, root, downloaded["PPI_EVALUATION_STANDARD"]) if develFraction > 0.0: print >> sys.stderr, "---------------", "Generating devel set", "---------------" MakeSets.processCorpus(xml, None, "train", [("devel", develFraction), ("train", 1.0)], 1) if outDir != None: print >> sys.stderr, "---------------", "Writing corpus", "---------------" #if intermediateFiles: #print >> sys.stderr, "Writing combined corpus" #ETUtils.write(xml, os.path.join(outDir, corpus + ".xml")) print >> sys.stderr, "Dividing into sets" Utils.InteractionXML.DivideSets.processCorpus(xml, outDir, corpus, ".xml") if logPath != None: Stream.closeLog(logPath) return xml
def mergeCorpora(corpusIds, outputId, inputDir, outDir): merged = Catenate.catenateElements(corpusIds, inputDir) for dataSet in ("devel", "train"): renameElements(merged[dataSet].getroot(), {"Localization":"Lives_In", "Host":"Habitat", "HostPart":"Habitat", "Food":"Habitat", "Soil":"Habitat", "Medical":"Habitat", "Water":"Habitat", "Bacterium":"Bacteria"}) DeleteElements.removeElements(merged[dataSet].getroot(), {"interaction":{"type":"PartOf"}}) if outDir != None: outPath = os.path.join(outDir, outputId + "-" + dataSet + ".xml") print "Writing set", dataSet, "to", outPath ETUtils.write(merged[dataSet].getroot(), outPath)
def makeDDISubmissionFile(input, output): xml = ETUtils.ETFromObj(input) outFile = open(output, "wt") for sentence in xml.getiterator("sentence"): # First determine which pairs interact intMap = defaultdict(lambda: defaultdict(lambda: None)) for interaction in sentence.findall("interaction"): # Make mapping both ways to discard edge directionality. This isn't actually needed, # since MultiEdgeExampleBuilder builds entity pairs in the same order as this function, # but shouldn't harm to include it and now it works regardless of pair direction. if interaction.get("type") != "neg": intMap[interaction.get("e1")][interaction.get( "e2")] = interaction intMap[interaction.get("e2")][interaction.get( "e1")] = interaction # Then write all pairs to the output file entities = sentence.findall("entity") for i in range(0, len(entities) - 1): for j in range(i + 1, len(entities)): eIId = entities[i].get("id") eJId = entities[j].get("id") outFile.write(eIId + "\t" + eJId + "\t") if intMap[eIId][eJId] != None: outFile.write("1\n") else: outFile.write("0\n")
def loadDocs(inDir, idStart=0): print "Loading documents from", inDir sentences = {"positive": [], "negative": []} docCounts = {} docById = {} documents = [] for filename in sorted(os.listdir(inDir)): if filename.endswith(".xml"): print "Reading", filename, xml = ETUtils.ETFromObj(os.path.join(inDir, filename)) for document in xml.getiterator("document"): counts = [0, 0] for sentence in document.findall("sentence"): #sentence.set("document.get("origId") + "." + sentence.get("origId")) truePairs = False for pair in sentence.findall("pair"): if pair.get("interaction") == "true": truePairs = True break if truePairs: counts[0] += 1 sentences["positive"].append(sentence) else: counts[1] += 1 sentences["negative"].append(sentence) assert document.get("id") not in docCounts docCounts[document.get("id")] = counts docById[document.get("id")] = document documents.append(document) print counts, #print ETUtils.toStr(document) print print "Positive sentences:", len(sentences["positive"]) print "Negative sentences:", len(sentences["negative"]) return documents, docById, docCounts
def getEmptyCorpus(xml, deletionRules=None, removeNames=False): """ A convenience function for getting an empty corpus, useful for testing for information leaks in the event extraction process. """ if type(xml) in types.StringTypes: # XML is read from disk, so it's a new copy and can be safely modified xml = ETUtils.ETFromObj(xml) else: # XML is already an object in memory. To prevent problems with other users of it, a copy # is created before deleting elements. xml = copy.deepcopy(xml) if deletionRules == None: # use default rules for BioNLP Shared Task # We remove all interactions, and all entities that are not named entities. This leaves only # the gold standard protein/gene names if removeNames: deletionRules = {"interaction": {}, "entity": {}} else: deletionRules = { "interaction": {}, "entity": { "given": (None, "False") } } # Remove elements and return the emptied XML return processCorpus(xml, None, deletionRules)
def parse(self, input, output=None, tokenizationName=None, parseName="McCC", requireEntities=False, skipIds=[], skipParsed=True, timeout=600, makePhraseElements=True, debug=False, pathParser=None, pathBioModel="AUTO", addTimeStamp=True): print >> sys.stderr, "BLLIP parser" corpusTree, corpusRoot = self.getCorpus(input) workdir = tempfile.mkdtemp() infileName, numCorpusSentences = self.makeInputFile(workdir, corpusRoot, requireEntities, skipIds, skipParsed, tokenizationName, debug) bllipOutput = self.runProcess(infileName, workdir, pathParser, pathBioModel, tokenizationName, timeout) self.insertPennTrees(bllipOutput, corpusRoot, parseName, requireEntities, skipIds, skipParsed) if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) # Remove work directory if not debug: shutil.rmtree(workdir) else: print >> sys.stderr, "Parser IO files at", workdir return corpusTree
def analyze(self, inputs, model=None): self._init() if type(inputs) in types.StringTypes: inputs = [inputs] for xml in inputs: print >> sys.stderr, "Analyzing", xml xml = ETUtils.ETFromObj(xml) for document in xml.getiterator("document"): # Collect elements into dictionaries entityById = {} for entity in document.getiterator("entity"): entityById[entity.get("id")] = entity interactions = [] interactionsByE1 = defaultdict(list) for interaction in document.getiterator("interaction"): interactions.append(interaction) interactionsByE1[interaction.get("e1")].append(interaction) siteOfTypes = self.buildSiteOfMap(interactions, interactionsByE1, entityById) # Add entity elements to analysis for entity in document.getiterator("entity"): self.addEntityElement(entity, interactionsByE1) # Add interaction elements to analysis for interaction in interactions: self.addInteractionElement(interaction, entityById, siteOfTypes[interaction]) # Calculate event definition argument limits from event instances for event in self.events.values(): event.countArguments() self._updateSupportingAnalyses() if model != None: self.save(model)
def processCorpus(input, attrs=["text"]): print attrs print >> sys.stderr, "Loading corpus file", input corpusRoot = ETUtils.ETFromObj(input).getroot() documents = corpusRoot.findall("document") counter = ProgressCounter(len(documents), "Documents") countsByType = {} interactors = {} for document in documents: entDict = {} for entity in document.getiterator("entity"): entDict[entity.get("id")] = entity for interaction in document.getiterator("interaction"): e1 = entDict[interaction.get("e1")] e2 = entDict[interaction.get("e2")] # form identifier tuples e1Tuple = [] for attr in attrs: e1Tuple.append(e1.get(attr)) e1Tuple = tuple(e1Tuple) e2Tuple = [] for attr in attrs: e2Tuple.append(e2.get(attr)) e2Tuple = tuple(e2Tuple) interactors = [e1Tuple, e2Tuple] #interactors.sort() print interactors
def combineXML(corpusXML, setName, dataDirs, subDirs=["DrugBank", "MedLine"]): # Add all documents into one XML ids = {} if isinstance(dataDirs, basestring): dataDirs = [] for dataDir in dataDirs: if dataDir.startswith(".") or dataDir.startswith("_"): continue for subDir in [""] + subDirs: inDir = dataDir + "/" + subDir if "/." in dataDir or "/_" in dataDir: # attempt to survive the junk directories continue if os.path.exists(inDir): for filename in sorted(os.listdir(inDir)): if filename.endswith(".xml"): print >> sys.stderr, "Reading", filename xml = ETUtils.ETFromObj(os.path.join(inDir, filename)) document = xml.getroot() assert document.tag == "document" assert document.get("id") not in ids, ( document.get("id"), os.path.join(inDir, filename), ids[document.get("id")]) ids[document.get("id")] = os.path.join(inDir, filename) document.set("source", os.path.join(subDir, filename)) if setName != None: document.set("set", setName) corpusXML.append(document)
def convert(self, input, dataSetNames=None, corpusName=None, output=None): if isinstance(input, basestring) and (os.path.isdir(input) or input.endswith(".tar.gz") or input.endswith(".txt") or "," in input): print >> sys.stderr, "Converting ST-format to Interaction XML" # Get input file (or files) dataSetDirs = input documents = [] if type(dataSetDirs) in types.StringTypes: dataSetDirs = dataSetDirs.split(",") # Get the list of "train", "devel" etc names for these sets if dataSetNames == None: dataSetNames = [] elif type(dataSetNames) in types.StringTypes: dataSetNames = dataSetNames.split(",") # Convert all input files into one corpus for dataSetDir, dataSetName in itertools.izip_longest( dataSetDirs, dataSetNames, fillvalue=None): print >> sys.stderr, "Reading", dataSetDir, "set,", docs = Utils.STFormat.STTools.loadSet(dataSetDir, dataSetName) print >> sys.stderr, len(docs), "documents" documents.extend(docs) print >> sys.stderr, "Resolving equivalences" Utils.STFormat.Equiv.process(documents) if corpusName == None: corpusName = "TEES" self.xml = Utils.STFormat.ConvertXML.toInteractionXML( documents, corpusName, output) else: print >> sys.stderr, "Processing source as interaction XML" self.xml = ETUtils.ETFromObj(input) return self.xml
def process(input, output=None, preprocess=True, debug=False): """ Run MetaMap. """ counter = ProgressCounter(id="MetaMap") # Create working directory workdir = tempfile.mkdtemp() outWriter = None if output != None: outWriter = ETUtils.ETWriter(output) # Loop iteratively over elements skip = False for event, element in ETUtils.ETIteratorFromObj(input, ("start", "end")): if event == "start": # element start message, element may not be fully read yet if element.tag == "sentence": sentence = element counter.update(1, "Processing MetaMap ("+sentence.get("id")+"): ") # Run metamap for the sentence element elif element.tag == "metamap": # skip the metamap element to remove the original one skip = True if not skip and output != None: outWriter.begin(element) elif event == "end": # element is fully read in memory if not skip and output != None: outWriter.end(element) if element.tag == "metamap": skip = False # write elements again after this one if preprocess: element = convert(element, sentence) outWriter.write(element) # insert the new metamap element into the output stream if output != None: print >> sys.stderr, "Writing output to", output outWriter.close() ETUtils.encodeNewlines(output) if debug: print >> sys.stderr, "Work directory preserved for debugging at", workdir else: shutil.rmtree(workdir) return output
def processCorpus(inputFilename, outputFilename, rules, reverse=False): print >> sys.stderr, "Deleting elements, rules =", rules print >> sys.stderr, "Loading corpus file", inputFilename corpusTree = ETUtils.ETFromObj(inputFilename) corpusRoot = corpusTree.getroot() countsByType = defaultdict(int) removeElements(corpusRoot, rules, reverse, countsByType) print >> sys.stderr, "Deleted elements" for k in sorted(countsByType.keys()): print >> sys.stderr, " " + k + ":", countsByType[k] if outputFilename != None: print >> sys.stderr, "Writing output to", outputFilename ETUtils.write(corpusRoot, outputFilename) return corpusTree
def parse(self, parserName, input, output=None, debug=False, reparse=False, syntaxNetDir=None, modelDir=None): # Run the parser process if syntaxNetDir == None: syntaxNetDir = Settings.SYNTAXNET_DIR corpusTree, corpusRoot = self.getCorpus(input) workdir = tempfile.mkdtemp() inPath = self.makeInputFile(corpusRoot, workdir) outPath = ProcessUtils.runSentenceProcess(self.run, syntaxNetDir, inPath, workdir, True, "SyntaxNetParser", "Parsing", processArgs={"modelDir":modelDir}) self.insertCoNLLParses(outPath, corpusRoot, parserName, unescaping=True, conllFormat="conllx") # Remove work directory if not debug: shutil.rmtree(workdir) else: print >> sys.stderr, "Parser IO files at", workdir # Write the output XML file if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree
def mixSets(input, output, docOrigIds, sourceSet, targetSet): print >> sys.stderr, "Mixing Sets", input corpusTree = ETUtils.ETFromObj(input) corpusRoot = corpusTree.getroot() if docOrigIds != None: for document in corpusRoot.getiterator("document"): docId = document.get("pmid") if docId == None: docId = document.get("origId") if docId in docOrigIds: assert document.get("set") == sourceSet document.set("set", targetSet) docOrigIds.remove(docId) assert len(docOrigIds) == 0, docOrigIds sentenceIds = None if sentenceIds != None: for document in corpusRoot.getiterator("document"): removed = [] for sentence in document.findall("sentence"): assert document.get("set") == sourceSet sentenceId = sentence.get("id") if sentenceId in sentenceIds: removed.append(document.remove(sentence)) sentenceIds.remove(sentenceId) if len(removed) > 0: newDoc = ET.Element("document") for attr in document.attrib: newDoc.set(attr, document.get(attr)) newDoc.set("id", None) newDoc.set("set", targetSet) for sentence in removed: newDoc.append(sentence) corpusRoot.append(newDoc) assert len(sentenceIds) == None RecalculateIds.recalculateIds(corpusTree, onlyWithinSentence=False) if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree
def convert(inPath, outDir, corpusId, directed, negatives, preprocess, preprocessorParameters=None, debug=False, clear=False, constParser="BLLIP-BIO", depParser="STANFORD-CONVERT", logging=True): assert negatives in ("INCLUDE", "SKIP", "REVERSE_POS") # Download the corpus if needed if inPath == None: if not hasattr(Settings, "SE10T8_CORPUS"): SemEval2010Task8Tools.install() inPath = Settings.SE10T8_CORPUS assert os.path.exists(inPath) # Prepare the output directory if not os.path.exists(outDir): print "Making output directory", outDir os.makedirs(outDir) elif clear: print "Removing output directory", outDir shutil.rmtree(outDir) # Start logging if logging: Stream.openLog(os.path.join(outDir, "log.txt"), clear=clear) # Read and process the corpus files archive = zipfile.ZipFile(inPath, 'r') usedIds = set() tree = None for fileName, setName in [("SemEval2010_task8_all_data/SemEval2010_task8_training/TRAIN_FILE.TXT", "train"),\ ("SemEval2010_task8_all_data/SemEval2010_task8_testing_keys/TEST_FILE_FULL.TXT", "test")]: print "Processing file", fileName, "as set", setName f = archive.open(fileName) tree = processLines(f.readlines(), setName, directed=directed, negatives=negatives, usedIds=usedIds, tree=tree, corpusId=corpusId) f.close() # Divide the training set into training and development sets MakeSets.processCorpus(tree, None, "train", [("train", 0.7), ("devel", 1.0)], 1) # Write out the converted corpus convertedPath = os.path.join(outDir, corpusId + "-converted.xml") ETUtils.write(tree.getroot(), convertedPath) # Preprocess the converted corpus if preprocess: outPath = os.path.join(outDir, corpusId + ".xml") preprocessor = Preprocessor(constParser, depParser) preprocessor.setArgForAllSteps("debug", debug) preprocessor.stepArgs("CONVERT")["corpusName"] = corpusId preprocessor.process(convertedPath, outPath, preprocessorParameters, omitSteps=["SPLIT-SENTENCES", "NER", "SPLIT-NAMES"]) # Stop logging if logging: Stream.closeLog(os.path.join(outDir, "log.txt"))
def processCorpus(input, output, rules): if rules == None: raise Exception("No mapping rules defined") elif isinstance(rules, basestring): rules = eval(rules) print >> sys.stderr, "Mapping attributes, rules =", rules print >> sys.stderr, "Loading corpus file", input corpusTree = ETUtils.ETFromObj(input) corpusRoot = corpusTree.getroot() counts = defaultdict(int) for key in sorted(rules.keys()): mapAttributes(corpusRoot, key, rules[key], counts) print >> sys.stderr, "Mapped", dict(counts) if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree