def processCorpus(inputFilename, outputFilename, rules): print >> sys.stderr, "Deleting elements, rules =", rules print >> sys.stderr, "Loading corpus file", inputFilename corpusTree = ETUtils.ETFromObj(inputFilename) corpusRoot = corpusTree.getroot() for eType in rules.keys(): for attrRule in rules[eType].keys(): rules[eType][attrRule] = rules[eType][attrRule].split("|") documents = corpusRoot.findall("document") counter = ProgressCounter(len(documents), "Documents") countsByType = defaultdict(int) for document in documents: counter.update() for sentence in document.findall("sentence"): processSentence(sentence, rules, countsByType) print >> sys.stderr, "Deleted elements" for k in sorted(countsByType.keys()): print >> sys.stderr, " " + k + ":", countsByType[k] if outputFilename != None: print >> sys.stderr, "Writing output to", outputFilename ETUtils.write(corpusRoot, outputFilename) return corpusTree
def mergeAll(input, output=None, debug=False, iterate=False): if iterate: origItems = defaultdict(int) removedItems = defaultdict(int) for docSentences in SentenceElements.getCorpusIterator(input, output): entitiesByType, duplicatesRemovedByType = mergeDuplicateEntities( docSentences, debug) for key in entitiesByType: origItems[key] += entitiesByType[key] for key in duplicatesRemovedByType: removedItems[key] += duplicatesRemovedByType[key] interactionsByType, duplicatesRemovedByType = mergeDuplicateInteractions( docSentences, debug) for key in interactionsByType: origItems[key] += interactionsByType[key] for key in duplicatesRemovedByType: removedItems[key] += duplicatesRemovedByType[key] printStats(origItems, removedItems) return None else: corpusElements = CorpusElements.loadCorpus( input, removeIntersentenceInteractions=False) print >> sys.stderr, "Merging duplicate entities" entitiesByType, duplicatesRemovedByType = mergeDuplicateEntities( corpusElements.sentences, debug) printStats(entitiesByType, duplicatesRemovedByType) print >> sys.stderr, "Merging duplicate interactions" interactionsByType, duplicatesRemovedByType = mergeDuplicateInteractions( corpusElements.sentences, debug) printStats(interactionsByType, duplicatesRemovedByType) if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusElements.rootElement, output) return corpusElements
def classify(self, data, model, output, parse=None, task=None, goldData=None, workDir=None, fromStep=None, omitSteps=None, validate=False): model = self.openModel(model, "r") self.enterState(self.STATE_CLASSIFY) self.setWorkDir(workDir) if workDir == None: self.setTempWorkDir() model = self.openModel(model, "r") if parse == None: parse = self.getStr(self.tag+"parse", model) workOutputTag = os.path.join(self.workDir, os.path.basename(output) + "-") xml = self.classifyToXML(data, model, None, workOutputTag, model.get(self.tag+"classifier-model", defaultIfNotExist=None), goldData, parse, float(model.getStr("recallAdjustParameter", defaultIfNotExist=1.0))) if (validate): self.structureAnalyzer.load(model) self.structureAnalyzer.validate(xml) ETUtils.write(xml, output+"-pred.xml.gz") else: shutil.copy2(workOutputTag+self.tag+"pred.xml.gz", output+"-pred.xml.gz") EvaluateInteractionXML.run(self.evaluator, xml, data, parse) stParams = self.getBioNLPSharedTaskParams(self.bioNLPSTParams, model) if stParams["convert"]: #self.useBioNLPSTFormat: extension = ".zip" if (stParams["convert"] == "zip") else ".tar.gz" Utils.STFormat.ConvertXML.toSTFormat(xml, output+"-events" + extension, outputTag=stParams["a2Tag"], writeExtra=(stParams["scores"] == True)) if stParams["evaluate"]: #self.stEvaluator != None: if task == None: task = self.getStr(self.tag+"task", model) self.stEvaluator.evaluate(output+"-events" + extension, task) self.deleteTempWorkDir() self.exitState()
def catenateElements(inputs, output): print >> sys.stderr, "##### Catenate interaction XML as elements #####" c1 = RecalculateIds.recalculateIds(input1, None, False, 0) numDocs = len(c1.getroot().findall("document")) print >> sys.stderr, "Documents in input 1:", numDocs c2 = RecalculateIds.recalculateIds(input2, None, False, numDocs) print >> sys.stderr, "Appending documents" c1Root = c1.getroot() for document in c2.getroot().findall("document"): c1Root.append(document) print >> sys.stderr, "Validating ids" ids = set() for element in c1Root.getiterator("entity"): id = element.get("id") assert not id in ids ids.add(id) for element in c1Root.getiterator("interaction"): id = element.get("id") assert not id in ids ids.add(id) for element in c1Root.getiterator("sentence"): id = element.get("id") assert not id in ids ids.add(id) for element in c1Root.getiterator("document"): id = element.get("id") assert not id in ids ids.add(id) if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(c1Root, output) return c1
def removeUnconnectedEntities(input, output=None): input = ETUtils.ETFromObj(input) root = input.getroot() removed = 0 preserved = 0 for document in root.findall("document"): sentMap = {} # allow for intersentence interactions for sentence in document.findall("sentence"): sentMap[sentence.get("id")] = sentence connected = set() for interaction in document.getiterator("interaction"): connected.add(interaction.get("e1")) connected.add(interaction.get("e2")) entities = [] for entity in document.getiterator("entity"): entities.append(entity) for entity in entities: if entity.get("isName") == "True": # never remove named entities continue eId = entity.get("id") if eId not in connected: if eId.find(".s") != -1: # sentence level entity sentMap[eId.rsplit(".", 1)[0]].remove(entity) else: # document level entity document.remove(entity) removed += 1 else: preserved += 1 print >> sys.stderr, "Removed", removed, "entities, preserved", preserved, "entities" if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(root, output) return input
def removeUnconnectedEntities(input, output=None): input = ETUtils.ETFromObj(input) root = input.getroot() removed = 0 preserved = 0 for document in root.findall("document"): sentMap = {} # allow for intersentence interactions for sentence in document.findall("sentence"): sentMap[sentence.get("id")] = sentence connected = set() for interaction in document.getiterator("interaction"): connected.add(interaction.get("e1")) connected.add(interaction.get("e2")) entities = [] for entity in document.getiterator("entity"): entities.append(entity) for entity in entities: if entity.get("given") == "True": # never remove named entities continue eId = entity.get("id") if eId not in connected: if eId.find(".s") != -1: # sentence level entity sentMap[eId.rsplit(".", 1)[0]].remove(entity) else: # document level entity document.remove(entity) removed += 1 else: preserved += 1 print >> sys.stderr, "Removed", removed, "entities, preserved", preserved, "entities" if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(root, output) return input
def processCorpus(inPath, outPath, sourceSet, newSets, seed=1): print >> sys.stderr, "Loading corpus file", inPath corpusTree = ETUtils.ETFromObj(inPath) corpusRoot = corpusTree.getroot() rand = random.Random(seed) documents = corpusRoot.findall("document") counts = {"old": defaultdict(int), "new": defaultdict(int)} for document in documents: counts["old"][document.get("set")] += 1 if sourceSet != None and document.get("set") != sourceSet: counts["new"][document.get("set")] += 1 continue value = rand.random() document.set("setValue", str(value)) document.set("origSet", document.get("set", "")) for setName, cutoff in newSets: if value <= cutoff: document.set("set", setName) break counts["new"][document.get("set")] += 1 #for key in counts: # counts[key] = dict(counts[key]) print "MakeSets result:", "old=" + str(dict( counts["old"])) + ", new=" + str(dict(counts["new"])) if outPath != None: ETUtils.write(corpusRoot, outPath) return corpusTree
def run(cls,inFile,multiplier=1.0,outFile=None,targetLabel="neg", binary=False): """inFile can be a string with file name (.xml or .xml.gz) or an ElementTree or an Element or an open input stream multiplier adjusts the level of boosting the non-negative predictions, it is a real number (0,inf) multiplier 1.0 does nothing, <1.0 decreases negative class confidence, >1.0 increases negative class confidence the root of the modified tree is returned and, if outFile is a string, written out to outFile as well""" print >> sys.stderr, "##### Recall adjust with multiplier " + str(multiplier)[:5] + " #####" tree=ETUtils.ETFromObj(inFile) if not ET.iselement(tree): assert isinstance(tree,ET.ElementTree) root=tree.getroot() else: root = tree if multiplier != -1: if binary: print >> sys.stderr, "Recall binary mode" classRanges = getClassRanges(root.getiterator("entity")) assert len(classRanges.keys()) in [0,2] if len(classRanges.keys()) == 0: print >> sys.stderr, "Warning, recall adjustment skipped because no prediction weights found" else: print >> sys.stderr, "Recall multiclass mode" classRanges = None for entityNode in root.getiterator("entity"): adjustEntity(entityNode,targetLabel,multiplier,classRanges) if outFile: ETUtils.write(root,outFile) return tree
def makeSubset(input, output=None, ratio=1.0, seed=0): if ratio == 1.0: if output != None: shutil.copy2(input, output) return output else: return input totalFolds = 100 selectedFolds = int(ratio * 100.0) print >>sys.stderr, "====== Making subset ======" print >>sys.stderr, "Subset for ", input, "ratio", ratio, "seed", seed xml = ETUtils.ETFromObj(input).getroot() count = 0 sentCount = 0 for document in xml.findall("document"): sentCount += len(document.findall("sentence")) count += 1 division = Core.Split.getFolds(count, totalFolds, seed) # print division, selectedFolds - 1 index = 0 removeCount = 0 sentRemoveCount = 0 for document in xml.findall("document"): if division[index] > selectedFolds - 1: xml.remove(document) sentRemoveCount += len(document.findall("sentence")) removeCount += 1 index += 1 print >>sys.stderr, "Subset", "doc:", count, "removed:", removeCount, "sent:", sentCount, "sentremoved:", sentRemoveCount xml.set("subsetRatio", str(ratio)) xml.set("subsetSeed", str(seed)) if output != None: ETUtils.write(xml, output) return output
def processCorpus(inPath, outPath, sourceSet, newSets, seed=1): print >> sys.stderr, "Loading corpus file", inPath corpusTree = ETUtils.ETFromObj(inPath) corpusRoot = corpusTree.getroot() rand = random.Random(seed) documents = corpusRoot.findall("document") counts = {"old":defaultdict(int), "new":defaultdict(int)} for document in documents: counts["old"][document.get("set")] += 1 if sourceSet != None and document.get("set") != sourceSet: counts["new"][document.get("set")] += 1 continue value = rand.random() document.set("setValue", str(value)) document.set("origSet", document.get("set", "")) for setName, cutoff in newSets: if value <= cutoff: document.set("set", setName) break counts["new"][document.get("set")] += 1 #for key in counts: # counts[key] = dict(counts[key]) print "MakeSets result:", "old=" + str(dict(counts["old"])) + ", new=" + str(dict(counts["new"])) if outPath != None: ETUtils.write(corpusRoot, outPath) return corpusTree
def fixAltOffsets(input, output=None): print >> sys.stderr, "Loading corpus", input corpusTree = ETUtils.ETFromObj(input) print >> sys.stderr, "Corpus file loaded" corpusRoot = corpusTree.getroot() docCount = 0 sentencesCreated = 0 sentences = [x for x in corpusRoot.getiterator("sentence")] counter = ProgressCounter(len(sentences), "FixAltOffsets") fixCount = 0 # fix spans for sentence in sentences: counter.update(1, "Fixing AltOffsets for sentence ("+sentence.get("id")+"): ") sentOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset")) for entity in sentence.findall("entity"): altOffsetString = entity.get("altOffset") if altOffsetString == None: continue #print altOffsetString altOffsets = Range.charOffsetToTuples(altOffsetString) assert len(altOffsets) == 1 for i in range(len(altOffsets)): altOffset = altOffsets[i] altOffsets[i] = (altOffset[0] - sentOffset[0], altOffset[1] - sentOffset[0]) entity.set("altOffset", Range.tuplesToCharOffset(altOffsets)) fixCount += 1 print >> sys.stderr, "Fixed", fixCount, "altOffsets" if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree
def makeSubset(input, output=None, ratio=1.0, seed=0): if ratio == 1.0: if output != None: shutil.copy2(input, output) return output else: return input totalFolds = 100 selectedFolds = int(ratio * 100.0) print >> sys.stderr, "====== Making subset ======" print >> sys.stderr, "Subset for ", input, "ratio", ratio, "seed", seed xml = ETUtils.ETFromObj(input).getroot() count = 0 sentCount = 0 for document in xml.findall("document"): sentCount += len(document.findall("sentence")) count += 1 division = Core.Split.getFolds(count, totalFolds, seed) #print division, selectedFolds - 1 index = 0 removeCount = 0 sentRemoveCount = 0 for document in xml.findall("document"): if division[index] > selectedFolds - 1: xml.remove(document) sentRemoveCount += len(document.findall("sentence")) removeCount += 1 index += 1 print >> sys.stderr, "Subset", "doc:", count, "removed:", removeCount, "sent:", sentCount, "sentremoved:", sentRemoveCount xml.set("subsetRatio", str(ratio)) xml.set("subsetSeed", str(seed)) if output != None: ETUtils.write(xml, output) return output
def parse(self, input, output=None, tokenizationName=None, parseName="McCC", requireEntities=False, skipIds=[], skipParsed=True, timeout=600, makePhraseElements=True, debug=False, pathParser=None, pathBioModel="AUTO", addTimeStamp=True): print >> sys.stderr, "BLLIP parser" corpusTree, corpusRoot = self.getCorpus(input) workdir = tempfile.mkdtemp() infileName, numCorpusSentences = self.makeInputFile( workdir, corpusRoot, requireEntities, skipIds, skipParsed, tokenizationName, debug) bllipOutput = self.runProcess(infileName, workdir, pathParser, pathBioModel, tokenizationName, timeout) self.insertPennTrees(bllipOutput, corpusRoot, parseName, requireEntities, skipIds, skipParsed) if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) # Remove work directory if not debug: shutil.rmtree(workdir) else: print >> sys.stderr, "Parser IO files at", workdir return corpusTree
def processCorpus(inputFilename, outputFilename, rules): print >> sys.stderr, "Deleting elements, rules =", rules print >> sys.stderr, "Loading corpus file", inputFilename corpusTree = ETUtils.ETFromObj(inputFilename) corpusRoot = corpusTree.getroot() for eType in rules.keys(): for attrRule in rules[eType].keys(): if type(rules[eType][attrRule]) in types.StringTypes: rules[eType][attrRule] = rules[eType][attrRule].split("|") documents = corpusRoot.findall("document") counter = ProgressCounter(len(documents), "Documents") countsByType = defaultdict(int) for document in documents: counter.update() for sentence in document.findall("sentence"): processSentence(sentence, rules, countsByType) print >> sys.stderr, "Deleted elements" for k in sorted(countsByType.keys()): print >> sys.stderr, " " + k + ":", countsByType[k] if outputFilename != None: print >> sys.stderr, "Writing output to", outputFilename ETUtils.write(corpusRoot, outputFilename) return corpusTree
def mergeAll(input, output=None, debug=False, iterate=False): if iterate: origItems = defaultdict(int) removedItems = defaultdict(int) for docSentences in SentenceElements.getCorpusIterator(input, output): entitiesByType, duplicatesRemovedByType = mergeDuplicateEntities(docSentences, debug) for key in entitiesByType: origItems[key] += entitiesByType[key] for key in duplicatesRemovedByType: removedItems[key] += duplicatesRemovedByType[key] interactionsByType, duplicatesRemovedByType = mergeDuplicateInteractions(docSentences, debug) for key in interactionsByType: origItems[key] += interactionsByType[key] for key in duplicatesRemovedByType: removedItems[key] += duplicatesRemovedByType[key] printStats(origItems, removedItems) return None else: corpusElements = CorpusElements.loadCorpus(input, removeIntersentenceInteractions=False) print >> sys.stderr, "Merging duplicate entities" entitiesByType, duplicatesRemovedByType = mergeDuplicateEntities(corpusElements.sentences, debug) printStats(entitiesByType, duplicatesRemovedByType) print >> sys.stderr, "Merging duplicate interactions" interactionsByType, duplicatesRemovedByType = mergeDuplicateInteractions(corpusElements.sentences, debug) printStats(interactionsByType, duplicatesRemovedByType) if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusElements.rootElement, output) return corpusElements
def parse(self, parserName, input, output=None, debug=False, reparse=False, stanfordParserDir=None, stanfordParserArgs=None, action="convert", outputFormat=None, memory=None): #global stanfordParserDir, stanfordParserArgs assert action in ("convert", "penn", "dep") if stanfordParserDir == None: stanfordParserDir = Settings.STANFORD_PARSER_DIR # Run the parser process corpusTree, corpusRoot = self.getCorpus(input) workdir = tempfile.mkdtemp() inPath = self.makeInputFile(corpusRoot, workdir, parserName, reparse, action, debug) outPath = self.runProcess(stanfordParserArgs, stanfordParserDir, inPath, workdir, action, outputFormat, memory) self.printStderr(outPath) # Insert the parses if action in ("convert", "dep"): #self.insertDependencyParses(outPath, corpusRoot, parserName, {"stanford-mode":action}, addTimeStamp=True, skipExtra=0, removeExisting=True) self.insertStanfordDependencyParses(outPath, corpusRoot, parserName, skipParsed=reparse, removeExisting=reparse) elif action == "penn": self.insertPennTrees(outPath, corpusRoot, parserName) # Remove work directory if not debug: shutil.rmtree(workdir) else: print >> sys.stderr, "Parser IO files at", workdir # Write the output XML file if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree
def process(input, output=None): download("/tmp/extract", "/tmp/download") specAnn = readResources("/tmp/extract") insertElements(input.getroot(), specAnn) if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(input.getroot(), output) return input
def test(extractPath, downloadPath, inCorpusPath, outCorpusPath): download(extractPath, downloadPath) specAnn = readResources(extractPath) inCorpus = ETUtils.ETFromObj(inCorpusPath) insertElements(inCorpus.getroot(), specAnn) ETUtils.write(inCorpus.getroot(), outCorpusPath) #process("/tmp/extract", "/tmp/download", "/home/jari/Dropbox/data/BioNLP16/corpora/BB_EVENT_16-devel.xml", "/tmp/ner.xml")
def insertParses(self, parseDir, input, output=None, parseName="McCC", extensions=None, subDirs=None, debug=False, skipParsed=False, docMatchKeys=None, conllFormat=None, splitting=True, unescapeFormats="AUTO", tokenMerging=True, extMap=None, sdFailedFormat="empty", origIdType=None, posTags=None): corpusTree, corpusRoot = self.getCorpus(input) if not os.path.exists(parseDir): raise Exception("Cannot find parse input '" + str(parseDir) + "'") if not os.path.isdir(parseDir): raise Exception("Parse input '" + str(parseDir) + "' is not a directory") if extensions == None: extensions = self.allExt elif isinstance(extensions, basestring): extensions = extensions.split(",") extensions = [x for x in extensions if x in self.allExt] unescapeFormats = self.getUnescapeFormats(unescapeFormats) if docMatchKeys == None: docMatchKeys = ["origId", "pmid", "id"] elif isinstance(docMatchKeys, basestring): docMatchKeys = docMatchKeys.split(",") print >> sys.stderr, "Inserting parses from file types:", extensions counts = defaultdict(int) files = self.getParseFiles(parseDir, extensions, subDirs, counts, extMap=extMap, origIdType=origIdType) typeCounts = {x:defaultdict(int) for x in extensions} # Make document elements if needed documents = [x for x in corpusRoot.findall("document")] if len(documents) == 0: typeCounts["document-generation"] = defaultdict(int) documents = self.prepareDocuments(corpusRoot, files) counter = ProgressCounter(len(files), "Parse Insertion") # Insert parses and make sentence elements if needed typeCounts["sentence-splitting"] = defaultdict(int) print >> sys.stderr, "Inserting parses for", len(files), "out of total", len(documents), "documents" for document in documents: counts["document"] += 1 matchFound = False for docMatchValue in [document.get(x) for x in docMatchKeys if document.get(x) != None]: if docMatchValue in files: if matchFound: raise Exception("Multiple matching parses for document " + str(document.attrib) + " using keys " + str(docMatchKeys)) matchFound = True counter.update(1, "Inserting parses for (" + document.get("id") + "/" + str(docMatchValue) + "): ") counts["document-match"] += 1 for ext in extensions: if ext not in files[docMatchValue]: continue counts[ext + "-match"] += 1 sentences = [x for x in self.getSentences(document, skipParsed=skipParsed)] self.insertParse(document, sentences, ext, files[docMatchValue][ext], parseName, splitting, typeCounts, conllFormat, unescapeFormats=unescapeFormats, tokenMerging=tokenMerging, sdFailedFormat=sdFailedFormat, posTags=posTags) if not matchFound: counts["document-no-match"] += 1 if len(typeCounts["sentence-splitting"]) > 0: print >> sys.stderr, "Sentence Splitting Counts", dict(typeCounts["sentence-splitting"]) print >> sys.stderr, "Counts", dict(counts) for ext in extensions: if len(typeCounts[ext]) > 0: print >> sys.stderr, "Counts for type '" + ext + "':", dict(typeCounts[ext]) # Write the output XML file if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree
def processCorpus(input, outDir, stem=None, tail=".xml", mergedSets=[], saveCombined=False, verbose=False): newCorpora = {} print >> sys.stderr, "Loading corpus file", input corpusRoot = ETUtils.ETFromObj(input).getroot() documents = corpusRoot.findall("document") counter = ProgressCounter(len(documents), "Documents") countsByType = {} for document in documents: counter.update() docSet = document.get("set") if docSet == None: if verbose: print >> sys.stderr, "Warning, no set defined for document", document.get("id") if not countsByType.has_key("No set"): countsByType["No set"] = 0 countsByType["No set"] += 1 continue elif not newCorpora.has_key(docSet): newCorpora[docSet] = ET.Element("corpus") for k, v in corpusRoot.attrib.iteritems(): newCorpora[docSet].set(k, v) countsByType[docSet] = 0 newCorpora[docSet].append(document) countsByType[docSet] += 1 # Make merged sets for mergedSet in mergedSets: tag = "-and-".join(sorted(mergedSet)) if not newCorpora.has_key(tag): newCorpora[tag] = ET.Element("corpus") for k, v in corpusRoot.attrib.iteritems(): newCorpora[tag].set(k, v) countsByType[tag] = 0 for componentSet in mergedSet: for element in newCorpora[componentSet].findall("document"): newCorpora[tag].append(element) countsByType[tag] += 1 print >> sys.stderr, "Documents per set" for k in sorted(countsByType.keys()): print >> sys.stderr, " " + str(k) + ":", countsByType[k] if stem == None: outDir, stem = os.path.dirname(outDir), os.path.basename(outDir) if not os.path.exists(outDir): os.makedirs(outDir) print >> sys.stderr, "Writing output files to directory", outDir if saveCombined: print >> sys.stderr, "Saving combined input to", stem + tail ETUtils.write(corpusRoot, stem + tail) else: print >> sys.stderr, "Combined input not saved" for docSet in sorted(newCorpora.keys()): outFilename = os.path.join(outDir, stem + "-" + docSet + tail) print >> sys.stderr, "Writing set", docSet, "to", outFilename ETUtils.write(newCorpora[docSet], outFilename)
def addMTMX(input, mtmxDir, output=None): from collections import defaultdict # read interaction XML print "Reading interaction XML" counts = defaultdict(int) xml = ETUtils.ETFromObj(input).getroot() docById = {} for document in xml.getiterator("document"): docId = document.get("origId") assert docId not in docById docById[docId] = document counts["document"] += 1 for entity in xml.getiterator("entity"): counts["entity"] += 1 # read MTMX files print "Processing MTMX" for filename in sorted(os.listdir(mtmxDir)): if filename.endswith(".xml"): print >> sys.stderr, filename, fileId = filename.split("_")[0] if fileId not in docById: print >> sys.stderr, "skipped" continue else: print >> sys.stderr, "processing" doc = docById[fileId] entityByOrigId = {} for entity in doc.getiterator("entity"): assert entity.get("origId") not in entityByOrigId, entity.get("origId") entityByOrigId[entity.get("origId")] = entity mtmx = ETUtils.ETFromObj(os.path.join(mtmxDir, filename)).getroot() for phrase in mtmx.getiterator("PHRASE"): if phrase.get("ID") in entityByOrigId: entity = entityByOrigId[phrase.get("ID")] mapCount = 0 for map in phrase.getiterator("MAP"): if (map.get("NAME").lower() == entity.get("text").lower()) or (map.get("NAME_SHORT").lower() == entity.get("text").lower()): if entity.get("mtmxProb") != None: if int(entity.get("mtmxProb")) > int(map.get("PROB")): break else: counts["mapped-multi"] += 1 counts["mapped-multi-"+str(mapCount)] += 1 #print filename, phrase.get("ID") else: counts["mapped-at-least-once"] += 1 entity.set("mtmxProb", str(map.get("PROB"))) entity.set("mtmxCui", str(map.get("CUI"))) entity.set("mtmxName", str(map.get("NAME"))) entity.set("mtmxNameShort", str(map.get("NAME_SHORT"))) entity.set("mtmxSemTypes", str(map.get("SEMTYPES"))) counts["mappings"] += 1 mapCount += 1 print >> sys.stderr, counts if output != None: ETUtils.write(xml, output)
def convertDDI13(outDir, downloadDir=None, datasets=["DDI13_TRAIN", "DDI13_TEST_TASK_9.1", "DDI13_TEST_TASK_9.2"], redownload=False, insertParses=True, parse=False, makeIntermediateFiles=True, debug=False): cwd = os.getcwd() if not os.path.exists(outDir): os.makedirs(outDir) os.chdir(outDir) logFileName = os.path.join(outDir, "DDI13-conversion-log.txt") Stream.openLog(logFileName) print >> sys.stderr, "=======================", "Converting DDI'13 corpus", "=======================" tempdir = tempfile.mkdtemp() downloaded = downloadFiles(downloadDir, tempdir, redownload) for dataset in datasets: corpusTree = getCorpusXML() xml = corpusTree.getroot() print >> sys.stderr, "Merging input XMLs" assert downloaded[dataset] != None combineXML(xml, "train", downloaded[dataset], subDirs=["DrugBank", "MedLine", "NER"]) print >> sys.stderr, "Processing elements" processElements(xml) if dataset == "DDI13_TRAIN": print >> sys.stderr, "Dividing training set into folds" divideSets(xml, "train", 10) else: for doc in xml.getiterator("document"): doc.set("set", "test") if parse: print >> sys.stderr, "Parsing" parseXML(corpusTree, os.path.join(tempdir, "parsing"), debug) elif insertParses: assert parse == False print >> sys.stderr, "Inserting McCC parses" Tools.BLLIPParser.insertParses(corpusTree, downloaded[dataset + "_TEES_PARSES"], None, extraAttributes={"source":"TEES"}) print >> sys.stderr, "Inserting Stanford conversions" Tools.StanfordParser.insertParses(corpusTree, downloaded[dataset + "_TEES_PARSES"], None, extraAttributes={"stanfordSource":"TEES"}) # Check what was produced by the conversion print >> sys.stderr, "---------------", "Corpus Structure Analysis", "---------------" analyzer = StructureAnalyzer() analyzer.analyze([xml]) print >> sys.stderr, analyzer.toString() if "9.1" in dataset: outFileName = os.path.join(outDir, "DDI13-test-task9.1.xml") elif "9.2" in dataset: outFileName = os.path.join(outDir, "DDI13-test-task9.2.xml") else: outFileName = os.path.join(outDir, "DDI13-train.xml") print >> sys.stderr, "Writing output to", outFileName ETUtils.write(xml, outFileName) Stream.closeLog(logFileName) if not debug and tempdir != None: print >> sys.stderr, "Removing temporary directory", tempdir shutil.rmtree(tempdir) os.chdir(cwd)
def findHeads(input, parse, tokenization=None, output=None, removeExisting=True, iterate=False): if iterate: from Utils.ProgressCounter import ProgressCounter import InteractionXML.SentenceElements as SentenceElements print >> sys.stderr, "Determining head offsets using parse", parse, "and tokenization", tokenization print >> sys.stderr, "Removing existing head offsets" removeCount = 0 counter = ProgressCounter(None, "Find heads") counter.showMilliseconds = True for sentences in SentenceElements.getCorpusIterator(input, output, parse, tokenization): for sentence in sentences: if removeExisting: for e in sentence.sentence.findall("entity"): if e.get("headOffset") != None: removeCount += 1 del e.attrib["headOffset"] graph = SentenceGraph.SentenceGraph(sentence.sentence, sentence.tokens, sentence.dependencies) graph.mapInteractions(sentence.entities, sentence.interactions) # Make sure every parse gets head scores #if graph.tokenHeadScores == None: # graph.getTokenHeadScores() counter.update(len(sentences), "Finding heads ("+sentences[-1].sentence.get("id")+"): ") print >> sys.stderr, "Removed head offsets from", removeCount, "entities" else: xml = ETUtils.ETFromObj(input) if removeExisting: print >> sys.stderr, "Removing existing head offsets" removeCount = 0 xml = ETUtils.ETFromObj(input) for d in xml.getroot().findall("document"): for s in d.findall("sentence"): for e in s.findall("entity"): if e.get("headOffset") != None: removeCount += 1 del e.attrib["headOffset"] print >> sys.stderr, "Removed head offsets from", removeCount, "entities" # SentenceGraph automatically calculates head offsets and adds them to entities if they are missing print >> sys.stderr, "Determining head offsets using parse", parse, "and tokenization", tokenization corpusElements = SentenceGraph.loadCorpus(xml, parse, tokenization) # Make sure every parse gets head scores for sentence in corpusElements.sentences: if sentence.sentenceGraph == None: continue if sentence.sentenceGraph.tokenHeadScores == None: sentence.sentenceGraph.getTokenHeadScores() if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusElements.rootElement, output) return xml
def validateCorpus(input, output, strict=True): print >> sys.stderr, "Validating XML" print >> sys.stderr, "Loading corpus file", input corpusTree = ETUtils.ETFromObj(input) corpusRoot = corpusTree.getroot() counts = validate(corpusRoot, strict) print >> sys.stderr, "Corpus validated:", dict(counts) if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree
def makeConfigXML(workdir, bannerDir, oldVersion=True): conf = ET.Element("banner-configuration") banner = ET.SubElement(conf, "banner") eval = ET.SubElement(banner, "eval") datasetName = ET.SubElement(eval, "datasetName").text = "banner.eval.dataset.BC2GMDataset" # Dataset dataset = ET.SubElement(eval, "dataset") ET.SubElement(dataset, "sentenceFilename").text = workdir + "/input.txt" ET.SubElement(dataset, "mentionTestFilename").text = workdir + "/empty.eval" ET.SubElement(dataset, "mentionAlternateFilename").text = workdir + "/empty.eval" codecs.open(os.path.join(workdir, "empty.eval"), "wt", "utf-8").close() # More eval level stuff ET.SubElement(eval, "idInputFilename").text = workdir + "/ids.txt" ET.SubElement(eval, "rawInputFilename").text = workdir + "/raw.txt" ET.SubElement(eval, "trainingInputFilename").text = workdir + "/training.txt" ET.SubElement(eval, "outputFilename").text = workdir + "/output.txt" codecs.open(os.path.join(workdir, "output.txt"), "wt", "utf-8").close() ET.SubElement(eval, "inContextAnalysisFilename").text = workdir + "/contextAnalysis.html" ET.SubElement(eval, "mentionFilename").text = workdir + "/mention.txt" ET.SubElement(eval, "modelFilename").text = bannerDir + "/output/model_BC2GM.bin" ET.SubElement(eval, "lemmatiserDataDirectory").text = bannerDir + "/nlpdata/lemmatiser" ET.SubElement(eval, "posTaggerDataDirectory").text = bannerDir + "/nlpdata/tagger" ET.SubElement(eval, "posTagger").text = "dragon.nlp.tool.HeppleTagger" ET.SubElement(eval, "tokenizer").text = "banner.tokenization.SimpleTokenizer" ET.SubElement(eval, "useParenthesisPostProcessing").text = "true" ET.SubElement(eval, "useLocalAbbreviationPostProcessing").text = "true" ET.SubElement(eval, "useNumericNormalization").text = "true" ET.SubElement(eval, "tagFormat").text = "IOB" ET.SubElement(eval, "crfOrder").text = "2" if not oldVersion: ET.SubElement(eval, "mentionTypes").text = "Required" ET.SubElement(eval, "sameTypeOverlapOption").text = "Exception" ET.SubElement(eval, "differentTypeOverlapOption").text = "Exception" ET.SubElement(eval, "dictionaryTagger").text = "banner.tagging.dictionary.DictionaryTagger" # End eval element tagging = ET.SubElement(banner, "tagging") dictionary = ET.SubElement(tagging, "dictionary") dictionaryTagger = ET.SubElement(dictionary, "DictionaryTagger") ET.SubElement(dictionaryTagger, "filterContainedMentions").text = "true" ET.SubElement(dictionaryTagger, "normalizeMixedCase").text = "false" ET.SubElement(dictionaryTagger, "normalizeDigits").text = "false" ET.SubElement(dictionaryTagger, "canonize").text = "false" ET.SubElement(dictionaryTagger, "generate2PartVariations").text = "true" ET.SubElement(dictionaryTagger, "dropEndParentheticals").text = "false" ET.SubElement(dictionaryTagger, "dictionaryFile").text = bannerDir + "/dict/single.txt" ET.SubElement(dictionaryTagger, "dictionaryType").text = "GENE" # Write to file filename = workdir + "/banner_config.xml" ETUtils.write(conf, workdir + "/banner_config.xml") return workdir + "/banner_config.xml"
def negateEvents(input, output=None, verbose=False): if not (ET.iselement(input) and input.tag == "sentence"): print >> sys.stderr, "Loading corpus file", input corpusTree = ETUtils.ETFromObj(input) corpusRoot = corpusTree.getroot() if not (ET.iselement(input) and input.tag == "sentence"): sentences = corpusRoot.getiterator("sentence") else: sentences = [input] counts = defaultdict(int) for sentence in sentences: for entity in sentence.findall("entity"): counts["all-entities"] += 1 eType = entity.get("type") if not isNegatableEPITrigger(eType): counts["out-of-scope"] += 1 continue eBaseType = getEPIBaseType(eType) eText = entity.get("text").lower() eNewType = determineNewType(eType, eText) # Insert changed charOffset counts["entities"] += 1 if verbose: print "Entity", entity.get("id"), [entity.get("text")], [ eType, eBaseType, eNewType ], if eNewType != eBaseType: counts["negated"] += 1 if verbose: print "NEGATED", if eNewType == eType: counts["correct"] += 1 if verbose: print "CORRECT" else: counts["incorrect"] += 1 if eNewType == eBaseType: counts["incorrect-pos"] += 1 else: counts["incorrect-neg"] += 1 if verbose: print "INCORRECT" entity.set("type", eNewType) if verbose: print counts if not (ET.iselement(input) and input.tag == "sentence"): if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree
def classify(self, data, model, output, parse=None, task=None, goldData=None, workDir=None, fromStep=None, omitSteps=None, validate=False): model = self.openModel(model, "r") self.enterState(self.STATE_CLASSIFY) self.setWorkDir(workDir) if workDir == None: self.setTempWorkDir() model = self.openModel(model, "r") if parse == None: parse = self.getStr(self.tag + "parse", model) workOutputTag = os.path.join(self.workDir, os.path.basename(output) + "-") xml = self.classifyToXML( data, model, None, workOutputTag, model.get(self.tag + "classifier-model", defaultIfNotExist=None), goldData, parse, float(model.getStr("recallAdjustParameter", defaultIfNotExist=1.0))) if (validate): self.structureAnalyzer.load(model) self.structureAnalyzer.validate(xml) ETUtils.write(xml, output + "-pred.xml.gz") else: shutil.copy2(workOutputTag + self.tag + "pred.xml.gz", output + "-pred.xml.gz") EvaluateInteractionXML.run(self.evaluator, xml, data, parse) stParams = self.getBioNLPSharedTaskParams(self.bioNLPSTParams, model) if stParams["convert"]: #self.useBioNLPSTFormat: extension = ".zip" if (stParams["convert"] == "zip") else ".tar.gz" Utils.STFormat.ConvertXML.toSTFormat( xml, output + "-events" + extension, outputTag=stParams["a2Tag"], writeExtra=(stParams["scores"] == True)) if stParams["evaluate"]: #self.stEvaluator != None: if task == None: task = self.getStr(self.tag + "task", model) self.stEvaluator.evaluate(output + "-events" + extension, task) self.deleteTempWorkDir() self.exitState()
def negateEvents(input, output=None, verbose=False): if not (ET.iselement(input) and input.tag == "sentence"): print >> sys.stderr, "Loading corpus file", input corpusTree = ETUtils.ETFromObj(input) corpusRoot = corpusTree.getroot() if not (ET.iselement(input) and input.tag == "sentence"): sentences = corpusRoot.getiterator("sentence") else: sentences = [input] counts = defaultdict(int) for sentence in sentences: for entity in sentence.findall("entity"): counts["all-entities"] += 1 eType = entity.get("type") if not isNegatableEPITrigger(eType): counts["out-of-scope"] += 1 continue eBaseType = getEPIBaseType(eType) eText = entity.get("text").lower() eNewType = determineNewType(eType, eText) # Insert changed charOffset counts["entities"] += 1 if verbose: print "Entity", entity.get("id"), [entity.get("text")], [eType, eBaseType, eNewType], if eNewType != eBaseType: counts["negated"] += 1 if verbose: print "NEGATED", if eNewType == eType: counts["correct"] += 1 if verbose: print "CORRECT" else: counts["incorrect"] += 1 if eNewType == eBaseType: counts["incorrect-pos"] += 1 else: counts["incorrect-neg"] += 1 if verbose: print "INCORRECT" entity.set("type", eNewType) if verbose: print counts if not (ET.iselement(input) and input.tag == "sentence"): if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree
def parse(self, input, output=None, tokenizationName=None, parseName="McCC", requireEntities=False, skipIds=[], skipParsed=True, timeout=600, makePhraseElements=True, debug=False, pathParser=None, pathBioModel="AUTO", addTimeStamp=True): print >> sys.stderr, "BLLIP parser" corpusTree, corpusRoot = self.getCorpus(input) workdir = tempfile.mkdtemp() infileName, numCorpusSentences = self.makeInputFile(workdir, corpusRoot, requireEntities, skipIds, skipParsed, tokenizationName, debug) bllipOutput = self.runProcess(infileName, workdir, pathParser, pathBioModel, tokenizationName, timeout) self.insertPennTrees(bllipOutput, corpusRoot, parseName, requireEntities, skipIds, skipParsed) if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) # Remove work directory if not debug: shutil.rmtree(workdir) else: print >> sys.stderr, "Parser IO files at", workdir return corpusTree
def mergeCorpora(corpusIds, outputId, inputDir, outDir): merged = Catenate.catenateElements(corpusIds, inputDir) for dataSet in ("devel", "train"): renameElements(merged[dataSet].getroot(), {"Localization":"Lives_In", "Host":"Habitat", "HostPart":"Habitat", "Food":"Habitat", "Soil":"Habitat", "Medical":"Habitat", "Water":"Habitat", "Bacterium":"Bacteria"}) DeleteElements.removeElements(merged[dataSet].getroot(), {"interaction":{"type":"PartOf"}}) if outDir != None: outPath = os.path.join(outDir, outputId + "-" + dataSet + ".xml") print "Writing set", dataSet, "to", outPath ETUtils.write(merged[dataSet].getroot(), outPath)
def processCorpus(inputFilename, outputFilename, rules, reverse=False): print >> sys.stderr, "Deleting elements, rules =", rules print >> sys.stderr, "Loading corpus file", inputFilename corpusTree = ETUtils.ETFromObj(inputFilename) corpusRoot = corpusTree.getroot() countsByType = defaultdict(int) removeElements(corpusRoot, rules, reverse, countsByType) print >> sys.stderr, "Deleted elements" for k in sorted(countsByType.keys()): print >> sys.stderr, " " + k + ":", countsByType[k] if outputFilename != None: print >> sys.stderr, "Writing output to", outputFilename ETUtils.write(corpusRoot, outputFilename) return corpusTree
def mergeSets(input, corpusDir=None, output=None, allowNone=False): # Find the files if isinstance(input, dict): filenames = [{"path":input[x], "set":x} for x in input] else: if corpusDir == None: if os.path.dirname(input): corpusDir = os.path.dirname(input) input = os.path.basename(input) else: corpusDir = os.path.normpath(Settings.DATAPATH + "/corpora") print >> sys.stderr, "Searching for corpus files at " + corpusDir + " using pattern " + input filenames = [{"path":os.path.join(corpusDir, x), "set":None} for x in getMatchingFiles(input, corpusDir)] # Merge the files print >> sys.stderr, "Merging input files", filenames if len(filenames) == 0: if allowNone: print >> sys.stderr, "Nothing to merge" return else: raise Exception("No input files found for merging") newRoot = None counts = defaultdict(int) for filename in filenames: print >> sys.stderr, "Merging file", filename["path"] xml = ETUtils.ETFromObj(filename["path"]).getroot() if newRoot == None: newRoot = ET.Element("corpus", xml.attrib) else: assert newRoot.attrib == xml.attrib for doc in xml.iter("document"): assert doc.get("set") != None, doc.attrib if filename["set"] != None: assert filename["set"] == doc.get("set") counts["set=" + doc.get("set")] += 1 counts["set(" + filename["path"] + ")=" + doc.get("set")] += 1 for element in xml: newRoot.append(element) print >> sys.stderr, dict(counts) if output != None: print "Writing merged corpus to", output ETUtils.write(newRoot, output) return ET.ElementTree(newRoot)
def mixSets(input, output, docOrigIds, sourceSet, targetSet): print >> sys.stderr, "Mixing Sets", input corpusTree = ETUtils.ETFromObj(input) corpusRoot = corpusTree.getroot() if docOrigIds != None: for document in corpusRoot.getiterator("document"): docId = document.get("pmid") if docId == None: docId = document.get("origId") if docId in docOrigIds: assert document.get("set") == sourceSet document.set("set", targetSet) docOrigIds.remove(docId) assert len(docOrigIds) == 0, docOrigIds sentenceIds = None if sentenceIds != None: for document in corpusRoot.getiterator("document"): removed = [] for sentence in document.findall("sentence"): assert document.get("set") == sourceSet sentenceId = sentence.get("id") if sentenceId in sentenceIds: removed.append(document.remove(sentence)) sentenceIds.remove(sentenceId) if len(removed) > 0: newDoc = ET.Element("document") for attr in document.attrib: newDoc.set(attr, document.get(attr)) newDoc.set("id", None) newDoc.set("set", targetSet) for sentence in removed: newDoc.append(sentence) corpusRoot.append(newDoc) assert len(sentenceIds) == None RecalculateIds.recalculateIds(corpusTree, onlyWithinSentence=False) if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree
def convert(inPath, outDir, corpusId, directed, negatives, preprocess, preprocessorParameters=None, debug=False, clear=False, constParser="BLLIP-BIO", depParser="STANFORD-CONVERT", logging=True): assert negatives in ("INCLUDE", "SKIP", "REVERSE_POS") # Download the corpus if needed if inPath == None: if not hasattr(Settings, "SE10T8_CORPUS"): SemEval2010Task8Tools.install() inPath = Settings.SE10T8_CORPUS assert os.path.exists(inPath) # Prepare the output directory if not os.path.exists(outDir): print "Making output directory", outDir os.makedirs(outDir) elif clear: print "Removing output directory", outDir shutil.rmtree(outDir) # Start logging if logging: Stream.openLog(os.path.join(outDir, "log.txt"), clear=clear) # Read and process the corpus files archive = zipfile.ZipFile(inPath, 'r') usedIds = set() tree = None for fileName, setName in [("SemEval2010_task8_all_data/SemEval2010_task8_training/TRAIN_FILE.TXT", "train"),\ ("SemEval2010_task8_all_data/SemEval2010_task8_testing_keys/TEST_FILE_FULL.TXT", "test")]: print "Processing file", fileName, "as set", setName f = archive.open(fileName) tree = processLines(f.readlines(), setName, directed=directed, negatives=negatives, usedIds=usedIds, tree=tree, corpusId=corpusId) f.close() # Divide the training set into training and development sets MakeSets.processCorpus(tree, None, "train", [("train", 0.7), ("devel", 1.0)], 1) # Write out the converted corpus convertedPath = os.path.join(outDir, corpusId + "-converted.xml") ETUtils.write(tree.getroot(), convertedPath) # Preprocess the converted corpus if preprocess: outPath = os.path.join(outDir, corpusId + ".xml") preprocessor = Preprocessor(constParser, depParser) preprocessor.setArgForAllSteps("debug", debug) preprocessor.stepArgs("CONVERT")["corpusName"] = corpusId preprocessor.process(convertedPath, outPath, preprocessorParameters, omitSteps=["SPLIT-SENTENCES", "NER", "SPLIT-NAMES"]) # Stop logging if logging: Stream.closeLog(os.path.join(outDir, "log.txt"))
def parse(self, parserName, input, output=None, debug=False, reparse=False, syntaxNetDir=None, modelDir=None): # Run the parser process if syntaxNetDir == None: syntaxNetDir = Settings.SYNTAXNET_DIR corpusTree, corpusRoot = self.getCorpus(input) workdir = tempfile.mkdtemp() inPath = self.makeInputFile(corpusRoot, workdir) outPath = ProcessUtils.runSentenceProcess(self.run, syntaxNetDir, inPath, workdir, True, "SyntaxNetParser", "Parsing", processArgs={"modelDir":modelDir}) self.insertCoNLLParses(outPath, corpusRoot, parserName, unescaping=True, conllFormat="conllx") # Remove work directory if not debug: shutil.rmtree(workdir) else: print >> sys.stderr, "Parser IO files at", workdir # Write the output XML file if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree
def processCorpus(input, output, rules): print >> sys.stderr, "Deleting attributes, rules =", rules print >> sys.stderr, "Loading corpus file", input corpusTree = ETUtils.ETFromObj(input) corpusRoot = corpusTree.getroot() countsByType = {} for key in sorted(rules.keys()): for attribute in rules[key]: countsByType[key + ":" + attribute] = 0 removeAttributes(corpusRoot, key, rules[key], countsByType) print >> sys.stderr, "Removed" for k in sorted(countsByType.keys()): print >> sys.stderr, " " + k + ":", countsByType[k] if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree
def processCorpus(input, output, rules): if rules == None: raise Exception("No mapping rules defined") elif isinstance(rules, basestring): rules = eval(rules) print >> sys.stderr, "Mapping attributes, rules =", rules print >> sys.stderr, "Loading corpus file", input corpusTree = ETUtils.ETFromObj(input) corpusRoot = corpusTree.getroot() counts = defaultdict(int) for key in sorted(rules.keys()): mapAttributes(corpusRoot, key, rules[key], counts) print >> sys.stderr, "Mapped", dict(counts) if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree
def toInteractionXML(documents, corpusName="CORPUS", output=None): corpusRoot = ET.Element("corpus") corpusRoot.set("source", corpusName) docCounter = 0 for doc in documents: docEl = addDocumentElement(doc, corpusRoot, docCounter, corpusName) docCounter += 1 # prepare mapping structures tMap = {} eventMap = {} for event in doc.events: eventMap[event.id] = event # write elements addEntityElements(doc, docEl, tMap, eventMap) addInteractionElements(doc, docEl, tMap) addParseElements(doc, docEl) if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return ET.ElementTree(corpusRoot)
def splitMergedElements(inputFilename, outputFilename=None): print >> sys.stderr, "##### Split elements with merged types #####" print >> sys.stderr, "Loading corpus", inputFilename corpusTree = ETUtils.ETFromObj(inputFilename) corpusRoot = corpusTree.getroot() documents = corpusRoot.findall("document") counter = ProgressCounter(len(documents), "Documents") countsByType = {"entity":[0,0], "interaction":[0,0], "pair":[0,0]} for document in documents: counter.update() for sentence in document.findall("sentence"): processSentence(sentence, countsByType) print >> sys.stderr, "Results" for k in sorted(countsByType.keys()): print >> sys.stderr, " " + k + ": removed", countsByType[k][0], "created", countsByType[k][1] if outputFilename != None: print >> sys.stderr, "Writing output to", outputFilename ETUtils.write(corpusRoot, outputFilename) return corpusTree
def splitMergedElements(inputFilename, outputFilename=None): print >> sys.stderr, "##### Split elements with merged types #####" print >> sys.stderr, "Loading corpus", inputFilename corpusTree = ETUtils.ETFromObj(inputFilename) corpusRoot = corpusTree.getroot() documents = corpusRoot.findall("document") counter = ProgressCounter(len(documents), "Documents") countsByType = {"entity": [0, 0], "interaction": [0, 0], "pair": [0, 0]} for document in documents: counter.update() for sentence in document.findall("sentence"): processSentence(sentence, countsByType) print >> sys.stderr, "Results" for k in sorted(countsByType.keys()): print >> sys.stderr, " " + k + ": removed", countsByType[k][ 0], "created", countsByType[k][1] if outputFilename != None: print >> sys.stderr, "Writing output to", outputFilename ETUtils.write(corpusRoot, outputFilename) return corpusTree
def processCorpus(inputFilename, outputFilename, rules): print >> sys.stderr, "Loading corpus file", inputFilename if inputFilename.rsplit(".", 1)[-1] == "gz": import gzip corpusTree = ET.parse(gzip.open(inputFilename)) else: corpusTree = ET.parse(inputFilename) corpusRoot = corpusTree.getroot() countsByType = {} for key in sorted(rules.keys()): for attribute in rules[key]: countsByType[key + ":" + attribute] = 0 removeAttributes(corpusRoot, key, rules[key], countsByType) print >> sys.stderr, "Removed" for k in sorted(countsByType.keys()): print >> sys.stderr, " " + k + ":", countsByType[k] if outputFilename != None: print >> sys.stderr, "Writing output to", outputFilename ETUtils.write(corpusRoot, outputFilename) return corpusTree
def fixAltOffsets(input, output=None): print >> sys.stderr, "Loading corpus", input corpusTree = ETUtils.ETFromObj(input) print >> sys.stderr, "Corpus file loaded" corpusRoot = corpusTree.getroot() docCount = 0 sentencesCreated = 0 sentences = [x for x in corpusRoot.getiterator("sentence")] counter = ProgressCounter(len(sentences), "FixAltOffsets") fixCount = 0 # fix spans for sentence in sentences: counter.update( 1, "Fixing AltOffsets for sentence (" + sentence.get("id") + "): ") sentOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset")) for entity in sentence.findall("entity"): altOffsetString = entity.get("altOffset") if altOffsetString == None: continue #print altOffsetString altOffsets = Range.charOffsetToTuples(altOffsetString) assert len(altOffsets) == 1 for i in range(len(altOffsets)): altOffset = altOffsets[i] altOffsets[i] = (altOffset[0] - sentOffset[0], altOffset[1] - sentOffset[0]) entity.set("altOffset", Range.tuplesToCharOffset(altOffsets)) fixCount += 1 print >> sys.stderr, "Fixed", fixCount, "altOffsets" if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree
def mergeSentences(input, output, verbose=False): print >> sys.stderr, "Merging sentences into documents" print >> sys.stderr, "Loading corpus file", input corpusTree = ETUtils.ETFromObj(input) corpusRoot = corpusTree.getroot() counts = defaultdict(int) for document in corpusRoot.findall("document"): counts["documents"] += 1 # Check that the entity has only sentence elements as children children = [x for x in document] docChildTypes = sorted(set([x.tag for x in children])) if len(docChildTypes) == 0: counts["documents-with-no-sentences"] += 1 continue elif len(docChildTypes) > 1 or docChildTypes[0] != "sentence": raise Exception("Document '" + str(document.get("id")) + "' has non-sentence children: " + str(docChildTypes)) # Process all the child sentence elements docId = document.get("id") interactions = [] entities = [] entityById = {} interactionById = {} combinedText = "" calculatedOffset = (0, 0) for sentence in children: document.remove(sentence) sentenceText = sentence.get("head", "") + sentence.get( "text", "") + sentence.get("tail", "") sentOffset = sentence.get("charOffset") if sentence == children[0]: noDefinedOffsets = sentOffset == None elif (sentOffset == None) != noDefinedOffsets: raise Exception("Only some sentences in document '" + docId + "' have defined offsets") if sentOffset == None: if sentence != children[-1]: sentenceText = sentenceText + " " calculatedOffset = (calculatedOffset[1], calculatedOffset[1] + len(sentenceText)) sentOffset = calculatedOffset else: sentOffset = Range.charOffsetToSingleTuple(sentOffset) combinedText += sentenceText # Collect and update the entity elements for entity in sentence.findall("entity"): # Map sentence-level entity offsets to document level for offsetKey in ("charOffset", "headOffset"): if entity.get(offsetKey) != None: offset = Range.charOffsetToTuples( entity.get(offsetKey)) for i in range(len(offset)): offset[i] = (offset[i][0] + sentOffset[0], offset[i][1] + sentOffset[0]) entity.set(offsetKey, Range.tuplesToCharOffset(offset)) # Compare mapped offsets to origOffset, if available if entity.get("origOffset") != None: if entity.get("charOffset") != entity.get("origOffset"): raise Exception( "Document '" + str(document.get("id")) + "' entity '" + str(entity.get("id")) + "' new charOffset differs from origOffset: " + str([ entity.get("charOffset"), entity.get("origOffset") ])) counts["checked-origOffsets"] += 1 del entity.attrib["origOffset"] assert entity.get("id") not in entityById entityById[entity.get( "id" )] = entity # For re-mapping the interaction 'e1' and 'e2' attributes entities.append(entity) counts["moved-entities"] += 1 # Collect and update the interaction elements for interaction in sentence.findall("interaction"): assert interaction.get("id") not in interactionById interactionById[interaction.get( "id" )] = interaction # For re-mapping the interaction 'siteOf' attributes interactions.append(interaction) counts["moved-interactions"] += 1 # Check that the combined sentence text matches the document text, if available if document.get("text") != None and document.get( "text") != combinedText: if combinedText == document.get( "text")[0:len(combinedText)] and document.get( "text")[len(combinedText):].strip() == "": if verbose: print >> sys.stderr, "Warning, document '" + document.get( "id" ) + "' text has trailing whitespace not included in the combined sentence text" combinedText = document.get("text") counts["missing-trailing-whitespace"] += 1 else: raise Exception( "Document '" + str(document.get("id")) + "' text differs from combined sentence text: " + str([document.get("text"), combinedText])) counts["checked-document-texts"] += 1 # Check that the entities' texts match the document text for entity in entities: offset = Range.charOffsetToTuples(entity.get("charOffset")) if len(offset) == 1: # Compare only continous entities if not Range.contains((0, len(combinedText)), offset[0]): raise Exception( "Document '" + str(document.get("id")) + "' entity '" + str(entity.get("id")) + "' offset is not contained in combined sentence text: " + str([ entity.attrib, offset, [0, len(combinedText)], combinedText ])) combTextSpan = combinedText[offset[0][0]:offset[0][1]] if entity.get("text") != combTextSpan: raise Exception( "Document '" + str(document.get("id")) + "' entity '" + str(entity.get("id")) + "' text does not match combined sentence text: " + str([entity.get("text"), combTextSpan])) counts["checked-charOffsets"] += 1 # Set the combined text as the document text document.set("text", combinedText) # Update entity and interaction ids (not done earlier so that possible error messages will refer to original ids, also because of siteOf-remapping) for i in range(len(entities)): entities[i].set("id", docId + ".e" + str(i)) # Update the id for the document level for i in range(len(interactions)): interaction.set("id", docId + ".i" + str(i)) # Update the id for the document level # Update interaction e1 and e2 ids (cannot be done earlier because interactions may refer to entities from multiple sentences) for i in range(len(interactions)): interaction = interactions[i] for entKey in ("e1", "e2"): interaction.set(entKey, entityById[interaction.get(entKey)].get("id")) if interaction.get("siteOf") != None: interaction.set( "siteOf", interactionById[interaction.get("siteOf")].get("id")) # Add the entity and interaction elements to the document document.extend(entities) document.extend(interactions) print >> sys.stderr, "Counts:", dict(counts) if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree
def writeXML(self, examples, predictions, corpus, outputFile, classSet=None, parse=None, tokenization=None, goldCorpus=None, exampleStyle=None, structureAnalyzer=None): """ Writes task 3 examples to interaction XML. Assumes task 3 classification is done with SVMMulticlass Classifier, used for two classes. """ print >> sys.stderr, "Adding task 3 to Interaction XML" examples, predictions = self.loadExamples(examples, predictions) if type(classSet) == types.StringType: # class names are in file classSet = IdSet(filename=classSet) classIds = None if classSet != None: classIds = classSet.getIds() corpusTree = ETUtils.ETFromObj(corpus) corpusRoot = corpusTree.getroot() # Determine subtask task3Type = None for example in examples: assert example[3].has_key("t3type") task3Type = example[3]["t3type"] break if task3Type == None: if outputFile != None: print >> sys.stderr, "Writing corpus to", outputFile ETUtils.write(corpusRoot, outputFile) return corpusTree assert task3Type in ["multiclass", "speculation", "negation"] # Remove the task 3 subtask information if it already exists for entity in corpusRoot.getiterator("entity"): if task3Type == "multiclass": entity.set("speculation", "False") entity.set("negation", "False") elif task3Type == "speculation": entity.set("speculation", "False") else: # task3Type == "negation" entity.set("negation", "False") specMap = {} negMap = {} for example, prediction in itertools.izip(examples, predictions): assert example[3]["xtype"] == "task3" if example[3]["t3type"] == "multiclass": if isinstance(prediction, dict): encoded = prediction["prediction"] predictedModifiers = [ classSet.getName(i) for i in range(len(encoded)) if encoded[i] == 1 ] else: predictedClassName = classSet.getName(prediction[0]) predictedModifiers = "" if predictedClassName != "neg": predictedModifiers = predictedClassName.split("---") if "negation" in predictedModifiers: assert not negMap.has_key(example[3]["entity"]) negMap[example[3]["entity"]] = (True, prediction) if "speculation" in predictedModifiers: assert not specMap.has_key(example[3]["entity"]) specMap[example[3]["entity"]] = (True, prediction) else: if example[3]["t3type"] == "speculation": map = specMap else: map = negMap if prediction[0] != 1: assert not map.has_key(example[3]["entity"]) map[example[3]["entity"]] = (True, prediction) else: assert not map.has_key(example[3]["entity"]) map[example[3]["entity"]] = (False, prediction) for entity in corpusRoot.getiterator("entity"): eId = entity.get("id") if task3Type == "multiclass": if specMap.has_key(eId): entity.set("speculation", str(specMap[eId][0])) entity.set( "modConf", self.getPredictionStrengthString( specMap[eId][1], classSet, classIds)) if negMap.has_key(eId): entity.set("negation", str(negMap[eId][0])) entity.set( "modConf", self.getPredictionStrengthString( negMap[eId][1], classSet, classIds)) else: if task3Type == "speculation": if specMap.has_key(eId): entity.set("speculation", str(specMap[eId][0])) entity.set( "specConf", self.getPredictionStrengthString( specMap[eId][1], classSet, classIds, [""])) elif task3Type == "negation": if negMap.has_key(eId): entity.set("negation", str(negMap[eId][0])) entity.set( "negConf", self.getPredictionStrengthString( negMap[eId][1], classSet, classIds, ["", "speculation"])) # Write corpus if outputFile != None: print >> sys.stderr, "Writing corpus to", outputFile ETUtils.write(corpusRoot, outputFile) return corpusTree
optparser.add_option("-d", "--debug", default=False, action="store_true", dest="debug", help="Debug mode") optparser.add_option("-v", "--validate", default=None, dest="validate", help="validate input", metavar="FILE") (options, args) = optparser.parse_args() s = StructureAnalyzer() if options.load: s.load(None, options.input) else: s.analyze(options.input.split(",")) print >> sys.stderr, "--- Structure Analysis ----" print >> sys.stderr, s.toString() if options.validate != None: print >> sys.stderr, "--- Validation ----" xml = ETUtils.ETFromObj(options.validate) s.validate(xml, simulation=False, debug=options.debug) if options.output != None: ETUtils.write(xml, options.output) elif options.output != None: print >> sys.stderr, "Structure analysis saved to", options.output s.save(None, options.output)
def mainFunc(input, output=None, parseName="McCC", tokenizationName=None, newParseName=None, newTokenizationName=None, logFileName=None, removeOld=True): print >> sys.stderr, "Protein Name Splitter" if logFileName != None: print >> sys.stderr, "Writing log to", logFileName logFile = open(logFileName, "wt") else: logFile = None #if input.endswith(".gz"): # inFile = gzip.GzipFile(input) #else: # inFile = open(input) tree = ETUtils.ETFromObj(input) if tokenizationName == None: tokenizationName = parseName #tree = ElementTree.parse(inFile) root = tree.getroot() sentences = [x for x in root.getiterator("sentence")] counter = ProgressCounter(len(sentences), "Split Protein Names") counter.showMilliseconds = True missingTokCount = 0 for sentence in sentences: sId = sentence.get("id") counter.update(1, "Splitting names (" + sId + "): ") tok = getTokenization(tokenizationName, sentence, sId, remove=removeOld) if tok == None: missingTokCount += 1 continue assert tok is not None, "Missing tokenization '%s' in sentence %s!" % ( tokenizationName, sId) parse = getParse(parseName, tokenizationName, sentence, sId, remove=removeOld) assert parse is not None, "Missing parse '%s' in sentence %s!" % ( parseName, sId) split = splitTokens(tok, sentence, logFile) # Default names if removeOld: if newTokenizationName == None: newTokenizationName = tok.get("tokenizer") if newParseName == None: newParseName = parse.get("parser") else: if newTokenizationName == None: newTokenizationName = "split-" + tok.get("tokenizer") if newParseName == None: newParseName = "split-" + parse.get("parser") # add a new tokenization with the split tokens. splittok = addTokenization(newTokenizationName, sentence, sId) addTokensToTree(split, splittok) for a in tok.attrib: if splittok.get(a) == None: splittok.set(a, tok.get(a)) #splittok.set("split-") # make a mapping from original to split token ids. Store the # head token when given. tokenIdMap = {} for t in split: if t.head: head = t.head # traverse while head.head is not None: assert head.head != t, "Cyclic heads" head = head.head # should match (nah, punctuation problems) # assert t.origId not in tokenIdMap or tokenIdMap[t.origId] == head.id, "Head conflict" tokenIdMap[t.origId] = head.id else: # only allow overwrite of existing entry if the current token # is not punctuation. if t.origId not in tokenIdMap or not t.isPunct(): tokenIdMap[t.origId] = t.id # make a copy of the specified parse that refers to the split tokens # instead of the originals. newparse = addParse(newParseName, newTokenizationName, sentence, sId) for a in parse.attrib: if newparse.get(a) == None: newparse.set(a, parse.get(a)) newparse.set("ProteinNameSplitter", "True") splittok.set("ProteinNameSplitter", "True") depSeqId = 0 #1 for d in parse.getiterator("dependency"): t1, t2, dType = d.get("t1"), d.get("t2"), d.get("type") assert t1 in tokenIdMap and t2 in tokenIdMap, "INTERNAL ERROR" dep = ElementTree.SubElement(newparse, "dependency") dep.set("t1", tokenIdMap[t1]) dep.set("t2", tokenIdMap[t2]) dep.set("type", dType) dep.set("id", "sd_%d" % depSeqId) depSeqId += 1 # Add in new dependencies between the split parts. for t in [tok for tok in split if tok.head is not None]: dep = ElementTree.SubElement(newparse, "dependency") dep.set("t1", t.head.id) dep.set("t2", t.id) dep.set("type", t.depType) dep.set("split", "PNS") dep.set("id", "spd_%d" % depSeqId) depSeqId += 1 for phrase in parse.getiterator("phrase"): newparse.append(phrase) # debugging #print >> sys.stderr, "NEW DEP IN", sId print >> sys.stderr, "Tokenization missing from", missingTokCount, "sentences" #indent(root) if logFile != None: logFile.close() # debugging if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(tree, output) return tree