def convert(corpora, outDir=None, downloadDir=None, redownload=False, makeIntermediateFiles=True, evaluate=False, processEquiv=True, addAnalyses=True): global bioNLP13AnalysesTempDir if outDir == None: os.path.normpath(Settings.DATAPATH + "/corpora") if not os.path.exists(outDir): os.makedirs(outDir) else: assert os.path.isdir(outDir) count = 1 for corpus in corpora: print >> sys.stderr, "=======================", "Converting BioNLP Shared Task", corpus, "corpus ("+str(count)+"/"+str(len(corpora))+")", "=======================" logFileName = outDir + "/conversion/" + corpus + "-conversion-log.txt" Stream.openLog(logFileName) downloaded = downloadCorpus(corpus, outDir, downloadDir, redownload) packageSubPath = None if corpus == "BB13T2": packageSubPath = "task_2" elif corpus == "BB13T3": packageSubPath = "task_3" convertDownloaded(outDir, corpus, downloaded, makeIntermediateFiles, evaluate, processEquiv=processEquiv, addAnalyses=addAnalyses, packageSubPath=packageSubPath) Stream.closeLog(logFileName) count += 1 if bioNLP13AnalysesTempDir != None: shutil.rmtree(bioNLP13AnalysesTempDir) bioNLP13AnalysesTempDir = None
def convertCorpus(corpus, outDir=None, downloadDir=None, redownload=False, removeAnalyses=True, develFraction=0.3, logPath=None): assert corpus in PPI_CORPORA if logPath == "AUTO": logPath = outDir + "/conversion/" + corpus + "-conversion-log.txt" if outDir != None else None if logPath: Stream.openLog(logPath) print >> sys.stderr, "==========", "Converting PPI corpus", corpus, "==========" downloaded = downloadCorpus(corpus, outDir, downloadDir, redownload) print >> sys.stderr, "---------------", "Updating Interaction XML format", "---------------" print >> sys.stderr, "Loading", downloaded[corpus + "_LEARNING_FORMAT"] xml = ETUtils.ETFromObj(downloaded[corpus + "_LEARNING_FORMAT"]) root = xml.getroot() updateXML(root, removeAnalyses) print >> sys.stderr, "---------------", "Adding sets from the PPI evaluation standard", "---------------" addSets(corpus, root, downloaded["PPI_EVALUATION_STANDARD"]) if develFraction > 0.0: print >> sys.stderr, "---------------", "Generating devel set", "---------------" MakeSets.processCorpus(xml, None, "train", [("devel", develFraction), ("train", 1.0)], 1) if outDir != None: print >> sys.stderr, "---------------", "Writing corpus", "---------------" #if intermediateFiles: #print >> sys.stderr, "Writing combined corpus" #ETUtils.write(xml, os.path.join(outDir, corpus + ".xml")) print >> sys.stderr, "Dividing into sets" Utils.InteractionXML.DivideSets.processCorpus(xml, outDir, corpus, ".xml") if logPath != None: Stream.closeLog(logPath) return xml
def convertCorpus(corpus, outDir=None, downloadDir=None, redownload=False, makeIntermediateFiles=True, evaluate=False, processEquiv=True, analysisMode="INSERT", debug=False, preprocessorSteps=None, preprocessorParameters=None, logPath=None): global bioNLP13AnalysesTempDir print >> sys.stderr, "==========", "Converting BioNLP Shared Task", corpus, "corpus", "==========" assert analysisMode in ("AUTO", "INSERT", "BUILD", "SKIP") if logPath == "AUTO": if outDir != None: logPath = outDir + "/conversion/" + corpus + "-conversion-log.txt" else: logPath = None if logPath: Stream.openLog(logPath) downloaded = downloadCorpus(corpus, outDir, downloadDir, redownload) packageSubPath = None if corpus == "BB13T2": packageSubPath = "task_2" elif corpus == "BB13T3": packageSubPath = "task_3" xml = convertDownloaded(outDir, corpus, downloaded, makeIntermediateFiles, evaluate, processEquiv=processEquiv, analysisMode=analysisMode, packageSubPath=packageSubPath, debug=debug, preprocessorSteps=preprocessorSteps, preprocessorParameters=preprocessorParameters) if logPath != None: Stream.closeLog(logPath) if bioNLP13AnalysesTempDir != None: shutil.rmtree(bioNLP13AnalysesTempDir) bioNLP13AnalysesTempDir = None return xml
def convertDDI13(outDir, downloadDir=None, datasets=["DDI13_TRAIN", "DDI13_TEST_TASK_9.1", "DDI13_TEST_TASK_9.2"], redownload=False, insertParses=True, parse=False, makeIntermediateFiles=True, debug=False): cwd = os.getcwd() if not os.path.exists(outDir): os.makedirs(outDir) os.chdir(outDir) logFileName = os.path.join(outDir, "DDI13-conversion-log.txt") Stream.openLog(logFileName) print >> sys.stderr, "=======================", "Converting DDI'13 corpus", "=======================" tempdir = tempfile.mkdtemp() downloaded = downloadFiles(downloadDir, tempdir, redownload) for dataset in datasets: corpusTree = getCorpusXML() xml = corpusTree.getroot() print >> sys.stderr, "Merging input XMLs" assert downloaded[dataset] != None combineXML(xml, "train", downloaded[dataset], subDirs=["DrugBank", "MedLine", "NER"]) print >> sys.stderr, "Processing elements" processElements(xml) if dataset == "DDI13_TRAIN": print >> sys.stderr, "Dividing training set into folds" divideSets(xml, "train", 10) else: for doc in xml.getiterator("document"): doc.set("set", "test") if parse: print >> sys.stderr, "Parsing" parseXML(corpusTree, os.path.join(tempdir, "parsing"), debug) elif insertParses: assert parse == False print >> sys.stderr, "Inserting McCC parses" Tools.BLLIPParser.insertParses(corpusTree, downloaded[dataset + "_TEES_PARSES"], None, extraAttributes={"source":"TEES"}) print >> sys.stderr, "Inserting Stanford conversions" Tools.StanfordParser.insertParses(corpusTree, downloaded[dataset + "_TEES_PARSES"], None, extraAttributes={"stanfordSource":"TEES"}) # Check what was produced by the conversion print >> sys.stderr, "---------------", "Corpus Structure Analysis", "---------------" analyzer = StructureAnalyzer() analyzer.analyze([xml]) print >> sys.stderr, analyzer.toString() if "9.1" in dataset: outFileName = os.path.join(outDir, "DDI13-test-task9.1.xml") elif "9.2" in dataset: outFileName = os.path.join(outDir, "DDI13-test-task9.2.xml") else: outFileName = os.path.join(outDir, "DDI13-train.xml") print >> sys.stderr, "Writing output to", outFileName ETUtils.write(xml, outFileName) Stream.closeLog(logFileName) if not debug and tempdir != None: print >> sys.stderr, "Removing temporary directory", tempdir shutil.rmtree(tempdir) os.chdir(cwd)
def process(self, source, output=None, model=None, fromStep=None, toStep=None, omitSteps=None, logPath=None): if logPath == "AUTO": if output != None: logPath = output if "*" in logPath: logPath = logPath.split("*")[0].rstrip("-") logPath = os.path.join( logPath.rstrip("/").rstrip("\\") + "-log.txt") else: logPath = None elif logPath == "None": logPath = None if logPath != None: if not os.path.exists(os.path.dirname(logPath)): os.makedirs(os.path.dirname(logPath)) Stream.openLog(logPath) print >> sys.stderr, "Preprocessor steps:", [ x.name for x in self.steps ] if len(self.steps) == 0: raise Exception("No preprocessing steps defined") #if omitSteps != None and((type(omitSteps) in types.StringTypes and omitSteps == "CONVERT") or "CONVERT" in omitSteps): # raise Exception("Preprocessor step 'CONVERT' may not be omitted") #if isinstance(source, basestring) and os.path.basename(source).isdigit(): # PMID # print >> sys.stderr, "Preprocessing PubMed abstract", os.path.basename(source) # source = Utils.Download.getPubMed(int(source)) # Initialize variables and save existing default values #self.intermediateFileTag = corpusName #parameters = self.getParameters(parameters, model) #parameters["CONVERT.dataSetNames"] = sourceDataSetNames #parameters["CONVERT.corpusName"] = corpusName #convertSetNames = self.stepArgs("CONVERT")["dataSetNames"] #convertCorpusName = self.stepArgs("CONVERT")["corpusName"] #self.stepArgs("CONVERT")["dataSetNames"] = sourceDataSetNames #self.stepArgs("CONVERT")["corpusName"] = corpusName # Run the tool chain xml = ToolChain.process(self, source, output, model, fromStep, toStep, omitSteps) # Reset variables to saved default values #self.stepArgs("CONVERT")["dataSetNames"] = convertSetNames #self.stepArgs("CONVERT")["corpusName"] = convertCorpusName if logPath != None: Stream.closeLog(logPath) return xml
def convert(corpora, outDir, downloadDir=None, redownload=False, makeIntermediateFiles=True, evaluate=False): if not os.path.exists(outDir): os.makedirs(outDir) else: assert os.path.isdir(outDir) count = 1 for corpus in corpora: print >> sys.stderr, "=======================", "Converting BioNLP'11", corpus, "corpus ("+str(count)+"/"+str(len(corpora))+")", "=======================" logFileName = outDir + "/conversion/" + corpus + "-conversion-log.txt" Stream.openLog(logFileName) downloaded = downloadCorpus(corpus, downloadDir, None, redownload) convertDownloaded(outDir, corpus, downloaded, makeIntermediateFiles, evaluate) Stream.closeLog(logFileName) count += 1
def convert(inPath, outDir, corpusId, directed, negatives, preprocess, preprocessorParameters=None, debug=False, clear=False, constParser="BLLIP-BIO", depParser="STANFORD-CONVERT", logging=True): assert negatives in ("INCLUDE", "SKIP", "REVERSE_POS") # Download the corpus if needed if inPath == None: if not hasattr(Settings, "SE10T8_CORPUS"): SemEval2010Task8Tools.install() inPath = Settings.SE10T8_CORPUS assert os.path.exists(inPath) # Prepare the output directory if not os.path.exists(outDir): print "Making output directory", outDir os.makedirs(outDir) elif clear: print "Removing output directory", outDir shutil.rmtree(outDir) # Start logging if logging: Stream.openLog(os.path.join(outDir, "log.txt"), clear=clear) # Read and process the corpus files archive = zipfile.ZipFile(inPath, 'r') usedIds = set() tree = None for fileName, setName in [("SemEval2010_task8_all_data/SemEval2010_task8_training/TRAIN_FILE.TXT", "train"),\ ("SemEval2010_task8_all_data/SemEval2010_task8_testing_keys/TEST_FILE_FULL.TXT", "test")]: print "Processing file", fileName, "as set", setName f = archive.open(fileName) tree = processLines(f.readlines(), setName, directed=directed, negatives=negatives, usedIds=usedIds, tree=tree, corpusId=corpusId) f.close() # Divide the training set into training and development sets MakeSets.processCorpus(tree, None, "train", [("train", 0.7), ("devel", 1.0)], 1) # Write out the converted corpus convertedPath = os.path.join(outDir, corpusId + "-converted.xml") ETUtils.write(tree.getroot(), convertedPath) # Preprocess the converted corpus if preprocess: outPath = os.path.join(outDir, corpusId + ".xml") preprocessor = Preprocessor(constParser, depParser) preprocessor.setArgForAllSteps("debug", debug) preprocessor.stepArgs("CONVERT")["corpusName"] = corpusId preprocessor.process(convertedPath, outPath, preprocessorParameters, omitSteps=["SPLIT-SENTENCES", "NER", "SPLIT-NAMES"]) # Stop logging if logging: Stream.closeLog(os.path.join(outDir, "log.txt"))
def convert(corpora, outDir, downloadDir=None, redownload=False, makeIntermediateFiles=True, evaluate=False): if not os.path.exists(outDir): os.makedirs(outDir) else: assert os.path.isdir(outDir) count = 1 for corpus in corpora: print >> sys.stderr, "=======================", "Converting BioNLP'11", corpus, "corpus (" + str( count) + "/" + str(len(corpora)) + ")", "=======================" logFileName = outDir + "/conversion/" + corpus + "-conversion-log.txt" Stream.openLog(logFileName) downloaded = downloadCorpus(corpus, downloadDir, None, redownload) convertDownloaded(outDir, corpus, downloaded, makeIntermediateFiles, evaluate) Stream.closeLog(logFileName) count += 1
def process(self, source, output=None, model=None, fromStep=None, toStep=None, omitSteps=None, logPath=None): if logPath == "AUTO": if output != None: logPath = output if "*" in logPath: logPath = logPath.split("*")[0].rstrip("-") logPath = os.path.join(logPath.rstrip("/").rstrip("\\") + "-log.txt") else: logPath = None elif logPath == "None": logPath = None if logPath != None: if not os.path.exists(os.path.dirname(logPath)): os.makedirs(os.path.dirname(logPath)) Stream.openLog(logPath) print >> sys.stderr, "Preprocessor steps:", [x.name for x in self.steps] if len(self.steps) == 0: raise Exception("No preprocessing steps defined") #if omitSteps != None and((type(omitSteps) in types.StringTypes and omitSteps == "CONVERT") or "CONVERT" in omitSteps): # raise Exception("Preprocessor step 'CONVERT' may not be omitted") #if isinstance(source, basestring) and os.path.basename(source).isdigit(): # PMID # print >> sys.stderr, "Preprocessing PubMed abstract", os.path.basename(source) # source = Utils.Download.getPubMed(int(source)) # Initialize variables and save existing default values #self.intermediateFileTag = corpusName #parameters = self.getParameters(parameters, model) #parameters["CONVERT.dataSetNames"] = sourceDataSetNames #parameters["CONVERT.corpusName"] = corpusName #convertSetNames = self.stepArgs("CONVERT")["dataSetNames"] #convertCorpusName = self.stepArgs("CONVERT")["corpusName"] #self.stepArgs("CONVERT")["dataSetNames"] = sourceDataSetNames #self.stepArgs("CONVERT")["corpusName"] = corpusName # Run the tool chain xml = ToolChain.process(self, source, output, model, fromStep, toStep, omitSteps) # Reset variables to saved default values #self.stepArgs("CONVERT")["dataSetNames"] = convertSetNames #self.stepArgs("CONVERT")["corpusName"] = convertCorpusName if logPath != None: Stream.closeLog(logPath) return xml
def convertDDI(outDir, trainUnified=None, trainMTMX=None, testUnified=None, testMTMX=None, downloadDir=None, redownload=False, makeIntermediateFiles=True, debug=False): cwd = os.getcwd() os.chdir(outDir) logFileName = os.path.join(outDir, "DDI-conversion-log.txt") Stream.openLog(logFileName) print >> sys.stderr, "=======================", "Converting DDI'11 corpus", "=======================" bigfileName = os.path.join(outDir, "DDI") #oldXML = ETUtils.ETFromObj(bigfileName+".xml") if trainUnified == None: trainUnified = Settings.URL["DDI_TRAIN_UNIFIED"] if trainMTMX == None: trainMTMX = Settings.URL["DDI_TRAIN_MTMX"] if testUnified == None: testUnified = Settings.URL["DDI_TEST_UNIFIED"] if testMTMX == None: testMTMX = Settings.URL["DDI_TEST_MTMX"] tempdir = tempfile.mkdtemp() print >> sys.stderr, "Temporary files directory at", tempdir if True: documents, docById, docCounts = loadDocs(trainUnified, outDir, tempdir) sortedDocCounts = sorted(docCounts.iteritems(), key=lambda (k,v): (v,k), reverse=True) datasetCounts = {"train":[0,0], "devel":[0,0], "test":[0,0]} for i in range(0, len(sortedDocCounts)-3, 4): for j in [0,1]: docById[sortedDocCounts[i+j][0]].set("set", "train") datasetCounts["train"][0] += sortedDocCounts[i+j][1][0] datasetCounts["train"][1] += sortedDocCounts[i+j][1][1] docById[sortedDocCounts[i+2][0]].set("set", "train") #docById[sortedDocCounts[i+2][0]].set("set", "devel") docById[sortedDocCounts[i+3][0]].set("set", "devel") #docById[sortedDocCounts[i+3][0]].set("set", "test") datasetCounts["train"][0] += sortedDocCounts[i+2][1][0] #datasetCounts["devel"][0] += sortedDocCounts[i+2][1][0] datasetCounts["train"][1] += sortedDocCounts[i+2][1][1] #datasetCounts["devel"][1] += sortedDocCounts[i+2][1][1] datasetCounts["devel"][0] += sortedDocCounts[i+3][1][0] #datasetCounts["test"][0] += sortedDocCounts[i+3][1][0] datasetCounts["devel"][1] += sortedDocCounts[i+3][1][1] #datasetCounts["test"][1] += sortedDocCounts[i+3][1][1] for document in documents: # epajaolliset jaa yli if document.get("set") == None: document.set("set", "train") print datasetCounts for key in datasetCounts.keys(): if datasetCounts[key][1] != 0: print key, datasetCounts[key][0] / float(datasetCounts[key][1]) else: print key, datasetCounts[key][0], "/", float(datasetCounts[key][1]) if testUnified != None: testDocuments, testDocById, testDocCounts = loadDocs(testUnified, tempdir) for document in testDocuments: document.set("set", "test") documents = documents + testDocuments xmlTree = ET.ElementTree(ET.Element("corpus")) root = xmlTree.getroot() root.set("source", "DrugDDI") for document in documents: root.append(document) if makeIntermediateFiles: ETUtils.write(root, bigfileName + "-documents-notfixed.xml") xml = xmlTree print >> sys.stderr, "Fixing DDI XML" fixEntities(xml) convertToInteractions(xml) if makeIntermediateFiles: ETUtils.write(root, bigfileName + "-documents.xml") #sys.exit() if False: print >> sys.stderr, "Parsing" Tools.CharniakJohnsonParser.parse(xml, bigfileName+"-parsed.xml", tokenizationName=None, parseName="McClosky", requireEntities=True, timeout=10) print >> sys.stderr, "Stanford Conversion" Tools.StanfordParser.convertXML("McClosky", xml, bigfileName+"-stanford.xml") #if True: #xml = bigfileName + "-stanford.xml" print >> sys.stderr, "Protein Name Splitting" splitTarget = "McClosky" xml = ProteinNameSplitter.mainFunc(xml, None, splitTarget, splitTarget, "split-"+splitTarget, "split-"+splitTarget) print >> sys.stderr, "Head Detection" xml = FindHeads.findHeads(xml, "split-McClosky", tokenization=None, output=bigfileName+".xml", removeExisting=True) print >> sys.stderr, "Dividing into sets" InteractionXML.DivideSets.processCorpus(xml, outDir, "DrugDDI-", ".xml", [("devel", "train", "test"), ("devel", "train")]) #InteractionXML.DivideSets.processCorpus(oldXML, outDir, "DrugDDI-", ".xml", [("devel", "train", "test"), ("devel", "train")]) #InteractionXML.DivideSets.processCorpus(bigfileName+".xml", outDir, "DrugDDI-", ".xml", [("devel", "train", "test"), ("devel", "train")]) #if "devel" in [x[0] for x in datasets]: # print >> sys.stderr, "Creating empty devel set" # deletionRules = {"interaction":{},"entity":{"isName":"False"}} # InteractionXML.DeleteElements.processCorpus(corpusName + "-devel.xml", corpusName + "-devel-empty.xml", deletionRules) #return xml Stream.closeLog(logFileName) if not debug: print >> sys.stderr, "Removing temporary directory", tempdir shutil.rmtree(tempdir) os.chdir(cwd)
def convertDDI(outDir, downloadDir=None, redownload=False, makeIntermediateFiles=True, debug=False): cwd = os.getcwd() if not os.path.exists(outDir): os.makedirs(outDir) os.chdir(outDir) logFileName = os.path.join(outDir, "DDI11-conversion-log.txt") Stream.openLog(logFileName) print >> sys.stderr, "=======================", "Converting DDI'11 corpus", "=======================" corpusDir = outDir + "/DDI11-original" Utils.Download.downloadAndExtract(Settings.URL["DDI11_CORPUS"], corpusDir, downloadDir) bigfileName = os.path.join(outDir, "DDI11") #oldXML = ETUtils.ETFromObj(bigfileName+".xml") trainUnified = corpusDir + "/train" trainMTMX = corpusDir + "/train_MTMX" testUnified = corpusDir + "/test" testMTMX = corpusDir + "/test_MTMX" # Load main documents tempdir = tempfile.mkdtemp() print >> sys.stderr, "Temporary files directory at", tempdir documents, docById, docCounts = loadDocs(trainUnified) # Divide training data into a train and devel set sortedDocCounts = sorted(docCounts.iteritems(), key=lambda (k,v): (v,k), reverse=True) datasetCounts = {"train":[0,0], "devel":[0,0], "test":[0,0]} for i in range(0, len(sortedDocCounts)-3, 4): for j in [0,1]: docById[sortedDocCounts[i+j][0]].set("set", "train") datasetCounts["train"][0] += sortedDocCounts[i+j][1][0] datasetCounts["train"][1] += sortedDocCounts[i+j][1][1] docById[sortedDocCounts[i+2][0]].set("set", "train") #docById[sortedDocCounts[i+2][0]].set("set", "devel") docById[sortedDocCounts[i+3][0]].set("set", "devel") #docById[sortedDocCounts[i+3][0]].set("set", "test") datasetCounts["train"][0] += sortedDocCounts[i+2][1][0] #datasetCounts["devel"][0] += sortedDocCounts[i+2][1][0] datasetCounts["train"][1] += sortedDocCounts[i+2][1][1] #datasetCounts["devel"][1] += sortedDocCounts[i+2][1][1] datasetCounts["devel"][0] += sortedDocCounts[i+3][1][0] #datasetCounts["test"][0] += sortedDocCounts[i+3][1][0] datasetCounts["devel"][1] += sortedDocCounts[i+3][1][1] #datasetCounts["test"][1] += sortedDocCounts[i+3][1][1] for document in documents: # epajaolliset jaa yli if document.get("set") == None: document.set("set", "train") # Print division results print >> sys.stderr, datasetCounts for key in datasetCounts.keys(): if datasetCounts[key][1] != 0: print key, datasetCounts[key][0] / float(datasetCounts[key][1]) else: print key, datasetCounts[key][0], "/", float(datasetCounts[key][1]) # Some of the train and test ids overlap. Let's change the train set ids, because test set ones are needed # for the final evaluation. changeIdCount = 1000 for trainId in ['DrugDDI.d312', 'DrugDDI.d316', 'DrugDDI.d332', 'DrugDDI.d334', 'DrugDDI.d337', 'DrugDDI.d342', 'DrugDDI.d349', 'DrugDDI.d354', 'DrugDDI.d373', 'DrugDDI.d379', 'DrugDDI.d383', 'DrugDDI.d388', 'DrugDDI.d392', 'DrugDDI.d396', 'DrugDDI.d398', 'DrugDDI.d409', 'DrugDDI.d411', 'DrugDDI.d415', 'DrugDDI.d425', 'DrugDDI.d430', 'DrugDDI.d433', 'DrugDDI.d448', 'DrugDDI.d450', 'DrugDDI.d452', 'DrugDDI.d462', 'DrugDDI.d467', 'DrugDDI.d470', 'DrugDDI.d474', 'DrugDDI.d480', 'DrugDDI.d482', 'DrugDDI.d485', 'DrugDDI.d492', 'DrugDDI.d494', 'DrugDDI.d496', 'DrugDDI.d498', 'DrugDDI.d500', 'DrugDDI.d503', 'DrugDDI.d506', 'DrugDDI.d518', 'DrugDDI.d523', 'DrugDDI.d528', 'DrugDDI.d535', 'DrugDDI.d539', 'DrugDDI.d552', 'DrugDDI.d554', 'DrugDDI.d558', 'DrugDDI.d561', 'DrugDDI.d570', 'DrugDDI.d578']: newId = "DrugDDI.d" + str(changeIdCount) print >> sys.stderr, "Changing train/devel id", trainId, "to", newId for element in docById[trainId].getiterator(): for attrName, attrValue in element.attrib.iteritems(): if trainId in attrValue: element.set(attrName, attrValue.replace(trainId, newId)) docById[newId] = docById[trainId] del docById[trainId] changeIdCount += 1 # If test set exists, load it, too if testUnified != None: testDocuments, testDocById, testDocCounts = loadDocs(testUnified) for document in testDocuments: document.set("set", "test") documents = documents + testDocuments overlappingIds = [] for key in docById: if key in testDocById: overlappingIds.append(key) for key in docById: assert key not in testDocById, (key, docById[key].get("origId"), testDocById[key].get("origId"), sorted(docById.keys()), sorted(testDocById.keys()), sorted(overlappingIds)) docById.update(testDocById) # Add all documents into one XML xmlTree = ET.ElementTree(ET.Element("corpus")) root = xmlTree.getroot() root.set("source", "DDI11") for document in documents: root.append(document) if makeIntermediateFiles: ETUtils.write(root, bigfileName + "-documents-notfixed.xml") xml = xmlTree print >> sys.stderr, "Fixing DDI XML" fixEntities(xml) convertToInteractions(xml) # Add MTMX if trainMTMX != None: inDir = Utils.Download.getTopDir(tempdir, Utils.Download.downloadAndExtract(trainMTMX, tempdir, outDir + "/DDI11-original")) DDITools.addMTMX(xml, inDir) if testMTMX != None: inDir = Utils.Download.getTopDir(tempdir, Utils.Download.downloadAndExtract(testMTMX, tempdir, outDir + "/DDI11-original")) DDITools.addMTMX(xml, inDir) if makeIntermediateFiles: ETUtils.write(root, bigfileName + "-documents.xml") print >> sys.stderr, "---------------", "Inserting TEES-generated analyses", "---------------" Utils.Download.downloadAndExtract(Settings.URL["TEES_PARSES"], os.path.join(Settings.DATAPATH, "TEES-parses"), downloadDir, redownload=redownload) extractedFilename = os.path.join(Settings.DATAPATH, "TEES-parses") + "/DDI11" print >> sys.stderr, "Making sentences" Tools.SentenceSplitter.makeSentences(xml, extractedFilename, None) print >> sys.stderr, "Inserting McCC parses" Tools.BLLIPParser.insertParses(xml, extractedFilename, None, extraAttributes={"source":"TEES-preparsed"}) print >> sys.stderr, "Inserting Stanford conversions" Tools.StanfordParser.insertParses(xml, extractedFilename, None, extraAttributes={"stanfordSource":"TEES-preparsed"}) print >> sys.stderr, "Protein Name Splitting" splitTarget = "McCC" #ProteinNameSplitter.mainFunc(xml, None, splitTarget, splitTarget, "split-"+splitTarget, "split-"+splitTarget) ProteinNameSplitter.mainFunc(xml, None, splitTarget, removeOld=True) print >> sys.stderr, "Head Detection" #xml = FindHeads.findHeads(xml, "split-"+splitTarget, tokenization=None, output=None, removeExisting=True) xml = FindHeads.findHeads(xml, splitTarget, tokenization=None, output=None, removeExisting=True) print >> sys.stderr, "Dividing into sets" Utils.InteractionXML.DivideSets.processCorpus(xml, outDir, "DDI11", ".xml") Stream.closeLog(logFileName) if not debug: print >> sys.stderr, "Removing temporary directory", tempdir shutil.rmtree(tempdir) os.chdir(cwd)
def combine(inputA, inputB, inputGold, outPath=None, mode="OR", skip=None, logPath="AUTO"): assert options.mode in ("AND", "OR") if skip != None and isinstance(skip, basestring): skip = set(skip.split(",")) if skip != None: print "Skipping interaction types:", skip if logPath == "AUTO": if outPath != None: logPath = os.path.join( outPath.rstrip("/").rstrip("\\") + "-log.txt") else: logPath = None if logPath != None: if not os.path.exists(os.path.dirname(logPath)): os.makedirs(os.path.dirname(logPath)) Stream.openLog(logPath) print "Loading the Interaction XML files" print "Loading A from", inputA a = ETUtils.ETFromObj(inputA) print "Loading B from", inputB b = ETUtils.ETFromObj(inputB) gold = None if inputGold: print "Loading gold from", inputGold gold = ETUtils.ETFromObj(inputGold) if inputGold else None print "Copying a as template" template = copy.deepcopy(a) print "Calculating confidence score ranges" scoreRanges = {} scoreRanges["a"] = getScoreRange(a, skip) scoreRanges["b"] = getScoreRange(b, skip) print scoreRanges print "Combining" counts = defaultdict(int) counts["skipped"] = defaultdict(int) counter = ProgressCounter(len([x for x in a.findall("document")]), "Combine") for docA, docB, docGold, docTemplate in itertools.izip_longest( *[x.findall("document") for x in (a, b, gold, template)]): counter.update() assert len( set([x.get("id") for x in (docA, docB, docGold, docTemplate)])) == 1 for sentA, sentB, sentGold, sentTemplate in itertools.izip_longest(*[ x.findall("sentence") for x in (docA, docB, docGold, docTemplate) ]): assert len( set([ x.get("id") for x in (sentA, sentB, sentGold, sentTemplate) ])) == 1 interactions = getInteractions(sentA, sentB, sentGold, skip, counts["skipped"]) for interaction in sentTemplate.findall("interaction"): sentTemplate.remove(interaction) analyses = sentTemplate.find("analyses") if analyses: sentTemplate.remove(analyses) for key in interactions: interaction = getCombinedInteraction(interactions[key], mode, counts, scoreRanges) if interaction != None: sentTemplate.append(copy.deepcopy(interaction)) if analyses: sentTemplate.append(analyses) counts["skipped"] = dict(counts["skipped"]) print "Counts:", dict(counts) if gold != None: print "****** Evaluating A ******" evaluateChemProt( a, gold ) #EvaluateIXML.run(AveragingMultiClassEvaluator, a, gold, "McCC") print "****** Evaluating B ******" evaluateChemProt( b, gold ) #EvaluateIXML.run(AveragingMultiClassEvaluator, b, gold, "McCC") print "****** Evaluating Combined ******" evaluateChemProt( template, gold ) #EvaluateIXML.run(AveragingMultiClassEvaluator, template, gold, "McCC") if outPath != None: print "Writing output to", outPath if outPath.endswith(".tsv"): Preprocessor(steps=["EXPORT_CHEMPROT"]).process(template, outPath) else: ETUtils.write(template, outPath) if logPath != None: Stream.closeLog(logPath)
def convertDDI(outDir, downloadDir=None, redownload=False, makeIntermediateFiles=True, debug=False): cwd = os.getcwd() if not os.path.exists(outDir): os.makedirs(outDir) os.chdir(outDir) logFileName = os.path.join(outDir, "DDI11-conversion-log.txt") Stream.openLog(logFileName) print >> sys.stderr, "=======================", "Converting DDI'11 corpus", "=======================" corpusDir = outDir + "/DDI11-original" Utils.Download.downloadAndExtract(Settings.URL["DDI11_CORPUS"], corpusDir, downloadDir) bigfileName = os.path.join(outDir, "DDI11") #oldXML = ETUtils.ETFromObj(bigfileName+".xml") trainUnified = corpusDir + "/train" trainMTMX = corpusDir + "/train_MTMX" testUnified = corpusDir + "/test" testMTMX = corpusDir + "/test_MTMX" # Load main documents tempdir = tempfile.mkdtemp() print >> sys.stderr, "Temporary files directory at", tempdir documents, docById, docCounts = loadDocs(trainUnified) # Divide training data into a train and devel set sortedDocCounts = sorted(docCounts.iteritems(), key=lambda (k, v): (v, k), reverse=True) datasetCounts = {"train": [0, 0], "devel": [0, 0], "test": [0, 0]} for i in range(0, len(sortedDocCounts) - 3, 4): for j in [0, 1]: docById[sortedDocCounts[i + j][0]].set("set", "train") datasetCounts["train"][0] += sortedDocCounts[i + j][1][0] datasetCounts["train"][1] += sortedDocCounts[i + j][1][1] docById[sortedDocCounts[i + 2][0]].set( "set", "train") #docById[sortedDocCounts[i+2][0]].set("set", "devel") docById[sortedDocCounts[i + 3][0]].set( "set", "devel") #docById[sortedDocCounts[i+3][0]].set("set", "test") datasetCounts["train"][0] += sortedDocCounts[i + 2][1][ 0] #datasetCounts["devel"][0] += sortedDocCounts[i+2][1][0] datasetCounts["train"][1] += sortedDocCounts[i + 2][1][ 1] #datasetCounts["devel"][1] += sortedDocCounts[i+2][1][1] datasetCounts["devel"][0] += sortedDocCounts[i + 3][1][ 0] #datasetCounts["test"][0] += sortedDocCounts[i+3][1][0] datasetCounts["devel"][1] += sortedDocCounts[i + 3][1][ 1] #datasetCounts["test"][1] += sortedDocCounts[i+3][1][1] for document in documents: # epajaolliset jaa yli if document.get("set") == None: document.set("set", "train") # Print division results print >> sys.stderr, datasetCounts for key in datasetCounts.keys(): if datasetCounts[key][1] != 0: print key, datasetCounts[key][0] / float(datasetCounts[key][1]) else: print key, datasetCounts[key][0], "/", float(datasetCounts[key][1]) # Some of the train and test ids overlap. Let's change the train set ids, because test set ones are needed # for the final evaluation. changeIdCount = 1000 for trainId in [ 'DrugDDI.d312', 'DrugDDI.d316', 'DrugDDI.d332', 'DrugDDI.d334', 'DrugDDI.d337', 'DrugDDI.d342', 'DrugDDI.d349', 'DrugDDI.d354', 'DrugDDI.d373', 'DrugDDI.d379', 'DrugDDI.d383', 'DrugDDI.d388', 'DrugDDI.d392', 'DrugDDI.d396', 'DrugDDI.d398', 'DrugDDI.d409', 'DrugDDI.d411', 'DrugDDI.d415', 'DrugDDI.d425', 'DrugDDI.d430', 'DrugDDI.d433', 'DrugDDI.d448', 'DrugDDI.d450', 'DrugDDI.d452', 'DrugDDI.d462', 'DrugDDI.d467', 'DrugDDI.d470', 'DrugDDI.d474', 'DrugDDI.d480', 'DrugDDI.d482', 'DrugDDI.d485', 'DrugDDI.d492', 'DrugDDI.d494', 'DrugDDI.d496', 'DrugDDI.d498', 'DrugDDI.d500', 'DrugDDI.d503', 'DrugDDI.d506', 'DrugDDI.d518', 'DrugDDI.d523', 'DrugDDI.d528', 'DrugDDI.d535', 'DrugDDI.d539', 'DrugDDI.d552', 'DrugDDI.d554', 'DrugDDI.d558', 'DrugDDI.d561', 'DrugDDI.d570', 'DrugDDI.d578' ]: newId = "DrugDDI.d" + str(changeIdCount) print >> sys.stderr, "Changing train/devel id", trainId, "to", newId for element in docById[trainId].getiterator(): for attrName, attrValue in element.attrib.iteritems(): if trainId in attrValue: element.set(attrName, attrValue.replace(trainId, newId)) docById[newId] = docById[trainId] del docById[trainId] changeIdCount += 1 # If test set exists, load it, too if testUnified != None: testDocuments, testDocById, testDocCounts = loadDocs(testUnified) for document in testDocuments: document.set("set", "test") documents = documents + testDocuments overlappingIds = [] for key in docById: if key in testDocById: overlappingIds.append(key) for key in docById: assert key not in testDocById, (key, docById[key].get("origId"), testDocById[key].get("origId"), sorted(docById.keys()), sorted(testDocById.keys()), sorted(overlappingIds)) docById.update(testDocById) # Add all documents into one XML xmlTree = ET.ElementTree(ET.Element("corpus")) root = xmlTree.getroot() root.set("source", "DDI11") for document in documents: root.append(document) if makeIntermediateFiles: ETUtils.write(root, bigfileName + "-documents-notfixed.xml") xml = xmlTree print >> sys.stderr, "Fixing DDI XML" fixEntities(xml) convertToInteractions(xml) # Add MTMX if trainMTMX != None: inDir = Utils.Download.getTopDir( tempdir, Utils.Download.downloadAndExtract(trainMTMX, tempdir, outDir + "/DDI11-original")) DDITools.addMTMX(xml, inDir) if testMTMX != None: inDir = Utils.Download.getTopDir( tempdir, Utils.Download.downloadAndExtract(testMTMX, tempdir, outDir + "/DDI11-original")) DDITools.addMTMX(xml, inDir) if makeIntermediateFiles: ETUtils.write(root, bigfileName + "-documents.xml") print >> sys.stderr, "---------------", "Inserting TEES-generated analyses", "---------------" Utils.Download.downloadAndExtract(Settings.URL["TEES_PARSES"], os.path.join(Settings.DATAPATH, "TEES-parses"), downloadDir, redownload=redownload) extractedFilename = os.path.join(Settings.DATAPATH, "TEES-parses") + "/DDI11" print >> sys.stderr, "Making sentences" Tools.SentenceSplitter.makeSentences(xml, extractedFilename, None) print >> sys.stderr, "Inserting McCC parses" Tools.BLLIPParser.insertParses( xml, extractedFilename, None, extraAttributes={"source": "TEES-preparsed"}) print >> sys.stderr, "Inserting Stanford conversions" Tools.StanfordParser.insertParses( xml, extractedFilename, None, extraAttributes={"stanfordSource": "TEES-preparsed"}) print >> sys.stderr, "Protein Name Splitting" splitTarget = "McCC" #ProteinNameSplitter.mainFunc(xml, None, splitTarget, splitTarget, "split-"+splitTarget, "split-"+splitTarget) ProteinNameSplitter.mainFunc(xml, None, splitTarget, removeOld=True) print >> sys.stderr, "Head Detection" #xml = FindHeads.findHeads(xml, "split-"+splitTarget, tokenization=None, output=None, removeExisting=True) xml = FindHeads.findHeads(xml, splitTarget, tokenization=None, output=None, removeExisting=True) print >> sys.stderr, "Dividing into sets" Utils.InteractionXML.DivideSets.processCorpus(xml, outDir, "DDI11", ".xml") Stream.closeLog(logFileName) if not debug: print >> sys.stderr, "Removing temporary directory", tempdir shutil.rmtree(tempdir) os.chdir(cwd)
def endLog(logPath): if logPath != None: Stream.closeLog(logPath)
def train(output, task=None, detector=None, inputFiles=None, models=None, parse=None, processUnmerging=None, processModifiers=None, bioNLPSTParams=None, preprocessorParams=None, exampleStyles=None, classifierParams=None, doFullGrid=False, deleteOutput=False, copyFrom=None, log="log.txt", step=None, omitSteps=None, debug=False, connection=None, subset=None, folds=None, corpusDir=None, corpusPreprocessing=None, evaluator=None): """ Train a new model for event or relation detection. @param output: A directory where output files will appear. @param task: If defined, overridable default settings are used for many of the training parameters. Must be one of the supported TEES tasks. @param detector: a Detector object, or a string defining one to be imported @param inputFiles: A dictionary of file names, with keys "train", "devel" and, "test" @param models: A dictionary of file names defining the place for the new models, with keys "devel" and, "test" @param parse: The parse element name in the training interaction XML @param processUnmerging: Use the unmerging step of EventDetector. True, False or None for task default. @param processModifiers: Use the modifier detection step of EventDetector. True, False or None for task default. @param bioNLPSTParams: Parameters controlling BioNLP ST format output. @param preprocessorParams: Parameters controlling the preprocessor. Not used for training, but saved to the model for use when classifying. @param exampleStyles: A parameter set for controlling example builders. @param classifierParams: A parameter set for controlling classifiers. @param doFullGrid: Whether all parameters, as opposed to just recall adjustment, are tested in the EventDetector grid search. @param deleteOutput: Remove an existing output directory @param copyFrom: Copy an existing output directory for use as a template @param log: An optional alternative name for the log file. None is for no logging. @param step: A step=substep pair, where the steps are "TRAIN", "DEVEL", "EMPTY" and "TEST" @param omitSteps: step=substep parameters, where multiple substeps can be defined. @param debug: In debug mode, more output is shown, and some temporary intermediate files are saved @param connection: A parameter set defining a local or remote connection for training the classifier @param subset: A parameter set for making subsets of input files """ # Insert default arguments where needed inputFiles = setDictDefaults(inputFiles, {"train":None, "devel":None, "test":None}) models = setDictDefaults(models, {"devel":"model-devel", "test":"model-test"}) exampleStyles = setDictDefaults(exampleStyles, {"examples":None, "trigger":None, "edge":None, "unmerging":None, "modifiers":None}) classifierParams = setDictDefaults(classifierParams, {"examples":None, "trigger":None, "recall":None, "edge":None, "unmerging":None, "modifiers":None}) subset = setDictDefaults(Parameters.get(subset), {"train":None, "devel":None, "test":None, "seed":0, "all":None}) folds = setDictDefaults(folds, {"train":None, "devel":None, "test":None}) processUnmerging = getDefinedBool(processUnmerging) processModifiers = getDefinedBool(processModifiers) # Initialize working directory workdir(output, deleteOutput, copyFrom, log) # Get task specific parameters useKerasDetector = False if detector != None and "keras" in detector.lower(): print >> sys.stderr, "Using a Keras Detector" useKerasDetector = True if detector.lower() == "keras": detector = None detector, bioNLPSTParams, preprocessorParams, folds = getTaskSettings(task, detector, bioNLPSTParams, preprocessorParams, inputFiles, exampleStyles, classifierParams, folds, corpusDir=corpusDir, useKerasDetector=useKerasDetector) # Learn training settings from input files detector = learnSettings(inputFiles, detector, classifierParams, task, exampleStyles, useKerasDetector=useKerasDetector) # Get corpus subsets getFolds(inputFiles, folds) getSubsets(inputFiles, subset) if task != None: task = task.replace("-FULL", "") if "." in task: _, subTask = getSubTask(task) if subTask != 3: processModifiers = False # Preprocess the corpus if required if corpusPreprocessing != None: preprocessor = Preprocessor(steps=corpusPreprocessing) assert preprocessor.steps[0].name == "MERGE_SETS" assert preprocessor.steps[-1].name == "DIVIDE_SETS" preprocessedCorpusDir = os.path.join(output, "corpus") #outputFiles = {x:os.path.join(preprocessedCorpusDir, os.path.basename(inputFiles[x])) for x in inputFiles} preprocessor.process(inputFiles, os.path.join(preprocessedCorpusDir, task)) #inputFiles = outputFiles for setName in inputFiles.keys(): if inputFiles[setName] != None: inputFiles[setName] = os.path.join(preprocessedCorpusDir, task + "-" + setName + ".xml") # Define processing steps selector, detectorSteps, omitDetectorSteps = getSteps(step, omitSteps, ["TRAIN", "DEVEL", "EMPTY", "TEST"]) # Initialize the detector detector, detectorName = getDetector(detector, evaluator=evaluator) evaluator, evaluatorName = importClass(evaluator, "evaluator") detector = detector() # initialize object if evaluator != None: print >> sys.stderr, "Using evaluator", evaluator.__name__ detector.evaluator = evaluator detector.debug = debug detector.bioNLPSTParams = detector.getBioNLPSharedTaskParams(bioNLPSTParams) #detector.useBioNLPSTFormat = useBioNLPSTFormat # classify-output and grid evaluation in ST-format #detector.stWriteScores = True # write confidence scores into additional st-format files connection = getConnection(connection) detector.setConnection(connection) connection.debug = debug if deleteOutput: connection.clearWorkDir() # Train if selector.check("TRAIN"): print >> sys.stderr, "----------------------------------------------------" print >> sys.stderr, "------------------ Train Detector ------------------" print >> sys.stderr, "----------------------------------------------------" if not isinstance(detector, EventDetector): detector.train(inputFiles["train"], inputFiles["devel"], models["devel"], models["test"], exampleStyles["examples"], classifierParams["examples"], parse, None, task, fromStep=detectorSteps["TRAIN"], workDir="training", testData=inputFiles["test"]) else: detector.train(inputFiles["train"], inputFiles["devel"], models["devel"], models["test"], exampleStyles["trigger"], exampleStyles["edge"], exampleStyles["unmerging"], exampleStyles["modifiers"], classifierParams["trigger"], classifierParams["edge"], classifierParams["unmerging"], classifierParams["modifiers"], classifierParams["recall"], processUnmerging, processModifiers, doFullGrid, task, parse, None, fromStep=detectorSteps["TRAIN"], workDir="training", testData=inputFiles["test"]) # Save the detector type for model in [models["devel"], models["test"]]: if model != None and os.path.exists(model): model = Model(model, "a") model.addStr("detector", detectorName) if evaluatorName != None: model.addStr("detector", evaluatorName) if preprocessorParams != None: preprocessor = Preprocessor() model.addStr("preprocessorParams", Parameters.toString(preprocessor.getParameters(preprocessorParams))) model.save() model.close() if selector.check("DEVEL"): print >> sys.stderr, "----------------------------------------------------" print >> sys.stderr, "------------ Check devel classification ------------" print >> sys.stderr, "----------------------------------------------------" #detector.bioNLPSTParams["scores"] = False # the evaluation server doesn't like additional files detector.classify(inputFiles["devel"], models["devel"], "classification-devel/devel", goldData=inputFiles["devel"], fromStep=detectorSteps["DEVEL"], workDir="classification-devel") if selector.check("EMPTY"): # By passing an emptied devel set through the prediction system, we can check that we get the same predictions # as in the DEVEL step, ensuring the model does not use leaked information. print >> sys.stderr, "----------------------------------------------------" print >> sys.stderr, "------------ Empty devel classification ------------" print >> sys.stderr, "----------------------------------------------------" #detector.bioNLPSTParams["scores"] = False # the evaluation server doesn't like additional files removalScope = "non-given" if "names" in str(exampleStyles["examples"]) or "names" in str(exampleStyles["trigger"]): removalScope = "all" elif "Edge" in detector.__class__.__name__: removalScope = "interactions" detector.classify(getEmptyCorpus(inputFiles["devel"], scope=removalScope), models["devel"], "classification-empty/devel-empty", fromStep=detectorSteps["EMPTY"], workDir="classification-empty") print >> sys.stderr, "*** Evaluate empty devel classification ***" if os.path.exists("classification-empty/devel-empty-pred.xml.gz"): EvaluateInteractionXML.run(detector.evaluator, "classification-empty/devel-empty-pred.xml.gz", inputFiles["devel"], parse) else: print >> sys.stderr, "No output file for evaluation" if selector.check("TEST"): print >> sys.stderr, "----------------------------------------------------" print >> sys.stderr, "------------- Test set classification --------------" print >> sys.stderr, "----------------------------------------------------" if inputFiles["test"] == None or not os.path.exists(inputFiles["test"]): print >> sys.stderr, "Skipping, test file", inputFiles["test"], "does not exist" else: #detector.bioNLPSTParams["scores"] = False # the evaluation server doesn't like additional files detector.classify(inputFiles["test"], models["test"] if models["test"] != None else models["devel"], "classification-test/test", fromStep=detectorSteps["TEST"], workDir="classification-test") if detector.bioNLPSTParams["convert"]: extension = ".zip" if (detector.bioNLPSTParams["convert"] == "zip") else ".tar.gz" Utils.STFormat.Compare.compare("classification-test/test-events" + extension, "classification-devel/devel-events" + extension, "a2") # Stop logging if log != None: Stream.closeLog(log)
def endLog(logPath): if logPath != None: Stream.closeLog(logPath)
def convertDDI13( outDir, downloadDir=None, datasets=["DDI13_TRAIN", "DDI13_TEST_TASK_9.1", "DDI13_TEST_TASK_9.2"], redownload=False, insertParses=True, parse=False, makeIntermediateFiles=True, debug=False): cwd = os.getcwd() if not os.path.exists(outDir): os.makedirs(outDir) os.chdir(outDir) logFileName = os.path.join(outDir, "DDI13-conversion-log.txt") Stream.openLog(logFileName) print >> sys.stderr, "=======================", "Converting DDI'13 corpus", "=======================" tempdir = tempfile.mkdtemp() downloaded = downloadFiles(downloadDir, tempdir, redownload) for dataset in datasets: corpusTree = getCorpusXML() xml = corpusTree.getroot() print >> sys.stderr, "Merging input XMLs" assert downloaded[dataset] != None combineXML(xml, "train", downloaded[dataset], subDirs=["DrugBank", "MedLine", "NER"]) print >> sys.stderr, "Processing elements" processElements(xml) if dataset == "DDI13_TRAIN": print >> sys.stderr, "Dividing training set into folds" divideSets(xml, "train", 10) else: for doc in xml.getiterator("document"): doc.set("set", "test") if parse: print >> sys.stderr, "Parsing" parseXML(corpusTree, os.path.join(tempdir, "parsing"), debug) elif insertParses: assert parse == False print >> sys.stderr, "Inserting McCC parses" Tools.BLLIPParser.insertParses(corpusTree, downloaded[dataset + "_TEES_PARSES"], None, extraAttributes={"source": "TEES"}) print >> sys.stderr, "Inserting Stanford conversions" Tools.StanfordParser.insertParses( corpusTree, downloaded[dataset + "_TEES_PARSES"], None, extraAttributes={"stanfordSource": "TEES"}) # Check what was produced by the conversion print >> sys.stderr, "---------------", "Corpus Structure Analysis", "---------------" analyzer = StructureAnalyzer() analyzer.analyze([xml]) print >> sys.stderr, analyzer.toString() if "9.1" in dataset: outFileName = os.path.join(outDir, "DDI13-test-task9.1.xml") elif "9.2" in dataset: outFileName = os.path.join(outDir, "DDI13-test-task9.2.xml") else: outFileName = os.path.join(outDir, "DDI13-train.xml") print >> sys.stderr, "Writing output to", outFileName ETUtils.write(xml, outFileName) Stream.closeLog(logFileName) if not debug and tempdir != None: print >> sys.stderr, "Removing temporary directory", tempdir shutil.rmtree(tempdir) os.chdir(cwd)
def convert(inPath, outDir, corpusId, directed, negatives, preprocess, preprocessorParameters=None, debug=False, clear=False, constParser="BLLIP-BIO", depParser="STANFORD-CONVERT", logging=True): assert negatives in ("INCLUDE", "SKIP", "REVERSE_POS") # Download the corpus if needed if inPath == None: if not hasattr(Settings, "SE10T8_CORPUS"): SemEval2010Task8Tools.install() inPath = Settings.SE10T8_CORPUS assert os.path.exists(inPath) # Prepare the output directory if not os.path.exists(outDir): print "Making output directory", outDir os.makedirs(outDir) elif clear: print "Removing output directory", outDir shutil.rmtree(outDir) # Start logging if logging: Stream.openLog(os.path.join(outDir, "log.txt"), clear=clear) # Read and process the corpus files archive = zipfile.ZipFile(inPath, 'r') usedIds = set() tree = None for fileName, setName in [("SemEval2010_task8_all_data/SemEval2010_task8_training/TRAIN_FILE.TXT", "train"),\ ("SemEval2010_task8_all_data/SemEval2010_task8_testing_keys/TEST_FILE_FULL.TXT", "test")]: print "Processing file", fileName, "as set", setName f = archive.open(fileName) tree = processLines(f.readlines(), setName, directed=directed, negatives=negatives, usedIds=usedIds, tree=tree, corpusId=corpusId) f.close() # Divide the training set into training and development sets MakeSets.processCorpus(tree, None, "train", [("train", 0.7), ("devel", 1.0)], 1) # Write out the converted corpus convertedPath = os.path.join(outDir, corpusId + "-converted.xml") ETUtils.write(tree.getroot(), convertedPath) # Preprocess the converted corpus if preprocess: outPath = os.path.join(outDir, corpusId + ".xml") preprocessor = Preprocessor(constParser, depParser) preprocessor.setArgForAllSteps("debug", debug) preprocessor.stepArgs("CONVERT")["corpusName"] = corpusId preprocessor.process( convertedPath, outPath, preprocessorParameters, omitSteps=["SPLIT-SENTENCES", "NER", "SPLIT-NAMES"]) # Stop logging if logging: Stream.closeLog(os.path.join(outDir, "log.txt"))
def combine(inputA, inputB, inputGold, outPath=None, mode="OR", skip=None, logPath="AUTO"): assert options.mode in ("AND", "OR") if skip != None and isinstance(skip, basestring): skip = set(skip.split(",")) if skip != None: print "Skipping interaction types:", skip if logPath == "AUTO": if outPath != None: logPath = os.path.join(outPath.rstrip("/").rstrip("\\") + "-log.txt") else: logPath = None if logPath != None: if not os.path.exists(os.path.dirname(logPath)): os.makedirs(os.path.dirname(logPath)) Stream.openLog(logPath) print "Loading the Interaction XML files" print "Loading A from", inputA a = ETUtils.ETFromObj(inputA) print "Loading B from", inputB b = ETUtils.ETFromObj(inputB) gold = None if inputGold: print "Loading gold from", inputGold gold = ETUtils.ETFromObj(inputGold) if inputGold else None print "Copying a as template" template = copy.deepcopy(a) print "Calculating confidence score ranges" scoreRanges = {} scoreRanges["a"] = getScoreRange(a, skip) scoreRanges["b"] = getScoreRange(b, skip) print scoreRanges print "Combining" counts = defaultdict(int) counts["skipped"] = defaultdict(int) counter = ProgressCounter(len([x for x in a.findall("document")]), "Combine") for docA, docB, docGold, docTemplate in itertools.izip_longest(*[x.findall("document") for x in (a, b, gold, template)]): counter.update() assert len(set([x.get("id") for x in (docA, docB, docGold, docTemplate)])) == 1 for sentA, sentB, sentGold, sentTemplate in itertools.izip_longest(*[x.findall("sentence") for x in (docA, docB, docGold, docTemplate)]): assert len(set([x.get("id") for x in (sentA, sentB, sentGold, sentTemplate)])) == 1 interactions = getInteractions(sentA, sentB, sentGold, skip, counts["skipped"]) for interaction in sentTemplate.findall("interaction"): sentTemplate.remove(interaction) analyses = sentTemplate.find("analyses") if analyses: sentTemplate.remove(analyses) for key in interactions: interaction = getCombinedInteraction(interactions[key], mode, counts, scoreRanges) if interaction != None: sentTemplate.append(copy.deepcopy(interaction)) if analyses: sentTemplate.append(analyses) counts["skipped"] = dict(counts["skipped"]) print "Counts:", dict(counts) if gold != None: print "****** Evaluating A ******" evaluateChemProt(a, gold) #EvaluateIXML.run(AveragingMultiClassEvaluator, a, gold, "McCC") print "****** Evaluating B ******" evaluateChemProt(b, gold) #EvaluateIXML.run(AveragingMultiClassEvaluator, b, gold, "McCC") print "****** Evaluating Combined ******" evaluateChemProt(template, gold) #EvaluateIXML.run(AveragingMultiClassEvaluator, template, gold, "McCC") if outPath != None: print "Writing output to", outPath if outPath.endswith(".tsv"): Preprocessor(steps=["EXPORT_CHEMPROT"]).process(template, outPath) else: ETUtils.write(template, outPath) if logPath != None: Stream.closeLog(logPath)