def learnSettings(inputFiles, detector, classifierParameters): if detector == None: print >> sys.stderr, "*** Analyzing input files to determine training settings ***" structureAnalyzer = StructureAnalyzer() if not os.path.exists("training/structure.txt"): datasets = sorted( filter(None, [inputFiles["train"], inputFiles["devel"]])) print >> sys.stderr, "input files:", datasets structureAnalyzer.analyze(datasets) print >> sys.stderr, structureAnalyzer.toString() structureAnalyzer.save(None, "training/structure.txt") else: print >> sys.stderr, "Using existing analysis from training/structure.txt" structureAnalyzer.load(None, "training/structure.txt") # Choose detector if detector == None: if "ENTITY" in structureAnalyzer.targets and "INTERACTION" in structureAnalyzer.targets: detector = "Detectors.EventDetector" elif "ENTITY" in structureAnalyzer.targets: detector = "Detectors.EntityDetector" elif "INTERACTION" in structureAnalyzer.targets: detector = "Detectors.EdgeDetector" else: assert False, structureAnalyzer.targets print >> sys.stderr, "Using detector '" + str(detector) + "'" # Set default parameters if detector == "Detectors.EventDetector": classifierParameters["unmerging"] = Parameters.cat( "c=1,10,100,500,1000,1500,2500,5000,10000,20000,50000,80000,100000", classifierParameters["unmerging"], "Classifier parameters for unmerging") classifierParameters["modifiers"] = Parameters.cat( "c=5000,10000,20000,50000,100000", classifierParameters["modifiers"], "Classifier parameters for modifiers") classifierParameters["edge"] = Parameters.cat( "c=1000,4500,5000,7500,10000,20000,25000,27500,28000,29000,30000,35000,40000,50000,60000,65000", classifierParameters["edge"], "Classifier parameters for edges") classifierParameters["trigger"] = Parameters.cat( "c=1000,5000,10000,20000,50000,80000,100000,150000,180000,200000,250000,300000,350000,500000,1000000", classifierParameters["trigger"], "Classifier parameters for triggers") classifierParameters["recall"] = Parameters.cat( "0.5,0.6,0.65,0.7,0.85,1.0,1.1,1.2", classifierParameters["recall"], "Recall adjustment parameters") elif detector == "Detectors.EntityDetector": classifierParameters["examples"] = Parameters.cat( "c=1000,5000,10000,20000,50000,80000,100000,150000,180000,200000,250000,300000,350000,500000,1000000", classifierParameters["examples"], "Classifier parameters for entities") elif detector == "Detectors.EdgeDetector": classifierParameters["examples"] = Parameters.cat( "c=1000,4500,5000,7500,10000,20000,25000,27500,28000,29000,30000,35000,40000,50000,60000,65000", classifierParameters["examples"], "Classifier parameters for edges") return detector
def convertDDI13(outDir, downloadDir=None, datasets=["DDI13_TRAIN", "DDI13_TEST_TASK_9.1", "DDI13_TEST_TASK_9.2"], redownload=False, insertParses=True, parse=False, makeIntermediateFiles=True, debug=False): cwd = os.getcwd() if not os.path.exists(outDir): os.makedirs(outDir) os.chdir(outDir) logFileName = os.path.join(outDir, "DDI13-conversion-log.txt") Stream.openLog(logFileName) print >> sys.stderr, "=======================", "Converting DDI'13 corpus", "=======================" tempdir = tempfile.mkdtemp() downloaded = downloadFiles(downloadDir, tempdir, redownload) for dataset in datasets: corpusTree = getCorpusXML() xml = corpusTree.getroot() print >> sys.stderr, "Merging input XMLs" assert downloaded[dataset] != None combineXML(xml, "train", downloaded[dataset], subDirs=["DrugBank", "MedLine", "NER"]) print >> sys.stderr, "Processing elements" processElements(xml) if dataset == "DDI13_TRAIN": print >> sys.stderr, "Dividing training set into folds" divideSets(xml, "train", 10) else: for doc in xml.getiterator("document"): doc.set("set", "test") if parse: print >> sys.stderr, "Parsing" parseXML(corpusTree, os.path.join(tempdir, "parsing"), debug) elif insertParses: assert parse == False print >> sys.stderr, "Inserting McCC parses" Tools.BLLIPParser.insertParses(corpusTree, downloaded[dataset + "_TEES_PARSES"], None, extraAttributes={"source":"TEES"}) print >> sys.stderr, "Inserting Stanford conversions" Tools.StanfordParser.insertParses(corpusTree, downloaded[dataset + "_TEES_PARSES"], None, extraAttributes={"stanfordSource":"TEES"}) # Check what was produced by the conversion print >> sys.stderr, "---------------", "Corpus Structure Analysis", "---------------" analyzer = StructureAnalyzer() analyzer.analyze([xml]) print >> sys.stderr, analyzer.toString() if "9.1" in dataset: outFileName = os.path.join(outDir, "DDI13-test-task9.1.xml") elif "9.2" in dataset: outFileName = os.path.join(outDir, "DDI13-test-task9.2.xml") else: outFileName = os.path.join(outDir, "DDI13-train.xml") print >> sys.stderr, "Writing output to", outFileName ETUtils.write(xml, outFileName) Stream.closeLog(logFileName) if not debug and tempdir != None: print >> sys.stderr, "Removing temporary directory", tempdir shutil.rmtree(tempdir) os.chdir(cwd)
def learnSettings(inputFiles, detector, classifierParameters): if detector == None: print >> sys.stderr, "*** Analyzing input files to determine training settings ***" structureAnalyzer = StructureAnalyzer() if not os.path.exists("training/structure.txt"): datasets = sorted(filter(None, [inputFiles["train"], inputFiles["devel"]])) print >> sys.stderr, "input files:", datasets structureAnalyzer.analyze(datasets) print >> sys.stderr, structureAnalyzer.toString() structureAnalyzer.save(None, "training/structure.txt") else: print >> sys.stderr, "Using existing analysis from training/structure.txt" structureAnalyzer.load(None, "training/structure.txt") # Choose detector if detector == None: if "ENTITY" in structureAnalyzer.targets and "INTERACTION" in structureAnalyzer.targets: detector = "Detectors.EventDetector" elif "ENTITY" in structureAnalyzer.targets: detector = "Detectors.EntityDetector" elif "INTERACTION" in structureAnalyzer.targets: detector = "Detectors.EdgeDetector" else: assert False, structureAnalyzer.targets print >> sys.stderr, "Using detector '" + str(detector) + "'" # Set default parameters cp = classifierParameters if detector == "Detectors.EventDetector": # Add common classifier parameters if cp["examples"] != None: cp["unmerging"] = Parameters.cat(cp["examples"], cp["unmerging"]) cp["modifiers"] = Parameters.cat(cp["examples"], cp["modifiers"]) cp["edge"] = Parameters.cat(cp["examples"], cp["edge"]) cp["trigger"] = Parameters.cat(cp["examples"], cp["trigger"]) cp["unmerging"] = Parameters.cat("c=1,10,100,500,1000,1500,2500,5000,10000,20000,50000,80000,100000", cp["unmerging"], "Classifier parameters for unmerging") cp["modifiers"] = Parameters.cat("c=5000,10000,20000,50000,100000", cp["modifiers"], "Classifier parameters for modifiers") cp["edge"] = Parameters.cat("c=1000,4500,5000,7500,10000,20000,25000,27500,28000,29000,30000,35000,40000,50000,60000,65000", cp["edge"], "Classifier parameters for edges") cp["trigger"] = Parameters.cat("c=1000,5000,10000,20000,50000,80000,100000,150000,180000,200000,250000,300000,350000,500000,1000000", cp["trigger"], "Classifier parameters for triggers") cp["recall"] = Parameters.cat("0.5,0.6,0.65,0.7,0.85,1.0,1.1,1.2", cp["recall"], "Recall adjustment parameters") elif detector == "Detectors.EntityDetector": cp["examples"] = Parameters.cat("c=1000,5000,10000,20000,50000,80000,100000,150000,180000,200000,250000,300000,350000,500000,1000000", cp["examples"], "Classifier parameters for entities") elif detector == "Detectors.EdgeDetector": cp["examples"] = Parameters.cat("c=1000,4500,5000,7500,10000,20000,25000,27500,28000,29000,30000,35000,40000,50000,60000,65000", cp["examples"], "Classifier parameters for edges") return detector
usage= "%prog [options]\nBuild machine learning examples from interaction XML." ) addBasicOptions(optparser) (options, args) = optparser.parse_args() if options.gold == "AUTO": options.gold = options.input print >> sys.stderr, "Importing modules" exec "from ExampleBuilders." + options.exampleBuilder + " import " + options.exampleBuilder + " as ExampleBuilderClass" structureAnalyzer = None if options.structure == None: # define structure from input file structureAnalyzer = StructureAnalyzer() structureAnalyzer.analyze(options.input) print >> sys.stderr, "--- Structure Analysis ----" print >> sys.stderr, structureAnalyzer.toString() elif options.structure != "NONE": # a file name structureAnalyzer = StructureAnalyzer(options.structure) #input, output, parse, tokenization, style, classIds=None, featureIds=None, gold=None, append=False) ExampleBuilderClass.run(options.input, options.output, options.parse, None, options.parameters, options.classes, options.features, allowNewIds=options.addIds, structureAnalyzer=structureAnalyzer, debug=options.debug,
def convertDownloaded(outdir, corpus, files, intermediateFiles=True, evaluate=True, processEquiv=True, analysisMode="INSERT", packageSubPath=None, debug=False): global moveBI if evaluate: workdir = outdir + "/conversion/" + corpus if os.path.exists(workdir): shutil.rmtree(workdir) os.makedirs(workdir) print >> sys.stderr, "---------------", "Converting to XML", "---------------" # All datasets are processed as one XML, to ensure all the steps (parse modification etc.) are # applied equally #print corpus, files datasets = [] for setName in ["devel", "train", "test"]: if corpus + "_" + setName.upper() in files: datasets.append(setName) bigfileName = os.path.join(outdir, corpus + "-" + "-and-".join(datasets)) documents = [] for setName in datasets: sourceFile = files[corpus + "_" + setName.upper()] print >> sys.stderr, "Reading", setName, "set from", sourceFile docs = ST.loadSet(sourceFile, setName, "a2", subPath=packageSubPath) print >> sys.stderr, "Read", len(docs), "documents" documents.extend(docs) if len(docs) > 0 and docs[0].license != None: licenseFile = open(os.path.join(outdir, corpus + "-LICENSE"), "wt") licenseFile.write(docs[0].license) licenseFile.close() if processEquiv: print >> sys.stderr, "Resolving equivalences" Utils.STFormat.Equiv.process(documents) else: print >> sys.stderr, "Skipping resolving of equivalences" if evaluate: #print >> sys.stderr, "Checking data validity" #for doc in documents: # Utils.STFormat.Validate.validate(doc.events, simulation=True, verbose=True, docId=doc.id) print >> sys.stderr, "Writing all documents to geniaformat" ST.writeSet(documents, os.path.join(workdir, "all-geniaformat"), resultFileTag="a2", debug=False) if intermediateFiles: print >> sys.stderr, "Converting to XML, writing combined corpus to", bigfileName + "-documents.xml" xml = STConvert.toInteractionXML(documents, corpus, bigfileName + "-documents.xml") else: print >> sys.stderr, "Converting to XML" xml = STConvert.toInteractionXML(documents, corpus, None) if corpus == "BI11": Utils.InteractionXML.MixSets.mixSets(xml, None, set(moveBI), "train", "devel") if corpus == "REN11": corpusRENtoASCII(xml) if analysisMode == "INSERT": insertAnalyses(xml, corpus, datasets, files, bigfileName, packageSubPath=packageSubPath) if intermediateFiles: print >> sys.stderr, "Writing combined corpus", bigfileName + "-sentences.xml" ETUtils.write(xml, bigfileName + "-sentences.xml") processParses(xml) elif analysisMode == "BUILD": parseXML(xml, bigfileName, intermediateFiles, debug, bbResources=(corpus.startswith("BB_"))) else: print >> sys.stderr, "Skipping analyses" # A hack for GRN13 task that breaks the official BioNLP Shared Task convention of trigger and event having the same type. # Let's remove the unused triggers, so that there won't be an unusable node class. There is no clean way to fix this, # as the GRN13 task not following the official rules introduces yet another mechanism into the Shared Task format, # and supporting this would require rewriting everything. if corpus == "GRN13": Utils.InteractionXML.DeleteElements.processCorpus( xml, None, {"entity": { "type": ["Action"] }}) print >> sys.stderr, "---------------", "Writing corpora", "---------------" checkAttributes(xml) # Write out converted data if intermediateFiles: print >> sys.stderr, "Writing combined corpus", bigfileName + ".xml" ETUtils.write(xml, bigfileName + ".xml") print >> sys.stderr, "Dividing into sets" Utils.InteractionXML.DivideSets.processCorpus(xml, outdir, corpus, ".xml") if evaluate and "devel" in datasets: print >> sys.stderr, "---------------", "Evaluating conversion", "---------------" if corpus != "REL11": # Task 1 (removal of Entity-entities) cannot work for REL print >> sys.stderr, "Evaluating task 1 back-conversion" STConvert.toSTFormat(os.path.join(outdir, corpus + "-devel.xml"), workdir + "/roundtrip/" + corpus + "-devel" + "-task1", outputTag="a2", skipArgs=["Site"]) BioNLP11GeniaTools.evaluate( workdir + "/roundtrip/" + corpus + "-devel" + "-task1", corpus + ".1") print >> sys.stderr, "Evaluating task 2 back-conversion" STConvert.toSTFormat(os.path.join(outdir, corpus + "-devel.xml"), workdir + "/roundtrip/" + corpus + "-devel" + "-task2", outputTag="a2") BioNLP11GeniaTools.evaluate( workdir + "/roundtrip/" + corpus + "-devel" + "-task2", corpus + ".2") print >> sys.stderr, "Note! Evaluation of Task 2 back-conversion can be less than 100% due to site-argument mapping" # Check what was produced by the conversion print >> sys.stderr, "---------------", "Corpus Structure Analysis", "---------------" analyzer = StructureAnalyzer() analyzer.analyze([xml]) print >> sys.stderr, analyzer.toString()
def convertDownloaded(outdir, corpus, files, intermediateFiles=True, evaluate=True, processEquiv=True, addAnalyses=True, packageSubPath=None): global moveBI if evaluate: workdir = outdir + "/conversion/" + corpus if os.path.exists(workdir): shutil.rmtree(workdir) os.makedirs(workdir) print >> sys.stderr, "---------------", "Converting to XML", "---------------" # All datasets are processed as one XML, to ensure all the steps (parse modification etc.) are # applied equally #print corpus, files datasets = [] for setName in ["devel", "train", "test"]: if corpus + "_" + setName.upper() in files: datasets.append(setName) bigfileName = os.path.join(outdir, corpus + "-" + "-and-".join(datasets)) documents = [] for setName in datasets: sourceFile = files[corpus + "_" + setName.upper()] print >> sys.stderr, "Reading", setName, "set from", sourceFile docs = ST.loadSet(sourceFile, setName, "a2", subPath=packageSubPath) print >> sys.stderr, "Read", len(docs), "documents" documents.extend(docs) if len(docs) > 0 and docs[0].license != None: licenseFile = open(os.path.join(outdir, corpus + "-LICENSE"), "wt") licenseFile.write(docs[0].license) licenseFile.close() if processEquiv: print >> sys.stderr, "Resolving equivalences" Utils.STFormat.Equiv.process(documents) else: print >> sys.stderr, "Skipping resolving of equivalences" if evaluate: #print >> sys.stderr, "Checking data validity" #for doc in documents: # Utils.STFormat.Validate.validate(doc.events, simulation=True, verbose=True, docId=doc.id) print >> sys.stderr, "Writing all documents to geniaformat" ST.writeSet(documents, os.path.join(workdir, "all-geniaformat"), resultFileTag="a2", debug=False) if intermediateFiles: print >> sys.stderr, "Converting to XML, writing combined corpus to", bigfileName+"-documents.xml" xml = STConvert.toInteractionXML(documents, corpus, bigfileName+"-documents.xml") else: print >> sys.stderr, "Converting to XML" xml = STConvert.toInteractionXML(documents, corpus, None) if corpus == "BI11": Utils.InteractionXML.MixSets.mixSets(xml, None, set(moveBI), "train", "devel") if corpus == "REN11": corpusRENtoASCII(xml) if addAnalyses: insertAnalyses(xml, corpus, datasets, files, bigfileName, packageSubPath=packageSubPath) else: print >> sys.stderr, "Skipping adding analyses" if intermediateFiles: print >> sys.stderr, "Writing combined corpus", bigfileName+"-sentences.xml" ETUtils.write(xml, bigfileName+"-sentences.xml") processParses(xml) # A hack for GRN13 task that breaks the official BioNLP Shared Task convention of trigger and event having the same type. # Let's remove the unused triggers, so that there won't be an unusable node class. There is no clean way to fix this, # as the GRN13 task not following the official rules introduces yet another mechanism into the Shared Task format, # and supporting this would require rewriting everything. if corpus == "GRN13": Utils.InteractionXML.DeleteElements.processCorpus(xml, None, {"entity":{"type":["Action"]}}) print >> sys.stderr, "---------------", "Writing corpora", "---------------" checkAttributes(xml) # Write out converted data if intermediateFiles: print >> sys.stderr, "Writing combined corpus", bigfileName+".xml" ETUtils.write(xml, bigfileName+".xml") print >> sys.stderr, "Dividing into sets" Utils.InteractionXML.DivideSets.processCorpus(xml, outdir, corpus, ".xml") if evaluate and "devel" in datasets: print >> sys.stderr, "---------------", "Evaluating conversion", "---------------" if corpus != "REL11": # Task 1 (removal of Entity-entities) cannot work for REL print >> sys.stderr, "Evaluating task 1 back-conversion" STConvert.toSTFormat(os.path.join(outdir, corpus + "-devel.xml"), workdir + "/roundtrip/" + corpus + "-devel" + "-task1", outputTag="a2", skipArgs=["Site"]) BioNLP11GeniaTools.evaluate(workdir + "/roundtrip/" + corpus + "-devel" + "-task1", corpus + ".1") print >> sys.stderr, "Evaluating task 2 back-conversion" STConvert.toSTFormat(os.path.join(outdir, corpus + "-devel.xml"), workdir + "/roundtrip/" + corpus + "-devel" + "-task2", outputTag="a2") BioNLP11GeniaTools.evaluate(workdir + "/roundtrip/" + corpus + "-devel" + "-task2", corpus + ".2") print >> sys.stderr, "Note! Evaluation of Task 2 back-conversion can be less than 100% due to site-argument mapping" # Check what was produced by the conversion print >> sys.stderr, "---------------", "Corpus Structure Analysis", "---------------" analyzer = StructureAnalyzer() analyzer.analyze([xml]) print >> sys.stderr, analyzer.toString()
try: import psyco psyco.full() print >> sys.stderr, "Found Psyco, using" except ImportError: print >> sys.stderr, "Psyco not installed" from optparse import OptionParser optparser = OptionParser(usage="%prog [options]\nBuild machine learning examples from interaction XML.") addBasicOptions(optparser) (options, args) = optparser.parse_args() if options.gold == "AUTO": options.gold = options.input print >> sys.stderr, "Importing modules" exec "from ExampleBuilders." + options.exampleBuilder + " import " + options.exampleBuilder + " as ExampleBuilderClass" structureAnalyzer = None if options.structure == None: # define structure from input file structureAnalyzer = StructureAnalyzer() structureAnalyzer.analyze(options.input) print >> sys.stderr, "--- Structure Analysis ----" print >> sys.stderr, structureAnalyzer.toString() elif options.structure != "NONE": # a file name structureAnalyzer = StructureAnalyzer(options.structure) #input, output, parse, tokenization, style, classIds=None, featureIds=None, gold=None, append=False) ExampleBuilderClass.run(options.input, options.output, options.parse, None, options.parameters, options.classes, options.features, allowNewIds=options.addIds, structureAnalyzer=structureAnalyzer, debug=options.debug, gold=options.gold)
def learnSettings(inputFiles, detector, classifierParameters, task, exampleStyles, useKerasDetector=False): if detector == None: print >> sys.stderr, "*** Analyzing input files to determine training settings ***" structureAnalyzer = StructureAnalyzer() if not os.path.exists("training/structure.txt"): datasets = sorted(filter(None, [inputFiles["train"], inputFiles["devel"]])) print >> sys.stderr, "input files:", datasets structureAnalyzer.analyze(datasets) print >> sys.stderr, structureAnalyzer.toString() structureAnalyzer.save(None, "training/structure.txt") else: print >> sys.stderr, "Using existing analysis from training/structure.txt" structureAnalyzer.load(None, "training/structure.txt") # Choose detector if detector == None: if "ENTITY" in structureAnalyzer.targets and "INTERACTION" in structureAnalyzer.targets: detector = "Detectors.EventDetector" elif "ENTITY" in structureAnalyzer.targets: detector = "Detectors.EntityDetector" elif "INTERACTION" in structureAnalyzer.targets: detector = "Detectors.EdgeDetector" else: assert False, structureAnalyzer.targets if useKerasDetector and not "Keras" in detector: detector = detector.replace("Detectors.", "Detectors.Keras") print >> sys.stderr, "Using detector '" + str(detector) + "'" # Set default parameters cp = classifierParameters if detector == "Detectors.EventDetector": # Add common classifier parameters if cp["examples"] != None: cp["unmerging"] = Parameters.cat(cp["examples"], cp["unmerging"]) cp["modifiers"] = Parameters.cat(cp["examples"], cp["modifiers"]) cp["edge"] = Parameters.cat(cp["examples"], cp["edge"]) cp["trigger"] = Parameters.cat(cp["examples"], cp["trigger"]) cp["unmerging"] = Parameters.cat("c=1,10,100,500,1000,1500,2500,5000,10000,20000,50000,80000,100000", cp["unmerging"], "Classifier parameters for unmerging") cp["modifiers"] = Parameters.cat("c=5000,10000,20000,50000,100000", cp["modifiers"], "Classifier parameters for modifiers") cp["edge"] = Parameters.cat("c=1000,4500,5000,7500,10000,20000,25000,27500,28000,29000,30000,35000,40000,50000,60000,65000", cp["edge"], "Classifier parameters for edges") cp["trigger"] = Parameters.cat("c=1000,5000,10000,20000,50000,80000,100000,150000,180000,200000,250000,300000,350000,500000,1000000", cp["trigger"], "Classifier parameters for triggers") cp["recall"] = Parameters.cat("0.5,0.6,0.65,0.7,0.85,1.0,1.1,1.2", cp["recall"], "Recall adjustment parameters") elif detector == "Detectors.EntityDetector": cp["examples"] = Parameters.cat("c=1000,5000,10000,20000,50000,80000,100000,150000,180000,200000,250000,300000,350000,500000,1000000", cp["examples"], "Classifier parameters for entities") elif detector == "Detectors.EdgeDetector": cp["examples"] = Parameters.cat("c=1000,4500,5000,7500,10000,20000,25000,27500,28000,29000,30000,35000,40000,50000,60000,65000", cp["examples"], "Classifier parameters for edges") elif detector == "Detectors.UnmergingDetector": cp["examples"] = Parameters.cat("c=1,10,100,500,1000,1500,2500,5000,10000,20000,50000,80000,100000", cp["examples"], "Classifier parameters for unmerging") ####################################################################### # Keras example styles ####################################################################### if useKerasDetector: task, subTask = getSubTask(task) msg = "Keras example style" #overrideStyles = {x:(Parameters.get(exampleStyles[x]) if (exampleStyles[x] != None and "override" in exampleStyles[x]) else {"override":True}) for x in exampleStyles} overrideStyles = {"all":{}} for key in exampleStyles: overrideStyles[key] = {} params = Parameters.get(exampleStyles[key]) if "override" in params: exampleStyles[key] = None overrideStyles[key] = params overrideStyles[key].pop("override") elif "override_all" in params: exampleStyles[key] = None overrideStyles["all"] = params overrideStyles["all"].pop("override_all") #exampleStyles[key] = exampleStyles[key] if (exampleStyles[key] != None and not "override" in exampleStyles[key]) else None print >> sys.stderr, "Override styles:", overrideStyles if "EventDetector" in detector: if task == "EPI11": exampleStyles["trigger"] = Parameters.cat("keras:epochs=500:patience=10:nf=512:path=4:el=41:mods=20:epi_merge_negated", exampleStyles["trigger"]) else: exampleStyles["trigger"] = Parameters.cat("keras:epochs=500:patience=10:nf=512:path=4:el=41:mods=20", exampleStyles["trigger"]) if task in ["GE09", "GE11", "GE13"] and subTask == 1: exampleStyles["edge"] = Parameters.cat("keras:genia_task1:epochs=500:patience=10:nf=256:path=4:ol=15:mods=20", exampleStyles["edge"]) else: exampleStyles["edge"] = Parameters.cat("keras:epochs=500:patience=10:nf=256:path=4:ol=15:mods=20", exampleStyles["edge"]) exampleStyles["unmerging"] = Parameters.cat("keras:epochs=500:patience=10:nf=256:path=4:ol=15:mods=20", exampleStyles["unmerging"]) exampleStyles["modifiers"] = Parameters.cat("keras:epochs=500:patience=10:nf=256:path=4:el=41:mods=20", exampleStyles["modifiers"]) elif "EntityDetector" in detector: if task == "DDI13T91": exampleStyles["examples"] = Parameters.cat("keras:epochs=500:patience=10:nf=512:path=4:el=41:mods=20:names:build_for_nameless", exampleStyles["examples"]) else: exampleStyles["examples"] = Parameters.cat("keras:epochs=500:patience=10:nf=512:path=4:el=41:mods=20", exampleStyles["examples"]) elif "EdgeDetector" in detector: if "DDI" in task: exampleStyles["examples"] = Parameters.cat("keras:epochs=500:patience=10:nf=256:path=0:do=0.2:dense=800:ol=50:mods=20", exampleStyles["examples"]) elif task == "CP17": exampleStyles["examples"] = Parameters.cat("keras:epochs=500:patience=10:nf=512:path=0:do=0.2:ol=50:skip_labels=CPR\:0,CPR\:1,CPR\:2,CPR\:7,CPR\:8,CPR\:10:mods=20", exampleStyles["examples"]) else: exampleStyles["examples"] = Parameters.cat("keras:epochs=500:patience=10:nf=256:path=4:ol=15:mods=20", exampleStyles["examples"]) print >> sys.stderr, "Keras initial example styles:", exampleStyles for key in exampleStyles: if exampleStyles[key] != None: exampleStyles[key] = Parameters.get(exampleStyles[key]) exampleStyles[key].update(overrideStyles[key]) exampleStyles[key].update(overrideStyles["all"]) exampleStyles[key] = Parameters.toString(exampleStyles[key]) print >> sys.stderr, "Keras final example style for " + key + ": ", exampleStyles[key] return detector
def convertDDI13( outDir, downloadDir=None, datasets=["DDI13_TRAIN", "DDI13_TEST_TASK_9.1", "DDI13_TEST_TASK_9.2"], redownload=False, insertParses=True, parse=False, makeIntermediateFiles=True, debug=False): cwd = os.getcwd() if not os.path.exists(outDir): os.makedirs(outDir) os.chdir(outDir) logFileName = os.path.join(outDir, "DDI13-conversion-log.txt") Stream.openLog(logFileName) print >> sys.stderr, "=======================", "Converting DDI'13 corpus", "=======================" tempdir = tempfile.mkdtemp() downloaded = downloadFiles(downloadDir, tempdir, redownload) for dataset in datasets: corpusTree = getCorpusXML() xml = corpusTree.getroot() print >> sys.stderr, "Merging input XMLs" assert downloaded[dataset] != None combineXML(xml, "train", downloaded[dataset], subDirs=["DrugBank", "MedLine", "NER"]) print >> sys.stderr, "Processing elements" processElements(xml) if dataset == "DDI13_TRAIN": print >> sys.stderr, "Dividing training set into folds" divideSets(xml, "train", 10) else: for doc in xml.getiterator("document"): doc.set("set", "test") if parse: print >> sys.stderr, "Parsing" parseXML(corpusTree, os.path.join(tempdir, "parsing"), debug) elif insertParses: assert parse == False print >> sys.stderr, "Inserting McCC parses" Tools.BLLIPParser.insertParses(corpusTree, downloaded[dataset + "_TEES_PARSES"], None, extraAttributes={"source": "TEES"}) print >> sys.stderr, "Inserting Stanford conversions" Tools.StanfordParser.insertParses( corpusTree, downloaded[dataset + "_TEES_PARSES"], None, extraAttributes={"stanfordSource": "TEES"}) # Check what was produced by the conversion print >> sys.stderr, "---------------", "Corpus Structure Analysis", "---------------" analyzer = StructureAnalyzer() analyzer.analyze([xml]) print >> sys.stderr, analyzer.toString() if "9.1" in dataset: outFileName = os.path.join(outDir, "DDI13-test-task9.1.xml") elif "9.2" in dataset: outFileName = os.path.join(outDir, "DDI13-test-task9.2.xml") else: outFileName = os.path.join(outDir, "DDI13-train.xml") print >> sys.stderr, "Writing output to", outFileName ETUtils.write(xml, outFileName) Stream.closeLog(logFileName) if not debug and tempdir != None: print >> sys.stderr, "Removing temporary directory", tempdir shutil.rmtree(tempdir) os.chdir(cwd)
def convertDownloaded(outdir, corpus, files, intermediateFiles=True, evaluate=True, processEquiv=True, addAnalyses=True, packageSubPath=None): global moveBI if evaluate: workdir = outdir + "/conversion/" + corpus if os.path.exists(workdir): shutil.rmtree(workdir) os.makedirs(workdir) print >> sys.stderr, "---------------", "Converting to XML", "---------------" # All datasets are processed as one XML, to ensure all the steps (parse modification etc.) are # applied equally #print corpus, files datasets = [] for setName in ["devel", "train", "test"]: if corpus + "_" + setName.upper() in files: datasets.append(setName) bigfileName = os.path.join(outdir, corpus + "-" + "-and-".join(datasets)) documents = [] for setName in datasets: sourceFile = files[corpus + "_" + setName.upper()] print >> sys.stderr, "Reading", setName, "set from", sourceFile docs = ST.loadSet(sourceFile, setName, "a2", subPath=packageSubPath) print >> sys.stderr, "Read", len(docs), "documents" documents.extend(docs) if len(docs) > 0 and docs[0].license != None: licenseFile = open(os.path.join(outdir, corpus + "-LICENSE"), "wt") licenseFile.write(docs[0].license) licenseFile.close() if processEquiv: print >> sys.stderr, "Resolving equivalences" Utils.STFormat.Equiv.process(documents) else: print >> sys.stderr, "Skipping resolving of equivalences" if evaluate: #print >> sys.stderr, "Checking data validity" #for doc in documents: # Utils.STFormat.Validate.validate(doc.events, simulation=True, verbose=True, docId=doc.id) print >> sys.stderr, "Writing all documents to geniaformat" ST.writeSet(documents, os.path.join(workdir, "all-geniaformat"), resultFileTag="a2", debug=False) if intermediateFiles: print >> sys.stderr, "Converting to XML, writing combined corpus to", bigfileName + "-documents.xml" xml = STConvert.toInteractionXML(documents, corpus, bigfileName + "-documents.xml") else: print >> sys.stderr, "Converting to XML" xml = STConvert.toInteractionXML(documents, corpus, None) if corpus == "BI11": Utils.InteractionXML.MixSets.mixSets(xml, None, set(moveBI), "train", "devel") if corpus == "REN11": corpusRENtoASCII(xml) if addAnalyses: insertAnalyses(xml, corpus, datasets, files, bigfileName, packageSubPath=packageSubPath) else: print >> sys.stderr, "Skipping adding analyses" if intermediateFiles: print >> sys.stderr, "Writing combined corpus", bigfileName + "-sentences.xml" ETUtils.write(xml, bigfileName + "-sentences.xml") processParses(xml) print >> sys.stderr, "---------------", "Writing corpora", "---------------" checkAttributes(xml) # Write out converted data if intermediateFiles: print >> sys.stderr, "Writing combined corpus", bigfileName + ".xml" ETUtils.write(xml, bigfileName + ".xml") print >> sys.stderr, "Dividing into sets" Utils.InteractionXML.DivideSets.processCorpus(xml, outdir, corpus, ".xml") if evaluate and "devel" in datasets: print >> sys.stderr, "---------------", "Evaluating conversion", "---------------" if corpus != "REL11": # Task 1 (removal of Entity-entities) cannot work for REL print >> sys.stderr, "Evaluating task 1 back-conversion" STConvert.toSTFormat(os.path.join(outdir, corpus + "-devel.xml"), workdir + "/roundtrip/" + corpus + "-devel" + "-task1", outputTag="a2", skipArgs=["Site"]) BioNLP11GeniaTools.evaluate( workdir + "/roundtrip/" + corpus + "-devel" + "-task1", corpus + ".1") print >> sys.stderr, "Evaluating task 2 back-conversion" STConvert.toSTFormat(os.path.join(outdir, corpus + "-devel.xml"), workdir + "/roundtrip/" + corpus + "-devel" + "-task2", outputTag="a2") BioNLP11GeniaTools.evaluate( workdir + "/roundtrip/" + corpus + "-devel" + "-task2", corpus + ".2") print >> sys.stderr, "Note! Evaluation of Task 2 back-conversion can be less than 100% due to site-argument mapping" # Check what was produced by the conversion print >> sys.stderr, "---------------", "Corpus Structure Analysis", "---------------" analyzer = StructureAnalyzer() analyzer.analyze([xml]) print >> sys.stderr, analyzer.toString()