def getDDI13Result(output, numFolds=10, catenate=False): global mainTEESDir foldPaths = [] scores = [] matrix = defaultdict(lambda:defaultdict(int)) for fold in range(numFolds): foldPath = os.path.join(output, "DDI13-fold" + str(fold), "classification-test", "test-pred.xml.gz") foldPaths.append(foldPath) logPath = os.path.join(output, "DDI13-fold" + str(fold), "log.txt") getDDI13ResultLine(logPath, "DDI13-fold" + str(fold), scores) foldPath = os.path.join(output, "DDI13-fold" + str(fold), "classification-test") classPath = os.path.join(output, "DDI13-fold" + str(fold), "model-test", "trigger-ids.classes") if not os.path.exists(classPath): classPath = os.path.join(output, "DDI13-fold" + str(fold), "model-test", "edge-ids.classes") addExamples(os.path.join(foldPath, "test-edge-examples.gz"), os.path.join(foldPath, "test-edge-classifications"), classPath, matrix) else: addExamples(os.path.join(foldPath, "test-trigger-examples.gz"), os.path.join(foldPath, "test-trigger-classifications"), classPath, matrix) #parameterPaths = [[":TRAIN:END-MODEL", "Selected parameters"]] #print "DDI13-fold" + str(fold) + ": " + getResultLine(logPath, parameterPaths) print "-----" for testSet in ["DDI13-test9.1", "DDI13-test9.2"]: logPath = os.path.join(output, testSet, "log.txt") getDDI13ResultLine(logPath, testSet) #parameterPaths = [[":TRAIN:END-MODEL", "Selected parameters"]] #print testSet + ": " + getResultLine(logPath, parameterPaths) predPath = os.path.join(output, testSet, "classification-test", "test-pred.xml.gz") DDITools.makeDDI13SubmissionFile(predPath, os.path.join(output, testSet + "-interactions.txt"), "interactions") DDITools.makeDDI13SubmissionFile(predPath, os.path.join(output, testSet + "-entities.txt"), "entities") print "-----" print "Avg-score: ", stats.mean(scores), "stdev", stats.stdev(scores) print "-----" print matrixToString(matrix) print matrixToString(matrix, True) if catenate and len(foldPaths) > 1: catPath = os.path.join(output, "DDI13-train-analyses.xml.gz") Catenate.catenate(foldPaths, catPath, fast=True) DDITools.makeDDI13SubmissionFile(catPath, os.path.join(output, "DDI13-train-interactions.txt"), "interactions") DDITools.makeDDI13SubmissionFile(catPath, os.path.join(output, "DDI13-train-entities.txt"), "entities")
def getDDI13Result(output, numFolds=10): global mainTEESDir from batch import batch foldPaths = [] for fold in range(numFolds): foldPath = os.path.join(output, "DDI13-fold" + str(fold), "classification-test", "test-pred.xml.gz") logPath = os.path.join(output, "DDI13-fold" + str(fold), "log.txt") tagPaths = [[ "------------ Test set classification ------------", "##### EvaluateInteractionXML #####", "Interactions", "micro p/n:" ]] print "DDI13-fold" + str(fold) + ": " + getResultLine( logPath, tagPaths) foldPaths.append(foldPath) if len(foldPaths) > 1: Catenate.catenate(foldPaths, os.path.join(output, "DDI13-train-analyses.xml.gz"), fast=True)
def getTaskSettings(task, detector, processUnmerging, processModifiers, isSingleStage, bioNLPSTParams, preprocessorParams, inputFiles, exampleStyles, classifierParameters): if task != None: print >> sys.stderr, "Determining training settings for task", task assert task.replace("-MINI", "") in ["GE09", "GE09.1", "GE09.2", "GE", "GE.1", "GE.2", "EPI", "ID", "BB", "BI", "BI-FULL", "CO", "REL", "REN", "DDI", "DDI-FULL"], task fullTaskId = task subTask = 2 if "." in task: task, subTask = task.split(".") subTask = int(subTask) #dataPath = os.path.expanduser("~/biotext/BioNLP2011/data/main-tasks/") dataPath = Settings.CORPUS_DIR # Optional overrides for input files #if inputFiles["devel"] == None: inputFiles["devel"] = dataPath + task + "/" + task + "-devel.xml" #if inputFiles["train"] == None: inputFiles["train"] = dataPath + task + "/" + task + "-train.xml" #if inputFiles["test"] == None: inputFiles["test"] = dataPath + task + "/" + task + "-test.xml" if inputFiles["devel"] == None and inputFiles["devel"] != "None": inputFiles["devel"] = os.path.join(dataPath, task.replace("-FULL", "") + "-devel.xml") if inputFiles["train"] == None and inputFiles["train"] != "None": if task == "ID": # add GE-task data to the ID training set inputFiles["train"] = Catenate.catenate([os.path.join(dataPath, "ID-train.xml"), os.path.join(dataPath, "GE-devel.xml"), os.path.join(dataPath, "GE-train.xml")], "training/ID-train-and-GE-devel-and-train.xml.gz", fast=True) else: inputFiles["train"] = os.path.join(dataPath, task.replace("-FULL", "") + "-train.xml") if inputFiles["test"] == None and inputFiles["test"] != "None": inputFiles["test"] = os.path.join(dataPath, task.replace("-FULL", "") + "-test.xml") task = task.replace("-MINI", "") # Example generation parameters if detector == None: detector = "Detectors.EventDetector" if task == "CO": detector = "Detectors.CODetector" elif task in ["REN", "BI", "DDI"]: detector = "Detectors.EdgeDetector" isSingleStage = True print >> sys.stderr, "Detector undefined, using default '" + detector + "' for task", fullTaskId if bioNLPSTParams == None and task not in ["DDI", "DDI-FULL"]: bioNLPSTParams = "convert:evaluate:scores" if task == "BI-FULL": bioNLPSTParams = "convert:scores" # the shared task evaluator is not designed for predicted entities print >> sys.stderr, "BioNLP Shared Task parameters undefined, using default '" + bioNLPSTParams + "' for task", fullTaskId if preprocessorParams == None: preprocessorParams = ["intermediateFiles"] if task in ["BI", "BI-FULL", "BB", "DDI", "DDI-FULL"]: preprocessorParams += ["omitSteps=NER,DIVIDE-SETS"] else: preprocessorParams += ["omitSteps=DIVIDE-SETS"] preprocessorParams += ["PARSE.requireEntities"] # parse only sentences where BANNER found an entity preprocessorParams = ":".join(preprocessorParams) print >> sys.stderr, "Preprocessor parameters undefined, using default '" + preprocessorParams + "' for task", fullTaskId if processUnmerging == None and not isSingleStage: processUnmerging = True if task in ["CO", "REL", "BB", "BI-FULL", "DDI-FULL"]: processUnmerging = False print >> sys.stderr, "Unmerging undefined, using default", processUnmerging, "for task", fullTaskId if processModifiers == None: processModifiers = False if task in ["GE", "EPI", "ID"]: processModifiers = True print >> sys.stderr, "Modifier prediction undefined, using default", processModifiers, " for task", fullTaskId if exampleStyles["examples"] == None and isSingleStage: if task == "REN": exampleStyles["examples"] = "trigger_features:typed:no_linear:entities:noMasking:maxFeatures:bacteria_renaming:maskTypeAsProtein=Gene" elif task == "BI": exampleStyles["examples"] = "trigger_features:typed:directed:no_linear:entities:noMasking:maxFeatures:bi_limits" elif task == "DDI": exampleStyles["examples"] = "trigger_features:typed:no_linear:entities:noMasking:maxFeatures:ddi_features:ddi_mtmx:filter_shortest_path=conj_and" print >> sys.stderr, "Single-stage examples style undefined, using default '" + exampleStyles["examples"] + "' for task", fullTaskId if exampleStyles["edge"] == None and not isSingleStage: print >> sys.stderr, "Edge example style undefined, using default for task", fullTaskId if task in ["GE09", "GE"]: exampleStyles["edge"]="trigger_features:typed:directed:no_linear:entities:genia_limits:noMasking:maxFeatures" #,multipath" if subTask == 1: exampleStyles["edge"] += ":genia_task1" elif task in ["BB"]: exampleStyles["edge"]="trigger_features:typed:directed:no_linear:entities:bb_limits:noMasking:maxFeatures" elif task == "EPI": exampleStyles["edge"]="trigger_features:typed:directed:no_linear:entities:epi_limits:noMasking:maxFeatures" elif task == "ID": exampleStyles["edge"]="trigger_features:typed:directed:no_linear:entities:id_limits:noMasking:maxFeatures" elif task == "REL": exampleStyles["edge"]="trigger_features:typed:directed:no_linear:entities:noMasking:maxFeatures:rel_limits:rel_features" elif task == "CO": exampleStyles["edge"]="trigger_features:typed:directed:no_linear:entities:noMasking:maxFeatures:co_limits" elif task == "BI-FULL": exampleStyles["edge"] = "trigger_features:typed:directed:no_linear:entities:noMasking:maxFeatures:bi_limits" elif task == "DDI-FULL": exampleStyles["edge"] = "trigger_features:typed:no_linear:entities:noMasking:maxFeatures:ddi_features:filter_shortest_path=conj_and" else: exampleStyles["edge"]="trigger_features:typed:directed:no_linear:entities:noMasking:maxFeatures" if exampleStyles["trigger"] == None and not isSingleStage: print >> sys.stderr, "Trigger example style undefined, using default for task", fullTaskId if task in ["GE09", "GE"] and subTask == 1: exampleStyles["trigger"] = "genia_task1" elif task == "EPI": exampleStyles["trigger"] = "epi_merge_negated" elif task == "BB": exampleStyles["trigger"] = "bb_features:build_for_nameless:wordnet" elif task == "REL": exampleStyles["trigger"] = "rel_features" elif task == "CO": options.triggerExampleBuilder = "PhraseTriggerExampleBuilder" elif task in ["BI-FULL", "DDI-FULL"]: exampleStyles["trigger"] = "build_for_nameless:names" if exampleStyles["unmerging"] == None and not isSingleStage: exampleStyles["unmerging"] = "trigger_features:typed:directed:no_linear:entities:genia_limits:noMasking:maxFeatures" #if task == "ID": # Do not use catenated GE for unmerging examples # exampleStyles["unmerging"] += ":sentenceLimit=id.ID" # Classifier parameters if classifierParameters["examples"] == None and isSingleStage: print >> sys.stderr, "Classifier parameters for single-stage examples undefined, using default for task", fullTaskId if task == "REN": classifierParameters["examples"] = "10,100,1000,2000,3000,4000,4500,5000,5500,6000,7500,10000,20000,25000,28000,50000,60000" elif task == "BI": classifierParameters["examples"] = "10,100,1000,2500,5000,7500,10000,20000,25000,28000,50000,60000,65000,80000,100000,150000" elif task == "DDI": classifierParameters["examples"] = "c=10,100,1000,2500,4000,5000,6000,7500,10000,20000,25000,50000:TEES.threshold" if classifierParameters["trigger"] == None and not isSingleStage: print >> sys.stderr, "Classifier parameters for trigger examples undefined, using default for task", fullTaskId classifierParameters["trigger"] = "1000,5000,10000,20000,50000,80000,100000,150000,180000,200000,250000,300000,350000,500000,1000000" if classifierParameters["recall"] == None and not isSingleStage: print >> sys.stderr, "Recall adjust parameter undefined, using default for task", fullTaskId classifierParameters["recall"] = "0.5,0.6,0.65,0.7,0.85,1.0,1.1,1.2" if task == "CO": classifierParameters["recall"] = "0.8,0.9,0.95,1.0" if classifierParameters["edge"] == None and not isSingleStage: print >> sys.stderr, "Classifier parameters for edge examples undefined, using default for task", fullTaskId classifierParameters["edge"] = "5000,7500,10000,20000,25000,27500,28000,29000,30000,35000,40000,50000,60000,65000" if task in ["REL", "CO"]: classifierParameters["edge"] = "10,100,1000,5000,7500,10000,20000,25000,28000,50000,60000,65000,100000,500000,1000000" if classifierParameters["unmerging"] == None and not isSingleStage: print >> sys.stderr, "Classifier parameters for unmerging examples undefined, using default for task", fullTaskId classifierParameters["unmerging"] = "1,10,100,500,1000,1500,2500,5000,10000,20000,50000,80000,100000" if classifierParameters["modifiers"] == None and not isSingleStage: print >> sys.stderr, "Classifier parameters for modifier examples undefined, using default for task", fullTaskId classifierParameters["modifiers"] = "5000,10000,20000,50000,100000" if isSingleStage and exampleStyles["examples"] != None and "names" in exampleStyles["examples"]: removeNamesFromEmpty = True elif (not isSingleStage) and exampleStyles["trigger"] != None and "names" in exampleStyles["trigger"]: removeNamesFromEmpty = True else: removeNamesFromEmpty = False return detector, processUnmerging, processModifiers, isSingleStage, bioNLPSTParams, preprocessorParams, exampleStyles, classifierParameters, removeNamesFromEmpty
def getTaskSettings(task, detector, bioNLPSTParams, preprocessorParams, inputFiles, exampleStyles, classifierParameters): if task != None: print >> sys.stderr, "*** Defining training settings for task", task, "***" fullTaskId = task subTask = 2 if "." in task: task, subTask = task.split(".") subTask = int(subTask) dataPath = Settings.CORPUS_DIR for dataset in ["devel", "train", "test"]: if inputFiles[dataset] == None and inputFiles[dataset] != "None": inputFiles[dataset] = os.path.join(dataPath, task.replace("-FULL", "") + "-"+dataset+".xml") if task == "ID11" and dataset == "train": inputFiles[dataset] = Catenate.catenate([os.path.join(dataPath, "ID11-train.xml"), os.path.join(dataPath, "GE11-devel.xml"), os.path.join(dataPath, "GE11-train.xml")], "training/ID11-train-and-GE11-devel-and-train.xml.gz", fast=True) if inputFiles[dataset] == "None": inputFiles[dataset] = None if inputFiles[dataset] != None and not os.path.exists(inputFiles[dataset]): inputFiles[dataset] = None print >> sys.stderr, "Input file", inputFiles[dataset], "for set '" + dataset + "' does not exist, skipping." assert inputFiles["train"] != None # at least training set must exist # Example generation parameters if task == "CO11": detector = "Detectors.CODetector" elif task in ["BI11-FULL", "DDI11-FULL"]: detector = "Detectors.EventDetector" # BioNLP Shared Task and preprocessing parameters if task == "BI11-FULL": bioNLPSTParams = Parameters.cat(bioNLPSTParams, "convert:scores", "BioNLP Shared Task / " + fullTaskId, ["default"]) # the shared task evaluator is not designed for predicted entities elif task == "REL11": bioNLPSTParams = Parameters.cat(bioNLPSTParams, "convert:evaluate:scores:a2Tag=rel", "BioNLP Shared Task / " + fullTaskId, ["default"]) elif task not in ["DDI11", "DDI11-FULL", "DDI13"]: bioNLPSTParams = Parameters.cat(bioNLPSTParams, "convert:evaluate:scores", "BioNLP Shared Task / " + fullTaskId, ["default"]) # Preprocessing parameters if task in ["BI11", "BI11-FULL", "BB11", "DDI11", "DDI11-FULL"]: Parameters.cat("intermediateFiles:omitSteps=NER,DIVIDE-SETS", preprocessorParams, "Preprocessor /" + fullTaskId, ["default"]) else: # parse only sentences where BANNER found an entity Parameters.cat("intermediateFiles:omitSteps=DIVIDE-SETS:PARSE.requireEntities", preprocessorParams, "Preprocessor /" + fullTaskId, ["default"]) # Example style parameters for single-stage tasks if task == "REN11": exampleStyles["examples"] = Parameters.cat("undirected:bacteria_renaming:maskTypeAsProtein=Gene", exampleStyles["examples"], "Single-stage example style / " + fullTaskId) elif task == "DDI11": exampleStyles["examples"] = Parameters.cat("drugbank_features:ddi_mtmx:filter_shortest_path=conj_and", exampleStyles["examples"], "Single-stage example style / " + fullTaskId) elif task == "DDI13": exampleStyles["examples"] = Parameters.cat("keep_neg:drugbank_features:filter_shortest_path=conj_and", exampleStyles["examples"], "Single-stage example style / " + fullTaskId) elif task == "BI11": exampleStyles["edge"] = Parameters.cat("bi_features", exampleStyles["edge"], "Edge example style / " + fullTaskId) # Edge style if task in ["GE09", "GE11", "GE13"] and subTask == 1: exampleStyles["edge"] = Parameters.cat("genia_features:genia_task1", exampleStyles["edge"]) elif task in ["GE09", "GE11", "GE13"]: exampleStyles["edge"] = Parameters.cat("genia_features", exampleStyles["edge"]) elif task == "REL11": exampleStyles["edge"] = Parameters.cat("rel_features", exampleStyles["edge"], "Edge example style / " + fullTaskId) elif task == "DDI11-FULL": exampleStyles["edge"] = Parameters.cat("drugbank_features:filter_shortest_path=conj_and", exampleStyles["edge"], "Edge example style / " + fullTaskId) elif task == "CO11": exampleStyles["edge"] = Parameters.cat("co_features", exampleStyles["edge"], "Edge example style / " + fullTaskId) elif task == "BI11-FULL": exampleStyles["edge"] = Parameters.cat("bi_features", exampleStyles["edge"], "Edge example style / " + fullTaskId) # Trigger style if task in ["GE09", "GE11", "GE13"] and subTask == 1: exampleStyles["trigger"] = Parameters.cat("genia_task1", exampleStyles["trigger"], "Trigger example style / " + fullTaskId) elif task in ["EPI11", "PC13"]: exampleStyles["trigger"] = Parameters.cat("epi_merge_negated", exampleStyles["trigger"], "Trigger example style / " + fullTaskId) elif task == "BB11": # "bb_features:build_for_nameless:wordnet" exampleStyles["trigger"] = Parameters.cat("bb_features:build_for_nameless", exampleStyles["trigger"], "Trigger example style / " + fullTaskId) elif task == "BB13T3": # "bb_features:build_for_nameless:wordnet" exampleStyles["trigger"] = Parameters.cat("bb_features:build_for_nameless", exampleStyles["trigger"], "Trigger example style / " + fullTaskId) elif task == "REL11": exampleStyles["trigger"] = Parameters.cat("rel_features", exampleStyles["trigger"], "Trigger example style / " + fullTaskId) elif task in ["BI11-FULL", "DDI11-FULL"]: exampleStyles["trigger"] = "build_for_nameless:names" # Classifier parameters if task == "DDI11": classifierParameters["examples"] = Parameters.cat("c=10,100,1000,2500,4000,5000,6000,7500,10000,20000,25000,50000:TEES.threshold", classifierParameters["examples"], "Classifier parameters for single-stage examples" + fullTaskId) #elif task == "DDI13": # classifierParameters["examples"] = Parameters.cat("c=10,100,1000,2500,4000,5000,6000,7500,10000,20000,25000,50000:TEES.threshold", classifierParameters["examples"], "Classifier parameters for single-stage examples" + fullTaskId) elif task == "CO11": classifierParameters["edge"] = Parameters.cat("c=1000,4500,5000,7500,10000,20000,25000,27500,28000,29000,30000,35000,40000,50000,60000,65000", classifierParameters["examples"], "Classifier parameters for edges / " + fullTaskId) classifierParameters["trigger"] = Parameters.cat("c=1000,5000,10000,20000,50000,80000,100000,150000,180000,200000,250000,300000,350000,500000,1000000", classifierParameters["examples"], "Classifier parameters for triggers / " + fullTaskId) classifierParameters["recall"] = Parameters.cat("0.8,0.9,0.95,1.0", classifierParameters["recall"], "Recall adjust / " + fullTaskId) return detector, bioNLPSTParams, preprocessorParams
def getTaskSettings(task, detector, bioNLPSTParams, preprocessorParams, inputFiles, exampleStyles, classifierParameters): if task != None: print >> sys.stderr, "*** Defining training settings for task", task, "***" fullTaskId = task subTask = 2 if "." in task: task, subTask = task.split(".") subTask = int(subTask) dataPath = Settings.CORPUS_DIR for dataset in ["devel", "train", "test"]: if inputFiles[dataset] == None and inputFiles[dataset] != "None": inputFiles[dataset] = os.path.join( dataPath, task.replace("-FULL", "") + "-" + dataset + ".xml") if task == "ID11" and dataset == "train": inputFiles[dataset] = Catenate.catenate( [ os.path.join(dataPath, "ID11-train.xml"), os.path.join(dataPath, "GE11-devel.xml"), os.path.join(dataPath, "GE11-train.xml") ], "training/ID11-train-and-GE11-devel-and-train.xml.gz", fast=True) if inputFiles[dataset] == "None": inputFiles[dataset] = None if inputFiles[dataset] != None and not os.path.exists( inputFiles[dataset]): inputFiles[dataset] = None print >> sys.stderr, "Input file", inputFiles[ dataset], "for set '" + dataset + "' does not exist, skipping." assert inputFiles["train"] != None # at least training set must exist # Example generation parameters if task == "CO11": detector = "Detectors.CODetector" elif task in ["BI11-FULL", "DDI11-FULL"]: detector = "Detectors.EventDetector" # BioNLP Shared Task and preprocessing parameters if task == "BI11-FULL": bioNLPSTParams = Parameters.cat( bioNLPSTParams, "convert:scores", "BioNLP Shared Task / " + fullTaskId, ["default"] ) # the shared task evaluator is not designed for predicted entities elif task == "REL11": bioNLPSTParams = Parameters.cat( bioNLPSTParams, "convert:evaluate:scores:a2Tag=rel", "BioNLP Shared Task / " + fullTaskId, ["default"]) elif task not in ["DDI11", "DDI11-FULL", "DDI13"]: bioNLPSTParams = Parameters.cat( bioNLPSTParams, "convert:evaluate:scores", "BioNLP Shared Task / " + fullTaskId, ["default"]) # Preprocessing parameters if task in ["BI11", "BI11-FULL", "BB11", "DDI11", "DDI11-FULL"]: Parameters.cat("intermediateFiles:omitSteps=NER,DIVIDE-SETS", preprocessorParams, "Preprocessor /" + fullTaskId, ["default"]) else: # parse only sentences where BANNER found an entity Parameters.cat( "intermediateFiles:omitSteps=DIVIDE-SETS:PARSE.requireEntities", preprocessorParams, "Preprocessor /" + fullTaskId, ["default"]) # Example style parameters for single-stage tasks if task == "REN11": exampleStyles["examples"] = Parameters.cat( "undirected:bacteria_renaming:maskTypeAsProtein=Gene", exampleStyles["examples"], "Single-stage example style / " + fullTaskId) elif task == "DDI11": exampleStyles["examples"] = Parameters.cat( "drugbank_features:ddi_mtmx:filter_shortest_path=conj_and", exampleStyles["examples"], "Single-stage example style / " + fullTaskId) elif task == "DDI13": exampleStyles["examples"] = Parameters.cat( "keep_neg:drugbank_features:filter_shortest_path=conj_and", exampleStyles["examples"], "Single-stage example style / " + fullTaskId) elif task == "BI11": exampleStyles["edge"] = Parameters.cat( "bi_features", exampleStyles["edge"], "Edge example style / " + fullTaskId) # Edge style if task in ["GE09", "GE11", "GE13"] and subTask == 1: exampleStyles["edge"] = Parameters.cat( "genia_features:genia_task1", exampleStyles["edge"]) elif task in ["GE09", "GE11", "GE13"]: exampleStyles["edge"] = Parameters.cat("genia_features", exampleStyles["edge"]) elif task == "REL11": exampleStyles["edge"] = Parameters.cat( "rel_features", exampleStyles["edge"], "Edge example style / " + fullTaskId) elif task == "DDI11-FULL": exampleStyles["edge"] = Parameters.cat( "drugbank_features:filter_shortest_path=conj_and", exampleStyles["edge"], "Edge example style / " + fullTaskId) elif task == "CO11": exampleStyles["edge"] = Parameters.cat( "co_features", exampleStyles["edge"], "Edge example style / " + fullTaskId) elif task == "BI11-FULL": exampleStyles["edge"] = Parameters.cat( "bi_features", exampleStyles["edge"], "Edge example style / " + fullTaskId) # Trigger style if task in ["GE09", "GE11", "GE13"] and subTask == 1: exampleStyles["trigger"] = Parameters.cat( "genia_task1", exampleStyles["trigger"], "Trigger example style / " + fullTaskId) elif task in ["EPI11", "PC13"]: exampleStyles["trigger"] = Parameters.cat( "epi_merge_negated", exampleStyles["trigger"], "Trigger example style / " + fullTaskId) elif task == "BB11": # "bb_features:build_for_nameless:wordnet" exampleStyles["trigger"] = Parameters.cat( "bb_features:build_for_nameless", exampleStyles["trigger"], "Trigger example style / " + fullTaskId) elif task == "BB13T3": # "bb_features:build_for_nameless:wordnet" exampleStyles["trigger"] = Parameters.cat( "bb_features:build_for_nameless", exampleStyles["trigger"], "Trigger example style / " + fullTaskId) elif task == "REL11": exampleStyles["trigger"] = Parameters.cat( "rel_features", exampleStyles["trigger"], "Trigger example style / " + fullTaskId) elif task in ["BI11-FULL", "DDI11-FULL"]: exampleStyles["trigger"] = "build_for_nameless:names" # Classifier parameters if task == "DDI11": classifierParameters["examples"] = Parameters.cat( "c=10,100,1000,2500,4000,5000,6000,7500,10000,20000,25000,50000:TEES.threshold", classifierParameters["examples"], "Classifier parameters for single-stage examples" + fullTaskId) #elif task == "DDI13": # classifierParameters["examples"] = Parameters.cat("c=10,100,1000,2500,4000,5000,6000,7500,10000,20000,25000,50000:TEES.threshold", classifierParameters["examples"], "Classifier parameters for single-stage examples" + fullTaskId) elif task == "CO11": classifierParameters["edge"] = Parameters.cat( "c=1000,4500,5000,7500,10000,20000,25000,27500,28000,29000,30000,35000,40000,50000,60000,65000", classifierParameters["examples"], "Classifier parameters for edges / " + fullTaskId) classifierParameters["trigger"] = Parameters.cat( "c=1000,5000,10000,20000,50000,80000,100000,150000,180000,200000,250000,300000,350000,500000,1000000", classifierParameters["examples"], "Classifier parameters for triggers / " + fullTaskId) classifierParameters["recall"] = Parameters.cat( "0.8,0.9,0.95,1.0", classifierParameters["recall"], "Recall adjust / " + fullTaskId) return detector, bioNLPSTParams, preprocessorParams
def getTaskSettings(task, detector, bioNLPSTParams, preprocessorParams, inputFiles, exampleStyles, classifierParameters, folds, corpusDir=None): if task != None: print >> sys.stderr, "*** Defining training settings for task", task, "***" fullTaskId = task subTask = 2 if "." in task: task, subTask = task.split(".") subTask = int(subTask) if corpusDir == None: corpusDir = Settings.CORPUS_DIR for dataset in ["devel", "train", "test"]: if inputFiles[dataset] == None and inputFiles[dataset] != "None": if task.startswith("DDI13"): if dataset in ["devel", "train"]: inputFiles[dataset] = os.path.join( corpusDir, "DDI13-train.xml") elif dataset == "test": if task.endswith("T91"): inputFiles[dataset] = os.path.join( corpusDir, "DDI13-test-task9.1.xml") elif task.endswith("T92") or task.endswith("FULL"): inputFiles[dataset] = os.path.join( corpusDir, "DDI13-test-task9.2.xml") elif task == "ID11" and dataset == "train": inputFiles[dataset] = Catenate.catenate( [ os.path.join(corpusDir, "ID11-train.xml"), os.path.join(corpusDir, "GE11-devel.xml"), os.path.join(corpusDir, "GE11-train.xml") ], "training/ID11-train-and-GE11-devel-and-train.xml.gz", fast=True) else: inputFiles[dataset] = os.path.join( corpusDir, task.replace("-FULL", "") + "-" + dataset + ".xml") if inputFiles[dataset] == "None": inputFiles[dataset] = None if inputFiles[dataset] != None and not os.path.exists( inputFiles[dataset]): fullPath = os.path.join(Settings.CORPUS_DIR, inputFiles[dataset]) if os.path.exists(fullPath): inputFiles[dataset] = fullPath else: inputFiles[dataset] = None print >> sys.stderr, "Input file", inputFiles[ dataset], "for set '" + dataset + "' does not exist, skipping." assert inputFiles["train"] != None # at least training set must exist # Example generation parameters if task == "CO11": detector = "Detectors.CODetector" elif task in [ "BI11-FULL", "DDI11-FULL", "DDI13-FULL", "BB_EVENT_16-FULL" ]: detector = "Detectors.EventDetector" elif task.startswith("DDI13"): if task.endswith("T91"): detector = "Detectors.EntityDetector" elif task.endswith("T92"): detector = "Detectors.EdgeDetector" ####################################################################### # BioNLP Shared Task and preprocessing parameters ####################################################################### if task == "BI11-FULL": bioNLPSTParams = Parameters.cat( bioNLPSTParams, "convert:scores", "BioNLP Shared Task / " + fullTaskId, ["default"] ) # the shared task evaluator is not designed for predicted entities elif task == "REL11": bioNLPSTParams = Parameters.cat( bioNLPSTParams, "convert:evaluate:scores:a2Tag=rel", "BioNLP Shared Task / " + fullTaskId, ["default"]) elif task in ("BB_EVENT_16", "BB_EVENT_16-FULL", "BB_EVENT_NER_16", "SDB16"): bioNLPSTParams = Parameters.cat( bioNLPSTParams, "convert=zip", "BioNLP Shared Task / " + fullTaskId, ["default"]) elif task not in [ "DDI11", "DDI11-FULL", "DDI13T91", "DDI13T92", "DDI13-FULL" ]: bioNLPSTParams = Parameters.cat( bioNLPSTParams, "convert:evaluate:scores", "BioNLP Shared Task / " + fullTaskId, ["default"]) ####################################################################### # Preprocessing parameters ####################################################################### if task in [ "BI11", "BI11-FULL", "BB11", "DDI11", "DDI11-FULL", "DDI13T91", "DDI13T92", "DDI13-FULL" ]: Parameters.cat("intermediateFiles:omitSteps=NER,DIVIDE-SETS", preprocessorParams, "Preprocessor /" + fullTaskId, ["default"]) else: # parse only sentences where BANNER found an entity Parameters.cat( "intermediateFiles:omitSteps=DIVIDE-SETS:PARSE.requireEntities", preprocessorParams, "Preprocessor /" + fullTaskId, ["default"]) ####################################################################### # Example style parameters ####################################################################### # Example style parameters for single-stage tasks ##################### msg = "Single-stage example style / " + fullTaskId if task == "REN11": exampleStyles["examples"] = Parameters.cat( "undirected:bacteria_renaming:maskTypeAsProtein=Gene", exampleStyles["examples"], msg) elif task == "DDI11": exampleStyles["examples"] = Parameters.cat( "drugbank_features:ddi_mtmx:filter_shortest_path=conj_and", exampleStyles["examples"], msg) elif task.startswith("DDI13"): if task.endswith("T91"): exampleStyles["examples"] = Parameters.cat( "names:build_for_nameless:ddi13_features:drugbank_features", exampleStyles["examples"], msg) elif task.endswith("T92"): exampleStyles["examples"] = Parameters.cat( "keep_neg:drugbank_features:filter_shortest_path=conj_and", exampleStyles["examples"], msg) elif task == "BI11": exampleStyles["examples"] = Parameters.cat( "bi_features", exampleStyles["examples"], msg) elif task == "BB_EVENT_16": exampleStyles["examples"] = Parameters.cat( "keep_neg", exampleStyles["examples"], msg ) #exampleStyles["examples"] = Parameters.cat("linear_features:keep_neg", exampleStyles["examples"], msg) elif task == "SDB16": exampleStyles["examples"] = Parameters.cat( "sdb_merge:sdb_features", exampleStyles["examples"], msg) # Edge style ########################################################## msg = "Edge example style / " + fullTaskId if task in ["GE09", "GE11", "GE13"] and subTask == 1: exampleStyles["edge"] = Parameters.cat( "genia_features:genia_task1", exampleStyles["edge"], msg) elif task in ["GE09", "GE11", "GE13"]: exampleStyles["edge"] = Parameters.cat("genia_features", exampleStyles["edge"], msg) elif task == "REL11": exampleStyles["edge"] = Parameters.cat("rel_features", exampleStyles["edge"], msg) elif task == "DDI11-FULL": exampleStyles["edge"] = Parameters.cat( "drugbank_features:filter_shortest_path=conj_and", exampleStyles["edge"], msg) elif task == "DDI13-FULL": exampleStyles["edge"] = Parameters.cat( "keep_neg:drugbank_features:filter_shortest_path=conj_and", exampleStyles["edge"], msg) elif task == "CO11": exampleStyles["edge"] = Parameters.cat("co_features", exampleStyles["edge"], msg) elif task == "BI11-FULL": exampleStyles["edge"] = Parameters.cat("bi_features", exampleStyles["edge"], msg) # Trigger style ####################################################### msg = "Trigger example style / " + fullTaskId if task in ["GE09", "GE11", "GE13"] and subTask == 1: exampleStyles["trigger"] = Parameters.cat("genia_task1", exampleStyles["trigger"], msg) elif task in ["EPI11", "PC13"]: exampleStyles["trigger"] = Parameters.cat("epi_merge_negated", exampleStyles["trigger"], msg) elif task == "BB11": # "bb_features:build_for_nameless:wordnet" exampleStyles["trigger"] = Parameters.cat("bb_features", exampleStyles["trigger"], msg) elif task == "BB13T3": # "bb_features:build_for_nameless:wordnet" exampleStyles["trigger"] = Parameters.cat("bb_features", exampleStyles["trigger"], msg) elif task == "REL11": exampleStyles["trigger"] = Parameters.cat("rel_features", exampleStyles["trigger"], msg) elif task in ["BI11-FULL", "DDI11-FULL"]: exampleStyles["trigger"] = "names:build_for_nameless" elif task == "DDI13-FULL": exampleStyles[ "trigger"] = "names:build_for_nameless:ddi13_features:drugbank_features" elif task == "BB_EVENT_16-FULL": exampleStyles["trigger"] = Parameters.cat( "bb_spans:bb_features:ontobiotope_features:build_for_nameless:all_tokens:only_types=Bacteria,Habitat,Geographical", exampleStyles["trigger"], msg) elif task in "BB_EVENT_NER_16": exampleStyles["trigger"] = Parameters.cat( "bb_spans:bb_features:ontobiotope_features:build_for_nameless:all_tokens", exampleStyles["trigger"], msg) ####################################################################### # Classifier parameters ####################################################################### if task == "DDI11": classifierParameters["examples"] = Parameters.cat( "c=10,100,1000,2500,4000,5000,6000,7500,10000,20000,25000,50000:TEES.threshold", classifierParameters["examples"], "Classifier parameters for single-stage examples" + fullTaskId) #elif task == "DDI13": # classifierParameters["examples"] = Parameters.cat("c=10,100,1000,2500,4000,5000,6000,7500,10000,20000,25000,50000:TEES.threshold", classifierParameters["examples"], "Classifier parameters for single-stage examples" + fullTaskId) elif task == "CO11": classifierParameters["edge"] = Parameters.cat( "c=1000,4500,5000,7500,10000,20000,25000,27500,28000,29000,30000,35000,40000,50000,60000,65000", classifierParameters["edge"], "Classifier parameters for edges / " + fullTaskId) classifierParameters["trigger"] = Parameters.cat( "c=1000,5000,10000,20000,50000,80000,100000,150000,180000,200000,250000,300000,350000,500000,1000000", classifierParameters["trigger"], "Classifier parameters for triggers / " + fullTaskId) classifierParameters["recall"] = Parameters.cat( "0.8,0.9,0.95,1.0", classifierParameters["recall"], "Recall adjust / " + fullTaskId) elif task == "BB_EVENT_16": classifierParameters["examples"] = Parameters.cat( "c=10,20,30,40,50,60,70,80,100,110,115,120,125,130,140,150,200,500,1000,2000,3000,4000,4500,5000,7500,10000,20000,25000,27500,28000,29000,30000,35000,40000,50000,60000,65000", classifierParameters["examples"], "Classifier parameters for edges / " + fullTaskId) elif task in ("BB_EVENT_16-FULL", "BB_EVENT_NER_16"): classifierParameters["edge"] = Parameters.cat( "c=10,20,50,80,100,110,115,120,125,130,140,150,200,500,1000", classifierParameters["edge"], "Classifier parameters for edges / " + fullTaskId) elif task == "SDB16": classifierParameters["examples"] = Parameters.cat( "c=1000,4500,5000,7500,10000,20000,25000,27500,28000,29000,30000,35000,40000,50000,60000,65000,80000,100000,150000", classifierParameters["examples"], "Classifier parameters for single-stage examples / " + fullTaskId) # Training fold parameters ############################################ if task.startswith("DDI13"): folds["devel"] = ["train1", "train2", "train3", "train4"] folds["train"] = ["train5", "train6", "train7", "train8", "train9"] return detector, bioNLPSTParams, preprocessorParams, folds
def getTaskSettings(task, detector, bioNLPSTParams, preprocessorParams, inputFiles, exampleStyles, classifierParameters, folds, corpusDir=None, useKerasDetector=False): if task != None: print >> sys.stderr, "*** Defining training settings for task", task, "***" fullTaskId = task task, subTask = getSubTask(task) if corpusDir == None: corpusDir = Settings.CORPUS_DIR print >> sys.stderr, "Loading corpus", task, "from", corpusDir for dataset in ["devel", "train", "test"]: if inputFiles[dataset] == None: if task.startswith("DDI13") and task != "DDI13": if dataset in ["devel", "train"]: inputFiles[dataset] = os.path.join(corpusDir, "DDI13-train.xml") elif dataset == "test": if task.endswith("T91"): inputFiles[dataset] = os.path.join(corpusDir, "DDI13-test-task9.1.xml") elif task.endswith("T92") or task.endswith("FULL"): inputFiles[dataset] = os.path.join(corpusDir, "DDI13-test-task9.2.xml") elif task == "ID11" and dataset == "train": inputFiles[dataset] = Catenate.catenate([os.path.join(corpusDir, "ID11-train.xml"), os.path.join(corpusDir, "GE11-devel.xml"), os.path.join(corpusDir, "GE11-train.xml")], "training/ID11-train-and-GE11-devel-and-train.xml.gz", fast=True) else: inputFiles[dataset] = os.path.join(corpusDir, task.replace("-FULL", "") + "-"+dataset+".xml") if inputFiles[dataset] == "skip": inputFiles[dataset] = None if inputFiles[dataset] != None and not os.path.exists(inputFiles[dataset]): fullPath = os.path.join(Settings.CORPUS_DIR, inputFiles[dataset]) if os.path.exists(fullPath): inputFiles[dataset] = fullPath else: inputFiles[dataset] = None print >> sys.stderr, "Input file", inputFiles[dataset], "for set '" + dataset + "' does not exist, skipping." assert inputFiles["train"] != None # at least training set must exist # Example generation parameters if detector == None: if task == "CO11": detector = "Detectors.CODetector" elif task in ["BI11-FULL", "DDI11-FULL", "DDI13-FULL", "BB_EVENT_16-FULL"]: detector = "Detectors.EventDetector" elif task.startswith("DDI13"): if task.endswith("T91"): detector = "Detectors.EntityDetector" elif task.endswith("T92") or task == "DDI13": detector = "Detectors.EdgeDetector" ####################################################################### # BioNLP Shared Task and preprocessing parameters ####################################################################### if task == "BI11-FULL": bioNLPSTParams = Parameters.cat(bioNLPSTParams, "convert:scores", "BioNLP Shared Task / " + fullTaskId, ["default"]) # the shared task evaluator is not designed for predicted entities elif task == "REL11": bioNLPSTParams = Parameters.cat(bioNLPSTParams, "convert:evaluate:scores:a2Tag=rel", "BioNLP Shared Task / " + fullTaskId, ["default"]) elif task in ("BB_EVENT_16", "BB_EVENT_16-FULL", "BB_EVENT_NER_16", "SDB16"): bioNLPSTParams = Parameters.cat(bioNLPSTParams, "convert=zip", "BioNLP Shared Task / " + fullTaskId, ["default"]) elif task not in ["DDI11", "DDI11-FULL", "DDI13T91", "DDI13T92", "DDI13-FULL", "DDI13", "CP17", "SEMEVAL10T8"]: bioNLPSTParams = Parameters.cat(bioNLPSTParams, "convert:evaluate:scores", "BioNLP Shared Task / " + fullTaskId, ["default"]) else: bioNLPSTParams = "skip" ####################################################################### # Preprocessing parameters ####################################################################### if task in ["BI11", "BI11-FULL", "BB11", "DDI11", "DDI11-FULL", "DDI13T91", "DDI13T92", "DDI13-FULL", "DDI13"]: Parameters.cat("intermediateFiles:omitSteps=NER,DIVIDE-SETS", preprocessorParams, "Preprocessor /" + fullTaskId, ["default"]) else: # parse only sentences where BANNER found an entity Parameters.cat("intermediateFiles:omitSteps=DIVIDE-SETS:PARSE.requireEntities", preprocessorParams, "Preprocessor /" + fullTaskId, ["default"]) ####################################################################### # Example style parameters ####################################################################### if not useKerasDetector: # Example style parameters for single-stage tasks ##################### msg = "Single-stage example style / " + fullTaskId if task == "REN11": exampleStyles["examples"] = Parameters.cat("undirected:bacteria_renaming:maskTypeAsProtein=Gene", exampleStyles["examples"], msg) elif task == "DDI11": exampleStyles["examples"] = Parameters.cat("drugbank_features:ddi_mtmx:filter_shortest_path=conj_and", exampleStyles["examples"], msg) elif task.startswith("DDI13"): if task.endswith("T91"): exampleStyles["examples"] = Parameters.cat("names:build_for_nameless:ddi13_features:drugbank_features", exampleStyles["examples"], msg) elif task.endswith("T92") or task == "DDI13": exampleStyles["examples"] = Parameters.cat("keep_neg:drugbank_features:filter_shortest_path=conj_and", exampleStyles["examples"], msg) elif task == "BI11": exampleStyles["examples"] = Parameters.cat("bi_features", exampleStyles["examples"], msg) elif task == "BB_EVENT_16": exampleStyles["examples"] = Parameters.cat("keep_neg", exampleStyles["examples"], msg) #exampleStyles["examples"] = Parameters.cat("linear_features:keep_neg", exampleStyles["examples"], msg) elif task == "SDB16": exampleStyles["examples"] = Parameters.cat("sdb_merge:sdb_features", exampleStyles["examples"], msg) # Edge style ########################################################## msg = "Edge example style / " + fullTaskId if task in ["GE09", "GE11", "GE13"] and subTask == 1: exampleStyles["edge"] = Parameters.cat("genia_features:genia_task1", exampleStyles["edge"], msg) elif task in ["GE09", "GE11", "GE13"]: exampleStyles["edge"] = Parameters.cat("genia_features", exampleStyles["edge"], msg) elif task == "REL11": exampleStyles["edge"] = Parameters.cat("rel_features", exampleStyles["edge"], msg) elif task == "DDI11-FULL": exampleStyles["edge"] = Parameters.cat("drugbank_features:filter_shortest_path=conj_and", exampleStyles["edge"], msg) elif task == "DDI13-FULL": exampleStyles["edge"] = Parameters.cat("keep_neg:drugbank_features:filter_shortest_path=conj_and", exampleStyles["edge"], msg) elif task == "CO11": exampleStyles["edge"] = Parameters.cat("co_features", exampleStyles["edge"], msg) elif task == "BI11-FULL": exampleStyles["edge"] = Parameters.cat("bi_features", exampleStyles["edge"], msg) # Trigger style ####################################################### msg = "Trigger example style / " + fullTaskId if task in ["GE09", "GE11", "GE13"] and subTask == 1: exampleStyles["trigger"] = Parameters.cat("genia_task1", exampleStyles["trigger"], msg) elif task in ["EPI11", "PC13"]: exampleStyles["trigger"] = Parameters.cat("epi_merge_negated", exampleStyles["trigger"], msg) elif task == "BB11": # "bb_features:build_for_nameless:wordnet" exampleStyles["trigger"] = Parameters.cat("bb_features", exampleStyles["trigger"], msg) elif task == "BB13T3": # "bb_features:build_for_nameless:wordnet" exampleStyles["trigger"] = Parameters.cat("bb_features", exampleStyles["trigger"], msg) elif task == "REL11": exampleStyles["trigger"] = Parameters.cat("rel_features", exampleStyles["trigger"], msg) elif task in ["BI11-FULL", "DDI11-FULL"]: exampleStyles["trigger"] = "names:build_for_nameless" elif task == "DDI13-FULL": exampleStyles["trigger"] = "names:build_for_nameless:ddi13_features:drugbank_features" elif task == "BB_EVENT_16-FULL": exampleStyles["trigger"] = Parameters.cat("bb_spans:bb_features:ontobiotope_features:build_for_nameless:all_tokens:only_types=Bacteria,Habitat,Geographical", exampleStyles["trigger"], msg) elif task in "BB_EVENT_NER_16": exampleStyles["trigger"] = Parameters.cat("bb_spans:bb_features:ontobiotope_features:build_for_nameless:all_tokens", exampleStyles["trigger"], msg) ####################################################################### # Classifier parameters ####################################################################### if task == "DDI11": classifierParameters["examples"] = Parameters.cat("c=10,100,1000,2500,4000,5000,6000,7500,10000,20000,25000,50000:TEES.threshold", classifierParameters["examples"], "Classifier parameters for single-stage examples" + fullTaskId) #elif task == "DDI13": # classifierParameters["examples"] = Parameters.cat("c=10,100,1000,2500,4000,5000,6000,7500,10000,20000,25000,50000:TEES.threshold", classifierParameters["examples"], "Classifier parameters for single-stage examples" + fullTaskId) elif task == "CO11": classifierParameters["edge"] = Parameters.cat("c=1000,4500,5000,7500,10000,20000,25000,27500,28000,29000,30000,35000,40000,50000,60000,65000", classifierParameters["edge"], "Classifier parameters for edges / " + fullTaskId) classifierParameters["trigger"] = Parameters.cat("c=1000,5000,10000,20000,50000,80000,100000,150000,180000,200000,250000,300000,350000,500000,1000000", classifierParameters["trigger"], "Classifier parameters for triggers / " + fullTaskId) classifierParameters["recall"] = Parameters.cat("0.8,0.9,0.95,1.0", classifierParameters["recall"], "Recall adjust / " + fullTaskId) elif task == "BB_EVENT_16": classifierParameters["examples"] = Parameters.cat("c=10,20,30,40,50,60,70,80,100,110,115,120,125,130,140,150,200,500,1000,2000,3000,4000,4500,5000,7500,10000,20000,25000,27500,28000,29000,30000,35000,40000,50000,60000,65000", classifierParameters["examples"], "Classifier parameters for edges / " + fullTaskId) elif task in ("BB_EVENT_16-FULL", "BB_EVENT_NER_16"): classifierParameters["edge"] = Parameters.cat("c=10,20,50,80,100,110,115,120,125,130,140,150,200,500,1000", classifierParameters["edge"], "Classifier parameters for edges / " + fullTaskId) elif task == "SDB16": classifierParameters["examples"] = Parameters.cat("c=1000,4500,5000,7500,10000,20000,25000,27500,28000,29000,30000,35000,40000,50000,60000,65000,80000,100000,150000", classifierParameters["examples"], "Classifier parameters for single-stage examples / " + fullTaskId) # Training fold parameters ############################################ if task.startswith("DDI13") and task != "DDI13": #folds["devel"]=["train1", "train2", "train3", "train4"] #folds["train"]=["train5", "train6", "train7", "train8", "train9"] folds["devel"]=["train1", "train2", "train3"] folds["train"]=["train4", "train5", "train6", "train7", "train8", "train9"] return detector, bioNLPSTParams, preprocessorParams, folds