def getConnection(connection): #, account=None, workDirBase=None, remoteSettingsPath=None): if connection == None: # return a "dummy" local connection return getConnection("connection=Unix:jobLimit=1") elif type(connection) in types.StringTypes and hasattr(Settings, connection): # connection is a Settings key print >> sys.stderr, "Using connection", connection return getConnection(getattr(Settings, connection)) #return getConnection(*getattr(Settings, connection)) else: # connection is a parameter string or dictionary defaultParams = dict.fromkeys(["connection", "account", "workdir", "settings", "memory", "cores", "modules", "wallTime", "jobLimit", "preamble", "debug"]) defaultParams["debug"] = False connection = Parameters.get(connection, valueListKey="connection", valueTypes={"debug":[bool]}, defaults=defaultParams) if connection["connection"] == None: connection["connection"] = "Unix" if connection["account"] == None: assert connection["workdir"] == None #assert remoteSettingsPath == None print >> sys.stderr, "New local connection", Parameters.toString(connection) else: print >> sys.stderr, "New remote connection:", Parameters.toString(connection) # Make the connection exec "ConnectionClass = " + connection["connection"] + "Connection" connectionArgs = {} for key in connection: if key != "connection" and connection[key] != None: connectionArgs[key] = connection[key] return ConnectionClass(**connectionArgs)
def addClassifierModel(self, model, classifierModelPath, classifierParameters, threshold=None): classifierModel = model.get(self.tag+"classifier-model", True) shutil.copy2(classifierModelPath, classifierModel) model.addStr(self.tag+"classifier-parameter", Parameters.toString(Parameters.get(classifierParameters))) if threshold != None: model.addStr(self.tag+"threshold", str(threshold)) return classifierModel
def saveModel(self, teesModel, tag=""): if hasattr(self, "model") and self.model != None: teesModelPath = teesModel.get(tag+"classifier-model", True) shutil.copy2(self.model, teesModelPath) if hasattr(self, "parameters") and self.parameters != None: teesModel.addStr(tag+"classifier-parameter", Parameters.toString(Parameters.get(self.parameters))) if hasattr(self, "threshold") and self.threshold != None: teesModel.addStr(tag+"threshold", str(self.threshold))
def saveModel(self, teesModel, tag=""): if hasattr(self, "model") and self.model != None: teesModelPath = teesModel.get(tag + "classifier-model", True) shutil.copy2(self.model, teesModelPath) if hasattr(self, "parameters") and self.parameters != None: teesModel.addStr( tag + "classifier-parameter", Parameters.toString(Parameters.get(self.parameters))) if hasattr(self, "threshold") and self.threshold != None: teesModel.addStr(tag + "threshold", str(self.threshold))
def initModel(self, model, saveParams=[]): if model == None: return model elif type(model) in types.StringTypes: model = self.openModel(model, "w") else: assert model.mode in ["a", "w"] for param in saveParams: model.addStr(param[1], Parameters.toString(getattr(self, param[0]))) model.save() return model
def getConnection( connection ): #, account=None, workDirBase=None, remoteSettingsPath=None): if connection == None: # return a "dummy" local connection return getConnection("connection=Unix:jobLimit=1") elif type(connection) in types.StringTypes and hasattr( Settings, connection): # connection is a Settings key print >> sys.stderr, "Using connection", connection return getConnection(getattr(Settings, connection)) #return getConnection(*getattr(Settings, connection)) else: # connection is a parameter string or dictionary defaultParams = dict.fromkeys([ "connection", "account", "workdir", "settings", "memory", "cores", "modules", "wallTime", "jobLimit", "preamble", "debug" ]) defaultParams["debug"] = False connection = Parameters.get(connection, valueListKey="connection", valueTypes={"debug": [bool]}, defaults=defaultParams) if connection["connection"] == None: connection["connection"] = "Unix" if connection["account"] == None: assert connection["workdir"] == None #assert remoteSettingsPath == None print >> sys.stderr, "New local connection", Parameters.toString( connection) else: print >> sys.stderr, "New remote connection:", Parameters.toString( connection) # Make the connection exec "ConnectionClass = " + connection["connection"] + "Connection" connectionArgs = {} for key in connection: if key != "connection" and connection[key] != None: connectionArgs[key] = connection[key] return ConnectionClass(**connectionArgs)
def beginModel(self, step, model, trainExampleFiles, testExampleFile, importIdsFromModel=None): """ Begin the training process leading to a new model. """ if self.checkStep(step, False): if model != None: if self.state != None and step != None: print >> sys.stderr, self.__class__.__name__ + ":" + self.state + ":" + step # Create combined model model = self.openModel(model, "w") assert model.mode in ["a", "w"], (model.path, model.mode) # Information can be imported from an existing model. In this case, model is trained # with the parameter already defined in the import source. This is used when training # the combined model. if importIdsFromModel != None: model.importFrom(self.openModel(importIdsFromModel, "r"), [self.tag+"ids.classes", self.tag+"ids.features", "structure.txt"], [self.tag+"classifier-parameter", self.tag+"example-style", self.tag+"parse", self.tag+"task"]) # Train the model with the parameters defined in the import source model.addStr(self.tag+"classifier-parameters-train", model.getStr(self.tag+"classifier-parameter")) if self.bioNLPSTParams != None and len(self.bioNLPSTParams) > 0: model.addStr("BioNLPSTParams", Parameters.toString(self.bioNLPSTParams)) # Catenate example files if type(trainExampleFiles) in types.StringTypes: combinedTrainExamples = trainExampleFiles elif len(trainExampleFiles) == 1: combinedTrainExamples = trainExampleFiles[0] else: combinedTrainExamples = self.workDir + os.path.normpath(model.path)+"-"+self.tag+"combined-examples.gz" combinedTrainExamplesFile = gzip.open(combinedTrainExamples, 'wb') for trainExampleFile in trainExampleFiles: print >> sys.stderr, "Catenating", trainExampleFile, "to", combinedTrainExamples shutil.copyfileobj(gzip.open(trainExampleFile, 'rb'), combinedTrainExamplesFile) combinedTrainExamplesFile.close() # Upload training model # The parameter grid is stored in the model as "*classifier-parameters-train" so that endModel can # use it, and also as annotation for the trained model. The final selected parameter will # be stored as "*classifier-parameter" classifierWorkDir = self.workDir + os.path.normpath(model.path) + "-" + self.tag + "models" classifier = self.getClassifier(model.getStr(self.tag+"classifier-parameters-train"))(self.connection) classifier.optimize(combinedTrainExamples, classifierWorkDir, model.getStr(self.tag+"classifier-parameters-train"), testExampleFile, model.get(self.tag+"ids.classes"), step="SUBMIT", evaluator=self.evaluator) model.save()
def process(self, input, output, parameters=None, model=None, fromStep=None, toStep=None, omitSteps=None): self.initVariables(source=input, xml=input, outDir=os.path.dirname(output)) if os.path.basename(output) != "": self.intermediateFileTag = os.path.basename(output) else: self.intermediateFileTag = "" self.enterState(self.STATE_TOOLCHAIN, [x[0] for x in self.steps], fromStep, toStep, omitSteps) parameters = self.getParameters(parameters, model, defaultValue=NOTHING) self.applyParameters(parameters) # Run the tools print >> sys.stderr, "Tool chain parameters:", Parameters.toString(parameters, skipKeysWithValues=[NOTHING], skipDefaults=self.getDefaultParameters()) if os.path.exists(output) and not os.path.isdir(output): print >> sys.stderr, "Removing existing preprocessor output file", output os.remove(output) savedIntermediate = None # Output from a previous step if "fromStep" is used for step in self.steps: if self.checkStep(step[0]): if savedIntermediate != None: # A previous run of the program saved an intermediate file print >> sys.stderr, "Reading input from saved intermediate file", savedIntermediate self.xml = ETUtils.ETFromObj(savedIntermediate) savedIntermediate = None stepArgs = copy.copy(step[2]) # make a copy of the arguments to which i/o can be added stepArgs[step[4]["input"]] = self.xml # the input if self.getIntermediateFilePath(step) != None: # this step should save an intermediate file stepArgs[step[4]["output"]] = self.getIntermediateFilePath(step) print >> sys.stderr, "Running step", step[0], "with arguments", stepArgs step[1](**stepArgs) # call the tool elif self.getStepStatus(step[0]) == "BEFORE": # this step was run earlier savedIntermediate = self.getIntermediateFilePath(step) # End state and return xml = self.xml # state-specific member variable self.xml will be removed when exiting state self.exitState() if self.state == None: # if the whole toolchain has finished, return the final product if not os.path.isdir(output): # if output is a directory, it was given only for storing intermediate files ... ETUtils.write(xml, output) # ... otherwise, save the final output return xml else: return None
def learnSettings(inputFiles, detector, classifierParameters, task, exampleStyles, useKerasDetector=False): if detector == None: print >> sys.stderr, "*** Analyzing input files to determine training settings ***" structureAnalyzer = StructureAnalyzer() if not os.path.exists("training/structure.txt"): datasets = sorted(filter(None, [inputFiles["train"], inputFiles["devel"]])) print >> sys.stderr, "input files:", datasets structureAnalyzer.analyze(datasets) print >> sys.stderr, structureAnalyzer.toString() structureAnalyzer.save(None, "training/structure.txt") else: print >> sys.stderr, "Using existing analysis from training/structure.txt" structureAnalyzer.load(None, "training/structure.txt") # Choose detector if detector == None: if "ENTITY" in structureAnalyzer.targets and "INTERACTION" in structureAnalyzer.targets: detector = "Detectors.EventDetector" elif "ENTITY" in structureAnalyzer.targets: detector = "Detectors.EntityDetector" elif "INTERACTION" in structureAnalyzer.targets: detector = "Detectors.EdgeDetector" else: assert False, structureAnalyzer.targets if useKerasDetector and not "Keras" in detector: detector = detector.replace("Detectors.", "Detectors.Keras") print >> sys.stderr, "Using detector '" + str(detector) + "'" # Set default parameters cp = classifierParameters if detector == "Detectors.EventDetector": # Add common classifier parameters if cp["examples"] != None: cp["unmerging"] = Parameters.cat(cp["examples"], cp["unmerging"]) cp["modifiers"] = Parameters.cat(cp["examples"], cp["modifiers"]) cp["edge"] = Parameters.cat(cp["examples"], cp["edge"]) cp["trigger"] = Parameters.cat(cp["examples"], cp["trigger"]) cp["unmerging"] = Parameters.cat("c=1,10,100,500,1000,1500,2500,5000,10000,20000,50000,80000,100000", cp["unmerging"], "Classifier parameters for unmerging") cp["modifiers"] = Parameters.cat("c=5000,10000,20000,50000,100000", cp["modifiers"], "Classifier parameters for modifiers") cp["edge"] = Parameters.cat("c=1000,4500,5000,7500,10000,20000,25000,27500,28000,29000,30000,35000,40000,50000,60000,65000", cp["edge"], "Classifier parameters for edges") cp["trigger"] = Parameters.cat("c=1000,5000,10000,20000,50000,80000,100000,150000,180000,200000,250000,300000,350000,500000,1000000", cp["trigger"], "Classifier parameters for triggers") cp["recall"] = Parameters.cat("0.5,0.6,0.65,0.7,0.85,1.0,1.1,1.2", cp["recall"], "Recall adjustment parameters") elif detector == "Detectors.EntityDetector": cp["examples"] = Parameters.cat("c=1000,5000,10000,20000,50000,80000,100000,150000,180000,200000,250000,300000,350000,500000,1000000", cp["examples"], "Classifier parameters for entities") elif detector == "Detectors.EdgeDetector": cp["examples"] = Parameters.cat("c=1000,4500,5000,7500,10000,20000,25000,27500,28000,29000,30000,35000,40000,50000,60000,65000", cp["examples"], "Classifier parameters for edges") elif detector == "Detectors.UnmergingDetector": cp["examples"] = Parameters.cat("c=1,10,100,500,1000,1500,2500,5000,10000,20000,50000,80000,100000", cp["examples"], "Classifier parameters for unmerging") ####################################################################### # Keras example styles ####################################################################### if useKerasDetector: task, subTask = getSubTask(task) msg = "Keras example style" #overrideStyles = {x:(Parameters.get(exampleStyles[x]) if (exampleStyles[x] != None and "override" in exampleStyles[x]) else {"override":True}) for x in exampleStyles} overrideStyles = {"all":{}} for key in exampleStyles: overrideStyles[key] = {} params = Parameters.get(exampleStyles[key]) if "override" in params: exampleStyles[key] = None overrideStyles[key] = params overrideStyles[key].pop("override") elif "override_all" in params: exampleStyles[key] = None overrideStyles["all"] = params overrideStyles["all"].pop("override_all") #exampleStyles[key] = exampleStyles[key] if (exampleStyles[key] != None and not "override" in exampleStyles[key]) else None print >> sys.stderr, "Override styles:", overrideStyles if "EventDetector" in detector: if task == "EPI11": exampleStyles["trigger"] = Parameters.cat("keras:epochs=500:patience=10:nf=512:path=4:el=41:mods=20:epi_merge_negated", exampleStyles["trigger"]) else: exampleStyles["trigger"] = Parameters.cat("keras:epochs=500:patience=10:nf=512:path=4:el=41:mods=20", exampleStyles["trigger"]) if task in ["GE09", "GE11", "GE13"] and subTask == 1: exampleStyles["edge"] = Parameters.cat("keras:genia_task1:epochs=500:patience=10:nf=256:path=4:ol=15:mods=20", exampleStyles["edge"]) else: exampleStyles["edge"] = Parameters.cat("keras:epochs=500:patience=10:nf=256:path=4:ol=15:mods=20", exampleStyles["edge"]) exampleStyles["unmerging"] = Parameters.cat("keras:epochs=500:patience=10:nf=256:path=4:ol=15:mods=20", exampleStyles["unmerging"]) exampleStyles["modifiers"] = Parameters.cat("keras:epochs=500:patience=10:nf=256:path=4:el=41:mods=20", exampleStyles["modifiers"]) elif "EntityDetector" in detector: if task == "DDI13T91": exampleStyles["examples"] = Parameters.cat("keras:epochs=500:patience=10:nf=512:path=4:el=41:mods=20:names:build_for_nameless", exampleStyles["examples"]) else: exampleStyles["examples"] = Parameters.cat("keras:epochs=500:patience=10:nf=512:path=4:el=41:mods=20", exampleStyles["examples"]) elif "EdgeDetector" in detector: if "DDI" in task: exampleStyles["examples"] = Parameters.cat("keras:epochs=500:patience=10:nf=256:path=0:do=0.2:dense=800:ol=50:mods=20", exampleStyles["examples"]) elif task == "CP17": exampleStyles["examples"] = Parameters.cat("keras:epochs=500:patience=10:nf=512:path=0:do=0.2:ol=50:skip_labels=CPR\:0,CPR\:1,CPR\:2,CPR\:7,CPR\:8,CPR\:10:mods=20", exampleStyles["examples"]) else: exampleStyles["examples"] = Parameters.cat("keras:epochs=500:patience=10:nf=256:path=4:ol=15:mods=20", exampleStyles["examples"]) print >> sys.stderr, "Keras initial example styles:", exampleStyles for key in exampleStyles: if exampleStyles[key] != None: exampleStyles[key] = Parameters.get(exampleStyles[key]) exampleStyles[key].update(overrideStyles[key]) exampleStyles[key].update(overrideStyles["all"]) exampleStyles[key] = Parameters.toString(exampleStyles[key]) print >> sys.stderr, "Keras final example style for " + key + ": ", exampleStyles[key] return detector
def doGrid(self): print >> sys.stderr, "--------- Parameter grid search ---------" # Build trigger examples self.triggerDetector.buildExamples(self.model, [self.optData], [self.workDir+"grid-trigger-examples.gz"]) if self.fullGrid: stepParams = { "trigger":Parameters.get(self.model.getStr(self.triggerDetector.tag+"classifier-parameters-train", defaultIfNotExist=""), valueListKey="c"), "booster":[float(i) for i in self.recallAdjustParameters.split(",")], "edge":Parameters.get(self.model.getStr(self.edgeDetector.tag+"classifier-parameters-train", defaultIfNotExist=""), valueListKey="c")} else: stepParams = { "trigger":Parameters.get(self.model.getStr(self.triggerDetector.tag+"classifier-parameter", defaultIfNotExist=""), valueListKey="c"), "booster":[float(i) for i in self.recallAdjustParameters.split(",")], "edge":Parameters.get(self.model.getStr(self.edgeDetector.tag+"classifier-parameter", defaultIfNotExist=""), valueListKey="c")} for step in ["trigger", "edge"]: stepParams[step] = Parameters.getCombinations(stepParams[step]) for i in range(len(stepParams[step])): stepParams[step][i] = Parameters.toString(stepParams[step][i]) print >> sys.stderr, [stepParams[x] for x in ["trigger", "booster", "edge"]] paramCombinations = combine(*[stepParams[x] for x in ["trigger", "booster", "edge"]]) print >> sys.stderr, paramCombinations for i in range(len(paramCombinations)): paramCombinations[i] = {"trigger":paramCombinations[i][0], "booster":paramCombinations[i][1], "edge":paramCombinations[i][2]} #paramCombinations = Parameters.getCombinations(ALL_PARAMS, ["trigger", "booster", "edge"]) prevParams = None EDGE_MODEL_STEM = os.path.join(self.edgeDetector.workDir, os.path.normpath(self.model.path)+"-edge-models/model") TRIGGER_MODEL_STEM = os.path.join(self.triggerDetector.workDir, os.path.normpath(self.model.path)+"-trigger-models/model") self.structureAnalyzer.load(self.model) bestResults = None for i in range(len(paramCombinations)): params = paramCombinations[i] print >> sys.stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" print >> sys.stderr, "Processing params", str(i+1) + "/" + str(len(paramCombinations)), params print >> sys.stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" # Triggers and Boost if prevParams == None or prevParams["trigger"] != params["trigger"] or prevParams["trigger"] != params["trigger"]: print >> sys.stderr, "Classifying trigger examples for parameters", "trigger:" + str(params["trigger"]), "booster:" + str(params["booster"]) xml = self.triggerDetector.classifyToXML(self.optData, self.model, self.workDir+"grid-trigger-examples", self.workDir+"grid-", classifierModel=TRIGGER_MODEL_STEM + Parameters.toId(params["trigger"]), recallAdjust=params["booster"]) prevParams = params ## Build edge examples #self.edgeDetector.buildExamples(self.model, [xml], [self.workDir+"grid-edge-examples"], [self.optData]) # Classify with pre-defined model edgeClassifierModel = EDGE_MODEL_STEM + Parameters.toId(params["edge"]) xml = self.edgeDetector.classifyToXML(xml, self.model, self.workDir+"grid-edge-examples", self.workDir+"grid-", classifierModel=edgeClassifierModel, goldData=self.optData) bestResults = self.evaluateGrid(xml, params, bestResults) # Remove remaining intermediate grid files for tag1 in ["edge", "trigger", "unmerging"]: for tag2 in ["examples", "pred.xml.gz"]: if os.path.exists(self.workDir+"grid-"+tag1+"-"+tag2): os.remove(self.workDir+"grid-"+tag1+"-"+tag2) print >> sys.stderr, "Parameter grid search complete" print >> sys.stderr, "Tested", len(paramCombinations), "combinations" print >> sys.stderr, "Best parameters:", bestResults[0] print >> sys.stderr, "Best result:", bestResults[2] # f-score # Save grid model self.saveStr("recallAdjustParameter", str(bestResults[0]["booster"]), self.model) self.saveStr("recallAdjustParameter", str(bestResults[0]["booster"]), self.combinedModel, False) if self.fullGrid: # define best models self.triggerDetector.addClassifierModel(self.model, TRIGGER_MODEL_STEM+str(bestResults[0]["trigger"]), bestResults[0]["trigger"]) self.edgeDetector.addClassifierModel(self.model, EDGE_MODEL_STEM+str(bestResults[0]["edge"]), bestResults[0]["edge"]) # Remove work files for stepTag in [self.workDir+"grid-trigger", self.workDir+"grid-edge", self.workDir+"grid-unmerging"]: for fileStem in ["-classifications", "-classifications.log", "examples.gz", "pred.xml.gz"]: if os.path.exists(stepTag+fileStem): os.remove(stepTag+fileStem)
def addClassifierModel(self, model, classifierModelPath, classifierParameters): classifierModel = model.get(self.tag+"classifier-model", True) shutil.copy2(classifierModelPath, classifierModel) model.addStr(self.tag+"classifier-parameter", Parameters.toString(Parameters.get(classifierParameters))) return classifierModel
def train(output, task=None, detector=None, inputFiles=None, models=None, parse=None, processUnmerging=None, processModifiers=None, bioNLPSTParams=None, preprocessorParams=None, exampleStyles=None, classifierParams=None, doFullGrid=False, deleteOutput=False, copyFrom=None, log="log.txt", step=None, omitSteps=None, debug=False, connection=None, subset=None, folds=None): """ Train a new model for event or relation detection. @param output: A directory where output files will appear. @param task: If defined, overridable default settings are used for many of the training parameters. Must be one of the supported TEES tasks. @param detector: a Detector object, or a string defining one to be imported @param inputFiles: A dictionary of file names, with keys "train", "devel" and, "test" @param models: A dictionary of file names defining the place for the new models, with keys "devel" and, "test" @param parse: The parse element name in the training interaction XML @param processUnmerging: Use the unmerging step of EventDetector. True, False or None for task default. @param processModifiers: Use the modifier detection step of EventDetector. True, False or None for task default. @param bioNLPSTParams: Parameters controlling BioNLP ST format output. @param preprocessorParams: Parameters controlling the preprocessor. Not used for training, but saved to the model for use when classifying. @param exampleStyles: A parameter set for controlling example builders. @param classifierParams: A parameter set for controlling classifiers. @param doFullGrid: Whether all parameters, as opposed to just recall adjustment, are tested in the EventDetector grid search. @param deleteOutput: Remove an existing output directory @param copyFrom: Copy an existing output directory for use as a template @param log: An optional alternative name for the log file. None is for no logging. @param step: A step=substep pair, where the steps are "TRAIN", "DEVEL", "EMPTY" and "TEST" @param omitSteps: step=substep parameters, where multiple substeps can be defined. @param debug: In debug mode, more output is shown, and some temporary intermediate files are saved @param connection: A parameter set defining a local or remote connection for training the classifier @param subset: A parameter set for making subsets of input files """ # Insert default arguments where needed inputFiles = setDictDefaults(inputFiles, { "train": None, "devel": None, "test": None }) models = setDictDefaults(models, {"devel": None, "test": None}) exampleStyles = setDictDefaults( exampleStyles, { "examples": None, "trigger": None, "edge": None, "unmerging": None, "modifiers": None }) classifierParams = setDictDefaults( classifierParams, { "examples": None, "trigger": None, "recall": None, "edge": None, "unmerging": None, "modifiers": None }) subset = setDictDefaults(Parameters.get(subset), { "train": None, "devel": None, "test": None, "seed": 0, "all": None }) folds = setDictDefaults(folds, { "train": None, "devel": None, "test": None }) processUnmerging = getDefinedBool(processUnmerging) processModifiers = getDefinedBool(processModifiers) # Initialize working directory workdir(output, deleteOutput, copyFrom, log) # Get task specific parameters detector, bioNLPSTParams, preprocessorParams = getTaskSettings( task, detector, bioNLPSTParams, preprocessorParams, inputFiles, exampleStyles, classifierParams) # Learn training settings from input files detector = learnSettings(inputFiles, detector, classifierParams) # Get corpus subsets getFolds(inputFiles, folds) getSubsets(inputFiles, subset) if task != None: task = task.replace("-FULL", "") # Define processing steps selector, detectorSteps, omitDetectorSteps = getSteps( step, omitSteps, ["TRAIN", "DEVEL", "EMPTY", "TEST"]) # Initialize the detector detector, detectorName = getDetector(detector) detector = detector() # initialize object detector.debug = debug detector.bioNLPSTParams = detector.getBioNLPSharedTaskParams( bioNLPSTParams) #detector.useBioNLPSTFormat = useBioNLPSTFormat # classify-output and grid evaluation in ST-format #detector.stWriteScores = True # write confidence scores into additional st-format files connection = getConnection(connection) detector.setConnection(connection) connection.debug = debug if deleteOutput: connection.clearWorkDir() # Train if selector.check("TRAIN"): print >> sys.stderr, "----------------------------------------------------" print >> sys.stderr, "------------------ Train Detector ------------------" print >> sys.stderr, "----------------------------------------------------" if isinstance(detector, SingleStageDetector): detector.train(inputFiles["train"], inputFiles["devel"], models["devel"], models["test"], exampleStyles["examples"], classifierParams["examples"], parse, None, task, fromStep=detectorSteps["TRAIN"], workDir="training") else: detector.train(inputFiles["train"], inputFiles["devel"], models["devel"], models["test"], exampleStyles["trigger"], exampleStyles["edge"], exampleStyles["unmerging"], exampleStyles["modifiers"], classifierParams["trigger"], classifierParams["edge"], classifierParams["unmerging"], classifierParams["modifiers"], classifierParams["recall"], processUnmerging, processModifiers, doFullGrid, task, parse, None, fromStep=detectorSteps["TRAIN"], workDir="training") # Save the detector type for model in [models["devel"], models["test"]]: if model != None and os.path.exists(model): model = Model(model, "a") model.addStr("detector", detectorName) if preprocessorParams != None: preprocessor = Preprocessor() model.addStr( "preprocessorParams", Parameters.toString( preprocessor.getParameters(preprocessorParams))) model.save() model.close() if selector.check("DEVEL"): print >> sys.stderr, "----------------------------------------------------" print >> sys.stderr, "------------ Check devel classification ------------" print >> sys.stderr, "----------------------------------------------------" #detector.bioNLPSTParams["scores"] = False # the evaluation server doesn't like additional files detector.classify(inputFiles["devel"], models["devel"], "classification-devel/devel", goldData=inputFiles["devel"], fromStep=detectorSteps["DEVEL"], workDir="classification-devel") if selector.check("EMPTY"): # By passing an emptied devel set through the prediction system, we can check that we get the same predictions # as in the DEVEL step, ensuring the model does not use leaked information. print >> sys.stderr, "----------------------------------------------------" print >> sys.stderr, "------------ Empty devel classification ------------" print >> sys.stderr, "----------------------------------------------------" #detector.bioNLPSTParams["scores"] = False # the evaluation server doesn't like additional files detector.classify(getEmptyCorpus( inputFiles["devel"], removeNames=("names" in str(exampleStyles["examples"]) or "names" in str(exampleStyles["trigger"]))), models["devel"], "classification-empty/devel-empty", fromStep=detectorSteps["EMPTY"], workDir="classification-empty") if selector.check("TEST"): print >> sys.stderr, "----------------------------------------------------" print >> sys.stderr, "------------- Test set classification --------------" print >> sys.stderr, "----------------------------------------------------" if inputFiles["test"] == None or not os.path.exists( inputFiles["test"]): print >> sys.stderr, "Skipping, test file", inputFiles[ "test"], "does not exist" else: #detector.bioNLPSTParams["scores"] = False # the evaluation server doesn't like additional files detector.classify(inputFiles["test"], models["test"], "classification-test/test", fromStep=detectorSteps["TEST"], workDir="classification-test") if detector.bioNLPSTParams["convert"]: Utils.STFormat.Compare.compare( "classification-test/test-events.tar.gz", "classification-devel/devel-events.tar.gz", "a2")
def process(self, input, output, parameters=None, model=None, fromStep=None, toStep=None, omitSteps=None): self.initVariables(source=input, xml=input, outDir=os.path.dirname(output)) if os.path.basename(output) != "": self.intermediateFileTag = os.path.basename(output) else: self.intermediateFileTag = "" self.enterState(self.STATE_TOOLCHAIN, [x[0] for x in self.steps], fromStep, toStep, omitSteps) parameters = self.getParameters(parameters, model, defaultValue=NOTHING) self.applyParameters(parameters) # Run the tools print >> sys.stderr, "Tool chain parameters:", Parameters.toString( parameters, skipKeysWithValues=[NOTHING], skipDefaults=self.getDefaultParameters()) if os.path.exists(output) and not os.path.isdir(output): print >> sys.stderr, "Removing existing preprocessor output file", output os.remove(output) savedIntermediate = None # Output from a previous step if "fromStep" is used for step in self.steps: if self.checkStep(step[0]): if savedIntermediate != None: # A previous run of the program saved an intermediate file print >> sys.stderr, "Reading input from saved intermediate file", savedIntermediate self.xml = ETUtils.ETFromObj(savedIntermediate) savedIntermediate = None stepArgs = copy.copy( step[2] ) # make a copy of the arguments to which i/o can be added stepArgs[step[4]["input"]] = self.xml # the input if self.getIntermediateFilePath( step ) != None: # this step should save an intermediate file stepArgs[step[4]["output"]] = self.getIntermediateFilePath( step) print >> sys.stderr, "Running step", step[ 0], "with arguments", stepArgs step[1](**stepArgs) # call the tool elif self.getStepStatus( step[0]) == "BEFORE": # this step was run earlier savedIntermediate = self.getIntermediateFilePath(step) # End state and return xml = self.xml # state-specific member variable self.xml will be removed when exiting state self.exitState() if self.state == None: # if the whole toolchain has finished, return the final product if not os.path.isdir( output ): # if output is a directory, it was given only for storing intermediate files ... ETUtils.write(xml, output) # ... otherwise, save the final output return xml else: return None
def beginModel(self, step, model, trainExampleFiles, testExampleFile, importIdsFromModel=None): """ Begin the training process leading to a new model. """ if self.checkStep(step, False): if model != None: if self.state != None and step != None: print >> sys.stderr, self.__class__.__name__ + ":" + self.state + ":" + step # Create combined model model = self.openModel(model, "w") assert model.mode in ["a", "w"], (model.path, model.mode) # Information can be imported from an existing model. In this case, model is trained # with the parameter already defined in the import source. This is used when training # the combined model. if importIdsFromModel != None: model.importFrom(self.openModel(importIdsFromModel, "r"), [ self.tag + "ids.classes", self.tag + "ids.features", "structure.txt" ], [ self.tag + "classifier-parameter", self.tag + "example-style", self.tag + "parse", self.tag + "task" ]) # Train the model with the parameters defined in the import source model.addStr( self.tag + "classifier-parameters-train", model.getStr(self.tag + "classifier-parameter")) if self.bioNLPSTParams != None and len( self.bioNLPSTParams) > 0: model.addStr("BioNLPSTParams", Parameters.toString(self.bioNLPSTParams)) # Catenate example files if type(trainExampleFiles) in types.StringTypes: combinedTrainExamples = trainExampleFiles elif len(trainExampleFiles) == 1: combinedTrainExamples = trainExampleFiles[0] else: combinedTrainExamples = self.workDir + os.path.normpath( model.path) + "-" + self.tag + "combined-examples.gz" combinedTrainExamplesFile = gzip.open( combinedTrainExamples, 'wb') for trainExampleFile in trainExampleFiles: print >> sys.stderr, "Catenating", trainExampleFile, "to", combinedTrainExamples shutil.copyfileobj(gzip.open(trainExampleFile, 'rb'), combinedTrainExamplesFile) combinedTrainExamplesFile.close() # Upload training model # The parameter grid is stored in the model as "*classifier-parameters-train" so that endModel can # use it, and also as annotation for the trained model. The final selected parameter will # be stored as "*classifier-parameter" classifierWorkDir = self.workDir + os.path.normpath( model.path) + "-" + self.tag + "models" classifier = self.getClassifier( model.getStr(self.tag + "classifier-parameters-train"))( self.connection) classifier.optimize( combinedTrainExamples, classifierWorkDir, model.getStr(self.tag + "classifier-parameters-train"), testExampleFile, model.get(self.tag + "ids.classes"), step="SUBMIT", evaluator=self.evaluator) model.save()
def train(output, task=None, detector=None, inputFiles=None, models=None, parse=None, processUnmerging=None, processModifiers=None, isSingleStage=False, bioNLPSTParams=None, preprocessorParams=None, exampleStyles=None, classifierParams=None, doFullGrid=False, deleteOutput=False, copyFrom=None, log="log.txt", step=None, omitSteps=None, debug=False, connection=None): """ Train a new model for event or relation detection. @param output: A directory where output files will appear. @param task: If defined, overridable default settings are used for many of the training parameters. Must be one of the supported TEES tasks. @param detector: a Detector object, or a string defining one to be imported @param inputFiles: A dictionary of file names, with keys "train", "devel" and, "test" @param models: A dictionary of file names defining the place for the new models, with keys "devel" and, "test" @param parse: The parse element name in the training interaction XML @param processUnmerging: Use the unmerging step of EventDetector. True, False or None for task default. @param processModifiers: Use the modifier detection step of EventDetector. True, False or None for task default. @param isSingleStage: False for EventDetector, True for a single stage detector. @param bioNLPSTParams: Parameters controlling BioNLP ST format output. @param preprocessorParams: Parameters controlling the preprocessor. Not used for training, but saved to the model for use when classifying. @param exampleStyles: A parameter set for controlling example builders. @param classifierParams: A parameter set for controlling classifiers. @param doFullGrid: Whether all parameters, as opposed to just recall adjustment, are tested in the EventDetector grid search. @param deleteOutput: Remove an existing output directory @param copyFrom: Copy an existing output directory for use as a template @param log: An optional alternative name for the log file. None is for no logging. @param step: A step=substep pair, where the steps are "TRAIN", "DEVEL", "EMPTY" and "TEST" @param omitSteps: step=substep parameters, where multiple substeps can be defined. @param debug: In debug mode, more output is shown, and some temporary intermediate files are saved @param connection: A parameter set defining a local or remote connection for training the classifier """ # Insert default arguments where needed inputFiles = Parameters.get(inputFiles, {"train":None, "devel":None, "test":None}) models = Parameters.get(models, {"devel":None, "test":None}) exampleStyles = Parameters.get(exampleStyles, {"examples":None, "trigger":None, "edge":None, "unmerging":None, "modifiers":None}) classifierParams = Parameters.get(classifierParams, {"examples":None, "trigger":None, "recall":None, "edge":None, "unmerging":None, "modifiers":None}) processUnmerging = getDefinedBool(processUnmerging) processModifiers = getDefinedBool(processModifiers) # Initialize working directory workdir(output, deleteOutput, copyFrom, log) # Get task specific parameters detector, processUnmerging, processModifiers, isSingleStage, bioNLPSTParams, preprocessorParams, exampleStyles, classifierParams, removeNamesFromEmpty = getTaskSettings(task, detector, processUnmerging, processModifiers, isSingleStage, bioNLPSTParams, preprocessorParams, inputFiles, exampleStyles, classifierParams) if task != None: task = task.replace("-MINI", "").replace("-FULL", "") # Define processing steps selector, detectorSteps, omitDetectorSteps = getSteps(step, omitSteps, ["TRAIN", "DEVEL", "EMPTY", "TEST"]) # Initialize the detector detector, detectorName = getDetector(detector) detector = detector() # initialize object detector.debug = debug detector.bioNLPSTParams = detector.getBioNLPSharedTaskParams(bioNLPSTParams) #detector.useBioNLPSTFormat = useBioNLPSTFormat # classify-output and grid evaluation in ST-format #detector.stWriteScores = True # write confidence scores into additional st-format files connection = getConnection(connection) detector.setConnection(connection) connection.debug = debug if deleteOutput: connection.clearWorkDir() # Train if selector.check("TRAIN"): print >> sys.stderr, "----------------------------------------------------" print >> sys.stderr, "------------------ Train Detector ------------------" print >> sys.stderr, "----------------------------------------------------" if isSingleStage: detector.train(inputFiles["train"], inputFiles["devel"], models["devel"], models["test"], exampleStyles["examples"], classifierParams["examples"], parse, None, task, fromStep=detectorSteps["TRAIN"], workDir="training") else: detector.train(inputFiles["train"], inputFiles["devel"], models["devel"], models["test"], exampleStyles["trigger"], exampleStyles["edge"], exampleStyles["unmerging"], exampleStyles["modifiers"], classifierParams["trigger"], classifierParams["edge"], classifierParams["unmerging"], classifierParams["modifiers"], classifierParams["recall"], processUnmerging, processModifiers, doFullGrid, task, parse, None, fromStep=detectorSteps["TRAIN"], workDir="training") # Save the detector type for model in [models["devel"], models["test"]]: if os.path.exists(model): model = Model(model, "a") model.addStr("detector", detectorName) if preprocessorParams != None: preprocessor = Preprocessor() model.addStr("preprocessorParams", Parameters.toString(preprocessor.getParameters(preprocessorParams))) model.save() model.close() if selector.check("DEVEL"): print >> sys.stderr, "----------------------------------------------------" print >> sys.stderr, "------------ Check devel classification ------------" print >> sys.stderr, "----------------------------------------------------" detector.classify(inputFiles["devel"], models["devel"], "classification-devel/devel", goldData=inputFiles["devel"], fromStep=detectorSteps["DEVEL"], workDir="classification-devel") if selector.check("EMPTY"): # By passing an emptied devel set through the prediction system, we can check that we get the same predictions # as in the DEVEL step, ensuring the model does not use leaked information. print >> sys.stderr, "----------------------------------------------------" print >> sys.stderr, "------------ Empty devel classification ------------" print >> sys.stderr, "----------------------------------------------------" detector.classify(getEmptyCorpus(inputFiles["devel"], removeNames=removeNamesFromEmpty), models["devel"], "classification-empty/devel-empty", fromStep=detectorSteps["EMPTY"], workDir="classification-empty") if selector.check("TEST"): print >> sys.stderr, "----------------------------------------------------" print >> sys.stderr, "------------- Test set classification --------------" print >> sys.stderr, "----------------------------------------------------" if inputFiles["test"] == None or not os.path.exists(inputFiles["test"]): print >> sys.stderr, "Skipping, test file", inputFiles["test"], "does not exist" else: detector.bioNLPSTParams["scores"] = False # the evaluation server doesn't like additional files detector.classify(inputFiles["test"], models["test"], "classification-test/test", fromStep=detectorSteps["TEST"], workDir="classification-test") if detector.bioNLPSTParams["convert"]: Utils.STFormat.Compare.compare("classification-test/test-events.tar.gz", "classification-devel/devel-events.tar.gz", "a2")
def doGrid(self): print >> sys.stderr, "--------- Parameter grid search ---------" # Build trigger examples self.triggerDetector.buildExamples( self.model, [self.optData], [self.workDir + "grid-trigger-examples"]) if self.fullGrid: stepParams = { "trigger": Parameters.get(self.model.getStr(self.triggerDetector.tag + "classifier-parameters-train", defaultIfNotExist=""), valueListKey="c"), "booster": [float(i) for i in self.recallAdjustParameters.split(",")], "edge": Parameters.get(self.model.getStr(self.edgeDetector.tag + "classifier-parameters-train", defaultIfNotExist=""), valueListKey="c") } else: stepParams = { "trigger": Parameters.get(self.model.getStr(self.triggerDetector.tag + "classifier-parameter", defaultIfNotExist=""), valueListKey="c"), "booster": [float(i) for i in self.recallAdjustParameters.split(",")], "edge": Parameters.get(self.model.getStr(self.edgeDetector.tag + "classifier-parameter", defaultIfNotExist=""), valueListKey="c") } for step in ["trigger", "edge"]: stepParams[step] = Parameters.getCombinations(stepParams[step]) for i in range(len(stepParams[step])): stepParams[step][i] = Parameters.toString(stepParams[step][i]) print >> sys.stderr, "Parameters", [ stepParams[x] for x in ["trigger", "booster", "edge"] ] paramCombinations = combine( *[stepParams[x] for x in ["trigger", "booster", "edge"]]) print >> sys.stderr, "Combinations", paramCombinations for i in range(len(paramCombinations)): paramCombinations[i] = { "trigger": paramCombinations[i][0], "booster": paramCombinations[i][1], "edge": paramCombinations[i][2] } #paramCombinations = Parameters.getCombinations(ALL_PARAMS, ["trigger", "booster", "edge"]) prevParams = None EDGE_MODEL_STEM = os.path.join( self.edgeDetector.workDir, os.path.normpath(self.model.path) + "-edge-models/model") TRIGGER_MODEL_STEM = os.path.join( self.triggerDetector.workDir, os.path.normpath(self.model.path) + "-trigger-models/model") self.structureAnalyzer.load(self.model) bestResults = None for i in range(len(paramCombinations)): params = paramCombinations[i] print >> sys.stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" print >> sys.stderr, "Processing params", str(i + 1) + "/" + str( len(paramCombinations)), params print >> sys.stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" # Triggers and Boost (the trigger predictions are recalculated only when the relevant parameters change) if (prevParams == None) or ( prevParams["trigger"] != params["trigger"]) or ( prevParams["booster"] != params["booster"]): print >> sys.stderr, "Classifying trigger examples for parameters", "trigger:" + str( params["trigger"]), "booster:" + str(params["booster"]) xml = self.triggerDetector.classifyToXML( self.optData, self.model, self.workDir + "grid-trigger-examples", self.workDir + "grid-", classifierModel=TRIGGER_MODEL_STEM + Parameters.toId(params["trigger"]), recallAdjust=params["booster"], useExistingExamples=True) prevParams = params ## Build edge examples #self.edgeDetector.buildExamples(self.model, [xml], [self.workDir+"grid-edge-examples"], [self.optData]) # Classify with pre-defined model edgeClassifierModel = EDGE_MODEL_STEM + Parameters.toId( params["edge"]) xml = self.edgeDetector.classifyToXML( xml, self.model, self.workDir + "grid-edge-examples", self.workDir + "grid-", classifierModel=edgeClassifierModel, goldData=self.optData) bestResults = self.evaluateGrid(xml, params, bestResults) # Remove remaining intermediate grid files for tag1 in ["edge", "trigger", "unmerging"]: for tag2 in ["examples", "pred.xml.gz"]: if os.path.exists(self.workDir + "grid-" + tag1 + "-" + tag2): os.remove(self.workDir + "grid-" + tag1 + "-" + tag2) print >> sys.stderr, "Parameter grid search complete" print >> sys.stderr, "Tested", len(paramCombinations), "combinations" print >> sys.stderr, "Best parameters:", bestResults[0] print >> sys.stderr, "Best result:", bestResults[2] # f-score # Save grid model self.saveStr("recallAdjustParameter", str(bestResults[0]["booster"]), self.model) self.saveStr("recallAdjustParameter", str(bestResults[0]["booster"]), self.combinedModel, False) if self.fullGrid: # define best models self.triggerDetector.addClassifierModel( self.model, TRIGGER_MODEL_STEM + str(bestResults[0]["trigger"]), bestResults[0]["trigger"]) self.edgeDetector.addClassifierModel( self.model, EDGE_MODEL_STEM + str(bestResults[0]["edge"]), bestResults[0]["edge"]) # Remove work files for stepTag in [ self.workDir + "grid-trigger", self.workDir + "grid-edge", self.workDir + "grid-unmerging" ]: for fileStem in [ "-classifications", "-classifications.log", "examples.gz", "pred.xml.gz" ]: if os.path.exists(stepTag + fileStem): os.remove(stepTag + fileStem)
def train(output, task=None, detector=None, inputFiles=None, models=None, parse=None, processUnmerging=None, processModifiers=None, bioNLPSTParams=None, preprocessorParams=None, exampleStyles=None, classifierParams=None, doFullGrid=False, deleteOutput=False, copyFrom=None, log="log.txt", step=None, omitSteps=None, debug=False, connection=None, subset=None, folds=None, corpusDir=None, corpusPreprocessing=None, evaluator=None): """ Train a new model for event or relation detection. @param output: A directory where output files will appear. @param task: If defined, overridable default settings are used for many of the training parameters. Must be one of the supported TEES tasks. @param detector: a Detector object, or a string defining one to be imported @param inputFiles: A dictionary of file names, with keys "train", "devel" and, "test" @param models: A dictionary of file names defining the place for the new models, with keys "devel" and, "test" @param parse: The parse element name in the training interaction XML @param processUnmerging: Use the unmerging step of EventDetector. True, False or None for task default. @param processModifiers: Use the modifier detection step of EventDetector. True, False or None for task default. @param bioNLPSTParams: Parameters controlling BioNLP ST format output. @param preprocessorParams: Parameters controlling the preprocessor. Not used for training, but saved to the model for use when classifying. @param exampleStyles: A parameter set for controlling example builders. @param classifierParams: A parameter set for controlling classifiers. @param doFullGrid: Whether all parameters, as opposed to just recall adjustment, are tested in the EventDetector grid search. @param deleteOutput: Remove an existing output directory @param copyFrom: Copy an existing output directory for use as a template @param log: An optional alternative name for the log file. None is for no logging. @param step: A step=substep pair, where the steps are "TRAIN", "DEVEL", "EMPTY" and "TEST" @param omitSteps: step=substep parameters, where multiple substeps can be defined. @param debug: In debug mode, more output is shown, and some temporary intermediate files are saved @param connection: A parameter set defining a local or remote connection for training the classifier @param subset: A parameter set for making subsets of input files """ # Insert default arguments where needed inputFiles = setDictDefaults(inputFiles, {"train":None, "devel":None, "test":None}) models = setDictDefaults(models, {"devel":"model-devel", "test":"model-test"}) exampleStyles = setDictDefaults(exampleStyles, {"examples":None, "trigger":None, "edge":None, "unmerging":None, "modifiers":None}) classifierParams = setDictDefaults(classifierParams, {"examples":None, "trigger":None, "recall":None, "edge":None, "unmerging":None, "modifiers":None}) subset = setDictDefaults(Parameters.get(subset), {"train":None, "devel":None, "test":None, "seed":0, "all":None}) folds = setDictDefaults(folds, {"train":None, "devel":None, "test":None}) processUnmerging = getDefinedBool(processUnmerging) processModifiers = getDefinedBool(processModifiers) # Initialize working directory workdir(output, deleteOutput, copyFrom, log) # Get task specific parameters useKerasDetector = False if detector != None and "keras" in detector.lower(): print >> sys.stderr, "Using a Keras Detector" useKerasDetector = True if detector.lower() == "keras": detector = None detector, bioNLPSTParams, preprocessorParams, folds = getTaskSettings(task, detector, bioNLPSTParams, preprocessorParams, inputFiles, exampleStyles, classifierParams, folds, corpusDir=corpusDir, useKerasDetector=useKerasDetector) # Learn training settings from input files detector = learnSettings(inputFiles, detector, classifierParams, task, exampleStyles, useKerasDetector=useKerasDetector) # Get corpus subsets getFolds(inputFiles, folds) getSubsets(inputFiles, subset) if task != None: task = task.replace("-FULL", "") if "." in task: _, subTask = getSubTask(task) if subTask != 3: processModifiers = False # Preprocess the corpus if required if corpusPreprocessing != None: preprocessor = Preprocessor(steps=corpusPreprocessing) assert preprocessor.steps[0].name == "MERGE_SETS" assert preprocessor.steps[-1].name == "DIVIDE_SETS" preprocessedCorpusDir = os.path.join(output, "corpus") #outputFiles = {x:os.path.join(preprocessedCorpusDir, os.path.basename(inputFiles[x])) for x in inputFiles} preprocessor.process(inputFiles, os.path.join(preprocessedCorpusDir, task)) #inputFiles = outputFiles for setName in inputFiles.keys(): if inputFiles[setName] != None: inputFiles[setName] = os.path.join(preprocessedCorpusDir, task + "-" + setName + ".xml") # Define processing steps selector, detectorSteps, omitDetectorSteps = getSteps(step, omitSteps, ["TRAIN", "DEVEL", "EMPTY", "TEST"]) # Initialize the detector detector, detectorName = getDetector(detector, evaluator=evaluator) evaluator, evaluatorName = importClass(evaluator, "evaluator") detector = detector() # initialize object if evaluator != None: print >> sys.stderr, "Using evaluator", evaluator.__name__ detector.evaluator = evaluator detector.debug = debug detector.bioNLPSTParams = detector.getBioNLPSharedTaskParams(bioNLPSTParams) #detector.useBioNLPSTFormat = useBioNLPSTFormat # classify-output and grid evaluation in ST-format #detector.stWriteScores = True # write confidence scores into additional st-format files connection = getConnection(connection) detector.setConnection(connection) connection.debug = debug if deleteOutput: connection.clearWorkDir() # Train if selector.check("TRAIN"): print >> sys.stderr, "----------------------------------------------------" print >> sys.stderr, "------------------ Train Detector ------------------" print >> sys.stderr, "----------------------------------------------------" if not isinstance(detector, EventDetector): detector.train(inputFiles["train"], inputFiles["devel"], models["devel"], models["test"], exampleStyles["examples"], classifierParams["examples"], parse, None, task, fromStep=detectorSteps["TRAIN"], workDir="training", testData=inputFiles["test"]) else: detector.train(inputFiles["train"], inputFiles["devel"], models["devel"], models["test"], exampleStyles["trigger"], exampleStyles["edge"], exampleStyles["unmerging"], exampleStyles["modifiers"], classifierParams["trigger"], classifierParams["edge"], classifierParams["unmerging"], classifierParams["modifiers"], classifierParams["recall"], processUnmerging, processModifiers, doFullGrid, task, parse, None, fromStep=detectorSteps["TRAIN"], workDir="training", testData=inputFiles["test"]) # Save the detector type for model in [models["devel"], models["test"]]: if model != None and os.path.exists(model): model = Model(model, "a") model.addStr("detector", detectorName) if evaluatorName != None: model.addStr("detector", evaluatorName) if preprocessorParams != None: preprocessor = Preprocessor() model.addStr("preprocessorParams", Parameters.toString(preprocessor.getParameters(preprocessorParams))) model.save() model.close() if selector.check("DEVEL"): print >> sys.stderr, "----------------------------------------------------" print >> sys.stderr, "------------ Check devel classification ------------" print >> sys.stderr, "----------------------------------------------------" #detector.bioNLPSTParams["scores"] = False # the evaluation server doesn't like additional files detector.classify(inputFiles["devel"], models["devel"], "classification-devel/devel", goldData=inputFiles["devel"], fromStep=detectorSteps["DEVEL"], workDir="classification-devel") if selector.check("EMPTY"): # By passing an emptied devel set through the prediction system, we can check that we get the same predictions # as in the DEVEL step, ensuring the model does not use leaked information. print >> sys.stderr, "----------------------------------------------------" print >> sys.stderr, "------------ Empty devel classification ------------" print >> sys.stderr, "----------------------------------------------------" #detector.bioNLPSTParams["scores"] = False # the evaluation server doesn't like additional files removalScope = "non-given" if "names" in str(exampleStyles["examples"]) or "names" in str(exampleStyles["trigger"]): removalScope = "all" elif "Edge" in detector.__class__.__name__: removalScope = "interactions" detector.classify(getEmptyCorpus(inputFiles["devel"], scope=removalScope), models["devel"], "classification-empty/devel-empty", fromStep=detectorSteps["EMPTY"], workDir="classification-empty") print >> sys.stderr, "*** Evaluate empty devel classification ***" if os.path.exists("classification-empty/devel-empty-pred.xml.gz"): EvaluateInteractionXML.run(detector.evaluator, "classification-empty/devel-empty-pred.xml.gz", inputFiles["devel"], parse) else: print >> sys.stderr, "No output file for evaluation" if selector.check("TEST"): print >> sys.stderr, "----------------------------------------------------" print >> sys.stderr, "------------- Test set classification --------------" print >> sys.stderr, "----------------------------------------------------" if inputFiles["test"] == None or not os.path.exists(inputFiles["test"]): print >> sys.stderr, "Skipping, test file", inputFiles["test"], "does not exist" else: #detector.bioNLPSTParams["scores"] = False # the evaluation server doesn't like additional files detector.classify(inputFiles["test"], models["test"] if models["test"] != None else models["devel"], "classification-test/test", fromStep=detectorSteps["TEST"], workDir="classification-test") if detector.bioNLPSTParams["convert"]: extension = ".zip" if (detector.bioNLPSTParams["convert"] == "zip") else ".tar.gz" Utils.STFormat.Compare.compare("classification-test/test-events" + extension, "classification-devel/devel-events" + extension, "a2") # Stop logging if log != None: Stream.closeLog(log)