Esempio n. 1
0
def getConnection(connection): #, account=None, workDirBase=None, remoteSettingsPath=None):
    if connection == None: # return a "dummy" local connection
        return getConnection("connection=Unix:jobLimit=1")
    elif type(connection) in types.StringTypes and hasattr(Settings, connection): # connection is a Settings key
        print >> sys.stderr, "Using connection", connection
        return getConnection(getattr(Settings, connection))
        #return getConnection(*getattr(Settings, connection))
    else: # connection is a parameter string or dictionary
        defaultParams = dict.fromkeys(["connection", "account", "workdir", "settings", "memory", "cores", "modules", "wallTime", "jobLimit", "preamble", "debug"])
        defaultParams["debug"] = False
        connection = Parameters.get(connection, valueListKey="connection", valueTypes={"debug":[bool]}, defaults=defaultParams)
        if connection["connection"] == None:
            connection["connection"] = "Unix"
        if connection["account"] == None:
            assert connection["workdir"] == None
            #assert remoteSettingsPath == None
            print >> sys.stderr, "New local connection", Parameters.toString(connection)
        else: 
            print >> sys.stderr, "New remote connection:", Parameters.toString(connection)
        # Make the connection
        exec "ConnectionClass = " + connection["connection"] + "Connection"
        connectionArgs = {}
        for key in connection:
            if key != "connection" and connection[key] != None:
                connectionArgs[key] = connection[key]
        return ConnectionClass(**connectionArgs)
Esempio n. 2
0
 def addClassifierModel(self, model, classifierModelPath, classifierParameters, threshold=None):
     classifierModel = model.get(self.tag+"classifier-model", True)
     shutil.copy2(classifierModelPath, classifierModel)
     model.addStr(self.tag+"classifier-parameter", Parameters.toString(Parameters.get(classifierParameters)))
     if threshold != None:
         model.addStr(self.tag+"threshold", str(threshold))
     return classifierModel
Esempio n. 3
0
 def addClassifierModel(self, model, classifierModelPath, classifierParameters, threshold=None):
     classifierModel = model.get(self.tag+"classifier-model", True)
     shutil.copy2(classifierModelPath, classifierModel)
     model.addStr(self.tag+"classifier-parameter", Parameters.toString(Parameters.get(classifierParameters)))
     if threshold != None:
         model.addStr(self.tag+"threshold", str(threshold))
     return classifierModel
Esempio n. 4
0
 def saveModel(self, teesModel, tag=""):
     if hasattr(self, "model") and self.model != None:
         teesModelPath = teesModel.get(tag+"classifier-model", True)
         shutil.copy2(self.model, teesModelPath)
     if hasattr(self, "parameters") and self.parameters != None:
         teesModel.addStr(tag+"classifier-parameter", Parameters.toString(Parameters.get(self.parameters)))
     if hasattr(self, "threshold") and self.threshold != None:
         teesModel.addStr(tag+"threshold", str(self.threshold))
Esempio n. 5
0
 def saveModel(self, teesModel, tag=""):
     if hasattr(self, "model") and self.model != None:
         teesModelPath = teesModel.get(tag + "classifier-model", True)
         shutil.copy2(self.model, teesModelPath)
     if hasattr(self, "parameters") and self.parameters != None:
         teesModel.addStr(
             tag + "classifier-parameter",
             Parameters.toString(Parameters.get(self.parameters)))
     if hasattr(self, "threshold") and self.threshold != None:
         teesModel.addStr(tag + "threshold", str(self.threshold))
Esempio n. 6
0
 def initModel(self, model, saveParams=[]):
     if model == None:
         return model
     elif type(model) in types.StringTypes:
         model = self.openModel(model, "w")
     else:
         assert model.mode in ["a", "w"]
     for param in saveParams:
         model.addStr(param[1], Parameters.toString(getattr(self, param[0])))
     model.save()
     return model
Esempio n. 7
0
 def initModel(self, model, saveParams=[]):
     if model == None:
         return model
     elif type(model) in types.StringTypes:
         model = self.openModel(model, "w")
     else:
         assert model.mode in ["a", "w"]
     for param in saveParams:
         model.addStr(param[1], Parameters.toString(getattr(self, param[0])))
     model.save()
     return model
Esempio n. 8
0
def getConnection(
        connection
):  #, account=None, workDirBase=None, remoteSettingsPath=None):
    if connection == None:  # return a "dummy" local connection
        return getConnection("connection=Unix:jobLimit=1")
    elif type(connection) in types.StringTypes and hasattr(
            Settings, connection):  # connection is a Settings key
        print >> sys.stderr, "Using connection", connection
        return getConnection(getattr(Settings, connection))
        #return getConnection(*getattr(Settings, connection))
    else:  # connection is a parameter string or dictionary
        defaultParams = dict.fromkeys([
            "connection", "account", "workdir", "settings", "memory", "cores",
            "modules", "wallTime", "jobLimit", "preamble", "debug"
        ])
        defaultParams["debug"] = False
        connection = Parameters.get(connection,
                                    valueListKey="connection",
                                    valueTypes={"debug": [bool]},
                                    defaults=defaultParams)
        if connection["connection"] == None:
            connection["connection"] = "Unix"
        if connection["account"] == None:
            assert connection["workdir"] == None
            #assert remoteSettingsPath == None
            print >> sys.stderr, "New local connection", Parameters.toString(
                connection)
        else:
            print >> sys.stderr, "New remote connection:", Parameters.toString(
                connection)
        # Make the connection
        exec "ConnectionClass = " + connection["connection"] + "Connection"
        connectionArgs = {}
        for key in connection:
            if key != "connection" and connection[key] != None:
                connectionArgs[key] = connection[key]
        return ConnectionClass(**connectionArgs)
Esempio n. 9
0
 def beginModel(self, step, model, trainExampleFiles, testExampleFile, importIdsFromModel=None):
     """
     Begin the training process leading to a new model.
     """
     if self.checkStep(step, False):
         if model != None:
             if self.state != None and step != None:
                 print >> sys.stderr, self.__class__.__name__ + ":" + self.state + ":" + step
             # Create combined model
             model = self.openModel(model, "w")
             assert model.mode in ["a", "w"], (model.path, model.mode)
             # Information can be imported from an existing model. In this case, model is trained
             # with the parameter already defined in the import source. This is used when training
             # the combined model.
             if importIdsFromModel != None:
                 model.importFrom(self.openModel(importIdsFromModel, "r"), [self.tag+"ids.classes", self.tag+"ids.features", "structure.txt"],
                                  [self.tag+"classifier-parameter", self.tag+"example-style", self.tag+"parse", self.tag+"task"])
                 # Train the model with the parameters defined in the import source
                 model.addStr(self.tag+"classifier-parameters-train", model.getStr(self.tag+"classifier-parameter"))
             if self.bioNLPSTParams != None and len(self.bioNLPSTParams) > 0:
                 model.addStr("BioNLPSTParams", Parameters.toString(self.bioNLPSTParams))
             # Catenate example files
             if type(trainExampleFiles) in types.StringTypes:
                 combinedTrainExamples = trainExampleFiles
             elif len(trainExampleFiles) == 1: 
                 combinedTrainExamples = trainExampleFiles[0]
             else:
                 combinedTrainExamples = self.workDir + os.path.normpath(model.path)+"-"+self.tag+"combined-examples.gz"
                 combinedTrainExamplesFile = gzip.open(combinedTrainExamples, 'wb')
                 for trainExampleFile in trainExampleFiles:
                     print >> sys.stderr, "Catenating", trainExampleFile, "to", combinedTrainExamples
                     shutil.copyfileobj(gzip.open(trainExampleFile, 'rb'), combinedTrainExamplesFile)
                 combinedTrainExamplesFile.close()
             # Upload training model
             # The parameter grid is stored in the model as "*classifier-parameters-train" so that endModel can 
             # use it, and also as annotation for the trained model. The final selected parameter will
             # be stored as "*classifier-parameter" 
             classifierWorkDir = self.workDir + os.path.normpath(model.path) + "-" + self.tag + "models"
             classifier = self.getClassifier(model.getStr(self.tag+"classifier-parameters-train"))(self.connection)
             classifier.optimize(combinedTrainExamples, classifierWorkDir, model.getStr(self.tag+"classifier-parameters-train"), testExampleFile, model.get(self.tag+"ids.classes"), step="SUBMIT", evaluator=self.evaluator)
             model.save()
Esempio n. 10
0
 def process(self, input, output, parameters=None, model=None, fromStep=None, toStep=None, omitSteps=None):
     self.initVariables(source=input, xml=input, outDir=os.path.dirname(output))
     if os.path.basename(output) != "":
         self.intermediateFileTag = os.path.basename(output)
     else:
         self.intermediateFileTag = ""
     self.enterState(self.STATE_TOOLCHAIN, [x[0] for x in self.steps], fromStep, toStep, omitSteps)
     parameters = self.getParameters(parameters, model, defaultValue=NOTHING)
     self.applyParameters(parameters)
     # Run the tools
     print >> sys.stderr, "Tool chain parameters:", Parameters.toString(parameters, skipKeysWithValues=[NOTHING], skipDefaults=self.getDefaultParameters())
     if os.path.exists(output) and not os.path.isdir(output):
         print >> sys.stderr, "Removing existing preprocessor output file", output
         os.remove(output)
     savedIntermediate = None # Output from a previous step if "fromStep" is used
     for step in self.steps:
         if self.checkStep(step[0]):
             if savedIntermediate != None: # A previous run of the program saved an intermediate file
                 print >> sys.stderr, "Reading input from saved intermediate file", savedIntermediate
                 self.xml = ETUtils.ETFromObj(savedIntermediate)
                 savedIntermediate = None
             stepArgs = copy.copy(step[2]) # make a copy of the arguments to which i/o can be added
             stepArgs[step[4]["input"]] = self.xml # the input
             if self.getIntermediateFilePath(step) != None: # this step should save an intermediate file
                 stepArgs[step[4]["output"]] = self.getIntermediateFilePath(step)
             print >> sys.stderr, "Running step", step[0], "with arguments", stepArgs
             step[1](**stepArgs) # call the tool
         elif self.getStepStatus(step[0]) == "BEFORE": # this step was run earlier
             savedIntermediate = self.getIntermediateFilePath(step)
     # End state and return
     xml = self.xml # state-specific member variable self.xml will be removed when exiting state
     self.exitState()
     if self.state == None: # if the whole toolchain has finished, return the final product
         if not os.path.isdir(output): # if output is a directory, it was given only for storing intermediate files ...
             ETUtils.write(xml, output) # ... otherwise, save the final output
         return xml
     else:
         return None
Esempio n. 11
0
def learnSettings(inputFiles, detector, classifierParameters, task, exampleStyles, useKerasDetector=False):
    if detector == None:
        print >> sys.stderr, "*** Analyzing input files to determine training settings ***"
        structureAnalyzer = StructureAnalyzer()
        if not os.path.exists("training/structure.txt"): 
            datasets = sorted(filter(None, [inputFiles["train"], inputFiles["devel"]]))
            print >> sys.stderr, "input files:", datasets
            structureAnalyzer.analyze(datasets)
            print >> sys.stderr, structureAnalyzer.toString()
            structureAnalyzer.save(None, "training/structure.txt")
        else:
            print >> sys.stderr, "Using existing analysis from training/structure.txt"
            structureAnalyzer.load(None, "training/structure.txt")
    
    # Choose detector
    if detector == None:
        if "ENTITY" in structureAnalyzer.targets and "INTERACTION" in structureAnalyzer.targets:
            detector = "Detectors.EventDetector"
        elif "ENTITY" in structureAnalyzer.targets:
            detector = "Detectors.EntityDetector"
        elif "INTERACTION" in structureAnalyzer.targets:
            detector = "Detectors.EdgeDetector"
        else:
            assert False, structureAnalyzer.targets

    if useKerasDetector and not "Keras" in detector:
        detector = detector.replace("Detectors.", "Detectors.Keras")
    print >> sys.stderr, "Using detector '" + str(detector) + "'"
    
    # Set default parameters
    cp = classifierParameters
    if detector == "Detectors.EventDetector":
        # Add common classifier parameters
        if cp["examples"] != None:
            cp["unmerging"] = Parameters.cat(cp["examples"], cp["unmerging"])
            cp["modifiers"] = Parameters.cat(cp["examples"], cp["modifiers"])
            cp["edge"] = Parameters.cat(cp["examples"], cp["edge"])
            cp["trigger"] = Parameters.cat(cp["examples"], cp["trigger"])
        cp["unmerging"] = Parameters.cat("c=1,10,100,500,1000,1500,2500,5000,10000,20000,50000,80000,100000", cp["unmerging"], "Classifier parameters for unmerging")        
        cp["modifiers"] = Parameters.cat("c=5000,10000,20000,50000,100000", cp["modifiers"], "Classifier parameters for modifiers")
        cp["edge"] = Parameters.cat("c=1000,4500,5000,7500,10000,20000,25000,27500,28000,29000,30000,35000,40000,50000,60000,65000", cp["edge"], "Classifier parameters for edges")
        cp["trigger"] = Parameters.cat("c=1000,5000,10000,20000,50000,80000,100000,150000,180000,200000,250000,300000,350000,500000,1000000", cp["trigger"], "Classifier parameters for triggers")
        cp["recall"] = Parameters.cat("0.5,0.6,0.65,0.7,0.85,1.0,1.1,1.2", cp["recall"], "Recall adjustment parameters")
    elif detector == "Detectors.EntityDetector":
        cp["examples"] = Parameters.cat("c=1000,5000,10000,20000,50000,80000,100000,150000,180000,200000,250000,300000,350000,500000,1000000", cp["examples"], "Classifier parameters for entities")
    elif detector == "Detectors.EdgeDetector":
        cp["examples"] = Parameters.cat("c=1000,4500,5000,7500,10000,20000,25000,27500,28000,29000,30000,35000,40000,50000,60000,65000", cp["examples"], "Classifier parameters for edges")
    elif detector == "Detectors.UnmergingDetector":
        cp["examples"] = Parameters.cat("c=1,10,100,500,1000,1500,2500,5000,10000,20000,50000,80000,100000", cp["examples"], "Classifier parameters for unmerging")
    
    #######################################################################
    # Keras example styles
    #######################################################################
    if useKerasDetector:
        task, subTask = getSubTask(task)
        msg = "Keras example style"
        #overrideStyles = {x:(Parameters.get(exampleStyles[x]) if (exampleStyles[x] != None and "override" in exampleStyles[x]) else {"override":True}) for x in exampleStyles}
        overrideStyles = {"all":{}}
        for key in exampleStyles:
            overrideStyles[key] = {}
            params = Parameters.get(exampleStyles[key])
            if "override" in params:
                exampleStyles[key] = None
                overrideStyles[key] = params
                overrideStyles[key].pop("override")
            elif "override_all" in params:
                exampleStyles[key] = None
                overrideStyles["all"] = params
                overrideStyles["all"].pop("override_all")
            #exampleStyles[key] = exampleStyles[key] if (exampleStyles[key] != None and not "override" in exampleStyles[key]) else None
        print >> sys.stderr, "Override styles:", overrideStyles
        if "EventDetector" in detector:
            if task == "EPI11":
                exampleStyles["trigger"] = Parameters.cat("keras:epochs=500:patience=10:nf=512:path=4:el=41:mods=20:epi_merge_negated", exampleStyles["trigger"])
            else:
                exampleStyles["trigger"] = Parameters.cat("keras:epochs=500:patience=10:nf=512:path=4:el=41:mods=20", exampleStyles["trigger"])
            if task in ["GE09", "GE11", "GE13"] and subTask == 1:
                exampleStyles["edge"] = Parameters.cat("keras:genia_task1:epochs=500:patience=10:nf=256:path=4:ol=15:mods=20", exampleStyles["edge"])
            else:
                exampleStyles["edge"] = Parameters.cat("keras:epochs=500:patience=10:nf=256:path=4:ol=15:mods=20", exampleStyles["edge"])
            exampleStyles["unmerging"] = Parameters.cat("keras:epochs=500:patience=10:nf=256:path=4:ol=15:mods=20", exampleStyles["unmerging"])
            exampleStyles["modifiers"] = Parameters.cat("keras:epochs=500:patience=10:nf=256:path=4:el=41:mods=20", exampleStyles["modifiers"])
        elif "EntityDetector" in detector:
            if task == "DDI13T91":
                exampleStyles["examples"] = Parameters.cat("keras:epochs=500:patience=10:nf=512:path=4:el=41:mods=20:names:build_for_nameless", exampleStyles["examples"])
            else:
                exampleStyles["examples"] = Parameters.cat("keras:epochs=500:patience=10:nf=512:path=4:el=41:mods=20", exampleStyles["examples"])
        elif "EdgeDetector" in detector:
            if "DDI" in task:
                exampleStyles["examples"] = Parameters.cat("keras:epochs=500:patience=10:nf=256:path=0:do=0.2:dense=800:ol=50:mods=20", exampleStyles["examples"])
            elif task == "CP17":
                exampleStyles["examples"] = Parameters.cat("keras:epochs=500:patience=10:nf=512:path=0:do=0.2:ol=50:skip_labels=CPR\:0,CPR\:1,CPR\:2,CPR\:7,CPR\:8,CPR\:10:mods=20", exampleStyles["examples"])
            else:
                exampleStyles["examples"] = Parameters.cat("keras:epochs=500:patience=10:nf=256:path=4:ol=15:mods=20", exampleStyles["examples"])
        print >> sys.stderr, "Keras initial example styles:", exampleStyles
        for key in exampleStyles:
            if exampleStyles[key] != None:
                exampleStyles[key] = Parameters.get(exampleStyles[key])
                exampleStyles[key].update(overrideStyles[key])
                exampleStyles[key].update(overrideStyles["all"])
                exampleStyles[key] = Parameters.toString(exampleStyles[key])
            print >> sys.stderr, "Keras final example style for " + key + ": ", exampleStyles[key]
        
    return detector
Esempio n. 12
0
    def doGrid(self):
        print >> sys.stderr, "--------- Parameter grid search ---------"
        # Build trigger examples
        self.triggerDetector.buildExamples(self.model, [self.optData], [self.workDir+"grid-trigger-examples.gz"])

        if self.fullGrid:
            stepParams = {
                "trigger":Parameters.get(self.model.getStr(self.triggerDetector.tag+"classifier-parameters-train", defaultIfNotExist=""), valueListKey="c"),
                "booster":[float(i) for i in self.recallAdjustParameters.split(",")],
                "edge":Parameters.get(self.model.getStr(self.edgeDetector.tag+"classifier-parameters-train", defaultIfNotExist=""), valueListKey="c")}
        else:
            stepParams = {
                "trigger":Parameters.get(self.model.getStr(self.triggerDetector.tag+"classifier-parameter", defaultIfNotExist=""), valueListKey="c"),
                "booster":[float(i) for i in self.recallAdjustParameters.split(",")],
                "edge":Parameters.get(self.model.getStr(self.edgeDetector.tag+"classifier-parameter", defaultIfNotExist=""), valueListKey="c")}
        
        for step in ["trigger", "edge"]:
            stepParams[step] = Parameters.getCombinations(stepParams[step])
            for i in range(len(stepParams[step])):
                stepParams[step][i] = Parameters.toString(stepParams[step][i])
        print >> sys.stderr, [stepParams[x] for x in ["trigger", "booster", "edge"]]
        paramCombinations = combine(*[stepParams[x] for x in ["trigger", "booster", "edge"]])
        print >> sys.stderr, paramCombinations
        for i in range(len(paramCombinations)):
            paramCombinations[i] = {"trigger":paramCombinations[i][0], "booster":paramCombinations[i][1], "edge":paramCombinations[i][2]}
        
        #paramCombinations = Parameters.getCombinations(ALL_PARAMS, ["trigger", "booster", "edge"])
        prevParams = None
        EDGE_MODEL_STEM = os.path.join(self.edgeDetector.workDir, os.path.normpath(self.model.path)+"-edge-models/model")
        TRIGGER_MODEL_STEM = os.path.join(self.triggerDetector.workDir, os.path.normpath(self.model.path)+"-trigger-models/model")
        self.structureAnalyzer.load(self.model)
        bestResults = None
        for i in range(len(paramCombinations)):
            params = paramCombinations[i]
            print >> sys.stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
            print >> sys.stderr, "Processing params", str(i+1) + "/" + str(len(paramCombinations)), params
            print >> sys.stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
            # Triggers and Boost
            if prevParams == None or prevParams["trigger"] != params["trigger"] or prevParams["trigger"] != params["trigger"]:
                print >> sys.stderr, "Classifying trigger examples for parameters", "trigger:" + str(params["trigger"]), "booster:" + str(params["booster"])
                xml = self.triggerDetector.classifyToXML(self.optData, self.model, self.workDir+"grid-trigger-examples", self.workDir+"grid-", classifierModel=TRIGGER_MODEL_STEM + Parameters.toId(params["trigger"]), recallAdjust=params["booster"])
            prevParams = params
            ## Build edge examples
            #self.edgeDetector.buildExamples(self.model, [xml], [self.workDir+"grid-edge-examples"], [self.optData])
            # Classify with pre-defined model
            edgeClassifierModel = EDGE_MODEL_STEM + Parameters.toId(params["edge"])
            xml = self.edgeDetector.classifyToXML(xml, self.model, self.workDir+"grid-edge-examples", self.workDir+"grid-", classifierModel=edgeClassifierModel, goldData=self.optData)
            bestResults = self.evaluateGrid(xml, params, bestResults)
        # Remove remaining intermediate grid files
        for tag1 in ["edge", "trigger", "unmerging"]:
            for tag2 in ["examples", "pred.xml.gz"]:
                if os.path.exists(self.workDir+"grid-"+tag1+"-"+tag2):
                    os.remove(self.workDir+"grid-"+tag1+"-"+tag2)
        print >> sys.stderr, "Parameter grid search complete"
        print >> sys.stderr, "Tested", len(paramCombinations), "combinations"
        print >> sys.stderr, "Best parameters:", bestResults[0]
        print >> sys.stderr, "Best result:", bestResults[2] # f-score
        # Save grid model
        self.saveStr("recallAdjustParameter", str(bestResults[0]["booster"]), self.model)
        self.saveStr("recallAdjustParameter", str(bestResults[0]["booster"]), self.combinedModel, False)
        if self.fullGrid: # define best models
            self.triggerDetector.addClassifierModel(self.model, TRIGGER_MODEL_STEM+str(bestResults[0]["trigger"]), bestResults[0]["trigger"])
            self.edgeDetector.addClassifierModel(self.model, EDGE_MODEL_STEM+str(bestResults[0]["edge"]), bestResults[0]["edge"])
        # Remove work files
        for stepTag in [self.workDir+"grid-trigger", self.workDir+"grid-edge", self.workDir+"grid-unmerging"]:
            for fileStem in ["-classifications", "-classifications.log", "examples.gz", "pred.xml.gz"]:
                if os.path.exists(stepTag+fileStem):
                    os.remove(stepTag+fileStem)
Esempio n. 13
0
 def addClassifierModel(self, model, classifierModelPath, classifierParameters):
     classifierModel = model.get(self.tag+"classifier-model", True)
     shutil.copy2(classifierModelPath, classifierModel)
     model.addStr(self.tag+"classifier-parameter", Parameters.toString(Parameters.get(classifierParameters)))
     return classifierModel
Esempio n. 14
0
def train(output,
          task=None,
          detector=None,
          inputFiles=None,
          models=None,
          parse=None,
          processUnmerging=None,
          processModifiers=None,
          bioNLPSTParams=None,
          preprocessorParams=None,
          exampleStyles=None,
          classifierParams=None,
          doFullGrid=False,
          deleteOutput=False,
          copyFrom=None,
          log="log.txt",
          step=None,
          omitSteps=None,
          debug=False,
          connection=None,
          subset=None,
          folds=None):
    """
    Train a new model for event or relation detection.
    
    @param output: A directory where output files will appear.
    @param task: If defined, overridable default settings are used for many of the training parameters. Must be one of the supported TEES tasks.
    @param detector: a Detector object, or a string defining one to be imported
    @param inputFiles: A dictionary of file names, with keys "train", "devel" and, "test"
    @param models: A dictionary of file names defining the place for the new models, with keys "devel" and, "test"
    @param parse: The parse element name in the training interaction XML
    @param processUnmerging: Use the unmerging step of EventDetector. True, False or None for task default.
    @param processModifiers: Use the modifier detection step of EventDetector. True, False or None for task default.
    @param bioNLPSTParams: Parameters controlling BioNLP ST format output.
    @param preprocessorParams: Parameters controlling the preprocessor. Not used for training, but saved to the model for use when classifying.
    @param exampleStyles: A parameter set for controlling example builders.
    @param classifierParams: A parameter set for controlling classifiers.
    @param doFullGrid: Whether all parameters, as opposed to just recall adjustment, are tested in the EventDetector grid search.
    @param deleteOutput: Remove an existing output directory
    @param copyFrom: Copy an existing output directory for use as a template
    @param log: An optional alternative name for the log file. None is for no logging.
    @param step: A step=substep pair, where the steps are "TRAIN", "DEVEL", "EMPTY" and "TEST"
    @param omitSteps: step=substep parameters, where multiple substeps can be defined.
    @param debug: In debug mode, more output is shown, and some temporary intermediate files are saved
    @param connection: A parameter set defining a local or remote connection for training the classifier
    @param subset: A parameter set for making subsets of input files
    """
    # Insert default arguments where needed
    inputFiles = setDictDefaults(inputFiles, {
        "train": None,
        "devel": None,
        "test": None
    })
    models = setDictDefaults(models, {"devel": None, "test": None})
    exampleStyles = setDictDefaults(
        exampleStyles, {
            "examples": None,
            "trigger": None,
            "edge": None,
            "unmerging": None,
            "modifiers": None
        })
    classifierParams = setDictDefaults(
        classifierParams, {
            "examples": None,
            "trigger": None,
            "recall": None,
            "edge": None,
            "unmerging": None,
            "modifiers": None
        })
    subset = setDictDefaults(Parameters.get(subset), {
        "train": None,
        "devel": None,
        "test": None,
        "seed": 0,
        "all": None
    })
    folds = setDictDefaults(folds, {
        "train": None,
        "devel": None,
        "test": None
    })
    processUnmerging = getDefinedBool(processUnmerging)
    processModifiers = getDefinedBool(processModifiers)
    # Initialize working directory
    workdir(output, deleteOutput, copyFrom, log)
    # Get task specific parameters
    detector, bioNLPSTParams, preprocessorParams = getTaskSettings(
        task, detector, bioNLPSTParams, preprocessorParams, inputFiles,
        exampleStyles, classifierParams)
    # Learn training settings from input files
    detector = learnSettings(inputFiles, detector, classifierParams)
    # Get corpus subsets
    getFolds(inputFiles, folds)
    getSubsets(inputFiles, subset)
    if task != None:
        task = task.replace("-FULL", "")
    # Define processing steps
    selector, detectorSteps, omitDetectorSteps = getSteps(
        step, omitSteps, ["TRAIN", "DEVEL", "EMPTY", "TEST"])

    # Initialize the detector
    detector, detectorName = getDetector(detector)
    detector = detector()  # initialize object
    detector.debug = debug
    detector.bioNLPSTParams = detector.getBioNLPSharedTaskParams(
        bioNLPSTParams)
    #detector.useBioNLPSTFormat = useBioNLPSTFormat # classify-output and grid evaluation in ST-format
    #detector.stWriteScores = True # write confidence scores into additional st-format files
    connection = getConnection(connection)
    detector.setConnection(connection)
    connection.debug = debug
    if deleteOutput:
        connection.clearWorkDir()

    # Train
    if selector.check("TRAIN"):
        print >> sys.stderr, "----------------------------------------------------"
        print >> sys.stderr, "------------------ Train Detector ------------------"
        print >> sys.stderr, "----------------------------------------------------"
        if isinstance(detector, SingleStageDetector):
            detector.train(inputFiles["train"],
                           inputFiles["devel"],
                           models["devel"],
                           models["test"],
                           exampleStyles["examples"],
                           classifierParams["examples"],
                           parse,
                           None,
                           task,
                           fromStep=detectorSteps["TRAIN"],
                           workDir="training")
        else:
            detector.train(inputFiles["train"],
                           inputFiles["devel"],
                           models["devel"],
                           models["test"],
                           exampleStyles["trigger"],
                           exampleStyles["edge"],
                           exampleStyles["unmerging"],
                           exampleStyles["modifiers"],
                           classifierParams["trigger"],
                           classifierParams["edge"],
                           classifierParams["unmerging"],
                           classifierParams["modifiers"],
                           classifierParams["recall"],
                           processUnmerging,
                           processModifiers,
                           doFullGrid,
                           task,
                           parse,
                           None,
                           fromStep=detectorSteps["TRAIN"],
                           workDir="training")
        # Save the detector type
        for model in [models["devel"], models["test"]]:
            if model != None and os.path.exists(model):
                model = Model(model, "a")
                model.addStr("detector", detectorName)
                if preprocessorParams != None:
                    preprocessor = Preprocessor()
                    model.addStr(
                        "preprocessorParams",
                        Parameters.toString(
                            preprocessor.getParameters(preprocessorParams)))
                model.save()
                model.close()
    if selector.check("DEVEL"):
        print >> sys.stderr, "----------------------------------------------------"
        print >> sys.stderr, "------------ Check devel classification ------------"
        print >> sys.stderr, "----------------------------------------------------"
        #detector.bioNLPSTParams["scores"] = False # the evaluation server doesn't like additional files
        detector.classify(inputFiles["devel"],
                          models["devel"],
                          "classification-devel/devel",
                          goldData=inputFiles["devel"],
                          fromStep=detectorSteps["DEVEL"],
                          workDir="classification-devel")
    if selector.check("EMPTY"):
        # By passing an emptied devel set through the prediction system, we can check that we get the same predictions
        # as in the DEVEL step, ensuring the model does not use leaked information.
        print >> sys.stderr, "----------------------------------------------------"
        print >> sys.stderr, "------------ Empty devel classification ------------"
        print >> sys.stderr, "----------------------------------------------------"
        #detector.bioNLPSTParams["scores"] = False # the evaluation server doesn't like additional files
        detector.classify(getEmptyCorpus(
            inputFiles["devel"],
            removeNames=("names" in str(exampleStyles["examples"])
                         or "names" in str(exampleStyles["trigger"]))),
                          models["devel"],
                          "classification-empty/devel-empty",
                          fromStep=detectorSteps["EMPTY"],
                          workDir="classification-empty")
    if selector.check("TEST"):
        print >> sys.stderr, "----------------------------------------------------"
        print >> sys.stderr, "------------- Test set classification --------------"
        print >> sys.stderr, "----------------------------------------------------"
        if inputFiles["test"] == None or not os.path.exists(
                inputFiles["test"]):
            print >> sys.stderr, "Skipping, test file", inputFiles[
                "test"], "does not exist"
        else:
            #detector.bioNLPSTParams["scores"] = False # the evaluation server doesn't like additional files
            detector.classify(inputFiles["test"],
                              models["test"],
                              "classification-test/test",
                              fromStep=detectorSteps["TEST"],
                              workDir="classification-test")
            if detector.bioNLPSTParams["convert"]:
                Utils.STFormat.Compare.compare(
                    "classification-test/test-events.tar.gz",
                    "classification-devel/devel-events.tar.gz", "a2")
Esempio n. 15
0
 def process(self,
             input,
             output,
             parameters=None,
             model=None,
             fromStep=None,
             toStep=None,
             omitSteps=None):
     self.initVariables(source=input,
                        xml=input,
                        outDir=os.path.dirname(output))
     if os.path.basename(output) != "":
         self.intermediateFileTag = os.path.basename(output)
     else:
         self.intermediateFileTag = ""
     self.enterState(self.STATE_TOOLCHAIN, [x[0] for x in self.steps],
                     fromStep, toStep, omitSteps)
     parameters = self.getParameters(parameters,
                                     model,
                                     defaultValue=NOTHING)
     self.applyParameters(parameters)
     # Run the tools
     print >> sys.stderr, "Tool chain parameters:", Parameters.toString(
         parameters,
         skipKeysWithValues=[NOTHING],
         skipDefaults=self.getDefaultParameters())
     if os.path.exists(output) and not os.path.isdir(output):
         print >> sys.stderr, "Removing existing preprocessor output file", output
         os.remove(output)
     savedIntermediate = None  # Output from a previous step if "fromStep" is used
     for step in self.steps:
         if self.checkStep(step[0]):
             if savedIntermediate != None:  # A previous run of the program saved an intermediate file
                 print >> sys.stderr, "Reading input from saved intermediate file", savedIntermediate
                 self.xml = ETUtils.ETFromObj(savedIntermediate)
                 savedIntermediate = None
             stepArgs = copy.copy(
                 step[2]
             )  # make a copy of the arguments to which i/o can be added
             stepArgs[step[4]["input"]] = self.xml  # the input
             if self.getIntermediateFilePath(
                     step
             ) != None:  # this step should save an intermediate file
                 stepArgs[step[4]["output"]] = self.getIntermediateFilePath(
                     step)
             print >> sys.stderr, "Running step", step[
                 0], "with arguments", stepArgs
             step[1](**stepArgs)  # call the tool
         elif self.getStepStatus(
                 step[0]) == "BEFORE":  # this step was run earlier
             savedIntermediate = self.getIntermediateFilePath(step)
     # End state and return
     xml = self.xml  # state-specific member variable self.xml will be removed when exiting state
     self.exitState()
     if self.state == None:  # if the whole toolchain has finished, return the final product
         if not os.path.isdir(
                 output
         ):  # if output is a directory, it was given only for storing intermediate files ...
             ETUtils.write(xml,
                           output)  # ... otherwise, save the final output
         return xml
     else:
         return None
Esempio n. 16
0
 def beginModel(self,
                step,
                model,
                trainExampleFiles,
                testExampleFile,
                importIdsFromModel=None):
     """
     Begin the training process leading to a new model.
     """
     if self.checkStep(step, False):
         if model != None:
             if self.state != None and step != None:
                 print >> sys.stderr, self.__class__.__name__ + ":" + self.state + ":" + step
             # Create combined model
             model = self.openModel(model, "w")
             assert model.mode in ["a", "w"], (model.path, model.mode)
             # Information can be imported from an existing model. In this case, model is trained
             # with the parameter already defined in the import source. This is used when training
             # the combined model.
             if importIdsFromModel != None:
                 model.importFrom(self.openModel(importIdsFromModel, "r"), [
                     self.tag + "ids.classes", self.tag + "ids.features",
                     "structure.txt"
                 ], [
                     self.tag + "classifier-parameter", self.tag +
                     "example-style", self.tag + "parse", self.tag + "task"
                 ])
                 # Train the model with the parameters defined in the import source
                 model.addStr(
                     self.tag + "classifier-parameters-train",
                     model.getStr(self.tag + "classifier-parameter"))
             if self.bioNLPSTParams != None and len(
                     self.bioNLPSTParams) > 0:
                 model.addStr("BioNLPSTParams",
                              Parameters.toString(self.bioNLPSTParams))
             # Catenate example files
             if type(trainExampleFiles) in types.StringTypes:
                 combinedTrainExamples = trainExampleFiles
             elif len(trainExampleFiles) == 1:
                 combinedTrainExamples = trainExampleFiles[0]
             else:
                 combinedTrainExamples = self.workDir + os.path.normpath(
                     model.path) + "-" + self.tag + "combined-examples.gz"
                 combinedTrainExamplesFile = gzip.open(
                     combinedTrainExamples, 'wb')
                 for trainExampleFile in trainExampleFiles:
                     print >> sys.stderr, "Catenating", trainExampleFile, "to", combinedTrainExamples
                     shutil.copyfileobj(gzip.open(trainExampleFile, 'rb'),
                                        combinedTrainExamplesFile)
                 combinedTrainExamplesFile.close()
             # Upload training model
             # The parameter grid is stored in the model as "*classifier-parameters-train" so that endModel can
             # use it, and also as annotation for the trained model. The final selected parameter will
             # be stored as "*classifier-parameter"
             classifierWorkDir = self.workDir + os.path.normpath(
                 model.path) + "-" + self.tag + "models"
             classifier = self.getClassifier(
                 model.getStr(self.tag + "classifier-parameters-train"))(
                     self.connection)
             classifier.optimize(
                 combinedTrainExamples,
                 classifierWorkDir,
                 model.getStr(self.tag + "classifier-parameters-train"),
                 testExampleFile,
                 model.get(self.tag + "ids.classes"),
                 step="SUBMIT",
                 evaluator=self.evaluator)
             model.save()
Esempio n. 17
0
File: train.py Progetto: ninjin/TEES
def train(output, task=None, detector=None, inputFiles=None, models=None, parse=None,
          processUnmerging=None, processModifiers=None, isSingleStage=False, 
          bioNLPSTParams=None, preprocessorParams=None, exampleStyles=None, 
          classifierParams=None,  doFullGrid=False, deleteOutput=False, copyFrom=None, 
          log="log.txt", step=None, omitSteps=None, debug=False, connection=None):
    """
    Train a new model for event or relation detection.
    
    @param output: A directory where output files will appear.
    @param task: If defined, overridable default settings are used for many of the training parameters. Must be one of the supported TEES tasks.
    @param detector: a Detector object, or a string defining one to be imported
    @param inputFiles: A dictionary of file names, with keys "train", "devel" and, "test"
    @param models: A dictionary of file names defining the place for the new models, with keys "devel" and, "test"
    @param parse: The parse element name in the training interaction XML
    @param processUnmerging: Use the unmerging step of EventDetector. True, False or None for task default.
    @param processModifiers: Use the modifier detection step of EventDetector. True, False or None for task default.
    @param isSingleStage: False for EventDetector, True for a single stage detector.
    @param bioNLPSTParams: Parameters controlling BioNLP ST format output.
    @param preprocessorParams: Parameters controlling the preprocessor. Not used for training, but saved to the model for use when classifying.
    @param exampleStyles: A parameter set for controlling example builders.
    @param classifierParams: A parameter set for controlling classifiers.
    @param doFullGrid: Whether all parameters, as opposed to just recall adjustment, are tested in the EventDetector grid search.
    @param deleteOutput: Remove an existing output directory
    @param copyFrom: Copy an existing output directory for use as a template
    @param log: An optional alternative name for the log file. None is for no logging.
    @param step: A step=substep pair, where the steps are "TRAIN", "DEVEL", "EMPTY" and "TEST"
    @param omitSteps: step=substep parameters, where multiple substeps can be defined.
    @param debug: In debug mode, more output is shown, and some temporary intermediate files are saved
    @param connection: A parameter set defining a local or remote connection for training the classifier
    """
    # Insert default arguments where needed
    inputFiles = Parameters.get(inputFiles, {"train":None, "devel":None, "test":None})
    models = Parameters.get(models, {"devel":None, "test":None})
    exampleStyles = Parameters.get(exampleStyles, {"examples":None, "trigger":None, "edge":None, "unmerging":None, "modifiers":None})
    classifierParams = Parameters.get(classifierParams, {"examples":None, "trigger":None, "recall":None, "edge":None, "unmerging":None, "modifiers":None})
    processUnmerging = getDefinedBool(processUnmerging)
    processModifiers = getDefinedBool(processModifiers)
    # Initialize working directory
    workdir(output, deleteOutput, copyFrom, log)
    # Get task specific parameters
    detector, processUnmerging, processModifiers, isSingleStage, bioNLPSTParams, preprocessorParams, exampleStyles, classifierParams, removeNamesFromEmpty = getTaskSettings(task, 
        detector, processUnmerging, processModifiers, isSingleStage, bioNLPSTParams, preprocessorParams, inputFiles, exampleStyles, classifierParams)   
    if task != None: task = task.replace("-MINI", "").replace("-FULL", "")
    # Define processing steps
    selector, detectorSteps, omitDetectorSteps = getSteps(step, omitSteps, ["TRAIN", "DEVEL", "EMPTY", "TEST"])
    
    # Initialize the detector
    detector, detectorName = getDetector(detector)
    detector = detector() # initialize object
    detector.debug = debug
    detector.bioNLPSTParams = detector.getBioNLPSharedTaskParams(bioNLPSTParams)
    #detector.useBioNLPSTFormat = useBioNLPSTFormat # classify-output and grid evaluation in ST-format
    #detector.stWriteScores = True # write confidence scores into additional st-format files
    connection = getConnection(connection)
    detector.setConnection(connection)
    connection.debug = debug
    if deleteOutput:
        connection.clearWorkDir()
    
    # Train
    if selector.check("TRAIN"):
        print >> sys.stderr, "----------------------------------------------------"
        print >> sys.stderr, "------------------ Train Detector ------------------"
        print >> sys.stderr, "----------------------------------------------------"
        if isSingleStage:
            detector.train(inputFiles["train"], inputFiles["devel"], models["devel"], models["test"],
                           exampleStyles["examples"], classifierParams["examples"], parse, None, task,
                           fromStep=detectorSteps["TRAIN"], workDir="training")
        else:
            detector.train(inputFiles["train"], inputFiles["devel"], models["devel"], models["test"],
                           exampleStyles["trigger"], exampleStyles["edge"], exampleStyles["unmerging"], exampleStyles["modifiers"],
                           classifierParams["trigger"], classifierParams["edge"], classifierParams["unmerging"], classifierParams["modifiers"],
                           classifierParams["recall"], processUnmerging, processModifiers, 
                           doFullGrid, task, parse, None,
                           fromStep=detectorSteps["TRAIN"], workDir="training")
        # Save the detector type
        for model in [models["devel"], models["test"]]:
            if os.path.exists(model):
                model = Model(model, "a")
                model.addStr("detector", detectorName)
                if preprocessorParams != None:
                    preprocessor = Preprocessor()
                    model.addStr("preprocessorParams", Parameters.toString(preprocessor.getParameters(preprocessorParams)))
                model.save()
                model.close()
    if selector.check("DEVEL"):
        print >> sys.stderr, "----------------------------------------------------"
        print >> sys.stderr, "------------ Check devel classification ------------"
        print >> sys.stderr, "----------------------------------------------------"
        detector.classify(inputFiles["devel"], models["devel"], "classification-devel/devel", goldData=inputFiles["devel"], fromStep=detectorSteps["DEVEL"], workDir="classification-devel")
    if selector.check("EMPTY"):
        # By passing an emptied devel set through the prediction system, we can check that we get the same predictions
        # as in the DEVEL step, ensuring the model does not use leaked information.
        print >> sys.stderr, "----------------------------------------------------"
        print >> sys.stderr, "------------ Empty devel classification ------------"
        print >> sys.stderr, "----------------------------------------------------"
        detector.classify(getEmptyCorpus(inputFiles["devel"], removeNames=removeNamesFromEmpty), models["devel"], "classification-empty/devel-empty", fromStep=detectorSteps["EMPTY"], workDir="classification-empty")
    if selector.check("TEST"):
        print >> sys.stderr, "----------------------------------------------------"
        print >> sys.stderr, "------------- Test set classification --------------"
        print >> sys.stderr, "----------------------------------------------------"
        if inputFiles["test"] == None or not os.path.exists(inputFiles["test"]):
            print >> sys.stderr, "Skipping, test file", inputFiles["test"], "does not exist"
        else:
            detector.bioNLPSTParams["scores"] = False # the evaluation server doesn't like additional files
            detector.classify(inputFiles["test"], models["test"], "classification-test/test", fromStep=detectorSteps["TEST"], workDir="classification-test")
            if detector.bioNLPSTParams["convert"]:
                Utils.STFormat.Compare.compare("classification-test/test-events.tar.gz", "classification-devel/devel-events.tar.gz", "a2")
Esempio n. 18
0
    def doGrid(self):
        print >> sys.stderr, "--------- Parameter grid search ---------"
        # Build trigger examples
        self.triggerDetector.buildExamples(
            self.model, [self.optData],
            [self.workDir + "grid-trigger-examples"])

        if self.fullGrid:
            stepParams = {
                "trigger":
                Parameters.get(self.model.getStr(self.triggerDetector.tag +
                                                 "classifier-parameters-train",
                                                 defaultIfNotExist=""),
                               valueListKey="c"),
                "booster":
                [float(i) for i in self.recallAdjustParameters.split(",")],
                "edge":
                Parameters.get(self.model.getStr(self.edgeDetector.tag +
                                                 "classifier-parameters-train",
                                                 defaultIfNotExist=""),
                               valueListKey="c")
            }
        else:
            stepParams = {
                "trigger":
                Parameters.get(self.model.getStr(self.triggerDetector.tag +
                                                 "classifier-parameter",
                                                 defaultIfNotExist=""),
                               valueListKey="c"),
                "booster":
                [float(i) for i in self.recallAdjustParameters.split(",")],
                "edge":
                Parameters.get(self.model.getStr(self.edgeDetector.tag +
                                                 "classifier-parameter",
                                                 defaultIfNotExist=""),
                               valueListKey="c")
            }

        for step in ["trigger", "edge"]:
            stepParams[step] = Parameters.getCombinations(stepParams[step])
            for i in range(len(stepParams[step])):
                stepParams[step][i] = Parameters.toString(stepParams[step][i])
        print >> sys.stderr, "Parameters", [
            stepParams[x] for x in ["trigger", "booster", "edge"]
        ]
        paramCombinations = combine(
            *[stepParams[x] for x in ["trigger", "booster", "edge"]])
        print >> sys.stderr, "Combinations", paramCombinations
        for i in range(len(paramCombinations)):
            paramCombinations[i] = {
                "trigger": paramCombinations[i][0],
                "booster": paramCombinations[i][1],
                "edge": paramCombinations[i][2]
            }

        #paramCombinations = Parameters.getCombinations(ALL_PARAMS, ["trigger", "booster", "edge"])
        prevParams = None
        EDGE_MODEL_STEM = os.path.join(
            self.edgeDetector.workDir,
            os.path.normpath(self.model.path) + "-edge-models/model")
        TRIGGER_MODEL_STEM = os.path.join(
            self.triggerDetector.workDir,
            os.path.normpath(self.model.path) + "-trigger-models/model")
        self.structureAnalyzer.load(self.model)
        bestResults = None
        for i in range(len(paramCombinations)):
            params = paramCombinations[i]
            print >> sys.stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
            print >> sys.stderr, "Processing params", str(i + 1) + "/" + str(
                len(paramCombinations)), params
            print >> sys.stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
            # Triggers and Boost (the trigger predictions are recalculated only when the relevant parameters change)
            if (prevParams == None) or (
                    prevParams["trigger"] != params["trigger"]) or (
                        prevParams["booster"] != params["booster"]):
                print >> sys.stderr, "Classifying trigger examples for parameters", "trigger:" + str(
                    params["trigger"]), "booster:" + str(params["booster"])
                xml = self.triggerDetector.classifyToXML(
                    self.optData,
                    self.model,
                    self.workDir + "grid-trigger-examples",
                    self.workDir + "grid-",
                    classifierModel=TRIGGER_MODEL_STEM +
                    Parameters.toId(params["trigger"]),
                    recallAdjust=params["booster"],
                    useExistingExamples=True)
            prevParams = params
            ## Build edge examples
            #self.edgeDetector.buildExamples(self.model, [xml], [self.workDir+"grid-edge-examples"], [self.optData])
            # Classify with pre-defined model
            edgeClassifierModel = EDGE_MODEL_STEM + Parameters.toId(
                params["edge"])
            xml = self.edgeDetector.classifyToXML(
                xml,
                self.model,
                self.workDir + "grid-edge-examples",
                self.workDir + "grid-",
                classifierModel=edgeClassifierModel,
                goldData=self.optData)
            bestResults = self.evaluateGrid(xml, params, bestResults)
        # Remove remaining intermediate grid files
        for tag1 in ["edge", "trigger", "unmerging"]:
            for tag2 in ["examples", "pred.xml.gz"]:
                if os.path.exists(self.workDir + "grid-" + tag1 + "-" + tag2):
                    os.remove(self.workDir + "grid-" + tag1 + "-" + tag2)
        print >> sys.stderr, "Parameter grid search complete"
        print >> sys.stderr, "Tested", len(paramCombinations), "combinations"
        print >> sys.stderr, "Best parameters:", bestResults[0]
        print >> sys.stderr, "Best result:", bestResults[2]  # f-score
        # Save grid model
        self.saveStr("recallAdjustParameter", str(bestResults[0]["booster"]),
                     self.model)
        self.saveStr("recallAdjustParameter", str(bestResults[0]["booster"]),
                     self.combinedModel, False)
        if self.fullGrid:  # define best models
            self.triggerDetector.addClassifierModel(
                self.model,
                TRIGGER_MODEL_STEM + str(bestResults[0]["trigger"]),
                bestResults[0]["trigger"])
            self.edgeDetector.addClassifierModel(
                self.model, EDGE_MODEL_STEM + str(bestResults[0]["edge"]),
                bestResults[0]["edge"])
        # Remove work files
        for stepTag in [
                self.workDir + "grid-trigger", self.workDir + "grid-edge",
                self.workDir + "grid-unmerging"
        ]:
            for fileStem in [
                    "-classifications", "-classifications.log", "examples.gz",
                    "pred.xml.gz"
            ]:
                if os.path.exists(stepTag + fileStem):
                    os.remove(stepTag + fileStem)
Esempio n. 19
0
def train(output, task=None, detector=None, inputFiles=None, models=None, parse=None,
          processUnmerging=None, processModifiers=None, 
          bioNLPSTParams=None, preprocessorParams=None, exampleStyles=None, 
          classifierParams=None,  doFullGrid=False, deleteOutput=False, copyFrom=None, 
          log="log.txt", step=None, omitSteps=None, debug=False, connection=None, subset=None, 
          folds=None, corpusDir=None, corpusPreprocessing=None, evaluator=None):
    """
    Train a new model for event or relation detection.
    
    @param output: A directory where output files will appear.
    @param task: If defined, overridable default settings are used for many of the training parameters. Must be one of the supported TEES tasks.
    @param detector: a Detector object, or a string defining one to be imported
    @param inputFiles: A dictionary of file names, with keys "train", "devel" and, "test"
    @param models: A dictionary of file names defining the place for the new models, with keys "devel" and, "test"
    @param parse: The parse element name in the training interaction XML
    @param processUnmerging: Use the unmerging step of EventDetector. True, False or None for task default.
    @param processModifiers: Use the modifier detection step of EventDetector. True, False or None for task default.
    @param bioNLPSTParams: Parameters controlling BioNLP ST format output.
    @param preprocessorParams: Parameters controlling the preprocessor. Not used for training, but saved to the model for use when classifying.
    @param exampleStyles: A parameter set for controlling example builders.
    @param classifierParams: A parameter set for controlling classifiers.
    @param doFullGrid: Whether all parameters, as opposed to just recall adjustment, are tested in the EventDetector grid search.
    @param deleteOutput: Remove an existing output directory
    @param copyFrom: Copy an existing output directory for use as a template
    @param log: An optional alternative name for the log file. None is for no logging.
    @param step: A step=substep pair, where the steps are "TRAIN", "DEVEL", "EMPTY" and "TEST"
    @param omitSteps: step=substep parameters, where multiple substeps can be defined.
    @param debug: In debug mode, more output is shown, and some temporary intermediate files are saved
    @param connection: A parameter set defining a local or remote connection for training the classifier
    @param subset: A parameter set for making subsets of input files
    """
    # Insert default arguments where needed
    inputFiles = setDictDefaults(inputFiles, {"train":None, "devel":None, "test":None})
    models = setDictDefaults(models, {"devel":"model-devel", "test":"model-test"})
    exampleStyles = setDictDefaults(exampleStyles, {"examples":None, "trigger":None, "edge":None, "unmerging":None, "modifiers":None})
    classifierParams = setDictDefaults(classifierParams, {"examples":None, "trigger":None, "recall":None, "edge":None, "unmerging":None, "modifiers":None})
    subset = setDictDefaults(Parameters.get(subset), {"train":None, "devel":None, "test":None, "seed":0, "all":None})
    folds = setDictDefaults(folds, {"train":None, "devel":None, "test":None})
    processUnmerging = getDefinedBool(processUnmerging)
    processModifiers = getDefinedBool(processModifiers)
    # Initialize working directory
    workdir(output, deleteOutput, copyFrom, log)
    # Get task specific parameters
    useKerasDetector = False
    if detector != None and "keras" in detector.lower():
        print >> sys.stderr, "Using a Keras Detector"
        useKerasDetector = True
        if detector.lower() == "keras":
            detector = None
    detector, bioNLPSTParams, preprocessorParams, folds = getTaskSettings(task, detector, 
        bioNLPSTParams, preprocessorParams, inputFiles, exampleStyles, classifierParams, folds, corpusDir=corpusDir, useKerasDetector=useKerasDetector)
    # Learn training settings from input files
    detector = learnSettings(inputFiles, detector, classifierParams, task, exampleStyles, useKerasDetector=useKerasDetector)   
    # Get corpus subsets   
    getFolds(inputFiles, folds)
    getSubsets(inputFiles, subset)
    if task != None: 
        task = task.replace("-FULL", "")
    if "." in task:
        _, subTask = getSubTask(task)
        if subTask != 3:
            processModifiers = False
    # Preprocess the corpus if required
    if corpusPreprocessing != None:
        preprocessor = Preprocessor(steps=corpusPreprocessing)
        assert preprocessor.steps[0].name == "MERGE_SETS"
        assert preprocessor.steps[-1].name == "DIVIDE_SETS"
        preprocessedCorpusDir = os.path.join(output, "corpus")
        #outputFiles = {x:os.path.join(preprocessedCorpusDir, os.path.basename(inputFiles[x])) for x in inputFiles}
        preprocessor.process(inputFiles, os.path.join(preprocessedCorpusDir, task))
        #inputFiles = outputFiles
        for setName in inputFiles.keys():
            if inputFiles[setName] != None:
                inputFiles[setName] = os.path.join(preprocessedCorpusDir, task + "-" + setName + ".xml")
    # Define processing steps
    selector, detectorSteps, omitDetectorSteps = getSteps(step, omitSteps, ["TRAIN", "DEVEL", "EMPTY", "TEST"])
    
    # Initialize the detector
    detector, detectorName = getDetector(detector, evaluator=evaluator)
    evaluator, evaluatorName = importClass(evaluator, "evaluator")
    detector = detector() # initialize object
    if evaluator != None:
        print >> sys.stderr, "Using evaluator", evaluator.__name__
        detector.evaluator = evaluator
    detector.debug = debug
    detector.bioNLPSTParams = detector.getBioNLPSharedTaskParams(bioNLPSTParams)
    #detector.useBioNLPSTFormat = useBioNLPSTFormat # classify-output and grid evaluation in ST-format
    #detector.stWriteScores = True # write confidence scores into additional st-format files
    connection = getConnection(connection)
    detector.setConnection(connection)
    connection.debug = debug
    if deleteOutput:
        connection.clearWorkDir()
    
    # Train
    if selector.check("TRAIN"):
        print >> sys.stderr, "----------------------------------------------------"
        print >> sys.stderr, "------------------ Train Detector ------------------"
        print >> sys.stderr, "----------------------------------------------------"
        if not isinstance(detector, EventDetector):
            detector.train(inputFiles["train"], inputFiles["devel"], models["devel"], models["test"],
                           exampleStyles["examples"], classifierParams["examples"], parse, None, task,
                           fromStep=detectorSteps["TRAIN"], workDir="training", testData=inputFiles["test"])
        else:
            detector.train(inputFiles["train"], inputFiles["devel"], models["devel"], models["test"],
                           exampleStyles["trigger"], exampleStyles["edge"], exampleStyles["unmerging"], exampleStyles["modifiers"],
                           classifierParams["trigger"], classifierParams["edge"], classifierParams["unmerging"], classifierParams["modifiers"],
                           classifierParams["recall"], processUnmerging, processModifiers, 
                           doFullGrid, task, parse, None,
                           fromStep=detectorSteps["TRAIN"], workDir="training", testData=inputFiles["test"])
        # Save the detector type
        for model in [models["devel"], models["test"]]:
            if model != None and os.path.exists(model):
                model = Model(model, "a")
                model.addStr("detector", detectorName)
                if evaluatorName != None:
                    model.addStr("detector", evaluatorName)
                if preprocessorParams != None:
                    preprocessor = Preprocessor()
                    model.addStr("preprocessorParams", Parameters.toString(preprocessor.getParameters(preprocessorParams)))
                model.save()
                model.close()
    if selector.check("DEVEL"):
        print >> sys.stderr, "----------------------------------------------------"
        print >> sys.stderr, "------------ Check devel classification ------------"
        print >> sys.stderr, "----------------------------------------------------"
        #detector.bioNLPSTParams["scores"] = False # the evaluation server doesn't like additional files
        detector.classify(inputFiles["devel"], models["devel"], "classification-devel/devel", goldData=inputFiles["devel"], fromStep=detectorSteps["DEVEL"], workDir="classification-devel")
    if selector.check("EMPTY"):
        # By passing an emptied devel set through the prediction system, we can check that we get the same predictions
        # as in the DEVEL step, ensuring the model does not use leaked information.
        print >> sys.stderr, "----------------------------------------------------"
        print >> sys.stderr, "------------ Empty devel classification ------------"
        print >> sys.stderr, "----------------------------------------------------"
        #detector.bioNLPSTParams["scores"] = False # the evaluation server doesn't like additional files
        removalScope = "non-given"
        if "names" in str(exampleStyles["examples"]) or "names" in str(exampleStyles["trigger"]):
            removalScope = "all"
        elif "Edge" in detector.__class__.__name__:
            removalScope = "interactions"
        detector.classify(getEmptyCorpus(inputFiles["devel"], scope=removalScope), models["devel"], "classification-empty/devel-empty", fromStep=detectorSteps["EMPTY"], workDir="classification-empty")
        print >> sys.stderr, "*** Evaluate empty devel classification ***"
        if os.path.exists("classification-empty/devel-empty-pred.xml.gz"):
            EvaluateInteractionXML.run(detector.evaluator, "classification-empty/devel-empty-pred.xml.gz", inputFiles["devel"], parse)
        else:
            print >> sys.stderr, "No output file for evaluation"
    if selector.check("TEST"):
        print >> sys.stderr, "----------------------------------------------------"
        print >> sys.stderr, "------------- Test set classification --------------"
        print >> sys.stderr, "----------------------------------------------------"
        if inputFiles["test"] == None or not os.path.exists(inputFiles["test"]):
            print >> sys.stderr, "Skipping, test file", inputFiles["test"], "does not exist"
        else:
            #detector.bioNLPSTParams["scores"] = False # the evaluation server doesn't like additional files
            detector.classify(inputFiles["test"], models["test"] if models["test"] != None else models["devel"], "classification-test/test", fromStep=detectorSteps["TEST"], workDir="classification-test")
            if detector.bioNLPSTParams["convert"]:
                extension = ".zip" if (detector.bioNLPSTParams["convert"] == "zip") else ".tar.gz" 
                Utils.STFormat.Compare.compare("classification-test/test-events" + extension, "classification-devel/devel-events" + extension, "a2")
    # Stop logging
    if log != None:
        Stream.closeLog(log)